From 7c15d9bb8231f998ae7dc0b72415f5215459f7fb Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 19 Jul 2016 15:00:04 -0700
Subject: mm: Add is_migrate_cma_page

Code such as hardened user copy[1] needs a way to tell if a
page is CMA or not. Add is_migrate_cma_page in a similar way
to is_migrate_isolate_page.

[1]http://article.gmane.org/gmane.linux.kernel.mm/155238

Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/mmzone.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 02069c23486d..c8478b29f070 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -68,8 +68,10 @@ extern char * const migratetype_names[MIGRATE_TYPES];
 
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
+#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
 #else
 #  define is_migrate_cma(migratetype) false
+#  define is_migrate_cma_page(_page) false
 #endif
 
 #define for_each_migratetype_order(order, type) \
-- 
cgit v1.2.3


From 0f60a8efe4005ab5e65ce000724b04d4ca04a199 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 12 Jul 2016 16:19:48 -0700
Subject: mm: Implement stack frame object validation

This creates per-architecture function arch_within_stack_frames() that
should validate if a given object is contained by a kernel stack frame.
Initial implementation is on x86.

This is based on code from PaX.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/Kconfig                       |  9 ++++++++
 arch/x86/Kconfig                   |  1 +
 arch/x86/include/asm/thread_info.h | 44 ++++++++++++++++++++++++++++++++++++++
 include/linux/thread_info.h        |  9 ++++++++
 4 files changed, 63 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 15996290fed4..ef86cded5402 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -424,6 +424,15 @@ config CC_STACKPROTECTOR_STRONG
 
 endchoice
 
+config HAVE_ARCH_WITHIN_STACK_FRAMES
+	bool
+	help
+	  An architecture should select this if it can walk the kernel stack
+	  frames to determine if an object is part of either the arguments
+	  or local variables (i.e. that it excludes saved return addresses,
+	  and similar) by implementing an inline arch_within_stack_frames(),
+	  which is used by CONFIG_HARDENED_USERCOPY.
+
 config HAVE_CONTEXT_TRACKING
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d9a94da0c29f..a2865ddfc1ff 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -91,6 +91,7 @@ config X86
 	select HAVE_ARCH_SOFT_DIRTY		if X86_64
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_WITHIN_STACK_FRAMES
 	select HAVE_EBPF_JIT			if X86_64
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_CMPXCHG_DOUBLE
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 30c133ac05cd..ab386f1336f2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -180,6 +180,50 @@ static inline unsigned long current_stack_pointer(void)
 	return sp;
 }
 
+/*
+ * Walks up the stack frames to make sure that the specified object is
+ * entirely contained by a single stack frame.
+ *
+ * Returns:
+ *		 1 if within a frame
+ *		-1 if placed across a frame boundary (or outside stack)
+ *		 0 unable to determine (no frame pointers, etc)
+ */
+static inline int arch_within_stack_frames(const void * const stack,
+					   const void * const stackend,
+					   const void *obj, unsigned long len)
+{
+#if defined(CONFIG_FRAME_POINTER)
+	const void *frame = NULL;
+	const void *oldframe;
+
+	oldframe = __builtin_frame_address(1);
+	if (oldframe)
+		frame = __builtin_frame_address(2);
+	/*
+	 * low ----------------------------------------------> high
+	 * [saved bp][saved ip][args][local vars][saved bp][saved ip]
+	 *                     ^----------------^
+	 *               allow copies only within here
+	 */
+	while (stack <= frame && frame < stackend) {
+		/*
+		 * If obj + len extends past the last frame, this
+		 * check won't pass and the next frame will be 0,
+		 * causing us to bail out and correctly report
+		 * the copy as invalid.
+		 */
+		if (obj + len <= frame)
+			return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1;
+		oldframe = frame;
+		frame = *(const void * const *)frame;
+	}
+	return -1;
+#else
+	return 0;
+#endif
+}
+
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index b4c2a485b28a..3d5c80b4391d 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -146,6 +146,15 @@ static inline bool test_and_clear_restore_sigmask(void)
 #error "no set_restore_sigmask() provided and default one won't work"
 #endif
 
+#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
+static inline int arch_within_stack_frames(const void * const stack,
+					   const void * const stackend,
+					   const void *obj, unsigned long len)
+{
+	return 0;
+}
+#endif
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
-- 
cgit v1.2.3


From f5509cc18daa7f82bcc553be70df2117c8eedc16 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 7 Jun 2016 11:05:33 -0700
Subject: mm: Hardened usercopy

This is the start of porting PAX_USERCOPY into the mainline kernel. This
is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The
work is based on code by PaX Team and Brad Spengler, and an earlier port
from Casey Schaufler. Additional non-slab page tests are from Rik van Riel.

This patch contains the logic for validating several conditions when
performing copy_to_user() and copy_from_user() on the kernel object
being copied to/from:
- address range doesn't wrap around
- address range isn't NULL or zero-allocated (with a non-zero copy size)
- if on the slab allocator:
  - object size must be less than or equal to copy size (when check is
    implemented in the allocator, which appear in subsequent patches)
- otherwise, object must not span page allocations (excepting Reserved
  and CMA ranges)
- if on the stack
  - object must not extend before/after the current process stack
  - object must be contained by a valid stack frame (when there is
    arch/build support for identifying stack frames)
- object must not overlap with kernel text

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/slab.h        |  12 ++
 include/linux/thread_info.h |  15 +++
 mm/Makefile                 |   4 +
 mm/usercopy.c               | 268 ++++++++++++++++++++++++++++++++++++++++++++
 security/Kconfig            |  28 +++++
 5 files changed, 327 insertions(+)
 create mode 100644 mm/usercopy.c

diff --git a/include/linux/slab.h b/include/linux/slab.h
index aeb3e6d00a66..96a16a3fb7cb 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -155,6 +155,18 @@ void kfree(const void *);
 void kzfree(const void *);
 size_t ksize(const void *);
 
+#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
+const char *__check_heap_object(const void *ptr, unsigned long n,
+				struct page *page);
+#else
+static inline const char *__check_heap_object(const void *ptr,
+					      unsigned long n,
+					      struct page *page)
+{
+	return NULL;
+}
+#endif
+
 /*
  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
  * alignment larger than the alignment of a 64-bit integer.
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 3d5c80b4391d..f24b99eac969 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void * const stack,
 }
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY
+extern void __check_object_size(const void *ptr, unsigned long n,
+					bool to_user);
+
+static inline void check_object_size(const void *ptr, unsigned long n,
+				     bool to_user)
+{
+	__check_object_size(ptr, n, to_user);
+}
+#else
+static inline void check_object_size(const void *ptr, unsigned long n,
+				     bool to_user)
+{ }
+#endif /* CONFIG_HARDENED_USERCOPY */
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
diff --git a/mm/Makefile b/mm/Makefile
index 78c6f7dedb83..32d37247c7e5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
 
+# Since __builtin_frame_address does work as used, disable the warning.
+CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
+obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/usercopy.c b/mm/usercopy.c
new file mode 100644
index 000000000000..8ebae91a6b55
--- /dev/null
+++ b/mm/usercopy.c
@@ -0,0 +1,268 @@
+/*
+ * This implements the various checks for CONFIG_HARDENED_USERCOPY*,
+ * which are designed to protect kernel memory from needless exposure
+ * and overwrite under many unintended conditions. This code is based
+ * on PAX_USERCOPY, which is:
+ *
+ * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
+ * Security Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+
+enum {
+	BAD_STACK = -1,
+	NOT_STACK = 0,
+	GOOD_FRAME,
+	GOOD_STACK,
+};
+
+/*
+ * Checks if a given pointer and length is contained by the current
+ * stack frame (if possible).
+ *
+ * Returns:
+ *	NOT_STACK: not at all on the stack
+ *	GOOD_FRAME: fully within a valid stack frame
+ *	GOOD_STACK: fully on the stack (when can't do frame-checking)
+ *	BAD_STACK: error condition (invalid stack position or bad stack frame)
+ */
+static noinline int check_stack_object(const void *obj, unsigned long len)
+{
+	const void * const stack = task_stack_page(current);
+	const void * const stackend = stack + THREAD_SIZE;
+	int ret;
+
+	/* Object is not on the stack at all. */
+	if (obj + len <= stack || stackend <= obj)
+		return NOT_STACK;
+
+	/*
+	 * Reject: object partially overlaps the stack (passing the
+	 * the check above means at least one end is within the stack,
+	 * so if this check fails, the other end is outside the stack).
+	 */
+	if (obj < stack || stackend < obj + len)
+		return BAD_STACK;
+
+	/* Check if object is safely within a valid frame. */
+	ret = arch_within_stack_frames(stack, stackend, obj, len);
+	if (ret)
+		return ret;
+
+	return GOOD_STACK;
+}
+
+static void report_usercopy(const void *ptr, unsigned long len,
+			    bool to_user, const char *type)
+{
+	pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
+		to_user ? "exposure" : "overwrite",
+		to_user ? "from" : "to", ptr, type ? : "unknown", len);
+	/*
+	 * For greater effect, it would be nice to do do_group_exit(),
+	 * but BUG() actually hooks all the lock-breaking and per-arch
+	 * Oops code, so that is used here instead.
+	 */
+	BUG();
+}
+
+/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
+static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
+		     unsigned long high)
+{
+	unsigned long check_low = (uintptr_t)ptr;
+	unsigned long check_high = check_low + n;
+
+	/* Does not overlap if entirely above or entirely below. */
+	if (check_low >= high || check_high < low)
+		return false;
+
+	return true;
+}
+
+/* Is this address range in the kernel text area? */
+static inline const char *check_kernel_text_object(const void *ptr,
+						   unsigned long n)
+{
+	unsigned long textlow = (unsigned long)_stext;
+	unsigned long texthigh = (unsigned long)_etext;
+	unsigned long textlow_linear, texthigh_linear;
+
+	if (overlaps(ptr, n, textlow, texthigh))
+		return "<kernel text>";
+
+	/*
+	 * Some architectures have virtual memory mappings with a secondary
+	 * mapping of the kernel text, i.e. there is more than one virtual
+	 * kernel address that points to the kernel image. It is usually
+	 * when there is a separate linear physical memory mapping, in that
+	 * __pa() is not just the reverse of __va(). This can be detected
+	 * and checked:
+	 */
+	textlow_linear = (unsigned long)__va(__pa(textlow));
+	/* No different mapping: we're done. */
+	if (textlow_linear == textlow)
+		return NULL;
+
+	/* Check the secondary mapping... */
+	texthigh_linear = (unsigned long)__va(__pa(texthigh));
+	if (overlaps(ptr, n, textlow_linear, texthigh_linear))
+		return "<linear kernel text>";
+
+	return NULL;
+}
+
+static inline const char *check_bogus_address(const void *ptr, unsigned long n)
+{
+	/* Reject if object wraps past end of memory. */
+	if (ptr + n < ptr)
+		return "<wrapped address>";
+
+	/* Reject if NULL or ZERO-allocation. */
+	if (ZERO_OR_NULL_PTR(ptr))
+		return "<null>";
+
+	return NULL;
+}
+
+static inline const char *check_heap_object(const void *ptr, unsigned long n,
+					    bool to_user)
+{
+	struct page *page, *endpage;
+	const void *end = ptr + n - 1;
+	bool is_reserved, is_cma;
+
+	/*
+	 * Some architectures (arm64) return true for virt_addr_valid() on
+	 * vmalloced addresses. Work around this by checking for vmalloc
+	 * first.
+	 */
+	if (is_vmalloc_addr(ptr))
+		return NULL;
+
+	if (!virt_addr_valid(ptr))
+		return NULL;
+
+	page = virt_to_head_page(ptr);
+
+	/* Check slab allocator for flags and size. */
+	if (PageSlab(page))
+		return __check_heap_object(ptr, n, page);
+
+	/*
+	 * Sometimes the kernel data regions are not marked Reserved (see
+	 * check below). And sometimes [_sdata,_edata) does not cover
+	 * rodata and/or bss, so check each range explicitly.
+	 */
+
+	/* Allow reads of kernel rodata region (if not marked as Reserved). */
+	if (ptr >= (const void *)__start_rodata &&
+	    end <= (const void *)__end_rodata) {
+		if (!to_user)
+			return "<rodata>";
+		return NULL;
+	}
+
+	/* Allow kernel data region (if not marked as Reserved). */
+	if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
+		return NULL;
+
+	/* Allow kernel bss region (if not marked as Reserved). */
+	if (ptr >= (const void *)__bss_start &&
+	    end <= (const void *)__bss_stop)
+		return NULL;
+
+	/* Is the object wholly within one base page? */
+	if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
+		   ((unsigned long)end & (unsigned long)PAGE_MASK)))
+		return NULL;
+
+	/* Allow if start and end are inside the same compound page. */
+	endpage = virt_to_head_page(end);
+	if (likely(endpage == page))
+		return NULL;
+
+	/*
+	 * Reject if range is entirely either Reserved (i.e. special or
+	 * device memory), or CMA. Otherwise, reject since the object spans
+	 * several independently allocated pages.
+	 */
+	is_reserved = PageReserved(page);
+	is_cma = is_migrate_cma_page(page);
+	if (!is_reserved && !is_cma)
+		goto reject;
+
+	for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
+		page = virt_to_head_page(ptr);
+		if (is_reserved && !PageReserved(page))
+			goto reject;
+		if (is_cma && !is_migrate_cma_page(page))
+			goto reject;
+	}
+
+	return NULL;
+
+reject:
+	return "<spans multiple pages>";
+}
+
+/*
+ * Validates that the given object is:
+ * - not bogus address
+ * - known-safe heap or stack object
+ * - not in kernel text
+ */
+void __check_object_size(const void *ptr, unsigned long n, bool to_user)
+{
+	const char *err;
+
+	/* Skip all tests if size is zero. */
+	if (!n)
+		return;
+
+	/* Check for invalid addresses. */
+	err = check_bogus_address(ptr, n);
+	if (err)
+		goto report;
+
+	/* Check for bad heap object. */
+	err = check_heap_object(ptr, n, to_user);
+	if (err)
+		goto report;
+
+	/* Check for bad stack object. */
+	switch (check_stack_object(ptr, n)) {
+	case NOT_STACK:
+		/* Object is not touching the current process stack. */
+		break;
+	case GOOD_FRAME:
+	case GOOD_STACK:
+		/*
+		 * Object is either in the correct frame (when it
+		 * is possible to check) or just generally on the
+		 * process stack (when frame checking not available).
+		 */
+		return;
+	default:
+		err = "<process stack>";
+		goto report;
+	}
+
+	/* Check for object in kernel to avoid text exposure. */
+	err = check_kernel_text_object(ptr, n);
+	if (!err)
+		return;
+
+report:
+	report_usercopy(ptr, n, to_user, err);
+}
+EXPORT_SYMBOL(__check_object_size);
diff --git a/security/Kconfig b/security/Kconfig
index 176758cdfa57..df28f2b6f3e1 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -118,6 +118,34 @@ config LSM_MMAP_MIN_ADDR
 	  this low address space will need the permission specific to the
 	  systems running LSM.
 
+config HAVE_HARDENED_USERCOPY_ALLOCATOR
+	bool
+	help
+	  The heap allocator implements __check_heap_object() for
+	  validating memory ranges against heap object sizes in
+	  support of CONFIG_HARDENED_USERCOPY.
+
+config HAVE_ARCH_HARDENED_USERCOPY
+	bool
+	help
+	  The architecture supports CONFIG_HARDENED_USERCOPY by
+	  calling check_object_size() just before performing the
+	  userspace copies in the low level implementation of
+	  copy_to_user() and copy_from_user().
+
+config HARDENED_USERCOPY
+	bool "Harden memory copies between kernel and userspace"
+	depends on HAVE_ARCH_HARDENED_USERCOPY
+	select BUG
+	help
+	  This option checks for obviously wrong memory regions when
+	  copying memory to/from the kernel (via copy_to_user() and
+	  copy_from_user() functions) by rejecting memory ranges that
+	  are larger than the specified heap object, span multiple
+	  separately allocates pages, are not on the process stack,
+	  or are part of the kernel text. This kills entire classes
+	  of heap overflow exploits and similar kernel memory exposures.
+
 source security/selinux/Kconfig
 source security/smack/Kconfig
 source security/tomoyo/Kconfig
-- 
cgit v1.2.3


From 5b710f34e194c6b7710f69fdb5d798fdf35b98c1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:04:01 -0700
Subject: x86/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on x86. This is done both in
copy_*_user() and __copy_*_user() because copy_*_user() actually calls
down to _copy_*_user() and not __copy_*_user().

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
---
 arch/x86/Kconfig                  |  1 +
 arch/x86/include/asm/uaccess.h    | 10 ++++++----
 arch/x86/include/asm/uaccess_32.h |  2 ++
 arch/x86/include/asm/uaccess_64.h |  2 ++
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a2865ddfc1ff..9640942b68b9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -80,6 +80,7 @@ config X86
 	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
 	select HAVE_AOUT			if X86_32
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2982387ba817..d3312f0fcdfc 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -742,9 +742,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
 	 * case, and do only runtime checking for non-constant sizes.
 	 */
 
-	if (likely(sz < 0 || sz >= n))
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(to, n, false);
 		n = _copy_from_user(to, from, n);
-	else if(__builtin_constant_p(n))
+	} else if (__builtin_constant_p(n))
 		copy_from_user_overflow();
 	else
 		__copy_from_user_overflow(sz, n);
@@ -762,9 +763,10 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 	might_fault();
 
 	/* See the comment in copy_from_user() above. */
-	if (likely(sz < 0 || sz >= n))
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(from, n, true);
 		n = _copy_to_user(to, from, n);
-	else if(__builtin_constant_p(n))
+	} else if (__builtin_constant_p(n))
 		copy_to_user_overflow();
 	else
 		__copy_to_user_overflow(sz, n);
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 4b32da24faaf..7d3bdd1ed697 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -37,6 +37,7 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero
 static __always_inline unsigned long __must_check
 __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 {
+	check_object_size(from, n, true);
 	return __copy_to_user_ll(to, from, n);
 }
 
@@ -95,6 +96,7 @@ static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
 	might_fault();
+	check_object_size(to, n, false);
 	if (__builtin_constant_p(n)) {
 		unsigned long ret;
 
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 2eac2aa3e37f..673059a109fe 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -54,6 +54,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
 {
 	int ret = 0;
 
+	check_object_size(dst, size, false);
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
 	switch (size) {
@@ -119,6 +120,7 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
 {
 	int ret = 0;
 
+	check_object_size(src, size, true);
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
 	switch (size) {
-- 
cgit v1.2.3


From dfd45b6103c973bfcea2341d89e36faf947dbc33 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:06:53 -0700
Subject: ARM: uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on arm.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/Kconfig               |  1 +
 arch/arm/include/asm/uaccess.h | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 90542db1220d..f56b29b3f57e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -35,6 +35,7 @@ config ARM
 	select HARDIRQS_SW_RESEND
 	select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT)
 	select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
+	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
 	select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
 	select HAVE_ARCH_MMAP_RND_BITS if MMU
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 35c9db857ebe..7fb59199c6bb 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -496,7 +496,10 @@ arm_copy_from_user(void *to, const void __user *from, unsigned long n);
 static inline unsigned long __must_check
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	unsigned int __ua_flags = uaccess_save_and_enable();
+	unsigned int __ua_flags;
+
+	check_object_size(to, n, false);
+	__ua_flags = uaccess_save_and_enable();
 	n = arm_copy_from_user(to, from, n);
 	uaccess_restore(__ua_flags);
 	return n;
@@ -511,11 +514,15 @@ static inline unsigned long __must_check
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
 #ifndef CONFIG_UACCESS_WITH_MEMCPY
-	unsigned int __ua_flags = uaccess_save_and_enable();
+	unsigned int __ua_flags;
+
+	check_object_size(from, n, true);
+	__ua_flags = uaccess_save_and_enable();
 	n = arm_copy_to_user(to, from, n);
 	uaccess_restore(__ua_flags);
 	return n;
 #else
+	check_object_size(from, n, true);
 	return arm_copy_to_user(to, from, n);
 #endif
 }
-- 
cgit v1.2.3


From faf5b63e294151d6ac24ca6906d6f221bd3496cd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:59:42 -0700
Subject: arm64/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on arm64. As done by KASAN in -next,
renames the low-level functions to __arch_copy_*_user() so a static inline
can do additional work before the copy.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm64/Kconfig               |  1 +
 arch/arm64/include/asm/uaccess.h | 29 ++++++++++++++++++++++-------
 arch/arm64/kernel/arm64ksyms.c   |  4 ++--
 arch/arm64/lib/copy_from_user.S  |  4 ++--
 arch/arm64/lib/copy_to_user.S    |  4 ++--
 5 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 5a0a691d4220..9cdb2322c811 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -51,6 +51,7 @@ config ARM64
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_BITREVERSE
+	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 9e397a542756..92848b00e3cd 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -256,24 +256,39 @@ do {									\
 		-EFAULT;						\
 })
 
-extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
 extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
 extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
 
+static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	check_object_size(to, n, false);
+	return __arch_copy_from_user(to, from, n);
+}
+
+static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	check_object_size(from, n, true);
+	return __arch_copy_to_user(to, from, n);
+}
+
 static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (access_ok(VERIFY_READ, from, n))
-		n = __copy_from_user(to, from, n);
-	else /* security hole - plug it */
+	if (access_ok(VERIFY_READ, from, n)) {
+		check_object_size(to, n, false);
+		n = __arch_copy_from_user(to, from, n);
+	} else /* security hole - plug it */
 		memset(to, 0, n);
 	return n;
 }
 
 static inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (access_ok(VERIFY_WRITE, to, n))
-		n = __copy_to_user(to, from, n);
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		check_object_size(from, n, true);
+		n = __arch_copy_to_user(to, from, n);
+	}
 	return n;
 }
 
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 678f30b05a45..2dc44406a7ad 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -34,8 +34,8 @@ EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
 	/* user mem (segment) */
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_to_user);
+EXPORT_SYMBOL(__arch_copy_from_user);
+EXPORT_SYMBOL(__arch_copy_to_user);
 EXPORT_SYMBOL(__clear_user);
 EXPORT_SYMBOL(__copy_in_user);
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 17e8306dca29..0b90497d4424 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -66,7 +66,7 @@
 	.endm
 
 end	.req	x5
-ENTRY(__copy_from_user)
+ENTRY(__arch_copy_from_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
@@ -75,7 +75,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	mov	x0, #0				// Nothing to copy
 	ret
-ENDPROC(__copy_from_user)
+ENDPROC(__arch_copy_from_user)
 
 	.section .fixup,"ax"
 	.align	2
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 21faae60f988..7a7efe255034 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -65,7 +65,7 @@
 	.endm
 
 end	.req	x5
-ENTRY(__copy_to_user)
+ENTRY(__arch_copy_to_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	add	end, x0, x2
@@ -74,7 +74,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
 	    CONFIG_ARM64_PAN)
 	mov	x0, #0
 	ret
-ENDPROC(__copy_to_user)
+ENDPROC(__arch_copy_to_user)
 
 	.section .fixup,"ax"
 	.align	2
-- 
cgit v1.2.3


From 73d35887e24da77e8d1321b2e92bd9b9128e2fc2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:09:50 -0700
Subject: ia64/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on ia64.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/ia64/Kconfig               |  1 +
 arch/ia64/include/asm/uaccess.h | 18 +++++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index e109ee95e919..7451f1dce0af 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -53,6 +53,7 @@ config IA64
 	select MODULES_USE_ELF_RELA
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_HARDENED_USERCOPY
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index 2189d5ddc1ee..465c70982f40 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -241,12 +241,18 @@ extern unsigned long __must_check __copy_user (void __user *to, const void __use
 static inline unsigned long
 __copy_to_user (void __user *to, const void *from, unsigned long count)
 {
+	if (!__builtin_constant_p(count))
+		check_object_size(from, count, true);
+
 	return __copy_user(to, (__force void __user *) from, count);
 }
 
 static inline unsigned long
 __copy_from_user (void *to, const void __user *from, unsigned long count)
 {
+	if (!__builtin_constant_p(count))
+		check_object_size(to, count, false);
+
 	return __copy_user((__force void __user *) to, from, count);
 }
 
@@ -258,8 +264,11 @@ __copy_from_user (void *to, const void __user *from, unsigned long count)
 	const void *__cu_from = (from);							\
 	long __cu_len = (n);								\
 											\
-	if (__access_ok(__cu_to, __cu_len, get_fs()))					\
-		__cu_len = __copy_user(__cu_to, (__force void __user *) __cu_from, __cu_len);	\
+	if (__access_ok(__cu_to, __cu_len, get_fs())) {					\
+		if (!__builtin_constant_p(n))						\
+			check_object_size(__cu_from, __cu_len, true);			\
+		__cu_len = __copy_user(__cu_to, (__force void __user *)  __cu_from, __cu_len);	\
+	}										\
 	__cu_len;									\
 })
 
@@ -270,8 +279,11 @@ __copy_from_user (void *to, const void __user *from, unsigned long count)
 	long __cu_len = (n);								\
 											\
 	__chk_user_ptr(__cu_from);							\
-	if (__access_ok(__cu_from, __cu_len, get_fs()))					\
+	if (__access_ok(__cu_from, __cu_len, get_fs())) {				\
+		if (!__builtin_constant_p(n))						\
+			check_object_size(__cu_to, __cu_len, false);			\
 		__cu_len = __copy_user((__force void __user *) __cu_to, __cu_from, __cu_len);	\
+	}										\
 	__cu_len;									\
 })
 
-- 
cgit v1.2.3


From 1d3c1324746fed0e34a5b94d3ed303e7521ed603 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:10:01 -0700
Subject: powerpc/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on powerpc.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig               |  1 +
 arch/powerpc/include/asm/uaccess.h | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 0a9d439bcda6..0ac48a81632d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -164,6 +164,7 @@ config PPC
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
 	select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_ARCH_HARDENED_USERCOPY
 
 config GENERIC_CSUM
 	def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index b7c20f0b8fbe..c1dc6c14deb8 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -310,10 +310,15 @@ static inline unsigned long copy_from_user(void *to,
 {
 	unsigned long over;
 
-	if (access_ok(VERIFY_READ, from, n))
+	if (access_ok(VERIFY_READ, from, n)) {
+		if (!__builtin_constant_p(n))
+			check_object_size(to, n, false);
 		return __copy_tofrom_user((__force void __user *)to, from, n);
+	}
 	if ((unsigned long)from < TASK_SIZE) {
 		over = (unsigned long)from + n - TASK_SIZE;
+		if (!__builtin_constant_p(n - over))
+			check_object_size(to, n - over, false);
 		return __copy_tofrom_user((__force void __user *)to, from,
 				n - over) + over;
 	}
@@ -325,10 +330,15 @@ static inline unsigned long copy_to_user(void __user *to,
 {
 	unsigned long over;
 
-	if (access_ok(VERIFY_WRITE, to, n))
+	if (access_ok(VERIFY_WRITE, to, n)) {
+		if (!__builtin_constant_p(n))
+			check_object_size(from, n, true);
 		return __copy_tofrom_user(to, (__force void __user *)from, n);
+	}
 	if ((unsigned long)to < TASK_SIZE) {
 		over = (unsigned long)to + n - TASK_SIZE;
+		if (!__builtin_constant_p(n))
+			check_object_size(from, n - over, true);
 		return __copy_tofrom_user(to, (__force void __user *)from,
 				n - over) + over;
 	}
@@ -372,6 +382,10 @@ static inline unsigned long __copy_from_user_inatomic(void *to,
 		if (ret == 0)
 			return 0;
 	}
+
+	if (!__builtin_constant_p(n))
+		check_object_size(to, n, false);
+
 	return __copy_tofrom_user((__force void __user *)to, from, n);
 }
 
@@ -398,6 +412,9 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
 		if (ret == 0)
 			return 0;
 	}
+	if (!__builtin_constant_p(n))
+		check_object_size(from, n, true);
+
 	return __copy_tofrom_user(to, (__force const void __user *)from, n);
 }
 
-- 
cgit v1.2.3


From 9d9208a15800f9f06f102f9aac1e8b323c3b8575 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:10:13 -0700
Subject: sparc/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on sparc.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/sparc/Kconfig                  |  1 +
 arch/sparc/include/asm/uaccess_32.h | 14 ++++++++++----
 arch/sparc/include/asm/uaccess_64.h | 11 +++++++++--
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 546293d9e6c5..59b09600dd32 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -43,6 +43,7 @@ config SPARC
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
 	select CPU_NO_EFFICIENT_FFS
+	select HAVE_ARCH_HARDENED_USERCOPY
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index 57aca2792d29..341a5a133f48 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -248,22 +248,28 @@ unsigned long __copy_user(void __user *to, const void __user *from, unsigned lon
 
 static inline unsigned long copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (n && __access_ok((unsigned long) to, n))
+	if (n && __access_ok((unsigned long) to, n)) {
+		if (!__builtin_constant_p(n))
+			check_object_size(from, n, true);
 		return __copy_user(to, (__force void __user *) from, n);
-	else
+	} else
 		return n;
 }
 
 static inline unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
+	if (!__builtin_constant_p(n))
+		check_object_size(from, n, true);
 	return __copy_user(to, (__force void __user *) from, n);
 }
 
 static inline unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (n && __access_ok((unsigned long) from, n))
+	if (n && __access_ok((unsigned long) from, n)) {
+		if (!__builtin_constant_p(n))
+			check_object_size(to, n, false);
 		return __copy_user((__force void __user *) to, from, n);
-	else
+	} else
 		return n;
 }
 
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index e9a51d64974d..8bda94fab8e8 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -210,8 +210,12 @@ unsigned long copy_from_user_fixup(void *to, const void __user *from,
 static inline unsigned long __must_check
 copy_from_user(void *to, const void __user *from, unsigned long size)
 {
-	unsigned long ret = ___copy_from_user(to, from, size);
+	unsigned long ret;
 
+	if (!__builtin_constant_p(size))
+		check_object_size(to, size, false);
+
+	ret = ___copy_from_user(to, from, size);
 	if (unlikely(ret))
 		ret = copy_from_user_fixup(to, from, size);
 
@@ -227,8 +231,11 @@ unsigned long copy_to_user_fixup(void __user *to, const void *from,
 static inline unsigned long __must_check
 copy_to_user(void __user *to, const void *from, unsigned long size)
 {
-	unsigned long ret = ___copy_to_user(to, from, size);
+	unsigned long ret;
 
+	if (!__builtin_constant_p(size))
+		check_object_size(from, size, true);
+	ret = ___copy_to_user(to, from, size);
 	if (unlikely(ret))
 		ret = copy_to_user_fixup(to, from, size);
 	return ret;
-- 
cgit v1.2.3


From 97433ea4fda62349bfa42089455593cbcb57e06c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 7 Jul 2016 11:38:39 -0700
Subject: s390/uaccess: Enable hardened usercopy

Enables CONFIG_HARDENED_USERCOPY checks on s390.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/s390/Kconfig       | 1 +
 arch/s390/lib/uaccess.c | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a8c259059adf..9f694311c9ed 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -122,6 +122,7 @@ config S390
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_EARLY_PFN_TO_NID
+	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_ARCH_JUMP_LABEL
 	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 	select HAVE_ARCH_SECCOMP_FILTER
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index ae4de559e3a0..6986c20166f0 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -104,6 +104,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
 
 unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
+	check_object_size(to, n, false);
 	if (static_branch_likely(&have_mvcos))
 		return copy_from_user_mvcos(to, from, n);
 	return copy_from_user_mvcp(to, from, n);
@@ -177,6 +178,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
 
 unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
+	check_object_size(from, n, true);
 	if (static_branch_likely(&have_mvcos))
 		return copy_to_user_mvcos(to, from, n);
 	return copy_to_user_mvcs(to, from, n);
-- 
cgit v1.2.3


From 04385fc5e8fffed84425d909a783c0f0c587d847 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:20:59 -0700
Subject: mm: SLAB hardened usercopy support

Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the
SLAB allocator to catch any copies that may span objects.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
---
 init/Kconfig |  1 +
 mm/slab.c    | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index c02d89777713..1312d7b5a5fb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1758,6 +1758,7 @@ choice
 
 config SLAB
 	bool "SLAB"
+	select HAVE_HARDENED_USERCOPY_ALLOCATOR
 	help
 	  The regular slab allocator that is established and known to work
 	  well in all environments. It organizes cache hot objects in
diff --git a/mm/slab.c b/mm/slab.c
index cc8bbc1e6bc9..5e2d5f349aca 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4477,6 +4477,36 @@ static int __init slab_proc_init(void)
 module_init(slab_proc_init);
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+				struct page *page)
+{
+	struct kmem_cache *cachep;
+	unsigned int objnr;
+	unsigned long offset;
+
+	/* Find and validate object. */
+	cachep = page->slab_cache;
+	objnr = obj_to_index(cachep, page, (void *)ptr);
+	BUG_ON(objnr >= cachep->num);
+
+	/* Find offset within object. */
+	offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+
+	/* Allow address range falling entirely within object size. */
+	if (offset <= cachep->object_size && n <= cachep->object_size - offset)
+		return NULL;
+
+	return cachep->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
 /**
  * ksize - get the actual amount of memory allocated for a given object
  * @objp: Pointer to the object
-- 
cgit v1.2.3


From ed18adc1cdd00a5c55a20fbdaed4804660772281 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 23 Jun 2016 15:24:05 -0700
Subject: mm: SLUB hardened usercopy support

Under CONFIG_HARDENED_USERCOPY, this adds object size checking to the
SLUB allocator to catch any copies that may span objects. Includes a
redzone handling fix discovered by Michael Ellerman.

Based on code from PaX and grsecurity.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Reviwed-by: Laura Abbott <labbott@redhat.com>
---
 init/Kconfig |  1 +
 mm/slub.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index 1312d7b5a5fb..0c847063bb27 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1766,6 +1766,7 @@ config SLAB
 
 config SLUB
 	bool "SLUB (Unqueued Allocator)"
+	select HAVE_HARDENED_USERCOPY_ALLOCATOR
 	help
 	   SLUB is a slab allocator that minimizes cache line usage
 	   instead of managing queues of cached objects (SLAB approach).
diff --git a/mm/slub.c b/mm/slub.c
index 825ff4505336..256a8efd165e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3614,6 +3614,46 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY
+/*
+ * Rejects objects that are incorrectly sized.
+ *
+ * Returns NULL if check passes, otherwise const char * to name of cache
+ * to indicate an error.
+ */
+const char *__check_heap_object(const void *ptr, unsigned long n,
+				struct page *page)
+{
+	struct kmem_cache *s;
+	unsigned long offset;
+	size_t object_size;
+
+	/* Find object and usable object size. */
+	s = page->slab_cache;
+	object_size = slab_ksize(s);
+
+	/* Reject impossible pointers. */
+	if (ptr < page_address(page))
+		return s->name;
+
+	/* Find offset within object. */
+	offset = (ptr - page_address(page)) % s->size;
+
+	/* Adjust for redzone and reject if within the redzone. */
+	if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+		if (offset < s->red_left_pad)
+			return s->name;
+		offset -= s->red_left_pad;
+	}
+
+	/* Allow address range falling entirely within object size. */
+	if (offset <= object_size && n <= object_size - offset)
+		return NULL;
+
+	return s->name;
+}
+#endif /* CONFIG_HARDENED_USERCOPY */
+
 static size_t __ksize(const void *object)
 {
 	struct page *page;
-- 
cgit v1.2.3


From 784bdf3bb694b256fcd6120b93e8947a84249a3a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 29 Jul 2016 16:32:30 +0200
Subject: futex: Assume all mappings are private on !MMU systems

To quote Rick why there is no need for shared mapping on !MMU systems:

|With MMU, shared futex keys need to identify the physical backing for
|a memory address because it may be mapped at different addresses in
|different processes (or even multiple times in the same process).
|Without MMU this cannot happen. You only have physical addresses. So
|the "private futex" behavior of using the virtual address as the key
|is always correct (for both shared and private cases) on nommu
|systems.

This patch disables the FLAGS_SHARED in a way that allows the compiler to
remove that code.

[bigeasy: Added changelog ]
Reported-by: Rich Felker <dalias@libc.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20160729143230.GA21715@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 33664f70e2d2..46cb3a301bc1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -179,7 +179,15 @@ int __read_mostly futex_cmpxchg_enabled;
  * Futex flags used to encode options to functions and preserve them across
  * restarts.
  */
-#define FLAGS_SHARED		0x01
+#ifdef CONFIG_MMU
+# define FLAGS_SHARED		0x01
+#else
+/*
+ * NOMMU does not have per process address space. Let the compiler optimize
+ * code away.
+ */
+# define FLAGS_SHARED		0x00
+#endif
 #define FLAGS_CLOCKRT		0x02
 #define FLAGS_HAS_TIMEOUT	0x04
 
@@ -405,6 +413,16 @@ static void get_futex_key_refs(union futex_key *key)
 	if (!key->both.ptr)
 		return;
 
+	/*
+	 * On MMU less systems futexes are always "private" as there is no per
+	 * process address space. We need the smp wmb nevertheless - yes,
+	 * arch/blackfin has MMU less SMP ...
+	 */
+	if (!IS_ENABLED(CONFIG_MMU)) {
+		smp_mb(); /* explicit smp_mb(); (B) */
+		return;
+	}
+
 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 	case FUT_OFF_INODE:
 		ihold(key->shared.inode); /* implies smp_mb(); (B) */
@@ -436,6 +454,9 @@ static void drop_futex_key_refs(union futex_key *key)
 		return;
 	}
 
+	if (!IS_ENABLED(CONFIG_MMU))
+		return;
+
 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 	case FUT_OFF_INODE:
 		iput(key->shared.inode);
-- 
cgit v1.2.3


From 3b2c1710fac7fb278b760d1545e637cbb5ea5b5b Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Wed, 13 Jul 2016 16:32:03 +0300
Subject: drm/i915: Wait up to 3ms for the pcu to ack the cdclk change request
 on SKL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bspec tells us to keep bashing the PCU for up to 3ms when trying to
inform it about an upcoming change in the cdclk frequency. Currently
we only keep at it for 15*10usec (+ whatever delays gets added by
the sandybridge_pcode_read() itself). Let's change the limit to 3ms.

I decided to keep 10 usec delay per iteration for now, even though
the spec doesn't really tell us to do that.

Cc: stable@vger.kernel.org
Fixes: 5d96d8afcfbb ("drm/i915/skl: Deinit/init the display at suspend/resume")
Cc: David Weinehall <david.weinehall@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1468416723-23440-1-git-send-email-ville.syrjala@linux.intel.com
Tested-by: David Weinehall <david.weinehall@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
(cherry picked from commit 848496e5902833600f7992f4faa82dc1546051ba)
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_display.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 9f56ac11975f..e5c46a6ec65b 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -5691,15 +5691,7 @@ static bool skl_cdclk_pcu_ready(struct drm_i915_private *dev_priv)
 
 static bool skl_cdclk_wait_for_pcu_ready(struct drm_i915_private *dev_priv)
 {
-	unsigned int i;
-
-	for (i = 0; i < 15; i++) {
-		if (skl_cdclk_pcu_ready(dev_priv))
-			return true;
-		udelay(10);
-	}
-
-	return false;
+	return _wait_for(skl_cdclk_pcu_ready(dev_priv), 3000, 10) == 0;
 }
 
 static void skl_set_cdclk(struct drm_i915_private *dev_priv, int cdclk, int vco)
-- 
cgit v1.2.3


From a7b4667a00025ac28300737c868bd4818b6d8c4d Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Mon, 23 May 2016 17:42:48 +0300
Subject: drm/i915: Never fully mask the the EI up rps interrupt on SNB/IVB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SNB (and IVB too I suppose) starts to misbehave if the GPU gets stuck
in an infinite batch buffer loop. The GPU apparently hogs something
critical and CPUs start to lose interrupts and whatnot. We can keep
the system limping along by unmasking some interrupts in
GEN6_PMINTRMSK. The EI up interrupt has been previously chosen for
that task, so let's never mask it.

v2: s/gen6_rps_pm_mask/gen6_sanitize_rps_pm_mask/ (Chris)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93122
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1464014568-4529-1-git-send-email-ville.syrjala@linux.intel.com
Cc: stable@vger.kernel.org
(cherry picked from commit 12c100bfa5d9103b6c4d43636fee09c31e75605a)
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/intel_pm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 5a8ee0c76593..5ca68d2ed7d4 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -4892,7 +4892,8 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
 		else
 			gen6_set_rps(dev_priv, dev_priv->rps.idle_freq);
 		dev_priv->rps.last_adj = 0;
-		I915_WRITE(GEN6_PMINTRMSK, 0xffffffff);
+		I915_WRITE(GEN6_PMINTRMSK,
+			   gen6_sanitize_rps_pm_mask(dev_priv, ~0));
 	}
 	mutex_unlock(&dev_priv->rps.hw_lock);
 
-- 
cgit v1.2.3


From 0596a9048bf2aca2a74b312493f39e4d5ac3b653 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 14 Jun 2016 14:18:27 +0100
Subject: Btrfs: add missing check for writeback errors on fsync

When we start an fsync we start ordered extents for all delalloc ranges.
However before attempting to log the inode, we only wait for those ordered
extents if we are not doing a full sync (bit BTRFS_INODE_NEEDS_FULL_SYNC
is set in the inode's flags). This means that if an ordered extent
completes with an IO error before we check if we can skip logging the
inode, we will not catch and report the IO error to user space. This is
because on an IO error, when the ordered extent completes we do not
update the inode, so if the inode was not previously updated by the
current transaction we end up not logging it through calls to fsync and
therefore not check its mapping flags for the presence of IO errors.

Fix this by checking for errors in the flags of the inode's mapping when
we notice we can skip logging the inode.

This caused sporadic failures in the test generic/331 (which explicitly
tests for IO errors during an fsync call).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/file.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bcfb4a27ddd4..6f049109fffe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		 */
 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			  &BTRFS_I(inode)->runtime_flags);
+		/*
+		 * An ordered extent might have started before and completed
+		 * already with io errors, in which case the inode was not
+		 * updated and we end up here. So check the inode's mapping
+		 * flags for any errors that might have happened while doing
+		 * writeback of file data.
+		 */
+		ret = btrfs_inode_check_errors(inode);
 		inode_unlock(inode);
 		goto out;
 	}
-- 
cgit v1.2.3


From 801bec365e0e19f2ba066cd3e25a67dee21b4aae Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Tue, 23 Jun 2015 18:39:46 +0800
Subject: Btrfs: send, fix failure to move directories with the same name
 around

When doing an incremental send we can end up not moving directories that
have the same name. This happens when the same parent directory has
different child directories with the same name in the parent and send
snapshots.

For example, consider the following scenario:

  Parent snapshot:

  .                   (ino 256)
  |---- d/            (ino 257)
  |     |--- p1/      (ino 258)
  |
  |---- p1/           (ino 259)

  Send snapshot:

  .                    (ino 256)
  |--- d/              (ino 257)
       |--- p1/        (ino 259)
             |--- p1/  (ino 258)

The directory named "d" (inode 257) has in both snapshots an entry with
the name "p1" but it refers to different inodes in both snapshots (inode
258 in the parent snapshot and inode 259 in the send snapshot). When
attempting to move inode 258, the operation is delayed because its new
parent, inode 259, was not yet moved/renamed (as the stream is currently
processing inode 258). Then when processing inode 259, we also end up
delaying its move/rename operation so that it happens after inode 258 is
moved/renamed. This decision to delay the move/rename rename operation
of inode 259 is due to the fact that the new parent inode (257) still
has inode 258 as its child, which has the same name has inode 259. So
we end up with inode 258 move/rename operation waiting for inode's 259
move/rename operation, which in turn it waiting for inode's 258
move/rename. This results in ending the send stream without issuing
move/rename operations for inodes 258 and 259 and generating the
following warnings in syslog/dmesg:

[148402.979747] ------------[ cut here ]------------
[148402.980588] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6177 btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.981928] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148402.986999] CPU: 14 PID: 4117 Comm: btrfs Tainted: G        W       4.6.0-rc7-btrfs-next-31+ #1
[148402.988136] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148402.988136]  0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148402.988136]  0000000000000000 ffff88022139fce8 ffffffff81052b14 000018212139fac8
[148402.988136]  ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148402.988136] Call Trace:
[148402.988136]  [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148402.988136]  [<ffffffff81052b14>] __warn+0xc2/0xdd
[148402.988136]  [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148402.988136]  [<ffffffffa04bc831>] btrfs_ioctl_send+0xe03/0xe51 [btrfs]
[148402.988136]  [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148402.988136]  [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148402.988136]  [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148402.988136]  [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148402.988136]  [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148402.988136]  [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148402.988136]  [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148402.988136]  [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148402.988136]  [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148402.988136]  [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.011373] ---[ end trace a4539270c8056f8b ]---
[148403.012296] ------------[ cut here ]------------
[148403.013071] WARNING: CPU: 14 PID: 4117 at fs/btrfs/send.c:6194 btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.014447] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[148403.019708] CPU: 14 PID: 4117 Comm: btrfs Tainted: G        W       4.6.0-rc7-btrfs-next-31+ #1
[148403.020104] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[148403.020104]  0000000000000000 ffff88022139fca8 ffffffff8126b42c 0000000000000000
[148403.020104]  0000000000000000 ffff88022139fce8 ffffffff81052b14 000018322139fac8
[148403.020104]  ffff88022b0db400 0000000000000000 0000000000000001 0000000000000000
[148403.020104] Call Trace:
[148403.020104]  [<ffffffff8126b42c>] dump_stack+0x67/0x90
[148403.020104]  [<ffffffff81052b14>] __warn+0xc2/0xdd
[148403.020104]  [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[148403.020104]  [<ffffffffa04bc847>] btrfs_ioctl_send+0xe19/0xe51 [btrfs]
[148403.020104]  [<ffffffffa048b358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[148403.020104]  [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[148403.020104]  [<ffffffff8108eb51>] ? __lock_is_held+0x3c/0x57
[148403.020104]  [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[148403.020104]  [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[148403.020104]  [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[148403.020104]  [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[148403.020104]  [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[148403.020104]  [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[148403.020104]  [<ffffffff8108e89d>] ? trace_hardirqs_off_caller+0x3f/0xaa
[148403.038981] ---[ end trace a4539270c8056f8c ]---

There's another issue caused by similar (but more complex) changes in the
directory hierarchy that makes move/rename operations fail, described with
the following example:

  Parent snapshot:

  .
  |---- a/                                                   (ino 262)
  |     |---- c/                                             (ino 268)
  |
  |---- d/                                                   (ino 263)
        |---- ance/                                          (ino 267)
                |---- e/                                     (ino 264)
                |---- f/                                     (ino 265)
                |---- ance/                                  (ino 266)

  Send snapshot:

  .
  |---- a/                                                   (ino 262)
  |---- c/                                                   (ino 268)
  |     |---- ance/                                          (ino 267)
  |
  |---- d/                                                   (ino 263)
  |     |---- ance/                                          (ino 266)
  |
  |---- f/                                                   (ino 265)
        |---- e/                                             (ino 264)

When the inode 265 is processed, the path for inode 267 is computed, which
at that time corresponds to "d/ance", and it's stored in the names cache.
Later on when processing inode 266, we end up orphanizing (renaming to a
name matching the pattern o<ino>-<gen>-<seq>) inode 267 because it has
the same name as inode 266 and it's currently a child of the new parent
directory (inode 263) for inode 266. After the orphanization and while we
are still processing inode 266, a rename operation for inode 266 is
generated. However the source path for that rename operation is incorrect
because it ends up using the old, pre-orphanization, name of inode 267.
The no longer valid name for inode 267 was previously cached when
processing inode 265 and it remains usable and considered valid until
the inode currently being processed has a number greater than 267.
This resulted in the receiving side failing with the following error:

  ERROR: rename d/ance/ance -> d/ance failed: No such file or directory

So fix these issues by detecting such circular dependencies for rename
operations and by clearing the cached name of an inode once the inode
is orphanized.

A test case for fstests will follow soon.

Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and organized, and improved
 comments]

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 95 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b71dd298385c..993e1bab0a6b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -231,7 +231,6 @@ struct pending_dir_move {
 	u64 parent_ino;
 	u64 ino;
 	u64 gen;
-	bool is_orphan;
 	struct list_head update_refs;
 };
 
@@ -1861,7 +1860,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 	 * was already unlinked/moved, so we can safely assume that we will not
 	 * overwrite anything at this point in time.
 	 */
-	if (other_inode > sctx->send_progress) {
+	if (other_inode > sctx->send_progress ||
+	    is_waiting_for_move(sctx, other_inode)) {
 		ret = get_inode_info(sctx->parent_root, other_inode, NULL,
 				who_gen, NULL, NULL, NULL, NULL);
 		if (ret < 0)
@@ -3047,7 +3047,6 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 	pm->parent_ino = parent_ino;
 	pm->ino = ino;
 	pm->gen = ino_gen;
-	pm->is_orphan = is_orphan;
 	INIT_LIST_HEAD(&pm->list);
 	INIT_LIST_HEAD(&pm->update_refs);
 	RB_CLEAR_NODE(&pm->node);
@@ -3113,6 +3112,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
 	return NULL;
 }
 
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+		     u64 ino, u64 gen, u64 *ancestor_ino)
+{
+	int ret = 0;
+	u64 parent_inode = 0;
+	u64 parent_gen = 0;
+	u64 start_ino = ino;
+
+	*ancestor_ino = 0;
+	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+		fs_path_reset(name);
+
+		if (is_waiting_for_rm(sctx, ino))
+			break;
+		if (is_waiting_for_move(sctx, ino)) {
+			if (*ancestor_ino == 0)
+				*ancestor_ino = ino;
+			ret = get_first_ref(sctx->parent_root, ino,
+					    &parent_inode, &parent_gen, name);
+		} else {
+			ret = __get_cur_name_and_parent(sctx, ino, gen,
+							&parent_inode,
+							&parent_gen, name);
+			if (ret > 0) {
+				ret = 0;
+				break;
+			}
+		}
+		if (ret < 0)
+			break;
+		if (parent_inode == start_ino) {
+			ret = 1;
+			if (*ancestor_ino == 0)
+				*ancestor_ino = ino;
+			break;
+		}
+		ino = parent_inode;
+		gen = parent_gen;
+	}
+	return ret;
+}
+
 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
 	struct fs_path *from_path = NULL;
@@ -3123,6 +3164,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	u64 parent_ino, parent_gen;
 	struct waiting_dir_move *dm = NULL;
 	u64 rmdir_ino = 0;
+	u64 ancestor;
+	bool is_orphan;
 	int ret;
 
 	name = fs_path_alloc();
@@ -3135,9 +3178,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	dm = get_waiting_dir_move(sctx, pm->ino);
 	ASSERT(dm);
 	rmdir_ino = dm->rmdir_ino;
+	is_orphan = dm->orphanized;
 	free_waiting_dir_move(sctx, dm);
 
-	if (pm->is_orphan) {
+	if (is_orphan) {
 		ret = gen_unique_name(sctx, pm->ino,
 				      pm->gen, from_path);
 	} else {
@@ -3155,6 +3199,22 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 		goto out;
 
 	sctx->send_progress = sctx->cur_ino + 1;
+	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+	if (ret) {
+		LIST_HEAD(deleted_refs);
+		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+					   &pm->update_refs, &deleted_refs,
+					   is_orphan);
+		if (ret < 0)
+			goto out;
+		if (rmdir_ino) {
+			dm = get_waiting_dir_move(sctx, pm->ino);
+			ASSERT(dm);
+			dm->rmdir_ino = rmdir_ino;
+		}
+		goto out;
+	}
 	fs_path_reset(name);
 	to_path = name;
 	name = NULL;
@@ -3325,6 +3385,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 	u64 left_gen;
 	u64 right_gen;
 	int ret = 0;
+	struct waiting_dir_move *wdm;
 
 	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
 		return 0;
@@ -3383,7 +3444,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
 		goto out;
 	}
 
-	if (is_waiting_for_move(sctx, di_key.objectid)) {
+	wdm = get_waiting_dir_move(sctx, di_key.objectid);
+	if (wdm && !wdm->orphanized) {
 		ret = add_pending_dir_move(sctx,
 					   sctx->cur_ino,
 					   sctx->cur_inode_gen,
@@ -3643,11 +3705,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				goto out;
 			if (ret) {
 				struct name_cache_entry *nce;
+				struct waiting_dir_move *wdm;
 
 				ret = orphanize_inode(sctx, ow_inode, ow_gen,
 						cur->full_path);
 				if (ret < 0)
 					goto out;
+
+				/*
+				 * If ow_inode has its rename operation delayed
+				 * make sure that its orphanized name is used in
+				 * the source path when performing its rename
+				 * operation.
+				 */
+				if (is_waiting_for_move(sctx, ow_inode)) {
+					wdm = get_waiting_dir_move(sctx,
+								   ow_inode);
+					ASSERT(wdm);
+					wdm->orphanized = true;
+				}
+
 				/*
 				 * Make sure we clear our orphanized inode's
 				 * name from the name cache. This is because the
@@ -3663,6 +3740,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 					name_cache_delete(sctx, nce);
 					kfree(nce);
 				}
+
+				/*
+				 * ow_inode might currently be an ancestor of
+				 * cur_ino, therefore compute valid_path (the
+				 * current path of cur_ino) again because it
+				 * might contain the pre-orphanization name of
+				 * ow_inode, which is no longer valid.
+				 */
+				fs_path_reset(valid_path);
+				ret = get_cur_path(sctx, sctx->cur_ino,
+					   sctx->cur_inode_gen, valid_path);
+				if (ret < 0)
+					goto out;
 			} else {
 				ret = send_unlink(sctx, cur->full_path);
 				if (ret < 0)
-- 
cgit v1.2.3


From 7969e77a73164b418da851cbae35cdd6c1b43fee Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 17 Jun 2016 17:13:36 +0100
Subject: Btrfs: send, add missing error check for calls to path_loop()

The function path_loop() can return a negative integer, signaling an
error, 0 if there's no path loop and 1 if there's a path loop. We were
treating any non zero values as meaning that a path loop exists. Fix
this by explicitly checking for errors and gracefully return them to
user space.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 993e1bab0a6b..0dc05bb856ff 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3200,6 +3200,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 
 	sctx->send_progress = sctx->cur_ino + 1;
 	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+	if (ret < 0)
+		goto out;
 	if (ret) {
 		LIST_HEAD(deleted_refs);
 		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
-- 
cgit v1.2.3


From 4122ea64f877f632390bca1886c8a6f666f28398 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 27 Jun 2016 16:54:44 +0100
Subject: Btrfs: incremental send, fix invalid paths for rename operations

Example scenario:

  Parent snapshot:

  .                                                       (ino 277)
  |---- tmp/                                              (ino 278)
  |---- pre/                                              (ino 280)
  |      |---- wait_dir/                                  (ino 281)
  |
  |---- desc/                                             (ino 282)
  |---- ance/                                             (ino 283)
  |       |---- below_ance/                               (ino 279)
  |
  |---- other_dir/                                        (ino 284)

  Send snapshot:

  .                                                       (ino 277)
  |---- tmp/                                              (ino 278)
         |---- other_dir/                                 (ino 284)
                   |---- below_ance/                      (ino 279)
                   |            |---- pre/                (ino 280)
                   |
                   |---- wait_dir/                        (ino 281)
                              |---- desc/                 (ino 282)
                                      |---- ance/         (ino 283)

While computing the send stream the following steps happen:

1) While processing inode 279 we end up delaying its rename operation
   because its new parent in the send snapshot, inode 284, was not
   yet processed and therefore not yet renamed;

2) Later when processing inode 280 we end up renaming it immediately to
   "ance/below_once/pre" and not delay its rename operation because its
   new parent (inode 279 in the send snapshot) has its rename operation
   delayed and inode 280 is not an encestor of inode 279 (its parent in
   the send snapshot) in the parent snapshot;

3) When processing inode 281 we end up delaying its rename operation
   because its new parent in the send snapshot, inode 284, was not yet
   processed and therefore not yet renamed;

4) When processing inode 282 we do not delay its rename operation because
   its parent in the send snapshot, inode 281, already has its own rename
   operation delayed and our current inode (282) is not an ancestor of
   inode 281 in the parent snapshot. Therefore inode 282 is renamed to
   "ance/below_ance/pre/wait_dir";

5) When processing inode 283 we realize that we can rename it because one
   of its ancestors in the send snapshot, inode 281, has its rename
   operation delayed and inode 283 is not an ancestor of inode 281 in the
   parent snapshot. So a rename operation to rename inode 283 to
   "ance/below_ance/pre/wait_dir/desc/ance" is issued. This path is
   invalid due to a missing path building loop that was undetected by
   the incremental send implementation, as inode 283 ends up getting
   included twice in the path (once with its path in the parent snapshot).
   Therefore its rename operation must wait before the ancestor inode 284
   is renamed.

Fix this by not terminating the rename dependency checks when we find an
ancestor, in the send snapshot, that has its rename operation delayed. So
that we continue doing the same checks if the current inode is not an
ancestor, in the parent snapshot, of an ancestor in the send snapshot we
are processing in the loop.

The problem and reproducer were reported by Robbie Ko, as part of a patch
titled "Btrfs: incremental send, avoid ancestor rename to descendant".
However the fix was unnecessarily complicated and can be addressed with
much less code and effort.

Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 0dc05bb856ff..3d6a4dde8fe4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3534,7 +3534,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 			ret = is_ancestor(sctx->parent_root,
 					  sctx->cur_ino, sctx->cur_inode_gen,
 					  ino, path_before);
-			break;
+			if (ret)
+				break;
 		}
 
 		fs_path_reset(path_before);
-- 
cgit v1.2.3


From 99ea42ddb13bad6188d4f183f8866cacd14fa141 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Tue, 23 Jun 2015 18:39:49 +0800
Subject: Btrfs: incremental send, fix premature rmdir operations

Under certain situations, an incremental send operation can contain
a rmdir operation that will make the receiving end fail when attempting
to execute it, because the target directory is not yet empty.

Consider the following example:

  Parent snapshot:

  .                                                             (ino 256)
  |--- a/                                                       (ino 257)
  |    |--- c/                                                  (ino 260)
  |
  |--- del/                                                     (ino 259)
        |--- tmp/                                               (ino 258)
        |--- x/                                                 (ino 261)

  Send snapshot:

  .                                                             (ino 256)
  |--- a/                                                       (ino 257)
  |    |--- x/                                                  (ino 261)
  |
  |--- c/                                                       (ino 260)
       |--- tmp/                                                (ino 258)

1) When processing inode 258, we delay its rename operation because inode
   260 is its new parent in the send snapshot and it was not yet renamed
   (since 260 > 258, that is, beyond the current progress);

2) When processing inode 259, we realize we can not yet send an rmdir
   operation (against inode 259) because inode 258 was still not yet
   renamed/moved away from inode 259. Therefore we update data structures
   so that after inode 258 is renamed, we try again to see if we can
   finally send an rmdir operation for inode 259;

3) When we process inode 260, we send a rename operation for it followed
   by a rename operation for inode 258. Once we send the rename operation
   for inode 258 we then check if we can finally issue an rmdir for its
   previous parent, inode 259, by calling the can_rmdir() function with
   a value of sctx->cur_ino + 1 (260 + 1 = 261) for its "progress"
   argument. This makes can_rmdir() return true (value 1) because even
   though there's still a child inode of inode 259 that was not yet
   renamed/moved, which is inode 261, the given value of progress (261)
   is not lower then 261 (that is, not lower than the inode number of
   some child of inode 259). So we end up sending a rmdir operation for
   inode 259 before its child inode 261 is processed and renamed.

So fix this by passing the correct progress value to the call to
can_rmdir() from within apply_dir_move() (where we issue delayed rename
operations), which should match stcx->cur_ino (the number of the inode
currently being processed) and not sctx->cur_ino + 1.

A test case for fstests follows soon.

Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed, clear and well formatted]

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3d6a4dde8fe4..4115aba001ad 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3236,7 +3236,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 			/* already deleted */
 			goto finish;
 		}
-		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
 		if (ret < 0)
 			goto out;
 		if (!ret)
-- 
cgit v1.2.3


From 443f9d266c4848a3a9ea386f66c4bed2f5997940 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Mon, 22 Jun 2015 17:08:45 +0800
Subject: Btrfs: send, fix warning due to late freeing of orphan_dir_info
 structures

Under certain situations, when doing an incremental send, we can end up
not freeing orphan_dir_info structures as soon as they are no longer
needed. Instead we end up freeing them only after finishing the send
stream, which causes a warning to be emitted:

[282735.229200] ------------[ cut here ]------------
[282735.229968] WARNING: CPU: 9 PID: 10588 at fs/btrfs/send.c:6298 btrfs_ioctl_send+0xe2f/0xe51 [btrfs]
[282735.231282] Modules linked in: btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis ppdev tpm parport_pc psmouse parport sg pcspkr i2c_piix4 i2c_core evdev processor serio_raw button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring virtio e1000 scsi_mod floppy [last unloaded: btrfs]
[282735.237130] CPU: 9 PID: 10588 Comm: btrfs Tainted: G        W       4.6.0-rc7-btrfs-next-31+ #1
[282735.239309] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[282735.240160]  0000000000000000 ffff880224273ca8 ffffffff8126b42c 0000000000000000
[282735.240160]  0000000000000000 ffff880224273ce8 ffffffff81052b14 0000189a24273ac8
[282735.240160]  ffff8802210c9800 0000000000000000 0000000000000001 0000000000000000
[282735.240160] Call Trace:
[282735.240160]  [<ffffffff8126b42c>] dump_stack+0x67/0x90
[282735.240160]  [<ffffffff81052b14>] __warn+0xc2/0xdd
[282735.240160]  [<ffffffff81052beb>] warn_slowpath_null+0x1d/0x1f
[282735.240160]  [<ffffffffa03c99d5>] btrfs_ioctl_send+0xe2f/0xe51 [btrfs]
[282735.240160]  [<ffffffffa0398358>] btrfs_ioctl+0x14f/0x1f81 [btrfs]
[282735.240160]  [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
[282735.240160]  [<ffffffff8118da05>] vfs_ioctl+0x18/0x34
[282735.240160]  [<ffffffff8118e00c>] do_vfs_ioctl+0x550/0x5be
[282735.240160]  [<ffffffff81196f0c>] ? __fget+0x6b/0x77
[282735.240160]  [<ffffffff81196fa1>] ? __fget_light+0x62/0x71
[282735.240160]  [<ffffffff8118e0d1>] SyS_ioctl+0x57/0x79
[282735.240160]  [<ffffffff8149e025>] entry_SYSCALL_64_fastpath+0x18/0xa8
[282735.240160]  [<ffffffff81100c6b>] ? time_hardirqs_off+0x9/0x14
[282735.240160]  [<ffffffff8108e87d>] ? trace_hardirqs_off_caller+0x1f/0xaa
[282735.256343] ---[ end trace a4539270c8056f93 ]---

Consider the following example:

  Parent snapshot:

  .                                                             (ino 256)
  |--- a/                                                       (ino 257)
  |    |--- c/                                                  (ino 260)
  |
  |--- del/                                                     (ino 259)
        |--- tmp/                                               (ino 258)
        |--- x/                                                 (ino 261)
        |--- y/                                                 (ino 262)

  Send snapshot:

  .                                                             (ino 256)
  |--- a/                                                       (ino 257)
  |    |--- x/                                                  (ino 261)
  |    |--- y/                                                  (ino 262)
  |
  |--- c/                                                       (ino 260)
       |--- tmp/                                                (ino 258)

1) When processing inode 258, we end up delaying its rename operation
   because it has an ancestor (in the send snapshot) that has a higher
   inode number (inode 260) which was also renamed in the send snapshot,
   therefore we delay the rename of inode 258 so that it happens after
   inode 260 is renamed;

2) When processing inode 259, we end up delaying its deletion (rmdir
   operation) because it has a child inode (258) that has its rename
   operation delayed. At this point we allocate an orphan_dir_info
   structure and tag inode 258 so that we later attempt to see if we
   can delete (rmdir) inode 259 once inode 258 is renamed;

3) When we process inode 260, after renaming it we finally do the rename
   operation for inode 258. Once we issue the rename operation for inode
   258 we notice that this inode was tagged so that we attempt to see
   if at this point we can delete (rmdir) inode 259. But at this point
   we can not still delete inode 259 because it has 2 children, inodes
   261 and 262, that were not yet processed and therefore not yet
   moved (renamed) away from inode 259. We end up not freeing the
   orphan_dir_info structure allocated in step 2;

4) We process inodes 261 and 262, and once we move/rename inode 262
   we issue the rmdir operation for inode 260;

5) We finish the send stream and notice that red black tree that
   contains orphan_dir_info structures is not empty, so we emit
   a warning and then free any orphan_dir_structures left.

So fix this by freeing an orphan_dir_info structure once we try to
apply a pending rename operation if we can not delete yet the tagged
directory.

A test case for fstests follows soon.

Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Modified changelog to be more detailed and easier to understand]
---
 fs/btrfs/send.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4115aba001ad..e552c33e72c0 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2947,6 +2947,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 		}
 
 		if (loc.objectid > send_progress) {
+			struct orphan_dir_info *odi;
+
+			odi = get_orphan_dir_info(sctx, dir);
+			free_orphan_dir_info(sctx, odi);
 			ret = 0;
 			goto out;
 		}
-- 
cgit v1.2.3


From 764433a12e78e1a2161793c02eceafdfed2d652b Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Tue, 23 Jun 2015 18:39:50 +0800
Subject: Btrfs: send, fix invalid leaf accesses due to incorrect utimes
 operations

During an incremental send, if we have delayed rename operations for inodes
that were children of directories which were removed in the send snapshot,
we can end up accessing incorrect items in a leaf or accessing beyond the
last item of the leaf due to issuing utimes operations for the removed
inodes. Consider the following example:

  Parent snapshot:
  .                                                             (ino 256)
  |--- a/                                                       (ino 257)
  |    |--- c/                                                  (ino 262)
  |
  |--- b/                                                       (ino 258)
  |    |--- d/                                                  (ino 263)
  |
  |--- del/                                                     (ino 261)
        |--- x/                                                 (ino 259)
        |--- y/                                                 (ino 260)

  Send snapshot:

  .                                                             (ino 256)
  |--- a/                                                       (ino 257)
  |
  |--- b/                                                       (ino 258)
  |
  |--- c/                                                       (ino 262)
  |    |--- y/                                                  (ino 260)
  |
  |--- d/                                                       (ino 263)
       |--- x/                                                  (ino 259)

1) When processing inodes 259 and 260, we end up delaying their rename
   operations because their parents, inodes 263 and 262 respectively, were
   not yet processed and therefore not yet renamed;

2) When processing inode 262, its rename operation is issued and right
   after the rename operation for inode 260 is issued. However right after
   issuing the rename operation for inode 260, at send.c:apply_dir_move(),
   we issue utimes operations for all current and past parents of inode
   260. This means we try to send a utimes operation for its old parent,
   inode 261 (deleted in the send snapshot), which does not cause any
   immediate and deterministic failure, because when the target inode is
   not found in the send snapshot, the send.c:send_utimes() function
   ignores it and uses the leaf region pointed to by path->slots[0],
   which can be any unrelated item (belonging to other inode) or it can
   be a region outside the leaf boundaries, if the leaf is full and
   path->slots[0] matches the number of items in the leaf. So we end
   up either successfully sending a utimes operation, which is fine
   and irrelevant because the old parent (inode 261) will end up being
   deleted later, or we end up doing an invalid memory access tha
   crashes the kernel.

So fix this by making apply_dir_move() issue utimes operations only for
parents that still exist in the send snapshot. In a separate patch we
will make send_utimes() return an error (-ENOENT) if the given inode
does not exists in the send snapshot.

Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
[Rewrote change log to be more detailed and better organized]

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e552c33e72c0..8b653967b163 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3270,8 +3270,18 @@ finish:
 	 * and old parent(s).
 	 */
 	list_for_each_entry(cur, &pm->update_refs, list) {
-		if (cur->dir == rmdir_ino)
+		/*
+		 * The parent inode might have been deleted in the send snapshot
+		 */
+		ret = get_inode_info(sctx->send_root, cur->dir, NULL,
+				     NULL, NULL, NULL, NULL, NULL);
+		if (ret == -ENOENT) {
+			ret = 0;
 			continue;
+		}
+		if (ret < 0)
+			goto out;
+
 		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
 		if (ret < 0)
 			goto out;
-- 
cgit v1.2.3


From 15b253eace1f98dabbc6e03f6793fcf8603b1655 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Sat, 2 Jul 2016 05:43:46 +0100
Subject: Btrfs: send, avoid incorrect leaf accesses when sending utimes
 operations

The caller of send_utimes() is supposed to be sure that the inode number
it passes to this function does actually exists in the send snapshot.
However due to logic/algorithm bugs (such as the one fixed by the patch
titled "Btrfs: send, fix invalid leaf accesses due to incorrect utimes
operations"), this might not be the case and when that happens it makes
send_utimes() access use an unrelated leaf item as the target inode item
or access beyond a leaf's boundaries (when the leaf is full and
path->slots[0] matches the number of items in the leaf).

So if the call to btrfs_search_slot() done by send_utimes() does not find
the inode item, just make sure send_utimes() returns -ENOENT and does not
silently accesses unrelated leaf items or does invalid leaf accesses, also
allowing us to easialy and deterministically catch such algorithmic/logic
bugs.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8b653967b163..2db8dc89ca41 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2502,6 +2502,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret > 0)
+		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
 
-- 
cgit v1.2.3


From 951555856b88aa47bc238de6b4c6e97bfd9d36df Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 1 Aug 2016 01:50:37 +0100
Subject: Btrfs: send, don't bug on inconsistent snapshots

When doing an incremental send, if we find a new/modified/deleted extent,
reference or xattr without having previously processed the corresponding
inode item we end up exexuting a BUG_ON(). This is because whenever an
extent, xattr or reference is added, modified or deleted, we always expect
to have the corresponding inode item updated. However there are situations
where this will not happen due to transient -ENOMEM or -ENOSPC errors when
doing delayed inode updates.

For example, when punching holes we can succeed in deleting and modifying
(shrinking) extents but later fail to do the delayed inode update. So after
such failure we close our transaction handle and right after a snapshot of
the fs/subvol tree can be made and used later for a send operation. The
same thing can happen during truncate, link, unlink, and xattr related
operations.

So instead of executing a BUG_ON, make send return an -EIO error and print
an informative error message do dmesg/syslog.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 2db8dc89ca41..efe129fe2678 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -273,6 +273,39 @@ struct name_cache_entry {
 	char name[];
 };
 
+static void inconsistent_snapshot_error(struct send_ctx *sctx,
+					enum btrfs_compare_tree_result result,
+					const char *what)
+{
+	const char *result_string;
+
+	switch (result) {
+	case BTRFS_COMPARE_TREE_NEW:
+		result_string = "new";
+		break;
+	case BTRFS_COMPARE_TREE_DELETED:
+		result_string = "deleted";
+		break;
+	case BTRFS_COMPARE_TREE_CHANGED:
+		result_string = "updated";
+		break;
+	case BTRFS_COMPARE_TREE_SAME:
+		ASSERT(0);
+		result_string = "unchanged";
+		break;
+	default:
+		ASSERT(0);
+		result_string = "unexpected";
+	}
+
+	btrfs_err(sctx->send_root->fs_info,
+		  "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
+		  result_string, what, sctx->cmp_key->objectid,
+		  sctx->send_root->root_key.objectid,
+		  (sctx->parent_root ?
+		   sctx->parent_root->root_key.objectid : 0));
+}
+
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 
 static struct waiting_dir_move *
@@ -5711,7 +5744,10 @@ static int changed_ref(struct send_ctx *sctx,
 {
 	int ret = 0;
 
-	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+	if (sctx->cur_ino != sctx->cmp_key->objectid) {
+		inconsistent_snapshot_error(sctx, result, "reference");
+		return -EIO;
+	}
 
 	if (!sctx->cur_inode_new_gen &&
 	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
@@ -5736,7 +5772,10 @@ static int changed_xattr(struct send_ctx *sctx,
 {
 	int ret = 0;
 
-	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+	if (sctx->cur_ino != sctx->cmp_key->objectid) {
+		inconsistent_snapshot_error(sctx, result, "xattr");
+		return -EIO;
+	}
 
 	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
 		if (result == BTRFS_COMPARE_TREE_NEW)
@@ -5760,7 +5799,10 @@ static int changed_extent(struct send_ctx *sctx,
 {
 	int ret = 0;
 
-	BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+	if (sctx->cur_ino != sctx->cmp_key->objectid) {
+		inconsistent_snapshot_error(sctx, result, "extent");
+		return -EIO;
+	}
 
 	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
 		if (result != BTRFS_COMPARE_TREE_DELETED)
-- 
cgit v1.2.3


From 67710892ec983aa79ad1e2a2642fe8e3a4a194ea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Jun 2016 11:51:25 +0100
Subject: Btrfs: be more precise on errors when getting an inode from disk

When we attempt to read an inode from disk, we end up always returning an
-ESTALE error to the caller regardless of the actual failure reason, which
can be an out of memory problem (when allocating a path), some error found
when reading from the fs/subvolume btree (like a genuine IO error) or the
inode does not exists. So lets start returning the real error code to the
callers so that they don't treat all -ESTALE errors as meaning that the
inode does not exists (such as during orphan cleanup). This will also be
needed for a subsequent patch in the same series dealing with a special
fsync case.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e61bd1f3f65..f9686541997b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3428,10 +3428,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.offset = 0;
 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
 		ret = PTR_ERR_OR_ZERO(inode);
-		if (ret && ret != -ESTALE)
+		if (ret && ret != -ENOENT)
 			goto out;
 
-		if (ret == -ESTALE && root == root->fs_info->tree_root) {
+		if (ret == -ENOENT && root == root->fs_info->tree_root) {
 			struct btrfs_root *dead_root;
 			struct btrfs_fs_info *fs_info = root->fs_info;
 			int is_dead_root = 0;
@@ -3467,7 +3467,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * Inode is already gone but the orphan item is still there,
 		 * kill the orphan item.
 		 */
-		if (ret == -ESTALE) {
+		if (ret == -ENOENT) {
 			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
@@ -3626,7 +3626,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
  * read an inode from the btree into the in-memory inode
  */
-static void btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -3645,14 +3645,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		filled = true;
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		ret = -ENOMEM;
 		goto make_bad;
+	}
 
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
-	if (ret)
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
 		goto make_bad;
+	}
 
 	leaf = path->nodes[0];
 
@@ -3805,11 +3810,12 @@ cache_acl:
 	}
 
 	btrfs_update_iflags(inode);
-	return;
+	return 0;
 
 make_bad:
 	btrfs_free_path(path);
 	make_bad_inode(inode);
+	return ret;
 }
 
 /*
@@ -5595,7 +5601,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
-		btrfs_read_locked_inode(inode);
+		int ret;
+
+		ret = btrfs_read_locked_inode(inode);
 		if (!is_bad_inode(inode)) {
 			inode_tree_add(inode);
 			unlock_new_inode(inode);
@@ -5604,7 +5612,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 		} else {
 			unlock_new_inode(inode);
 			iput(inode);
-			inode = ERR_PTR(-ESTALE);
+			ASSERT(ret < 0);
+			inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
 		}
 	}
 
-- 
cgit v1.2.3


From 44f714dae50a2e795d3268a6831762aa6fa54f55 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 Jun 2016 16:11:13 +0100
Subject: Btrfs: improve performance on fsync against new inode after
 rename/unlink

With commit 56f23fdbb600 ("Btrfs: fix file/data loss caused by fsync after
rename and new inode") we got simple fix for a functional issue when the
following sequence of actions is done:

  at transaction N
  create file A at directory D
  at transaction N + M (where M >= 1)
  move/rename existing file A from directory D to directory E
  create a new file named A at directory D
  fsync the new file
  power fail

The solution was to simply detect such scenario and fallback to a full
transaction commit when we detect it. However this turned out to had a
significant impact on throughput (and a bit on latency too) for benchmarks
using the dbench tool, which simulates real workloads from smbd (Samba)
servers. For example on a test vm (with a debug kernel):

Unpatched:
Throughput 19.1572 MB/sec  32 clients  32 procs  max_latency=1005.229 ms

Patched:
Throughput 23.7015 MB/sec  32 clients  32 procs  max_latency=809.206 ms

The patched results (this patch is applied) are similar to the results of
a kernel with the commit 56f23fdbb600 ("Btrfs: fix file/data loss caused
by fsync after rename and new inode") reverted.

This change avoids the fallback to a transaction commit and instead makes
sure all the names of the conflicting inode (the one that had a name in a
past transaction that matches the name of the new file in the same parent
directory) are logged so that at log replay time we don't lose neither the
new file nor the old file, and the old file gets the name it was renamed
to.

This also ends up avoiding a full transaction commit for a similar case
that involves an unlink instead of a rename of the old file:

  at transaction N
  create file A at directory D
  at transaction N + M (where M >= 1)
  remove file A
  create a new file named A at directory D
  fsync the new file
  power fail

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/inode.c    | 19 +++++++++++-
 fs/btrfs/tree-log.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 95 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9686541997b..453b9d0da5f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4203,6 +4203,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err = 0;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
+	u64 last_unlink_trans;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
@@ -4225,11 +4226,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (err)
 		goto out;
 
+	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
 				 dentry->d_name.name, dentry->d_name.len);
-	if (!err)
+	if (!err) {
 		btrfs_i_size_write(inode, 0);
+		/*
+		 * Propagate the last_unlink_trans value of the deleted dir to
+		 * its parent directory. This is to prevent an unrecoverable
+		 * log tree in the case we do something like this:
+		 * 1) create dir foo
+		 * 2) create snapshot under dir foo
+		 * 3) delete the snapshot
+		 * 4) rmdir foo
+		 * 5) mkdir foo
+		 * 6) fsync foo or some file inside foo
+		 */
+		if (last_unlink_trans >= trans->transid)
+			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+	}
 out:
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c05f69a8ec42..5fa624cd815d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4469,7 +4469,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 					 const int slot,
 					 const struct btrfs_key *key,
-					 struct inode *inode)
+					 struct inode *inode,
+					 u64 *other_ino)
 {
 	int ret;
 	struct btrfs_path *search_path;
@@ -4528,7 +4529,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
 					   search_path, parent,
 					   name, this_name_len, 0);
 		if (di && !IS_ERR(di)) {
-			ret = 1;
+			struct btrfs_key di_key;
+
+			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+						  di, &di_key);
+			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+				ret = 1;
+				*other_ino = di_key.objectid;
+			} else {
+				ret = -EAGAIN;
+			}
 			goto out;
 		} else if (IS_ERR(di)) {
 			ret = PTR_ERR(di);
@@ -4718,16 +4728,71 @@ again:
 		if ((min_key.type == BTRFS_INODE_REF_KEY ||
 		     min_key.type == BTRFS_INODE_EXTREF_KEY) &&
 		    BTRFS_I(inode)->generation == trans->transid) {
+			u64 other_ino = 0;
+
 			ret = btrfs_check_ref_name_override(path->nodes[0],
 							    path->slots[0],
-							    &min_key, inode);
+							    &min_key, inode,
+							    &other_ino);
 			if (ret < 0) {
 				err = ret;
 				goto out_unlock;
 			} else if (ret > 0) {
-				err = 1;
-				btrfs_set_log_full_commit(root->fs_info, trans);
-				goto out_unlock;
+				struct btrfs_key inode_key;
+				struct inode *other_inode;
+
+				if (ins_nr > 0) {
+					ins_nr++;
+				} else {
+					ins_nr = 1;
+					ins_start_slot = path->slots[0];
+				}
+				ret = copy_items(trans, inode, dst_path, path,
+						 &last_extent, ins_start_slot,
+						 ins_nr, inode_only,
+						 logged_isize);
+				if (ret < 0) {
+					err = ret;
+					goto out_unlock;
+				}
+				ins_nr = 0;
+				btrfs_release_path(path);
+				inode_key.objectid = other_ino;
+				inode_key.type = BTRFS_INODE_ITEM_KEY;
+				inode_key.offset = 0;
+				other_inode = btrfs_iget(root->fs_info->sb,
+							 &inode_key, root,
+							 NULL);
+				/*
+				 * If the other inode that had a conflicting dir
+				 * entry was deleted in the current transaction,
+				 * we don't need to do more work nor fallback to
+				 * a transaction commit.
+				 */
+				if (IS_ERR(other_inode) &&
+				    PTR_ERR(other_inode) == -ENOENT) {
+					goto next_key;
+				} else if (IS_ERR(other_inode)) {
+					err = PTR_ERR(other_inode);
+					goto out_unlock;
+				}
+				/*
+				 * We are safe logging the other inode without
+				 * acquiring its i_mutex as long as we log with
+				 * the LOG_INODE_EXISTS mode. We're safe against
+				 * concurrent renames of the other inode as well
+				 * because during a rename we pin the log and
+				 * update the log with the new name before we
+				 * unpin it.
+				 */
+				err = btrfs_log_inode(trans, root, other_inode,
+						      LOG_INODE_EXISTS,
+						      0, LLONG_MAX, ctx);
+				iput(other_inode);
+				if (err)
+					goto out_unlock;
+				else
+					goto next_key;
 			}
 		}
 
@@ -4795,7 +4860,7 @@ next_slot:
 			ins_nr = 0;
 		}
 		btrfs_release_path(path);
-
+next_key:
 		if (min_key.offset < (u64)-1) {
 			min_key.offset++;
 		} else if (min_key.type < max_key.type) {
@@ -4989,8 +5054,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
 			break;
 
-		if (IS_ROOT(parent))
+		if (IS_ROOT(parent)) {
+			inode = d_inode(parent);
+			if (btrfs_must_commit_transaction(trans, inode))
+				ret = 1;
 			break;
+		}
 
 		parent = dget_parent(parent);
 		dput(old_parent);
-- 
cgit v1.2.3


From f005bd7e3b84a353475a2895e2c7686a66297d87 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 1 Aug 2016 10:54:15 +0100
Subject: clocksource/arm_arch_timer: Force per-CPU interrupt to be
 level-triggered

The ARM architected timer produces level-triggered interrupts (this
is mandated by the architecture). Unfortunately, a number of
device-trees get this wrong, and expose an edge-triggered interrupt.

Until now, this wasn't too much an issue, as the programming of the
trigger would fail (the corresponding PPI cannot be reconfigured),
and the kernel would be happy with this. But we're about to change
this, and trust DT a lot if the driver doesn't provide its own
trigger information. In that context, the timer breaks badly.

While we do need to fix the DTs, there is also some userspace out
there (kvmtool) that generates the same kind of broken DT on the
fly, and that will completely break with newer kernels.

As a safety measure, and to keep buggy software alive as well as
buying us some time to fix DTs all over the place, let's check
what trigger configuration has been given us by the firmware.
If this is not a level configuration, then we know that the
DT/ACPI configuration is bust, and we pick some defaults which
won't be worse than the existing setup.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Liu Gang <Gang.Liu@nxp.com>
Cc: Mark Rutland <marc.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Wenbin Song <Wenbin.Song@freescale.com>
Cc: Mingkai Hu <Mingkai.Hu@freescale.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Kevin Hilman <khilman@baylibre.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Michal Simek <michal.simek@xilinx.com>
Cc: Jon Hunter <jonathanh@nvidia.com>
Cc: arm@kernel.org
Cc: bcm-kernel-feedback-list@broadcom.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Ray Jui <rjui@broadcom.com>
Cc: "Hou Zhiqiang" <B48286@freescale.com>
Cc: Tirumalesh Chalamarla <tchalamarla@cavium.com>
Cc: linux-samsung-soc@vger.kernel.org
Cc: Yuan Yao <yao.yuan@nxp.com>
Cc: Jan Glauber <jglauber@cavium.com>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Cc: linux-amlogic@lists.infradead.org
Cc: soren.brinkmann@xilinx.com
Cc: Rajesh Bhagat <rajesh.bhagat@freescale.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Duc Dang <dhdang@apm.com>
Cc: Kukjin Kim <kgene@kernel.org>
Cc: Carlo Caione <carlo@caione.org>
Cc: Dinh Nguyen <dinguyen@opensource.altera.com>
Link: http://lkml.kernel.org/r/1470045256-9032-2-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/clocksource/arm_arch_timer.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 28bce3f4f81d..57700541f951 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -8,6 +8,9 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+
+#define pr_fmt(fmt)	"arm_arch_timer: " fmt
+
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/device.h>
@@ -370,16 +373,33 @@ static bool arch_timer_has_nonsecure_ppi(void)
 		arch_timer_ppi[PHYS_NONSECURE_PPI]);
 }
 
+static u32 check_ppi_trigger(int irq)
+{
+	u32 flags = irq_get_trigger_type(irq);
+
+	if (flags != IRQF_TRIGGER_HIGH && flags != IRQF_TRIGGER_LOW) {
+		pr_warn("WARNING: Invalid trigger for IRQ%d, assuming level low\n", irq);
+		pr_warn("WARNING: Please fix your firmware\n");
+		flags = IRQF_TRIGGER_LOW;
+	}
+
+	return flags;
+}
+
 static int arch_timer_starting_cpu(unsigned int cpu)
 {
 	struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
+	u32 flags;
 
 	__arch_timer_setup(ARCH_CP15_TIMER, clk);
 
-	enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], 0);
+	flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]);
+	enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags);
 
-	if (arch_timer_has_nonsecure_ppi())
-		enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], 0);
+	if (arch_timer_has_nonsecure_ppi()) {
+		flags = check_ppi_trigger(arch_timer_ppi[PHYS_NONSECURE_PPI]);
+		enable_percpu_irq(arch_timer_ppi[PHYS_NONSECURE_PPI], flags);
+	}
 
 	arch_counter_set_user_access();
 	if (evtstrm_enable)
-- 
cgit v1.2.3


From 7f555c8e5a84b348c2b76f4ca78eae7222354c03 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Fri, 29 Jul 2016 18:03:42 -0400
Subject: drm/amdgpu/gmc7: add missing mullins case

Looks like this got missed when we ported the code from radeon.

Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index d24a82bd0c7a..0b0f08641eed 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -144,6 +144,7 @@ static int gmc_v7_0_init_microcode(struct amdgpu_device *adev)
 		break;
 	case CHIP_KAVERI:
 	case CHIP_KABINI:
+	case CHIP_MULLINS:
 		return 0;
 	default: BUG();
 	}
-- 
cgit v1.2.3


From 903f75c8958e21f0fba2520467c11eaca824226d Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Fri, 29 Jul 2016 18:14:39 -0400
Subject: drm/amdgpu/ci: add mullins to default case for smc ucode

It's already covered by the default case, but add it for
consistency.

Reviewed-by: Alexandre Demers <alexandre.f.demers@gmail.com>
Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/ci_dpm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c
index e2f0e5d58d5c..a5c94b482459 100644
--- a/drivers/gpu/drm/amd/amdgpu/ci_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/ci_dpm.c
@@ -5779,6 +5779,7 @@ static int ci_dpm_init_microcode(struct amdgpu_device *adev)
 		break;
 	case CHIP_KAVERI:
 	case CHIP_KABINI:
+	case CHIP_MULLINS:
 	default: BUG();
 	}
 
-- 
cgit v1.2.3


From 4f7ec157cb6dfb1e2cd18abf21e6cb1107842c5f Mon Sep 17 00:00:00 2001
From: Eric Huang <JinHuiEric.Huang@amd.com>
Date: Thu, 28 Jul 2016 17:25:01 -0400
Subject: drm/amd/amdgpu: change pptable output format from ASCII to binary

Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Eric Huang <JinHuiEric.Huang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index ff63b88b0ffa..5cc7052e391d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -305,7 +305,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = ddev->dev_private;
 	char *table = NULL;
-	int size, i;
+	int size;
 
 	if (adev->pp_enabled)
 		size = amdgpu_dpm_get_pp_table(adev, &table);
@@ -315,10 +315,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
 	if (size >= PAGE_SIZE)
 		size = PAGE_SIZE - 1;
 
-	for (i = 0; i < size; i++) {
-		sprintf(buf + i, "%02x", table[i]);
-	}
-	sprintf(buf + i, "\n");
+	memcpy(buf, table, size);
 
 	return size;
 }
-- 
cgit v1.2.3


From 1f4c17a03ba7f430d63dba8c8e08ff1e2712581d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 1 Aug 2016 13:36:08 -0400
Subject: SUNRPC: Handle EADDRNOTAVAIL on connection failures

If the connect attempt immediately fails with an EADDRNOTAVAIL error, then
that means our choice of source port number was bad.
This error is expected when we set the SO_REUSEPORT socket option and we
have 2 sockets sharing the same source and destination address and port
combinations.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Fixes: 402e23b4ed9ed ("SUNRPC: Fix stupid typo in xs_sock_set_reuseport")
Cc: stable@vger.kernel.org # v4.0+
---
 net/sunrpc/xprtsock.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 111767ab124a..c6b1d48c4319 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2295,6 +2295,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		/* SYN_SENT! */
 		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
 			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+		break;
+	case -EADDRNOTAVAIL:
+		/* Source port number is unavailable. Try a new one! */
+		transport->srcport = 0;
 	}
 out:
 	return ret;
-- 
cgit v1.2.3


From 300108740b659c2380a731f147dd85ca0365db4f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 19 Jul 2016 13:04:14 -0700
Subject: lkdtm: fix false positive warning from -Wmaybe-uninitialized

The variable in use here doesn't matter (it's just used to exercise taking
up stack space), but this changes its use to pass its address instead,
to avoid a compiler warning:

drivers/misc/lkdtm_usercopy.c:54:15: warning: 'bad_stack' may be used uninitialized in this function [-Wmaybe-uninitialized]

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/misc/lkdtm_usercopy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/lkdtm_usercopy.c b/drivers/misc/lkdtm_usercopy.c
index 5a3fd76eec27..5525a204db93 100644
--- a/drivers/misc/lkdtm_usercopy.c
+++ b/drivers/misc/lkdtm_usercopy.c
@@ -49,7 +49,7 @@ static noinline void do_usercopy_stack(bool to_user, bool bad_frame)
 
 	/* This is a pointer to outside our current stack frame. */
 	if (bad_frame) {
-		bad_stack = do_usercopy_stack_callee((uintptr_t)bad_stack);
+		bad_stack = do_usercopy_stack_callee((uintptr_t)&bad_stack);
 	} else {
 		/* Put start address just inside stack. */
 		bad_stack = task_stack_page(current) + THREAD_SIZE;
-- 
cgit v1.2.3


From e50bd2354ced2018eff1c7e06e7db4db1f5ce745 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 1 Aug 2016 14:18:34 -0700
Subject: lkdtm: Fix targets for objcopy usage

The targets for lkdtm's objcopy were missing which caused them to always
be rebuilt. This corrects the problem.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/misc/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile
index 4387ccb79e64..7410c6d9a34d 100644
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -69,5 +69,6 @@ OBJCOPYFLAGS :=
 OBJCOPYFLAGS_lkdtm_rodata_objcopy.o := \
 			--set-section-flags .text=alloc,readonly \
 			--rename-section .text=.rodata
-$(obj)/lkdtm_rodata_objcopy.o: $(obj)/lkdtm_rodata.o
+targets += lkdtm_rodata.o lkdtm_rodata_objcopy.o
+$(obj)/lkdtm_rodata_objcopy.o: $(obj)/lkdtm_rodata.o FORCE
 	$(call if_changed,objcopy)
-- 
cgit v1.2.3


From ad3331acb1464024d20a184d3358f3cecc373b54 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 2 Aug 2016 13:47:43 -0400
Subject: SUNRPC: Fix up socket autodisconnect

Ensure that we don't forget to set up the disconnection timer for the
case when a connect request is fulfilled after the RPC request that
initiated it has timed out or been interrupted.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprt.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 8313960cac52..ea244b29138b 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -680,6 +680,20 @@ out:
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
+static bool
+xprt_has_timer(const struct rpc_xprt *xprt)
+{
+	return xprt->idle_timeout != 0;
+}
+
+static void
+xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
+	__must_hold(&xprt->transport_lock)
+{
+	if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
+		mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout);
+}
+
 static void
 xprt_init_autodisconnect(unsigned long data)
 {
@@ -688,6 +702,8 @@ xprt_init_autodisconnect(unsigned long data)
 	spin_lock(&xprt->transport_lock);
 	if (!list_empty(&xprt->recv))
 		goto out_abort;
+	/* Reset xprt->last_used to avoid connect/autodisconnect cycling */
+	xprt->last_used = jiffies;
 	if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
 		goto out_abort;
 	spin_unlock(&xprt->transport_lock);
@@ -725,6 +741,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
 		goto out;
 	xprt->snd_task =NULL;
 	xprt->ops->release_xprt(xprt, NULL);
+	xprt_schedule_autodisconnect(xprt);
 out:
 	spin_unlock_bh(&xprt->transport_lock);
 	wake_up_bit(&xprt->state, XPRT_LOCKED);
@@ -888,11 +905,6 @@ static void xprt_timer(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
-static inline int xprt_has_timer(struct rpc_xprt *xprt)
-{
-	return xprt->idle_timeout != 0;
-}
-
 /**
  * xprt_prepare_transmit - reserve the transport before sending a request
  * @task: RPC task about to send a request
@@ -1280,9 +1292,7 @@ void xprt_release(struct rpc_task *task)
 	if (!list_empty(&req->rq_list))
 		list_del(&req->rq_list);
 	xprt->last_used = jiffies;
-	if (list_empty(&xprt->recv) && xprt_has_timer(xprt))
-		mod_timer(&xprt->timer,
-				xprt->last_used + xprt->idle_timeout);
+	xprt_schedule_autodisconnect(xprt);
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(req->rq_buffer);
-- 
cgit v1.2.3


From 1fabaddd153e9e2f7e5695266abb56c934629718 Mon Sep 17 00:00:00 2001
From: Alim Akhtar <alim.akhtar@samsung.com>
Date: Mon, 11 Jul 2016 09:34:18 +0200
Subject: arm64: dts: Fix RTC by providing rtc_src clock

Add RTC source clock as Exynos7 needs source (32.768KHz) clock
for RTC block. Without this currently S3C RTC driver probe is broken
on this SoC.

Signed-off-by: Alim Akhtar <alim.akhtar@samsung.com>
Reviewed-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm64/boot/dts/exynos/exynos7-espresso.dts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/boot/dts/exynos/exynos7-espresso.dts b/arch/arm64/boot/dts/exynos/exynos7-espresso.dts
index 299f3ce969ab..c528dd52ba2d 100644
--- a/arch/arm64/boot/dts/exynos/exynos7-espresso.dts
+++ b/arch/arm64/boot/dts/exynos/exynos7-espresso.dts
@@ -12,6 +12,7 @@
 /dts-v1/;
 #include "exynos7.dtsi"
 #include <dt-bindings/interrupt-controller/irq.h>
+#include <dt-bindings/clock/samsung,s2mps11.h>
 
 / {
 	model = "Samsung Exynos7 Espresso board based on EXYNOS7";
@@ -43,6 +44,8 @@
 
 &rtc {
 	status = "okay";
+	clocks = <&clock_ccore PCLK_RTC>, <&s2mps15_osc S2MPS11_CLK_AP>;
+	clock-names = "rtc", "rtc_src";
 };
 
 &watchdog {
-- 
cgit v1.2.3


From d97ffe788184313fd83ff7da4b6587156f8388f0 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 2 Aug 2016 09:44:33 +0200
Subject: ARM: do away with final ARCH_REQUIRE_GPIOLIB

A new user of the Kconfig selection of ARCH_REQUIRE_GPIOLIB
has appeared. Replace with just selecting GPIOLIB.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/mach-clps711x/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mach-clps711x/Kconfig b/arch/arm/mach-clps711x/Kconfig
index dc7c6edeab39..61284b9389cf 100644
--- a/arch/arm/mach-clps711x/Kconfig
+++ b/arch/arm/mach-clps711x/Kconfig
@@ -1,13 +1,13 @@
 menuconfig ARCH_CLPS711X
 	bool "Cirrus Logic EP721x/EP731x-based"
 	depends on ARCH_MULTI_V4T
-	select ARCH_REQUIRE_GPIOLIB
 	select AUTO_ZRELADDR
 	select CLKSRC_OF
 	select CLPS711X_TIMER
 	select COMMON_CLK
 	select CPU_ARM720T
 	select GENERIC_CLOCKEVENTS
+	select GPIOLIB
 	select MFD_SYSCON
 	select OF_IRQ
 	select USE_OF
-- 
cgit v1.2.3


From 4d2bf027e8ca499a52224e508f52a59d3858094a Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@free-electrons.com>
Date: Tue, 26 Jul 2016 10:00:48 +0200
Subject: MAINTAINER: alpine: add a mailing list

Add the linux-arm-kernel mailing list for the Alpine SoCs.

Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 25f43204014d..b5c04dfeab22 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -999,6 +999,7 @@ N:	meson
 ARM/Annapurna Labs ALPINE ARCHITECTURE
 M:	Tsahee Zidenberg <tsahee@annapurnalabs.com>
 M:	Antoine Tenart <antoine.tenart@free-electrons.com>
+L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	arch/arm/mach-alpine/
 F:	arch/arm/boot/dts/alpine*
-- 
cgit v1.2.3


From 869ec056fa8450184423c8076e0a342db127795c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 30 Jun 2016 14:25:19 +0200
Subject: ARM: shmobile: don't call platform_can_secondary_boot on UP

For rcar-gen2, we build the SMP files even for UP configurations,
and that just broke:

arch/arm/mach-shmobile/built-in.o: In function `shmobile_smp_init_fallback_ops':
pm-rcar-gen2.c:(.init.text+0x40c): undefined reference to `platform_can_secondary_boot'

This adds an compile-time check before the call to platform_can_secondary_boot,
turning the function into an empty stub for UP configurations.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: c21af444eace ("ARM: shmobile: smp: Add function to prioritize DT SMP")
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/mach-shmobile/platsmp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm/mach-shmobile/platsmp.c b/arch/arm/mach-shmobile/platsmp.c
index f3dba6f356e2..02e21bceb085 100644
--- a/arch/arm/mach-shmobile/platsmp.c
+++ b/arch/arm/mach-shmobile/platsmp.c
@@ -40,5 +40,8 @@ bool shmobile_smp_cpu_can_disable(unsigned int cpu)
 bool __init shmobile_smp_init_fallback_ops(void)
 {
 	/* fallback on PSCI/smp_ops if no other DT based method is detected */
+	if (!IS_ENABLED(CONFIG_SMP))
+		return false;
+
 	return platform_can_secondary_boot() ? true : false;
 }
-- 
cgit v1.2.3


From fe85f07f582751da0a65d4ba7e3b36ebcbe55019 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 2 Aug 2016 11:21:35 +0800
Subject: drm/amdgpu: update golden setting of iceland

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index bff8668e9e6d..a4983237baa0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -409,6 +409,7 @@ static const u32 golden_settings_iceland_a11[] =
 	mmPA_SC_LINE_STIPPLE_STATE, 0x0000ff0f, 0x00000000,
 	mmPA_SC_RASTER_CONFIG, 0x3f3fffff, 0x00000002,
 	mmPA_SC_RASTER_CONFIG_1, 0x0000003f, 0x00000000,
+	mmRLC_CGCG_CGLS_CTRL, 0x00000003, 0x0000003c,
 	mmSQ_RANDOM_WAVE_PRI, 0x001fffff, 0x000006fd,
 	mmTA_CNTL_AUX, 0x000f000f, 0x000b0000,
 	mmTCC_CTRL, 0x00100000, 0xf31fff7f,
-- 
cgit v1.2.3


From 3a494b588f724fde00ae3ca0aa1c7e6cef1d0d51 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 2 Aug 2016 11:28:47 +0800
Subject: drm/amdgpu: update golden setting of carrizo

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index a4983237baa0..85854ba2cf0c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -506,8 +506,10 @@ static const u32 cz_golden_settings_a11[] =
 	mmGB_GPU_ID, 0x0000000f, 0x00000000,
 	mmPA_SC_ENHANCE, 0xffffffff, 0x00000001,
 	mmPA_SC_LINE_STIPPLE_STATE, 0x0000ff0f, 0x00000000,
+	mmRLC_CGCG_CGLS_CTRL, 0x00000003, 0x0000003c,
 	mmSQ_RANDOM_WAVE_PRI, 0x001fffff, 0x000006fd,
 	mmTA_CNTL_AUX, 0x000f000f, 0x00010000,
+	mmTCC_CTRL, 0x00100000, 0xf31fff7f,
 	mmTCC_EXE_DISABLE, 0x00000002, 0x00000002,
 	mmTCP_ADDR_CONFIG, 0x0000000f, 0x000000f3,
 	mmTCP_CHAN_STEER_LO, 0xffffffff, 0x00001302
-- 
cgit v1.2.3


From 9761bc53c8e1c52f3f0aa4334f38fd87b0cf1613 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 2 Aug 2016 11:26:34 +0800
Subject: drm/amdgpu: update golden setting of polaris11

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Ken Wang <Qingqing.Wang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 85854ba2cf0c..3b7f6e87d244 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -270,7 +270,8 @@ static const u32 tonga_mgcg_cgcg_init[] =
 
 static const u32 golden_settings_polaris11_a11[] =
 {
-	mmCB_HW_CONTROL, 0xfffdf3cf, 0x00006208,
+	mmCB_HW_CONTROL, 0x0000f3cf, 0x00007208,
+	mmCB_HW_CONTROL_2, 0x0f000000, 0x0f000000,
 	mmCB_HW_CONTROL_3, 0x000001ff, 0x00000040,
 	mmDB_DEBUG2, 0xf00fffff, 0x00000400,
 	mmPA_SC_ENHANCE, 0xffffffff, 0x20000001,
@@ -279,7 +280,7 @@ static const u32 golden_settings_polaris11_a11[] =
 	mmPA_SC_RASTER_CONFIG_1, 0x0000003f, 0x00000000,
 	mmRLC_CGCG_CGLS_CTRL, 0x00000003, 0x0001003c,
 	mmRLC_CGCG_CGLS_CTRL_3D, 0xffffffff, 0x0001003c,
-	mmSQ_CONFIG, 0x07f80000, 0x07180000,
+	mmSQ_CONFIG, 0x07f80000, 0x01180000,
 	mmTA_CNTL_AUX, 0x000f000f, 0x000b0000,
 	mmTCC_CTRL, 0x00100000, 0xf31fff7f,
 	mmTCP_ADDR_CONFIG, 0x000003ff, 0x000000f3,
-- 
cgit v1.2.3


From 6d51c813b172b4374fe7a6b732b6666f8d77bfea Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 2 Aug 2016 11:30:05 +0800
Subject: drm/amdgpu: update golden setting of stoney

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 717359d3ba8c..2aee2c6f3cd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -103,6 +103,11 @@ static const u32 stoney_mgcg_cgcg_init[] =
 	mmMC_MEM_POWER_LS, 0xffffffff, 0x00000104
 };
 
+static const u32 golden_settings_stoney_common[] =
+{
+	mmMC_HUB_RDREQ_UVD, MC_HUB_RDREQ_UVD__PRESCALE_MASK, 0x00000004,
+	mmMC_RD_GRP_OTH, MC_RD_GRP_OTH__UVD_MASK, 0x00600000
+};
 
 static void gmc_v8_0_init_golden_registers(struct amdgpu_device *adev)
 {
@@ -142,6 +147,9 @@ static void gmc_v8_0_init_golden_registers(struct amdgpu_device *adev)
 		amdgpu_program_register_sequence(adev,
 						 stoney_mgcg_cgcg_init,
 						 (const u32)ARRAY_SIZE(stoney_mgcg_cgcg_init));
+		amdgpu_program_register_sequence(adev,
+						 golden_settings_stoney_common,
+						 (const u32)ARRAY_SIZE(golden_settings_stoney_common));
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From a5a5e3084ec457f234298392dfa2c55db47b603f Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Tue, 2 Aug 2016 13:15:12 +0800
Subject: drm/amdgpu: update golden setting of polaris10

Signed-off-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Ken Wang <Qingqing.Wang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 3b7f6e87d244..b8184617ca25 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -302,8 +302,8 @@ static const u32 polaris11_golden_common_all[] =
 static const u32 golden_settings_polaris10_a11[] =
 {
 	mmATC_MISC_CG, 0x000c0fc0, 0x000c0200,
-	mmCB_HW_CONTROL, 0xfffdf3cf, 0x00007208,
-	mmCB_HW_CONTROL_2, 0, 0x0f000000,
+	mmCB_HW_CONTROL, 0x0001f3cf, 0x00007208,
+	mmCB_HW_CONTROL_2, 0x0f000000, 0x0f000000,
 	mmCB_HW_CONTROL_3, 0x000001ff, 0x00000040,
 	mmDB_DEBUG2, 0xf00fffff, 0x00000400,
 	mmPA_SC_ENHANCE, 0xffffffff, 0x20000001,
-- 
cgit v1.2.3


From 7a376ac11fc2109dfd86442ff79982ecf16dcd6d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 3 Aug 2016 11:07:44 +0200
Subject: Revert "ARM: aspeed: adapt defconfigs for new CONFIG_PRINTK_TIME"

The patch that this was preparing for made it into neither v4.7 nor
v4.8, so we should back this out as well to avoid the opposite
warning:

  arch/arm/configs/aspeed_g5_defconfig:62:warning: symbol value '1' invalid for PRINTK_TIME
  arch/arm/configs/aspeed_g4_defconfig:61:warning: symbol value '1' invalid for PRINTK_TIME

Sorry for not catching this earlier.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 0ef659a30055 ("ARM: aspeed: adapt defconfigs for new CONFIG_PRINTK_TIME")
Cc: stable@vger.kernel.org # v4.7
---
 arch/arm/configs/aspeed_g4_defconfig | 2 +-
 arch/arm/configs/aspeed_g5_defconfig | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/configs/aspeed_g4_defconfig b/arch/arm/configs/aspeed_g4_defconfig
index b6e54ee9bdbd..ca39c04fec6b 100644
--- a/arch/arm/configs/aspeed_g4_defconfig
+++ b/arch/arm/configs/aspeed_g4_defconfig
@@ -58,7 +58,7 @@ CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_FIRMWARE_MEMMAP=y
 CONFIG_FANOTIFY=y
-CONFIG_PRINTK_TIME=1
+CONFIG_PRINTK_TIME=y
 CONFIG_DYNAMIC_DEBUG=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_PAGE_POISONING=y
diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig
index 892605167357..4f366b0370e9 100644
--- a/arch/arm/configs/aspeed_g5_defconfig
+++ b/arch/arm/configs/aspeed_g5_defconfig
@@ -59,7 +59,7 @@ CONFIG_SERIAL_OF_PLATFORM=y
 # CONFIG_IOMMU_SUPPORT is not set
 CONFIG_FIRMWARE_MEMMAP=y
 CONFIG_FANOTIFY=y
-CONFIG_PRINTK_TIME=1
+CONFIG_PRINTK_TIME=y
 CONFIG_DYNAMIC_DEBUG=y
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_PAGE_POISONING=y
-- 
cgit v1.2.3


From e6571499336e10f93a77c51a35fd1a96828eea71 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 3 Aug 2016 11:01:32 +0100
Subject: Btrfs: remove unused function btrfs_add_delayed_qgroup_reserve()

No longer used as of commit 5846a3c26873 ("btrfs: qgroup: Fix a race in
delayed_ref which leads to abort trans").

Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/delayed-ref.c | 27 ---------------------------
 fs/btrfs/delayed-ref.h |  3 ---
 2 files changed, 30 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 430b3689b112..02dd2ec7d534 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -861,33 +861,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
-				     struct btrfs_trans_handle *trans,
-				     u64 ref_root, u64 bytenr, u64 num_bytes)
-{
-	struct btrfs_delayed_ref_root *delayed_refs;
-	struct btrfs_delayed_ref_head *ref_head;
-	int ret = 0;
-
-	if (!fs_info->quota_enabled || !is_fstree(ref_root))
-		return 0;
-
-	delayed_refs = &trans->transaction->delayed_refs;
-
-	spin_lock(&delayed_refs->lock);
-	ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
-	if (!ref_head) {
-		ret = -ENOENT;
-		goto out;
-	}
-	WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
-	ref_head->qgroup_ref_root = ref_root;
-	ref_head->qgroup_reserved = num_bytes;
-out:
-	spin_unlock(&delayed_refs->lock);
-	return ret;
-}
-
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5fca9534a271..43f3629760e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, u64 reserved, int action,
 			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
-				     struct btrfs_trans_handle *trans,
-				     u64 ref_root, u64 bytenr, u64 num_bytes);
 int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
-- 
cgit v1.2.3


From 0b857b44b5e445dc850cd91c45ce6edeb7797480 Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Sun, 31 Jul 2016 00:27:39 -0700
Subject: nvme-rdma: Don't leak uninitialized memory in connect request private
 data

Zero out the full nvme_rdma_cm_req structure before sending it.
Otherwise we end up leaking kernel memory in the reserved field, which
might break forward compatibility in the future.

Signed-off-by: Roland Dreier <roland@purestorage.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3e3ce2b0424e..b96b88369871 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1269,7 +1269,7 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
 {
 	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
 	struct rdma_conn_param param = { };
-	struct nvme_rdma_cm_req priv;
+	struct nvme_rdma_cm_req priv = { };
 	int ret;
 
 	param.qp_num = queue->qp->qp_num;
-- 
cgit v1.2.3


From 5f372eb3e76317b4fe4ba53ad1547f39fc883350 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 31 Jul 2016 18:43:15 +0300
Subject: nvme-rdma: Queue ns scanning after a sucessful reconnection

On an ordered target shutdown, the target can send a AEN on a namespace
removal, this will trigger the host to queue ns-list query. The shutdown
will trigger error recovery which will attepmt periodic reconnect.

We can hit a race where the ns rescanning fails (error recovery kicked
in and we're not connected) causing removing all the namespaces and when
we reconnect we won't see any namespaces for this controller.

So, queue a namespace rescan after we successfully reconnected to the target.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b96b88369871..7fd1d735ced5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -748,8 +748,10 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 	changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
 	WARN_ON_ONCE(!changed);
 
-	if (ctrl->queue_count > 1)
+	if (ctrl->queue_count > 1) {
 		nvme_start_queues(&ctrl->ctrl);
+		nvme_queue_scan(&ctrl->ctrl);
+	}
 
 	dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
 
-- 
cgit v1.2.3


From 57de5a0a40db97bb390d3ac1f4c2e74b9f3515c3 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 14 Jul 2016 17:39:47 +0300
Subject: nvme-rdma: Fix device removal handling

Device removal sequence may have crashed because the
controller (and admin queue space) was freed before
we destroyed the admin queue resources. Thus we
want to destroy the admin queue and only then queue
controller deletion and wait for it to complete.

More specifically we:
1. own the controller deletion (make sure we are not
   competing with another deletion).
2. get rid of inflight reconnects if exists (which
   also destroy and create queues).
3. destroy the queue.
4. safely queue controller deletion (and wait for it
   to complete).

Reported-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 7fd1d735ced5..3ffec3728abe 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -169,7 +169,6 @@ MODULE_PARM_DESC(register_always,
 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
 		struct rdma_cm_event *event);
 static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
-static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl);
 
 /* XXX: really should move to a generic header sooner or later.. */
 static inline void put_unaligned_le24(u32 val, u8 *p)
@@ -1320,37 +1319,39 @@ out_destroy_queue_ib:
  * that caught the event. Since we hold the callout until the controller
  * deletion is completed, we'll deadlock if the controller deletion will
  * call rdma_destroy_id on this queue's cm_id. Thus, we claim ownership
- * of destroying this queue before-hand, destroy the queue resources
- * after the controller deletion completed with the exception of destroying
- * the cm_id implicitely by returning a non-zero rc to the callout.
+ * of destroying this queue before-hand, destroy the queue resources,
+ * then queue the controller deletion which won't destroy this queue and
+ * we destroy the cm_id implicitely by returning a non-zero rc to the callout.
  */
 static int nvme_rdma_device_unplug(struct nvme_rdma_queue *queue)
 {
 	struct nvme_rdma_ctrl *ctrl = queue->ctrl;
-	int ret, ctrl_deleted = 0;
+	int ret;
 
-	/* First disable the queue so ctrl delete won't free it */
-	if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
-		goto out;
+	/* Own the controller deletion */
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
+		return 0;
 
-	/* delete the controller */
-	ret = __nvme_rdma_del_ctrl(ctrl);
-	if (!ret) {
-		dev_warn(ctrl->ctrl.device,
-			"Got rdma device removal event, deleting ctrl\n");
-		flush_work(&ctrl->delete_work);
+	dev_warn(ctrl->ctrl.device,
+		"Got rdma device removal event, deleting ctrl\n");
 
-		/* Return non-zero so the cm_id will destroy implicitly */
-		ctrl_deleted = 1;
+	/* Get rid of reconnect work if its running */
+	cancel_delayed_work_sync(&ctrl->reconnect_work);
 
+	/* Disable the queue so ctrl delete won't free it */
+	if (test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags)) {
 		/* Free this queue ourselves */
-		rdma_disconnect(queue->cm_id);
-		ib_drain_qp(queue->qp);
+		nvme_rdma_stop_queue(queue);
 		nvme_rdma_destroy_queue_ib(queue);
+
+		/* Return non-zero so the cm_id will destroy implicitly */
+		ret = 1;
 	}
 
-out:
-	return ctrl_deleted;
+	/* Queue controller deletion */
+	queue_work(nvme_rdma_wq, &ctrl->delete_work);
+	flush_work(&ctrl->delete_work);
+	return ret;
 }
 
 static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
-- 
cgit v1.2.3


From 2461a8dd38bea3cb5b1c1f0323794483292fb03f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 24 Jul 2016 09:29:51 +0300
Subject: nvme-rdma: Remove duplicate call to nvme_remove_namespaces

nvme_uninit_ctrl already does that for us. Note that we reordered
nvme_rdma_shutdown_ctrl and nvme_uninit_ctrl, this is perfectly
fine because we actually want ctrl uninit (aen, scan cancellation
and namespaces removal) to happen before we shutdown the rdma
resources.

Also, centralize the deletion work and the dead controller removal
work code duplication into __nvme_rdma_shutdown_ctrl that accepts
a shutdown boolean.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3ffec3728abe..1279bc292253 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1660,15 +1660,20 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
 	nvme_rdma_destroy_admin_queue(ctrl);
 }
 
+static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
+{
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	if (shutdown)
+		nvme_rdma_shutdown_ctrl(ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+}
+
 static void nvme_rdma_del_ctrl_work(struct work_struct *work)
 {
 	struct nvme_rdma_ctrl *ctrl = container_of(work,
 				struct nvme_rdma_ctrl, delete_work);
 
-	nvme_remove_namespaces(&ctrl->ctrl);
-	nvme_rdma_shutdown_ctrl(ctrl);
-	nvme_uninit_ctrl(&ctrl->ctrl);
-	nvme_put_ctrl(&ctrl->ctrl);
+	__nvme_rdma_remove_ctrl(ctrl, true);
 }
 
 static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
@@ -1701,9 +1706,7 @@ static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
 	struct nvme_rdma_ctrl *ctrl = container_of(work,
 				struct nvme_rdma_ctrl, delete_work);
 
-	nvme_remove_namespaces(&ctrl->ctrl);
-	nvme_uninit_ctrl(&ctrl->ctrl);
-	nvme_put_ctrl(&ctrl->ctrl);
+	__nvme_rdma_remove_ctrl(ctrl, false);
 }
 
 static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
-- 
cgit v1.2.3


From a34ca17a9717fe607cd58285a1704cb6526cf561 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 24 Jul 2016 09:22:19 +0300
Subject: nvme-rdma: Free the I/O tags when we delete the controller

If we wait until we free the controller (free_ctrl) we might
lose our rdma device without any notification while we still
have open resources (tags mrs and dma mappings).

Instead, destroy the tags with their rdma resources once we
delete the device and not when freeing it.

Note that we don't do that in nvme_rdma_shutdown_ctrl because
controller reset uses it as well and we want to give active I/O
a chance to complete successfully.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 1279bc292253..6378dc94aeaf 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -686,11 +686,6 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 	list_del(&ctrl->list);
 	mutex_unlock(&nvme_rdma_ctrl_mutex);
 
-	if (ctrl->ctrl.tagset) {
-		blk_cleanup_queue(ctrl->ctrl.connect_q);
-		blk_mq_free_tag_set(&ctrl->tag_set);
-		nvme_rdma_dev_put(ctrl->device);
-	}
 	kfree(ctrl->queues);
 	nvmf_free_options(nctrl->opts);
 free_ctrl:
@@ -1665,6 +1660,13 @@ static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 	nvme_uninit_ctrl(&ctrl->ctrl);
 	if (shutdown)
 		nvme_rdma_shutdown_ctrl(ctrl);
+
+	if (ctrl->ctrl.tagset) {
+		blk_cleanup_queue(ctrl->ctrl.connect_q);
+		blk_mq_free_tag_set(&ctrl->tag_set);
+		nvme_rdma_dev_put(ctrl->device);
+	}
+
 	nvme_put_ctrl(&ctrl->ctrl);
 }
 
-- 
cgit v1.2.3


From a159c64d936eb0d1da29d8ad384183d8984899c9 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 24 Jul 2016 09:32:08 +0300
Subject: nvme-loop: Remove duplicate call to nvme_remove_namespaces

nvme_uninit_ctrl already does that for us. Note that we
reordered nvme_loop_shutdown_ctrl with nvme_uninit_ctrl
but its safe because we want controller uninit to happen
before we shutdown the transport resources.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/loop.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 94e782987cc9..7affd40a6b33 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -414,9 +414,8 @@ static void nvme_loop_del_ctrl_work(struct work_struct *work)
 	struct nvme_loop_ctrl *ctrl = container_of(work,
 				struct nvme_loop_ctrl, delete_work);
 
-	nvme_remove_namespaces(&ctrl->ctrl);
-	nvme_loop_shutdown_ctrl(ctrl);
 	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_loop_shutdown_ctrl(ctrl);
 	nvme_put_ctrl(&ctrl->ctrl);
 }
 
@@ -501,7 +500,6 @@ out_free_queues:
 	nvme_loop_destroy_admin_queue(ctrl);
 out_disable:
 	dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
-	nvme_remove_namespaces(&ctrl->ctrl);
 	nvme_uninit_ctrl(&ctrl->ctrl);
 	nvme_put_ctrl(&ctrl->ctrl);
 }
-- 
cgit v1.2.3


From 45862ebcc4883b1b6bc0701cd15cb2b68b140c5d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 24 Jul 2016 09:26:16 +0300
Subject: nvme-rdma: Make sure to shutdown the controller if we can

Relying on ctrl state in nvme_rdma_shutdown_ctrl is wrong because
it will never be NVME_CTRL_LIVE (delete_ctrl or reset_ctrl invoked it).

Instead, check that the admin queue is connected. Note that it is safe
because we can never see a copmeting thread trying to destroy the admin
queue (reset or delete controller).

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 6378dc94aeaf..f4b836869d2c 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1646,7 +1646,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
 		nvme_rdma_free_io_queues(ctrl);
 	}
 
-	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
+	if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags))
 		nvme_shutdown_ctrl(&ctrl->ctrl);
 
 	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
-- 
cgit v1.2.3


From c45eb4fed12d278d3619f1904885bd0d7bcbf036 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 13 Jul 2016 18:34:45 +0100
Subject: drm/i915/fbdev: Check for the framebuffer before use

If the fbdev probing fails, and in our error path we fail to clear the
dev_priv->fbdev, then we can try and use a dangling fbdev pointer, and
in particular a NULL fb. This could also happen in pathological cases
where we try to operate on the fbdev prior to it being probed.

Reported-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1468431285-28264-2-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
(cherry picked from commit 6bc265424df02f8162f4a17a37e4982e1c64460e)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/i915/intel_fbdev.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c
index 86b00c6db1a6..3e3632c18733 100644
--- a/drivers/gpu/drm/i915/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/intel_fbdev.c
@@ -782,7 +782,7 @@ void intel_fbdev_set_suspend(struct drm_device *dev, int state, bool synchronous
 	struct intel_fbdev *ifbdev = dev_priv->fbdev;
 	struct fb_info *info;
 
-	if (!ifbdev)
+	if (!ifbdev || !ifbdev->fb)
 		return;
 
 	info = ifbdev->helper.fbdev;
@@ -827,31 +827,28 @@ void intel_fbdev_set_suspend(struct drm_device *dev, int state, bool synchronous
 
 void intel_fbdev_output_poll_changed(struct drm_device *dev)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	if (dev_priv->fbdev)
-		drm_fb_helper_hotplug_event(&dev_priv->fbdev->helper);
+	struct intel_fbdev *ifbdev = to_i915(dev)->fbdev;
+
+	if (ifbdev && ifbdev->fb)
+		drm_fb_helper_hotplug_event(&ifbdev->helper);
 }
 
 void intel_fbdev_restore_mode(struct drm_device *dev)
 {
-	int ret;
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct intel_fbdev *ifbdev = dev_priv->fbdev;
-	struct drm_fb_helper *fb_helper;
+	struct intel_fbdev *ifbdev = to_i915(dev)->fbdev;
 
 	if (!ifbdev)
 		return;
 
 	intel_fbdev_sync(ifbdev);
+	if (!ifbdev->fb)
+		return;
 
-	fb_helper = &ifbdev->helper;
-
-	ret = drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper);
-	if (ret) {
+	if (drm_fb_helper_restore_fbdev_mode_unlocked(&ifbdev->helper)) {
 		DRM_DEBUG("failed to restore crtc mode\n");
 	} else {
-		mutex_lock(&fb_helper->dev->struct_mutex);
+		mutex_lock(&dev->struct_mutex);
 		intel_fb_obj_invalidate(ifbdev->fb->obj, ORIGIN_GTT);
-		mutex_unlock(&fb_helper->dev->struct_mutex);
+		mutex_unlock(&dev->struct_mutex);
 	}
 }
-- 
cgit v1.2.3


From d8f7750a08968b105056328652d2c332bdfa062d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 19 May 2016 15:24:55 +0300
Subject: nvmet-rdma: Correctly handle RDMA device hot removal

When configuring a device attached listener, we may
see device removal events. In this case we return a
non-zero return code from the cm event handler which
implicitly destroys the cm_id. It is possible that in
the future the user will remove this listener and by
that trigger a second call to rdma_destroy_id on an
already destroyed cm_id -> BUG.

In addition, when a queue bound (active session) cm_id
generates a DEVICE_REMOVAL event we must guarantee all
resources are cleaned up by the time we return from the
event handler.

Introduce nvmet_rdma_device_removal which addresses
(or at least attempts to) both scenarios.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 87 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 70 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index e06d504bdf0c..48c811850c29 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -77,6 +77,7 @@ enum nvmet_rdma_queue_state {
 	NVMET_RDMA_Q_CONNECTING,
 	NVMET_RDMA_Q_LIVE,
 	NVMET_RDMA_Q_DISCONNECTING,
+	NVMET_RDMA_IN_DEVICE_REMOVAL,
 };
 
 struct nvmet_rdma_queue {
@@ -984,7 +985,10 @@ static void nvmet_rdma_release_queue_work(struct work_struct *w)
 	struct nvmet_rdma_device *dev = queue->dev;
 
 	nvmet_rdma_free_queue(queue);
-	rdma_destroy_id(cm_id);
+
+	if (queue->state != NVMET_RDMA_IN_DEVICE_REMOVAL)
+		rdma_destroy_id(cm_id);
+
 	kref_put(&dev->ref, nvmet_rdma_free_dev);
 }
 
@@ -1233,8 +1237,9 @@ static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
 	switch (queue->state) {
 	case NVMET_RDMA_Q_CONNECTING:
 	case NVMET_RDMA_Q_LIVE:
-		disconnect = true;
 		queue->state = NVMET_RDMA_Q_DISCONNECTING;
+	case NVMET_RDMA_IN_DEVICE_REMOVAL:
+		disconnect = true;
 		break;
 	case NVMET_RDMA_Q_DISCONNECTING:
 		break;
@@ -1272,6 +1277,62 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
 	schedule_work(&queue->release_work);
 }
 
+/**
+ * nvme_rdma_device_removal() - Handle RDMA device removal
+ * @queue:      nvmet rdma queue (cm id qp_context)
+ * @addr:	nvmet address (cm_id context)
+ *
+ * DEVICE_REMOVAL event notifies us that the RDMA device is about
+ * to unplug so we should take care of destroying our RDMA resources.
+ * This event will be generated for each allocated cm_id.
+ *
+ * Note that this event can be generated on a normal queue cm_id
+ * and/or a device bound listener cm_id (where in this case
+ * queue will be null).
+ *
+ * we claim ownership on destroying the cm_id. For queues we move
+ * the queue state to NVMET_RDMA_IN_DEVICE_REMOVAL and for port
+ * we nullify the priv to prevent double cm_id destruction and destroying
+ * the cm_id implicitely by returning a non-zero rc to the callout.
+ */
+static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
+		struct nvmet_rdma_queue *queue)
+{
+	unsigned long flags;
+
+	if (!queue) {
+		struct nvmet_port *port = cm_id->context;
+
+		/*
+		 * This is a listener cm_id. Make sure that
+		 * future remove_port won't invoke a double
+		 * cm_id destroy. use atomic xchg to make sure
+		 * we don't compete with remove_port.
+		 */
+		if (xchg(&port->priv, NULL) != cm_id)
+			return 0;
+	} else {
+		/*
+		 * This is a queue cm_id. Make sure that
+		 * release queue will not destroy the cm_id
+		 * and schedule all ctrl queues removal (only
+		 * if the queue is not disconnecting already).
+		 */
+		spin_lock_irqsave(&queue->state_lock, flags);
+		if (queue->state != NVMET_RDMA_Q_DISCONNECTING)
+			queue->state = NVMET_RDMA_IN_DEVICE_REMOVAL;
+		spin_unlock_irqrestore(&queue->state_lock, flags);
+		nvmet_rdma_queue_disconnect(queue);
+		flush_scheduled_work();
+	}
+
+	/*
+	 * We need to return 1 so that the core will destroy
+	 * it's own ID.  What a great API design..
+	 */
+	return 1;
+}
+
 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
 		struct rdma_cm_event *event)
 {
@@ -1294,20 +1355,11 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
 		break;
 	case RDMA_CM_EVENT_ADDR_CHANGE:
 	case RDMA_CM_EVENT_DISCONNECTED:
-	case RDMA_CM_EVENT_DEVICE_REMOVAL:
 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
-		/*
-		 * We can get the device removal callback even for a
-		 * CM ID that we aren't actually using.  In that case
-		 * the context pointer is NULL, so we shouldn't try
-		 * to disconnect a non-existing queue.  But we also
-		 * need to return 1 so that the core will destroy
-		 * it's own ID.  What a great API design..
-		 */
-		if (queue)
-			nvmet_rdma_queue_disconnect(queue);
-		else
-			ret = 1;
+		nvmet_rdma_queue_disconnect(queue);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		ret = nvmet_rdma_device_removal(cm_id, queue);
 		break;
 	case RDMA_CM_EVENT_REJECTED:
 	case RDMA_CM_EVENT_UNREACHABLE:
@@ -1396,9 +1448,10 @@ out_destroy_id:
 
 static void nvmet_rdma_remove_port(struct nvmet_port *port)
 {
-	struct rdma_cm_id *cm_id = port->priv;
+	struct rdma_cm_id *cm_id = xchg(&port->priv, NULL);
 
-	rdma_destroy_id(cm_id);
+	if (cm_id)
+		rdma_destroy_id(cm_id);
 }
 
 static struct nvmet_fabrics_ops nvmet_rdma_ops = {
-- 
cgit v1.2.3


From 40e64e07213201710a51e270595d6e6c028f9502 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 28 Jul 2016 18:04:09 +0300
Subject: nvmet-rdma: Don't use the inline buffer in order to avoid allocation
 for small reads

Under extreme conditions this might cause data corruptions. By doing that
we we repost the buffer and then post this buffer for the device to send.
If we happen to use shared receive queues the device might write to the
buffer before it sends it (there is no ordering between send and recv
queues). Without SRQs we probably won't get that if the host doesn't
mis-behave and send more than we allowed it, but relying on that is not
really a good idea.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 48c811850c29..b4d648536c3e 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -616,15 +616,10 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 	if (!len)
 		return 0;
 
-	/* use the already allocated data buffer if possible */
-	if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) {
-		nvmet_rdma_use_inline_sg(rsp, len, 0);
-	} else {
-		status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
-				len);
-		if (status)
-			return status;
-	}
+	status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
+			len);
+	if (status)
+		return status;
 
 	ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
 			rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
-- 
cgit v1.2.3


From 28b89118539da03f4b188763e1b2fd1aec0f580a Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 4 Aug 2016 11:18:49 +0300
Subject: nvmet: Fix controller serial number inconsistency

The host is allowed to issue identify as many times
as it wants, we need to stay consistent when reporting
the serial number for a given controller.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 6 +-----
 drivers/nvme/target/core.c      | 4 ++++
 drivers/nvme/target/nvmet.h     | 1 +
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 2fac17a5ad53..47c564b5a289 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -13,7 +13,6 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
-#include <linux/random.h>
 #include <generated/utsrelease.h>
 #include "nvmet.h"
 
@@ -83,7 +82,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	struct nvme_id_ctrl *id;
-	u64 serial;
 	u16 status = 0;
 
 	id = kzalloc(sizeof(*id), GFP_KERNEL);
@@ -96,10 +94,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	id->vid = 0;
 	id->ssvid = 0;
 
-	/* generate a random serial number as our controllers are ephemeral: */
-	get_random_bytes(&serial, sizeof(serial));
 	memset(id->sn, ' ', sizeof(id->sn));
-	snprintf(id->sn, sizeof(id->sn), "%llx", serial);
+	snprintf(id->sn, sizeof(id->sn), "%llx", ctrl->serial);
 
 	memset(id->mn, ' ', sizeof(id->mn));
 	strncpy((char *)id->mn, "Linux", sizeof(id->mn));
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 8a891ca53367..6559d5afa7bf 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -13,6 +13,7 @@
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
+#include <linux/random.h>
 #include "nvmet.h"
 
 static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
@@ -728,6 +729,9 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE);
 	memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE);
 
+	/* generate a random serial number as our controllers are ephemeral: */
+	get_random_bytes(&ctrl->serial, sizeof(ctrl->serial));
+
 	kref_init(&ctrl->ref);
 	ctrl->subsys = subsys;
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 57dd6d834c28..76b6eedccaf9 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -113,6 +113,7 @@ struct nvmet_ctrl {
 
 	struct mutex		lock;
 	u64			cap;
+	u64			serial;
 	u32			cc;
 	u32			csts;
 
-- 
cgit v1.2.3


From 3ef1b4b298d98ccb3cc895baf1b18f7f9d073bee Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 4 Aug 2016 13:46:19 +0300
Subject: nvme-rdma: start async event handler after reconnecting to a
 controller

When we reset or reconnect to a controller, we are cancelling the
async event handler so we can safely re-establish resources, but we
need to remember to start it again when we successfully reconnect.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f4b836869d2c..e82434ab3a28 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -745,6 +745,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 	if (ctrl->queue_count > 1) {
 		nvme_start_queues(&ctrl->ctrl);
 		nvme_queue_scan(&ctrl->ctrl);
+		nvme_queue_async_events(&ctrl->ctrl);
 	}
 
 	dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
@@ -1747,6 +1748,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
 	if (ctrl->queue_count > 1) {
 		nvme_start_queues(&ctrl->ctrl);
 		nvme_queue_scan(&ctrl->ctrl);
+		nvme_queue_async_events(&ctrl->ctrl);
 	}
 
 	return;
-- 
cgit v1.2.3


From e3266378bdbca82c2854fc612fa9a391eba1f173 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 4 Aug 2016 17:37:40 +0300
Subject: nvme-rdma: Remove unused includes

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/nvme/host/rdma.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index e82434ab3a28..8d2875b4c56d 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -12,13 +12,11 @@
  * more details.
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/string.h>
-#include <linux/jiffies.h>
 #include <linux/atomic.h>
 #include <linux/blk-mq.h>
 #include <linux/types.h>
@@ -26,7 +24,6 @@
 #include <linux/mutex.h>
 #include <linux/scatterlist.h>
 #include <linux/nvme.h>
-#include <linux/t10-pi.h>
 #include <asm/unaligned.h>
 
 #include <rdma/ib_verbs.h>
-- 
cgit v1.2.3


From 9130b8dbc6ac20f2dc5846e1647f5b60eafab6e3 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 3 Aug 2016 20:19:48 -0400
Subject: SUNRPC: allow for upcalls for same uid but different gss service

It's possible to have simultaneous upcalls for the same UIDs but
different GSS service. In that case, we need to allow for the
upcall to gssd to proceed so that not the same context is used
by two different GSS services. Some servers lock the use of context
to the GSS service.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Cc: stable@vger.kernel.org # v3.9+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 23c8e7c39656..976c7812bbd5 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -340,12 +340,14 @@ gss_release_msg(struct gss_upcall_msg *gss_msg)
 }
 
 static struct gss_upcall_msg *
-__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid)
+__gss_find_upcall(struct rpc_pipe *pipe, kuid_t uid, const struct gss_auth *auth)
 {
 	struct gss_upcall_msg *pos;
 	list_for_each_entry(pos, &pipe->in_downcall, list) {
 		if (!uid_eq(pos->uid, uid))
 			continue;
+		if (auth && pos->auth->service != auth->service)
+			continue;
 		atomic_inc(&pos->count);
 		dprintk("RPC:       %s found msg %p\n", __func__, pos);
 		return pos;
@@ -365,7 +367,7 @@ gss_add_msg(struct gss_upcall_msg *gss_msg)
 	struct gss_upcall_msg *old;
 
 	spin_lock(&pipe->lock);
-	old = __gss_find_upcall(pipe, gss_msg->uid);
+	old = __gss_find_upcall(pipe, gss_msg->uid, gss_msg->auth);
 	if (old == NULL) {
 		atomic_inc(&gss_msg->count);
 		list_add(&gss_msg->list, &pipe->in_downcall);
@@ -714,7 +716,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	err = -ENOENT;
 	/* Find a matching upcall */
 	spin_lock(&pipe->lock);
-	gss_msg = __gss_find_upcall(pipe, uid);
+	gss_msg = __gss_find_upcall(pipe, uid, NULL);
 	if (gss_msg == NULL) {
 		spin_unlock(&pipe->lock);
 		goto err_put_ctx;
-- 
cgit v1.2.3


From d88e4d82efc7aca47f1e31808717c9718e466d93 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 4 Aug 2016 16:24:28 +1000
Subject: SUNRPC: disable the use of IPv6 temporary addresses.

If the net.ipv6.conf.*.use_temp_addr sysctl is set to '2',
then TCP connections over IPv6 will prefer a 'private' source
address.
These eventually expire and become invalid, typically after a week,
but the time is configurable.

When the local address becomes invalid the client will not be able to
receive replies from the server.  Eventually the connection will timeout
or break and a new connection will be established, but this can take
half an hour (typically TCP connection break time).

RFC 4941, which describes private IPv6 addresses, acknowledges that some
applications might not work well with them and that the application may
explicitly a request non-temporary (i.e. "public") address.

I believe this is correct for SUNRPC clients.  Without this change, a
client will occasionally experience a long delay if private addresses
have been enabled.

The privacy offered by private addresses is of little value for an NFS
server which requires client authentication.

For NFSv3 this will often not be a problem because idle connections are
closed after 5 minutes.  For NFSv4 connections never go idle due to the
period RENEW (or equivalent) request.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c6b1d48c4319..bf79212fdbf8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2236,6 +2236,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		unsigned int keepcnt = xprt->timeout->to_retries + 1;
 		unsigned int opt_on = 1;
 		unsigned int timeo;
+		unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
 
 		/* TCP Keepalive options */
 		kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2247,6 +2248,16 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
 				(char *)&keepcnt, sizeof(keepcnt));
 
+		/* Avoid temporary address, they are bad for long-lived
+		 * connections such as NFS mounts.
+		 * RFC4941, section 3.6 suggests that:
+		 *    Individual applications, which have specific
+		 *    knowledge about the normal duration of connections,
+		 *    MAY override this as appropriate.
+		 */
+		kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
+				(char *)&addr_pref, sizeof(addr_pref));
+
 		/* TCP user timeout (see RFC5482) */
 		timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
 			(xprt->timeout->to_retries + 1);
-- 
cgit v1.2.3


From 206b3bb57481758b5fe98e90f47abb0f3fc0954e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 5 Aug 2016 12:16:19 -0400
Subject: NFSv4.2: LAYOUTSTATS may return NFS4ERR_ADMIN/DELEG_REVOKED

We should handle those errors in the same way we handle the other
stateid errors: by invalidating the faulty layout stateid.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42proc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 33da841a21bb..6f4752734804 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -338,6 +338,8 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 	case 0:
 		break;
 	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_DELEG_REVOKED:
 	case -NFS4ERR_STALE_STATEID:
 	case -NFS4ERR_OLD_STATEID:
 	case -NFS4ERR_BAD_STATEID:
-- 
cgit v1.2.3


From 02910177aede34d6f49e2dc14b1c5c6cd468d94f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 4 Aug 2016 00:00:33 -0400
Subject: SUNRPC: Fix reconnection timeouts

When the connect attempt fails and backs off, we should start the clock
at the last connection attempt, not time at which we queue up the
reconnect job.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/xprtsock.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf79212fdbf8..04b0c4190dd7 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2173,6 +2173,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		write_unlock_bh(&sk->sk_callback_lock);
 	}
 	xs_udp_do_set_buffer_size(xprt);
+
+	xprt->stat.connect_start = jiffies;
 }
 
 static void xs_udp_setup_socket(struct work_struct *work)
@@ -2384,6 +2386,16 @@ out:
 	xprt_wake_pending_tasks(xprt, status);
 }
 
+static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
+{
+	unsigned long start, now = jiffies;
+
+	start = xprt->stat.connect_start + xprt->reestablish_timeout;
+	if (time_after(start, now))
+		return start - now;
+	return 0;
+}
+
 /**
  * xs_connect - connect a socket to a remote endpoint
  * @xprt: pointer to transport structure
@@ -2401,6 +2413,7 @@ out:
 static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	unsigned long delay = 0;
 
 	WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
 
@@ -2412,19 +2425,19 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		/* Start by resetting any existing state */
 		xs_reset_transport(transport);
 
-		queue_delayed_work(xprtiod_workqueue,
-				   &transport->connect_worker,
-				   xprt->reestablish_timeout);
+		delay = xs_reconnect_delay(xprt);
+
 		xprt->reestablish_timeout <<= 1;
 		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
 			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
 		if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
 			xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
-	} else {
+	} else
 		dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
-		queue_delayed_work(xprtiod_workqueue,
-				   &transport->connect_worker, 0);
-	}
+
+	queue_delayed_work(xprtiod_workqueue,
+			&transport->connect_worker,
+			delay);
 }
 
 /**
-- 
cgit v1.2.3


From 34b58355ad1d9987267f071265a7de6c8e00662a Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel.daenzer@amd.com>
Date: Fri, 5 Aug 2016 18:36:10 +0900
Subject: drm/ttm: Wait for a BO to become idle before unbinding it from GTT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes hangs under memory pressure, e.g. running the piglit test
tex3d-maxsize concurrently with other tests.

Fixes: 17d33bc9d6ef ("drm/ttm: drop waiting for idle in ttm_bo_evict.")
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Michel Dänzer <michel.daenzer@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  4 ++--
 drivers/gpu/drm/nouveau/nouveau_bo.c    |  4 ++--
 drivers/gpu/drm/radeon/radeon_ttm.c     |  4 ++--
 drivers/gpu/drm/ttm/ttm_bo.c            |  3 ++-
 drivers/gpu/drm/ttm/ttm_bo_util.c       | 10 +++++++++-
 include/drm/ttm/ttm_bo_driver.h         |  3 ++-
 6 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index b7742e62972a..9b61c8ba7aaf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -335,7 +335,7 @@ static int amdgpu_move_vram_ram(struct ttm_buffer_object *bo,
 	if (unlikely(r)) {
 		goto out_cleanup;
 	}
-	r = ttm_bo_move_ttm(bo, true, no_wait_gpu, new_mem);
+	r = ttm_bo_move_ttm(bo, true, interruptible, no_wait_gpu, new_mem);
 out_cleanup:
 	ttm_bo_mem_put(bo, &tmp_mem);
 	return r;
@@ -368,7 +368,7 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo,
 	if (unlikely(r)) {
 		return r;
 	}
-	r = ttm_bo_move_ttm(bo, true, no_wait_gpu, &tmp_mem);
+	r = ttm_bo_move_ttm(bo, true, interruptible, no_wait_gpu, &tmp_mem);
 	if (unlikely(r)) {
 		goto out_cleanup;
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 528bdeffb339..6190035edfea 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1151,7 +1151,7 @@ nouveau_bo_move_flipd(struct ttm_buffer_object *bo, bool evict, bool intr,
 	if (ret)
 		goto out;
 
-	ret = ttm_bo_move_ttm(bo, true, no_wait_gpu, new_mem);
+	ret = ttm_bo_move_ttm(bo, true, intr, no_wait_gpu, new_mem);
 out:
 	ttm_bo_mem_put(bo, &tmp_mem);
 	return ret;
@@ -1179,7 +1179,7 @@ nouveau_bo_move_flips(struct ttm_buffer_object *bo, bool evict, bool intr,
 	if (ret)
 		return ret;
 
-	ret = ttm_bo_move_ttm(bo, true, no_wait_gpu, &tmp_mem);
+	ret = ttm_bo_move_ttm(bo, true, intr, no_wait_gpu, &tmp_mem);
 	if (ret)
 		goto out;
 
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index ffdad81ef964..0c00e192c845 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -346,7 +346,7 @@ static int radeon_move_vram_ram(struct ttm_buffer_object *bo,
 	if (unlikely(r)) {
 		goto out_cleanup;
 	}
-	r = ttm_bo_move_ttm(bo, true, no_wait_gpu, new_mem);
+	r = ttm_bo_move_ttm(bo, true, interruptible, no_wait_gpu, new_mem);
 out_cleanup:
 	ttm_bo_mem_put(bo, &tmp_mem);
 	return r;
@@ -379,7 +379,7 @@ static int radeon_move_ram_vram(struct ttm_buffer_object *bo,
 	if (unlikely(r)) {
 		return r;
 	}
-	r = ttm_bo_move_ttm(bo, true, no_wait_gpu, &tmp_mem);
+	r = ttm_bo_move_ttm(bo, true, interruptible, no_wait_gpu, &tmp_mem);
 	if (unlikely(r)) {
 		goto out_cleanup;
 	}
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 4054d804fe06..42c074a9c955 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -354,7 +354,8 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
 
 	if (!(old_man->flags & TTM_MEMTYPE_FLAG_FIXED) &&
 	    !(new_man->flags & TTM_MEMTYPE_FLAG_FIXED))
-		ret = ttm_bo_move_ttm(bo, evict, no_wait_gpu, mem);
+		ret = ttm_bo_move_ttm(bo, evict, interruptible, no_wait_gpu,
+				      mem);
 	else if (bdev->driver->move)
 		ret = bdev->driver->move(bo, evict, interruptible,
 					 no_wait_gpu, mem);
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index 2df602a35f92..f157a9efd220 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -45,7 +45,7 @@ void ttm_bo_free_old_node(struct ttm_buffer_object *bo)
 }
 
 int ttm_bo_move_ttm(struct ttm_buffer_object *bo,
-		    bool evict,
+		    bool evict, bool interruptible,
 		    bool no_wait_gpu, struct ttm_mem_reg *new_mem)
 {
 	struct ttm_tt *ttm = bo->ttm;
@@ -53,6 +53,14 @@ int ttm_bo_move_ttm(struct ttm_buffer_object *bo,
 	int ret;
 
 	if (old_mem->mem_type != TTM_PL_SYSTEM) {
+		ret = ttm_bo_wait(bo, interruptible, no_wait_gpu);
+
+		if (unlikely(ret != 0)) {
+			if (ret != -ERESTARTSYS)
+				pr_err("Failed to expire sync object before unbinding TTM\n");
+			return ret;
+		}
+
 		ttm_tt_unbind(ttm);
 		ttm_bo_free_old_node(bo);
 		ttm_flag_masked(&old_mem->placement, TTM_PL_FLAG_SYSTEM,
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 4348d6d5877a..99c6d01d24f2 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -962,6 +962,7 @@ void ttm_mem_io_free(struct ttm_bo_device *bdev,
  *
  * @bo: A pointer to a struct ttm_buffer_object.
  * @evict: 1: This is an eviction. Don't try to pipeline.
+ * @interruptible: Sleep interruptible if waiting.
  * @no_wait_gpu: Return immediately if the GPU is busy.
  * @new_mem: struct ttm_mem_reg indicating where to move.
  *
@@ -976,7 +977,7 @@ void ttm_mem_io_free(struct ttm_bo_device *bdev,
  */
 
 extern int ttm_bo_move_ttm(struct ttm_buffer_object *bo,
-			   bool evict, bool no_wait_gpu,
+			   bool evict, bool interruptible, bool no_wait_gpu,
 			   struct ttm_mem_reg *new_mem);
 
 /**
-- 
cgit v1.2.3


From 3851f1cdb2b8d507b10395fc110d4c37d6121285 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 4 Aug 2016 00:08:45 -0400
Subject: SUNRPC: Limit the reconnect backoff timer to the max RPC message
 timeout

...and ensure that we propagate it to new transports on the same
client.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h |  3 ++-
 net/sunrpc/clnt.c           |  3 +++
 net/sunrpc/xprtsock.c       | 18 ++++++++++++------
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 5e3e1b63dbb3..a16070dd03ee 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -218,7 +218,8 @@ struct rpc_xprt {
 	struct work_struct	task_cleanup;
 	struct timer_list	timer;
 	unsigned long		last_used,
-				idle_timeout;
+				idle_timeout,
+				max_reconnect_timeout;
 
 	/*
 	 * Send stuff
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index cb49898a5a58..faac5472d14d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2638,6 +2638,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
 {
 	struct rpc_xprt_switch *xps;
 	struct rpc_xprt *xprt;
+	unsigned long reconnect_timeout;
 	unsigned char resvport;
 	int ret = 0;
 
@@ -2649,6 +2650,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
 		return -EAGAIN;
 	}
 	resvport = xprt->resvport;
+	reconnect_timeout = xprt->max_reconnect_timeout;
 	rcu_read_unlock();
 
 	xprt = xprt_create_transport(xprtargs);
@@ -2657,6 +2659,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
 		goto out_put_switch;
 	}
 	xprt->resvport = resvport;
+	xprt->max_reconnect_timeout = reconnect_timeout;
 
 	rpc_xprt_switch_set_roundrobin(xps);
 	if (setup) {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 04b0c4190dd7..8ede3bc52481 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -177,7 +177,6 @@ static struct ctl_table sunrpc_table[] = {
  * increase over time if the server is down or not responding.
  */
 #define XS_TCP_INIT_REEST_TO	(3U * HZ)
-#define XS_TCP_MAX_REEST_TO	(5U * 60 * HZ)
 
 /*
  * TCP idle timeout; client drops the transport socket if it is idle
@@ -2396,6 +2395,15 @@ static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
 	return 0;
 }
 
+static void xs_reconnect_backoff(struct rpc_xprt *xprt)
+{
+	xprt->reestablish_timeout <<= 1;
+	if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+		xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+	if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
+		xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
+}
+
 /**
  * xs_connect - connect a socket to a remote endpoint
  * @xprt: pointer to transport structure
@@ -2426,12 +2434,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		xs_reset_transport(transport);
 
 		delay = xs_reconnect_delay(xprt);
+		xs_reconnect_backoff(xprt);
 
-		xprt->reestablish_timeout <<= 1;
-		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
-			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-		if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO)
-			xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO;
 	} else
 		dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
 
@@ -2989,6 +2993,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 	xprt->ops = &xs_tcp_ops;
 	xprt->timeout = &xs_tcp_default_timeout;
 
+	xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
+
 	INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
 
-- 
cgit v1.2.3


From fb10fb67ad2ce43d5e5b8ad22d2ba826844acc56 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 5 Aug 2016 19:13:08 -0400
Subject: NFSv4: Cleanup the setting of the nfs4 lease period

Make a helper function nfs4_set_lease_period() and have
nfs41_setup_state_renewal() and nfs4_do_fsinfo() use it.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4_fs.h    |  4 ++++
 fs/nfs/nfs4proc.c   |  9 +++------
 fs/nfs/nfs4renewd.c | 17 +++++++++++++++++
 fs/nfs/nfs4state.c  |  9 +++------
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4be567a54958..74a0e34e5ded 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -395,6 +395,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *);
 extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
 extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
+extern void nfs4_set_lease_period(struct nfs_client *clp,
+		unsigned long lease,
+		unsigned long lastrenewed);
+
 
 /* nfs4state.c */
 struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index da5c9e58e907..b9e18679af50 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4237,12 +4237,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
 		err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
 		trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
 		if (err == 0) {
-			struct nfs_client *clp = server->nfs_client;
-
-			spin_lock(&clp->cl_lock);
-			clp->cl_lease_time = fsinfo->lease_time * HZ;
-			clp->cl_last_renewal = now;
-			spin_unlock(&clp->cl_lock);
+			nfs4_set_lease_period(server->nfs_client,
+					fsinfo->lease_time * HZ,
+					now);
 			break;
 		}
 		err = nfs4_handle_exception(server, err, &exception);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e1ba58c3d1ad..062029ab344a 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -136,6 +136,23 @@ nfs4_kill_renewd(struct nfs_client *clp)
 	cancel_delayed_work_sync(&clp->cl_renewd);
 }
 
+/**
+ * nfs4_set_lease_period - Sets the lease period on a nfs_client
+ *
+ * @clp: pointer to nfs_client
+ * @lease: new value for lease period
+ * @lastrenewed: time at which lease was last renewed
+ */
+void nfs4_set_lease_period(struct nfs_client *clp,
+		unsigned long lease,
+		unsigned long lastrenewed)
+{
+	spin_lock(&clp->cl_lock);
+	clp->cl_lease_time = lease;
+	clp->cl_last_renewal = lastrenewed;
+	spin_unlock(&clp->cl_lock);
+}
+
 /*
  * Local variables:
  *   c-basic-offset: 8
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 834b875900d6..cada00aa5096 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -277,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
 {
 	int status;
 	struct nfs_fsinfo fsinfo;
+	unsigned long now;
 
 	if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
 		nfs4_schedule_state_renewal(clp);
 		return 0;
 	}
 
+	now = jiffies;
 	status = nfs4_proc_get_lease_time(clp, &fsinfo);
 	if (status == 0) {
-		/* Update lease time and schedule renewal */
-		spin_lock(&clp->cl_lock);
-		clp->cl_lease_time = fsinfo.lease_time * HZ;
-		clp->cl_last_renewal = jiffies;
-		spin_unlock(&clp->cl_lock);
-
+		nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
 		nfs4_schedule_state_renewal(clp);
 	}
 
-- 
cgit v1.2.3


From 8d480326c3d6921ff5f1cc988c993bd572248deb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 5 Aug 2016 19:03:31 -0400
Subject: NFSv4: Cap the transport reconnection timer at 1/2 lease period

We don't want to miss a lease period renewal due to the TCP connection
failing to reconnect in a timely fashion. To ensure this doesn't happen,
cap the reconnection timer so that we retry the connection attempt
at least every 1/2 lease period.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4renewd.c         |  3 +++
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 21 +++++++++++++++++++++
 3 files changed, 26 insertions(+)

diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 062029ab344a..82e77198d17e 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -151,6 +151,9 @@ void nfs4_set_lease_period(struct nfs_client *clp,
 	clp->cl_lease_time = lease;
 	clp->cl_last_renewal = lastrenewed;
 	spin_unlock(&clp->cl_lock);
+
+	/* Cap maximum reconnect timeout at 1/2 lease period */
+	rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1);
 }
 
 /*
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index b6810c92b8bb..5c02b0691587 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -195,6 +195,8 @@ int		rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *,
 				struct rpc_xprt *,
 				void *),
 			void *data);
+void		rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+			unsigned long timeo);
 
 const char *rpc_proc_name(const struct rpc_task *task);
 #endif /* __KERNEL__ */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index faac5472d14d..7f79fb7dc6a0 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2676,6 +2676,27 @@ out_put_switch:
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
 
+static int
+rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+		struct rpc_xprt *xprt,
+		void *data)
+{
+	unsigned long timeout = *((unsigned long *)data);
+
+	if (timeout < xprt->max_reconnect_timeout)
+		xprt->max_reconnect_timeout = timeout;
+	return 0;
+}
+
+void
+rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
+{
+	rpc_clnt_iterate_for_each_xprt(clnt,
+			rpc_xprt_cap_max_reconnect_timeout,
+			&timeo);
+}
+EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static void rpc_show_header(void)
 {
-- 
cgit v1.2.3


From 8e85946777d3f667085f298c0acfec12fe105078 Mon Sep 17 00:00:00 2001
From: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Date: Thu, 4 Aug 2016 20:59:17 +0530
Subject: cpufreq: powernv: Fix crash in gpstate_timer_handler()

Commit 09ca4c9b5958 (cpufreq: powernv: Replacing pstate_id with
frequency table index) changes calc_global_pstate() to use
cpufreq_table index instead of pstate_id.

But in gpstate_timer_handler(), pstate_id was being passed instead
of cpufreq_table index, which caused index_to_pstate() to access
out of bound indices, leading to this crash.

Adding sanity check for index and pstate, to ensure only valid pstate
and index values are returned.

Call Trace:
[c00000078d66b130] [c00000000011d224] __free_irq+0x234/0x360
(unreliable)
[c00000078d66b1c0] [c00000000011d44c] free_irq+0x6c/0xa0
[c00000078d66b1f0] [c00000000006c4f8] opal_event_shutdown+0x88/0xd0
[c00000078d66b230] [c000000000067a4c] opal_shutdown+0x1c/0x90
[c00000078d66b260] [c000000000063a00] pnv_shutdown+0x20/0x40
[c00000078d66b280] [c000000000021538] machine_restart+0x38/0x90
[c0000000078d66b310] [c000000000965ea0] panic+0x284/0x300
[c00000078d66b3a0] [c00000000001f508] die+0x388/0x450
[c00000078d66b430] [c000000000045a50] bad_page_fault+0xd0/0x140
[c00000078d66b4a0] [c000000000008964] handle_page_fault+0x2c/0x30
   interrupt: 300 at gpstate_timer_handler+0x150/0x260
    LR = gpstate_timer_handler+0x130/0x260
[c00000078d66b7f0] [c000000000132b58] call_timer_fn+0x58/0x1c0
[c00000078d66b880] [c000000000132e20] expire_timers+0x130/0x1d0
[c00000078d66b8f0] [c000000000133068] run_timer_softirq+0x1a8/0x230
[c00000078d66b980] [c0000000000b535c] __do_softirq+0x18c/0x400
[c00000078d66ba70] [c0000000000b5828] irq_exit+0xc8/0x100
[c00000078d66ba90] [c00000000001e214] timer_interrupt+0xa4/0xe0
[c00000078d66bac0] [c0000000000027d0] decrementer_common+0x150/0x180
   interrupt: 901 at arch_local_irq_restore+0x74/0x90
  0] [c000000000106b34] call_cpuidle+0x44/0x90
[c00000078d66be50] [c00000000010708c] cpu_startup_entry+0x38c/0x460
[c00000078d66bf20] [c00000000003d930] start_secondary+0x330/0x380
[c00000078d66bf90] [c000000000008e6c] start_secondary_prolog+0x10/0x14

Fixes: 09ca4c9b5958 (cpufreq: powernv: Replacing pstate_id with frequency table index)
Reported-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 0f138b050e9a..2fcc879d2b97 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -145,11 +145,30 @@ static struct powernv_pstate_info {
 /* Use following macros for conversions between pstate_id and index */
 static inline int idx_to_pstate(unsigned int i)
 {
+	if (unlikely(i >= powernv_pstate_info.nr_pstates)) {
+		pr_warn_once("index %u is out of bound\n", i);
+		return powernv_freqs[powernv_pstate_info.nominal].driver_data;
+	}
+
 	return powernv_freqs[i].driver_data;
 }
 
 static inline unsigned int pstate_to_idx(int pstate)
 {
+	int min = powernv_freqs[powernv_pstate_info.min].driver_data;
+	int max = powernv_freqs[powernv_pstate_info.max].driver_data;
+
+	if (min > 0) {
+		if (unlikely((pstate < max) || (pstate > min))) {
+			pr_warn_once("pstate %d is out of bound\n", pstate);
+			return powernv_pstate_info.nominal;
+		}
+	} else {
+		if (unlikely((pstate > max) || (pstate < min))) {
+			pr_warn_once("pstate %d is out of bound\n", pstate);
+			return powernv_pstate_info.nominal;
+		}
+	}
 	/*
 	 * abs() is deliberately used so that is works with
 	 * both monotonically increasing and decreasing
@@ -594,7 +613,7 @@ void gpstate_timer_handler(unsigned long data)
 	} else {
 		gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 						 gpstates->highest_lpstate_idx,
-						 freq_data.pstate_id);
+						 gpstates->last_lpstate_idx);
 	}
 
 	/*
-- 
cgit v1.2.3


From 0b98027122d0715aa2dc39882e4c295112275785 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@linux-m68k.org>
Date: Fri, 29 Jul 2016 14:26:53 +1000
Subject: m68knommu: fix user a5 register being overwritten

On no-MMU systems the application a5 register can be overwitten with the
address of the process data segment when processing application signals.
For flat format applications compiled with full absolute relocation this
effectively corrupts the a5 register on signal processing - and this very
quickly leads to process crash and often takes out the whole system with
a panic as well.

This has no effect on flat format applications compiled with the more
common PIC methods (such as -msep-data). These format applications reserve
a5 for the pointer to the data segment anyway - so it doesn't change it.

A long time ago the a5 register was used in the code packed into the user
stack to enable signal return processing. And so it had to be restored on
end of signal cleanup processing back to the original a5 user value. This
was historically done by saving away a5 in the sigcontext structure. At
some point (a long time back it seems) the a5 restore process was changed
and it was hard coded to put the user data segment address directly into a5.
Which is ok for the common PIC compiled application case, but breaks the
full relocation application code.

We no longer use this type of signal handling mechanism and so we don't
need to do anything special to save and restore a5 at all now. So remove the
code that hard codes a5 to the address of the user data segment.

Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
---
 arch/m68k/kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 2dcee3a88867..9202f82dfce6 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -213,7 +213,6 @@ static inline int frame_extra_sizes(int f)
 
 static inline void adjustformat(struct pt_regs *regs)
 {
-	((struct switch_stack *)regs - 1)->a5 = current->mm->start_data;
 	/*
 	 * set format byte to make stack appear modulo 4, which it will
 	 * be when doing the rte
-- 
cgit v1.2.3


From 176b1ec21306f0775e6d7a03f42d82a0b144ef0e Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 23 Jun 2016 15:02:46 -0700
Subject: thermal: intel_pch_thermal: Add suspend/resume callback

Added suspend/resume callback to disable/enable PCH thermal sensor
respectively. If the sensor is enabled by the BIOS, then the sensor status
will not be changed during suspend/resume.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/intel_pch_thermal.c | 60 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/drivers/thermal/intel_pch_thermal.c b/drivers/thermal/intel_pch_thermal.c
index 6a6ec1c95a7a..9b4815e81b0d 100644
--- a/drivers/thermal/intel_pch_thermal.c
+++ b/drivers/thermal/intel_pch_thermal.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/thermal.h>
+#include <linux/pm.h>
 
 /* Intel PCH thermal Device IDs */
 #define PCH_THERMAL_DID_WPT	0x9CA4 /* Wildcat Point */
@@ -65,6 +66,7 @@ struct pch_thermal_device {
 	unsigned long crt_temp;
 	int hot_trip_id;
 	unsigned long hot_temp;
+	bool bios_enabled;
 };
 
 static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips)
@@ -75,8 +77,10 @@ static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips)
 	*nr_trips = 0;
 
 	/* Check if BIOS has already enabled thermal sensor */
-	if (WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS))
+	if (WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS)) {
+		ptd->bios_enabled = true;
 		goto read_trips;
+	}
 
 	tsel = readb(ptd->hw_base + WPT_TSEL);
 	/*
@@ -130,9 +134,39 @@ static int pch_wpt_get_temp(struct pch_thermal_device *ptd, int *temp)
 	return 0;
 }
 
+static int pch_wpt_suspend(struct pch_thermal_device *ptd)
+{
+	u8 tsel;
+
+	if (ptd->bios_enabled)
+		return 0;
+
+	tsel = readb(ptd->hw_base + WPT_TSEL);
+
+	writeb(tsel & 0xFE, ptd->hw_base + WPT_TSEL);
+
+	return 0;
+}
+
+static int pch_wpt_resume(struct pch_thermal_device *ptd)
+{
+	u8 tsel;
+
+	if (ptd->bios_enabled)
+		return 0;
+
+	tsel = readb(ptd->hw_base + WPT_TSEL);
+
+	writeb(tsel | WPT_TSEL_ETS, ptd->hw_base + WPT_TSEL);
+
+	return 0;
+}
+
 struct pch_dev_ops {
 	int (*hw_init)(struct pch_thermal_device *ptd, int *nr_trips);
 	int (*get_temp)(struct pch_thermal_device *ptd, int *temp);
+	int (*suspend)(struct pch_thermal_device *ptd);
+	int (*resume)(struct pch_thermal_device *ptd);
 };
 
 
@@ -140,6 +174,8 @@ struct pch_dev_ops {
 static const struct pch_dev_ops pch_dev_ops_wpt = {
 	.hw_init = pch_wpt_init,
 	.get_temp = pch_wpt_get_temp,
+	.suspend = pch_wpt_suspend,
+	.resume = pch_wpt_resume,
 };
 
 static int pch_thermal_get_temp(struct thermal_zone_device *tzd, int *temp)
@@ -269,6 +305,22 @@ static void intel_pch_thermal_remove(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 }
 
+static int intel_pch_thermal_suspend(struct device *device)
+{
+	struct pci_dev *pdev = to_pci_dev(device);
+	struct pch_thermal_device *ptd = pci_get_drvdata(pdev);
+
+	return ptd->ops->suspend(ptd);
+}
+
+static int intel_pch_thermal_resume(struct device *device)
+{
+	struct pci_dev *pdev = to_pci_dev(device);
+	struct pch_thermal_device *ptd = pci_get_drvdata(pdev);
+
+	return ptd->ops->resume(ptd);
+}
+
 static struct pci_device_id intel_pch_thermal_id[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_WPT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_SKL) },
@@ -276,11 +328,17 @@ static struct pci_device_id intel_pch_thermal_id[] = {
 };
 MODULE_DEVICE_TABLE(pci, intel_pch_thermal_id);
 
+static const struct dev_pm_ops intel_pch_pm_ops = {
+	.suspend = intel_pch_thermal_suspend,
+	.resume = intel_pch_thermal_resume,
+};
+
 static struct pci_driver intel_pch_thermal_driver = {
 	.name		= "intel_pch_thermal",
 	.id_table	= intel_pch_thermal_id,
 	.probe		= intel_pch_thermal_probe,
 	.remove		= intel_pch_thermal_remove,
+	.driver.pm	= &intel_pch_pm_ops,
 };
 
 module_pci_driver(intel_pch_thermal_driver);
-- 
cgit v1.2.3


From 70c50ee72eebb3d1e3aae7450269fd8d8074c610 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 5 Aug 2016 15:20:41 +0200
Subject: thermal/powerclamp: Prevent division by zero when counting interval

I have got a zero division error when disabling the forced
idle injection from the intel powerclamp. I did

  echo 0 >/sys/class/thermal/cooling_device48/cur_state

and got

[  986.072632] divide error: 0000 [#1] PREEMPT SMP
[  986.078989] Modules linked in:
[  986.083618] CPU: 17 PID: 24967 Comm: kidle_inject/17 Not tainted 4.7.0-1-default+ #3055
[  986.093781] Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.R3.27.D685.1305151734 05/15/2013
[  986.106227] task: ffff880430e1c080 task.stack: ffff880427ef0000
[  986.114122] RIP: 0010:[<ffffffff81794859>]  [<ffffffff81794859>] clamp_thread+0x1d9/0x600
[  986.124609] RSP: 0018:ffff880427ef3e20  EFLAGS: 00010246
[  986.131860] RAX: 0000000000000258 RBX: 0000000000000006 RCX: 0000000000000001
[  986.141179] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000018
[  986.150478] RBP: ffff880427ef3ec8 R08: ffff880427ef0000 R09: 0000000000000002
[  986.159779] R10: 0000000000003df2 R11: 0000000000000018 R12: 0000000000000002
[  986.169089] R13: 0000000000000000 R14: ffff880427ef0000 R15: ffff880427ef0000
[  986.178388] FS:  0000000000000000(0000) GS:ffff880435940000(0000) knlGS:0000000000000000
[  986.188785] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  986.196559] CR2: 00007f1d0caf0000 CR3: 0000000002006000 CR4: 00000000001406e0
[  986.205909] Stack:
[  986.209524]  ffff8802be897b00 ffff880430e1c080 0000000000000011 0000006a35959780
[  986.219236]  0000000000000011 ffff880427ef0008 0000000000000000 ffff8804359503d0
[  986.228966]  0000000100029d93 ffffffff81794140 0000000000000000 ffffffff05000011
[  986.238686] Call Trace:
[  986.242825]  [<ffffffff81794140>] ? pkg_state_counter+0x80/0x80
[  986.250866]  [<ffffffff81794680>] ? powerclamp_set_cur_state+0x180/0x180
[  986.259797]  [<ffffffff8111d1a9>] kthread+0xc9/0xe0
[  986.266682]  [<ffffffff8193d69f>] ret_from_fork+0x1f/0x40
[  986.274142]  [<ffffffff8111d0e0>] ? kthread_create_on_node+0x180/0x180
[  986.282869] Code: d1 ea 48 89 d6 80 3d 6a d0 d4 00 00 ba 64 00 00 00 89 d8 41 0f 45 f5 0f af c2 42 8d 14 2e be 31 00 00 00 83 fa 31 0f 42 f2 31 d2 <f7> f6 48 8b 15 9e 07 87 00 48 8b 3d 97 07 87 00 48 63 f0 83 e8
[  986.307806] RIP  [<ffffffff81794859>] clamp_thread+0x1d9/0x600
[  986.315871]  RSP <ffff880427ef3e20>

RIP points to the following lines:

	compensation = get_compensation(target_ratio);
	interval = duration_jiffies*100/(target_ratio+compensation);

A solution would be to switch the following two commands in
powerclamp_set_cur_state():

	set_target_ratio = 0;
	end_power_clamp();

But I think that the zero division might happen also when target_ratio
is non-zero because the compensation might be negative. Therefore
we also check the sum of target_ratio and compensation explicitly.

Also the compensated_ratio variable is always set. Therefore there
is no need to initialize it.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/intel_powerclamp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 015ce2eb6eb7..0e4dc0afcfd2 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -388,7 +388,7 @@ static int clamp_thread(void *arg)
 		int sleeptime;
 		unsigned long target_jiffies;
 		unsigned int guard;
-		unsigned int compensation = 0;
+		unsigned int compensated_ratio;
 		int interval; /* jiffies to sleep for each attempt */
 		unsigned int duration_jiffies = msecs_to_jiffies(duration);
 		unsigned int window_size_now;
@@ -409,8 +409,11 @@ static int clamp_thread(void *arg)
 		 * c-states, thus we need to compensate the injected idle ratio
 		 * to achieve the actual target reported by the HW.
 		 */
-		compensation = get_compensation(target_ratio);
-		interval = duration_jiffies*100/(target_ratio+compensation);
+		compensated_ratio = target_ratio +
+			get_compensation(target_ratio);
+		if (compensated_ratio <= 0)
+			compensated_ratio = 1;
+		interval = duration_jiffies * 100 / compensated_ratio;
 
 		/* align idle time */
 		target_jiffies = roundup(jiffies, interval);
@@ -647,8 +650,8 @@ static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
 		goto exit_set;
 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
 		pr_info("Stop forced idle injection\n");
-		set_target_ratio = 0;
 		end_power_clamp();
+		set_target_ratio = 0;
 	} else	/* adjust currently running */ {
 		set_target_ratio = new_target_ratio;
 		/* make new set_target_ratio visible to other cpus */
-- 
cgit v1.2.3


From d0b7306d203c82e7c04d6eb066ca4898f016ebdd Mon Sep 17 00:00:00 2001
From: Michele Di Giorgio <michele.digiorgio@arm.com>
Date: Thu, 2 Jun 2016 15:25:31 +0100
Subject: thermal: fix race condition when updating cooling device

When multiple thermal zones are bound to the same cooling device, multiple
kernel threads may want to update the cooling device state by calling
thermal_cdev_update(). Having cdev not protected by a mutex can lead to a race
condition. Consider the following situation with two kernel threads k1 and k2:

	    Thread k1				Thread k2
                                    ||
                                    ||  call thermal_cdev_update()
                                    ||      ...
                                    ||      set_cur_state(cdev, target);
    call power_actor_set_power()    ||
        ...                         ||
        instance->target = state;   ||
        cdev->updated = false;      ||
                                    ||      cdev->updated = true;
                                    ||      // completes execution
    call thermal_cdev_update()      ||
        // cdev->updated == true    ||
        return;                     ||
                                    \/
                                    time

k2 has already looped through the thermal instances looking for the deepest
cooling device state and is preempted right before setting cdev->updated to
true. Now, k1 runs, modifies the thermal instance state and sets cdev->updated
to false. Then, k1 is preempted and k2 continues the execution by setting
cdev->updated to true, therefore preventing k1 from performing the update.
Notice that this is not an issue if k2 looks at the instance->target modified by
k1 "after" it is assigned by k1. In fact, in this case the update will happen
anyway and k1 can safely return immediately from thermal_cdev_update().

This may lead to a situation where a thermal governor never updates the cooling
device. For example, this is the case for the step_wise governor: when calling
the function thermal_zone_trip_update(), the governor may always get a new state
equal to the old one (which, however, wasn't notified to the cooling device) and
will therefore skip the update.

CC: Zhang Rui <rui.zhang@intel.com>
CC: Eduardo Valentin <edubezval@gmail.com>
CC: Peter Feuerer <peter@piie.net>
Reported-by: Toby Huang <toby.huang@arm.com>
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/fair_share.c      |  2 ++
 drivers/thermal/gov_bang_bang.c   |  2 ++
 drivers/thermal/power_allocator.c |  2 ++
 drivers/thermal/step_wise.c       |  2 ++
 drivers/thermal/thermal_core.c    | 10 +++++++---
 5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c
index 34fe36504a55..68bd1b569118 100644
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c
@@ -116,7 +116,9 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 		instance->target = get_target_state(tz, cdev, percentage,
 						    cur_trip_level);
 
+		mutex_lock(&instance->cdev->lock);
 		instance->cdev->updated = false;
+		mutex_unlock(&instance->cdev->lock);
 		thermal_cdev_update(cdev);
 	}
 	return 0;
diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index fc52016d4e85..bb118a152cbb 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -71,7 +71,9 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 		dev_dbg(&instance->cdev->device, "target=%d\n",
 					(int)instance->target);
 
+		mutex_lock(&instance->cdev->lock);
 		instance->cdev->updated = false; /* cdev needs update */
+		mutex_unlock(&instance->cdev->lock);
 	}
 
 	mutex_unlock(&tz->lock);
diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c
index 2f1a863a8e15..b4d3116cfdaf 100644
--- a/drivers/thermal/power_allocator.c
+++ b/drivers/thermal/power_allocator.c
@@ -529,7 +529,9 @@ static void allow_maximum_power(struct thermal_zone_device *tz)
 			continue;
 
 		instance->target = 0;
+		mutex_lock(&instance->cdev->lock);
 		instance->cdev->updated = false;
+		mutex_unlock(&instance->cdev->lock);
 		thermal_cdev_update(instance->cdev);
 	}
 }
diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c
index ea9366ad3e6b..bcef2e7c4ec9 100644
--- a/drivers/thermal/step_wise.c
+++ b/drivers/thermal/step_wise.c
@@ -175,7 +175,9 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 			update_passive_instance(tz, trip_type, -1);
 
 		instance->initialized = true;
+		mutex_lock(&instance->cdev->lock);
 		instance->cdev->updated = false; /* cdev needs update */
+		mutex_unlock(&instance->cdev->lock);
 	}
 
 	mutex_unlock(&tz->lock);
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5133cd1e10b7..e2fc6161dded 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1093,7 +1093,9 @@ int power_actor_set_power(struct thermal_cooling_device *cdev,
 		return ret;
 
 	instance->target = state;
+	mutex_lock(&cdev->lock);
 	cdev->updated = false;
+	mutex_unlock(&cdev->lock);
 	thermal_cdev_update(cdev);
 
 	return 0;
@@ -1623,11 +1625,13 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev)
 	struct thermal_instance *instance;
 	unsigned long target = 0;
 
+	mutex_lock(&cdev->lock);
 	/* cooling device is updated*/
-	if (cdev->updated)
+	if (cdev->updated) {
+		mutex_unlock(&cdev->lock);
 		return;
+	}
 
-	mutex_lock(&cdev->lock);
 	/* Make sure cdev enters the deepest cooling state */
 	list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) {
 		dev_dbg(&cdev->device, "zone%d->target=%lu\n",
@@ -1637,9 +1641,9 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev)
 		if (instance->target > target)
 			target = instance->target;
 	}
-	mutex_unlock(&cdev->lock);
 	cdev->ops->set_cur_state(cdev, target);
 	cdev->updated = true;
+	mutex_unlock(&cdev->lock);
 	trace_cdev_update(cdev, target);
 	dev_dbg(&cdev->device, "set to state %lu\n", target);
 }
-- 
cgit v1.2.3


From f4c592439b144ef1de66eac764140643a54099eb Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Mon, 4 Jul 2016 07:19:32 +0000
Subject: thermal: hwmon: EXPORT_SYMBOL_GPL for thermal hwmon sysfs

thermal_add_hwmon_sysfs()/thermal_remove_hwmon_sysfs() need
EXPORT_SYMBOL_GPL(). Otherwise we will have ERROR

>> ERROR: "thermal_remove_hwmon_sysfs" [drivers/thermal/rcar_thermal.ko] undefined!
>> ERROR: "thermal_add_hwmon_sysfs" [drivers/thermal/rcar_thermal.ko] undefined!

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/thermal_hwmon.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c
index 06fd2ed9ef9d..c41c7742903a 100644
--- a/drivers/thermal/thermal_hwmon.c
+++ b/drivers/thermal/thermal_hwmon.c
@@ -232,6 +232,7 @@ int thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 
 	return result;
 }
+EXPORT_SYMBOL_GPL(thermal_add_hwmon_sysfs);
 
 void thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 {
@@ -270,3 +271,4 @@ void thermal_remove_hwmon_sysfs(struct thermal_zone_device *tz)
 	hwmon_device_unregister(hwmon->device);
 	kfree(hwmon);
 }
+EXPORT_SYMBOL_GPL(thermal_remove_hwmon_sysfs);
-- 
cgit v1.2.3


From 165989a5b667b90589f21c8affe496ad21f08591 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Mon, 25 Jul 2016 07:01:19 +0000
Subject: thermal: clock_cooling: Fix missing mutex_init()

The driver allocates the mutex but not initialize it.
Use mutex_init() on it to initialize it correctly.

This is detected by Coccinelle semantic patch.

Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/clock_cooling.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/thermal/clock_cooling.c b/drivers/thermal/clock_cooling.c
index 1b4ff0f4c716..ed5dd0e88657 100644
--- a/drivers/thermal/clock_cooling.c
+++ b/drivers/thermal/clock_cooling.c
@@ -426,6 +426,7 @@ clock_cooling_register(struct device *dev, const char *clock_name)
 	if (!ccdev)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&ccdev->lock);
 	ccdev->dev = dev;
 	ccdev->clk = devm_clk_get(dev, clock_name);
 	if (IS_ERR(ccdev->clk))
-- 
cgit v1.2.3


From 29986cc8a66fcc4b8ea8a01216a72f06b865360b Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 3 Aug 2016 23:31:36 +0300
Subject: drm: rcar-du: Link HDMI encoder with bridge

The conversion of the rcar-du driver from the I2C slave encoder to the
DRM bridge API left the HDMI encoder's bridge pointer NULL, preventing
the bridge from being handled automatically by the DRM core. Fix it.

Fixes: 1d926114d8f4 ("drm: rcar-du: Remove i2c slave encoder interface for hdmi encoder")
Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/rcar-du/rcar_du_hdmienc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/rcar-du/rcar_du_hdmienc.c b/drivers/gpu/drm/rcar-du/rcar_du_hdmienc.c
index 4de3ff0dbebd..e03004f4588d 100644
--- a/drivers/gpu/drm/rcar-du/rcar_du_hdmienc.c
+++ b/drivers/gpu/drm/rcar-du/rcar_du_hdmienc.c
@@ -125,6 +125,7 @@ int rcar_du_hdmienc_init(struct rcar_du_device *rcdu,
 
 	/* Link drm_bridge to encoder */
 	bridge->encoder = encoder;
+	encoder->bridge = bridge;
 
 	ret = drm_bridge_attach(rcdu->ddev, bridge);
 	if (ret) {
-- 
cgit v1.2.3


From 5c6c201ccbaf9d3901f829441d457293f7ca8ef4 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 4 Aug 2016 18:35:48 +0200
Subject: drm: Paper over locking inversion after registration rework

drm_connector_register_all requires a few too many locks because our
connector_list locking is busted. Add another FIXME+hack to work
around this. This should address the below lockdep splat:

======================================================
[ INFO: possible circular locking dependency detected ]
4.7.0-rc5+ #524 Tainted: G           O
-------------------------------------------------------
kworker/u8:0/6 is trying to acquire lock:
 (&dev->mode_config.mutex){+.+.+.}, at: [<ffffffff815afde0>] drm_modeset_lock_all+0x40/0x120

but task is already holding lock:
 ((fb_notifier_list).rwsem){++++.+}, at: [<ffffffff810ac195>] __blocking_notifier_call_chain+0x35/0x70

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 ((fb_notifier_list).rwsem){++++.+}:
       [<ffffffff810df611>] lock_acquire+0xb1/0x200
       [<ffffffff819a55b4>] down_write+0x44/0x80
       [<ffffffff810abf91>] blocking_notifier_chain_register+0x21/0xb0
       [<ffffffff814c7448>] fb_register_client+0x18/0x20
       [<ffffffff814c6c86>] backlight_device_register+0x136/0x260
       [<ffffffffa0127eb2>] intel_backlight_device_register+0xa2/0x160 [i915]
       [<ffffffffa00f46be>] intel_connector_register+0xe/0x10 [i915]
       [<ffffffffa0112bfb>] intel_dp_connector_register+0x1b/0x80 [i915]
       [<ffffffff8159dfea>] drm_connector_register+0x4a/0x80
       [<ffffffff8159fe44>] drm_connector_register_all+0x64/0xf0
       [<ffffffff815a2a64>] drm_modeset_register_all+0x174/0x1c0
       [<ffffffff81599b72>] drm_dev_register+0xc2/0xd0
       [<ffffffffa00621d7>] i915_driver_load+0x1547/0x2200 [i915]
       [<ffffffffa006d80f>] i915_pci_probe+0x4f/0x70 [i915]
       [<ffffffff814a2135>] local_pci_probe+0x45/0xa0
       [<ffffffff814a349b>] pci_device_probe+0xdb/0x130
       [<ffffffff815c07e3>] driver_probe_device+0x223/0x440
       [<ffffffff815c0ad5>] __driver_attach+0xd5/0x100
       [<ffffffff815be386>] bus_for_each_dev+0x66/0xa0
       [<ffffffff815c002e>] driver_attach+0x1e/0x20
       [<ffffffff815bf9be>] bus_add_driver+0x1ee/0x280
       [<ffffffff815c1810>] driver_register+0x60/0xe0
       [<ffffffff814a1a10>] __pci_register_driver+0x60/0x70
       [<ffffffffa01a905b>] i915_init+0x5b/0x62 [i915]
       [<ffffffff8100042d>] do_one_initcall+0x3d/0x150
       [<ffffffff811a935b>] do_init_module+0x5f/0x1d9
       [<ffffffff81124416>] load_module+0x20e6/0x27e0
       [<ffffffff81124d63>] SYSC_finit_module+0xc3/0xf0
       [<ffffffff81124dae>] SyS_finit_module+0xe/0x10
       [<ffffffff819a83a9>] entry_SYSCALL_64_fastpath+0x1c/0xac

-> #0 (&dev->mode_config.mutex){+.+.+.}:
       [<ffffffff810df0ac>] __lock_acquire+0x10fc/0x1260
       [<ffffffff810df611>] lock_acquire+0xb1/0x200
       [<ffffffff819a3097>] mutex_lock_nested+0x67/0x3c0
       [<ffffffff815afde0>] drm_modeset_lock_all+0x40/0x120
       [<ffffffff8158f79b>] drm_fb_helper_restore_fbdev_mode_unlocked+0x2b/0x80
       [<ffffffff8158f81d>] drm_fb_helper_set_par+0x2d/0x50
       [<ffffffffa0105f7a>] intel_fbdev_set_par+0x1a/0x60 [i915]
       [<ffffffff814c13c6>] fbcon_init+0x586/0x610
       [<ffffffff8154d16a>] visual_init+0xca/0x130
       [<ffffffff8154e611>] do_bind_con_driver+0x1c1/0x3a0
       [<ffffffff8154eaf6>] do_take_over_console+0x116/0x180
       [<ffffffff814bd3a7>] do_fbcon_takeover+0x57/0xb0
       [<ffffffff814c1e48>] fbcon_event_notify+0x658/0x750
       [<ffffffff810abcae>] notifier_call_chain+0x3e/0xb0
       [<ffffffff810ac1ad>] __blocking_notifier_call_chain+0x4d/0x70
       [<ffffffff810ac1e6>] blocking_notifier_call_chain+0x16/0x20
       [<ffffffff814c748b>] fb_notifier_call_chain+0x1b/0x20
       [<ffffffff814c86b1>] register_framebuffer+0x251/0x330
       [<ffffffff8158fa9f>] drm_fb_helper_initial_config+0x25f/0x3f0
       [<ffffffffa0106b48>] intel_fbdev_initial_config+0x18/0x30 [i915]
       [<ffffffff810adfd8>] async_run_entry_fn+0x48/0x150
       [<ffffffff810a3947>] process_one_work+0x1e7/0x750
       [<ffffffff810a3efb>] worker_thread+0x4b/0x4f0
       [<ffffffff810aad4f>] kthread+0xef/0x110
       [<ffffffff819a85ef>] ret_from_fork+0x1f/0x40

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock((fb_notifier_list).rwsem);
                               lock(&dev->mode_config.mutex);
                               lock((fb_notifier_list).rwsem);
  lock(&dev->mode_config.mutex);

 *** DEADLOCK ***

6 locks held by kworker/u8:0/6:
 #0:  ("events_unbound"){.+.+.+}, at: [<ffffffff810a38c9>] process_one_work+0x169/0x750
 #1:  ((&entry->work)){+.+.+.}, at: [<ffffffff810a38c9>] process_one_work+0x169/0x750
 #2:  (registration_lock){+.+.+.}, at: [<ffffffff814c8487>] register_framebuffer+0x27/0x330
 #3:  (console_lock){+.+.+.}, at: [<ffffffff814c86ce>] register_framebuffer+0x26e/0x330
 #4:  (&fb_info->lock){+.+.+.}, at: [<ffffffff814c78dd>] lock_fb_info+0x1d/0x40
 #5:  ((fb_notifier_list).rwsem){++++.+}, at: [<ffffffff810ac195>] __blocking_notifier_call_chain+0x35/0x70

stack backtrace:
CPU: 2 PID: 6 Comm: kworker/u8:0 Tainted: G           O    4.7.0-rc5+ #524
Hardware name: Intel Corp. Broxton P/NOTEBOOK, BIOS APLKRVPA.X64.0138.B33.1606250842 06/25/2016
Workqueue: events_unbound async_run_entry_fn
 0000000000000000 ffff8800758577f0 ffffffff814507a5 ffffffff828b9900
 ffffffff828b9900 ffff880075857830 ffffffff810dc6fa ffff880075857880
 ffff88007584d688 0000000000000005 0000000000000006 ffff88007584d6b0
Call Trace:
 [<ffffffff814507a5>] dump_stack+0x67/0x92
 [<ffffffff810dc6fa>] print_circular_bug+0x1aa/0x200
 [<ffffffff810df0ac>] __lock_acquire+0x10fc/0x1260
 [<ffffffff810df611>] lock_acquire+0xb1/0x200
 [<ffffffff815afde0>] ? drm_modeset_lock_all+0x40/0x120
 [<ffffffff815afde0>] ? drm_modeset_lock_all+0x40/0x120
 [<ffffffff819a3097>] mutex_lock_nested+0x67/0x3c0
 [<ffffffff815afde0>] ? drm_modeset_lock_all+0x40/0x120
 [<ffffffff810fa85f>] ? rcu_read_lock_sched_held+0x7f/0x90
 [<ffffffff81208218>] ? kmem_cache_alloc_trace+0x248/0x2b0
 [<ffffffff815afdc5>] ? drm_modeset_lock_all+0x25/0x120
 [<ffffffff815afde0>] drm_modeset_lock_all+0x40/0x120
 [<ffffffff8158f79b>] drm_fb_helper_restore_fbdev_mode_unlocked+0x2b/0x80
 [<ffffffff8158f81d>] drm_fb_helper_set_par+0x2d/0x50
 [<ffffffffa0105f7a>] intel_fbdev_set_par+0x1a/0x60 [i915]
 [<ffffffff814c13c6>] fbcon_init+0x586/0x610
 [<ffffffff8154d16a>] visual_init+0xca/0x130
 [<ffffffff8154e611>] do_bind_con_driver+0x1c1/0x3a0
 [<ffffffff8154eaf6>] do_take_over_console+0x116/0x180
 [<ffffffff814bd3a7>] do_fbcon_takeover+0x57/0xb0
 [<ffffffff814c1e48>] fbcon_event_notify+0x658/0x750
 [<ffffffff810abcae>] notifier_call_chain+0x3e/0xb0
 [<ffffffff810ac1ad>] __blocking_notifier_call_chain+0x4d/0x70
 [<ffffffff810ac1e6>] blocking_notifier_call_chain+0x16/0x20
 [<ffffffff814c748b>] fb_notifier_call_chain+0x1b/0x20
 [<ffffffff814c86b1>] register_framebuffer+0x251/0x330
 [<ffffffff815b7e8d>] ? vga_switcheroo_client_fb_set+0x5d/0x70
 [<ffffffff8158fa9f>] drm_fb_helper_initial_config+0x25f/0x3f0
 [<ffffffffa0106b48>] intel_fbdev_initial_config+0x18/0x30 [i915]
 [<ffffffff810adfd8>] async_run_entry_fn+0x48/0x150
 [<ffffffff810a3947>] process_one_work+0x1e7/0x750
 [<ffffffff810a38c9>] ? process_one_work+0x169/0x750
 [<ffffffff810a3efb>] worker_thread+0x4b/0x4f0
 [<ffffffff810a3eb0>] ? process_one_work+0x750/0x750
 [<ffffffff810aad4f>] kthread+0xef/0x110
 [<ffffffff819a85ef>] ret_from_fork+0x1f/0x40
 [<ffffffff810aac60>] ? kthread_stop+0x2e0/0x2e0

v2: Rebase onto the right branch (hand-editing patches ftw) and add more
reporters.

Reported-by: Imre Deak <imre.deak@intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Chris Wilson <chris@chris-wilson.co.uk>
Reported-by: Jiri Kosina <jikos@kernel.org>
Cc: Jiri Kosina <jikos@kernel.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/drm_crtc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index f1d9f0569d7f..b1dbb60af99f 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -1121,16 +1121,14 @@ static int drm_connector_register_all(struct drm_device *dev)
 	struct drm_connector *connector;
 	int ret;
 
-	mutex_lock(&dev->mode_config.mutex);
-
-	drm_for_each_connector(connector, dev) {
+	/* FIXME: taking the mode config mutex ends up in a clash with
+	 * fbcon/backlight registration */
+	list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
 		ret = drm_connector_register(connector);
 		if (ret)
 			goto err;
 	}
 
-	mutex_unlock(&dev->mode_config.mutex);
-
 	return 0;
 
 err:
-- 
cgit v1.2.3


From 416f37d0816b9720b8227953e55954d81456f991 Mon Sep 17 00:00:00 2001
From: Darren Stevens <darren@stevens-zone.net>
Date: Wed, 27 Jul 2016 15:41:29 +0100
Subject: powerpc/pasemi: Fix coherent_dma_mask for dma engine

Commit 817820b0226a ("powerpc/iommu: Support "hybrid" iommu/direct DMA
ops for coherent_mask < dma_mask) adds a check of coherent_dma_mask for
dma allocations.

Unfortunately current PASemi code does not set this value for the DMA
engine, which ends up with the default value of 0xffffffff, the result
is on a PASemi system with >2Gb ram and iommu enabled the the onboard
ethernet stops working due to an inability to allocate memory. Add an
initialisation to pci_dma_dev_setup_pasemi().

Signed-off-by: Darren Stevens <darren@stevens-zone.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pasemi/iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 309d9ccccd50..c61667e8bb06 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -187,6 +187,11 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev)
 	if (dev->vendor == 0x1959 && dev->device == 0xa007 &&
 	    !firmware_has_feature(FW_FEATURE_LPAR)) {
 		dev->dev.archdata.dma_ops = &dma_direct_ops;
+		/*
+		 * Set the coherent DMA mask to prevent the iommu
+		 * being used unnecessarily
+		 */
+		dev->dev.coherent_dma_mask = DMA_BIT_MASK(44);
 		return;
 	}
 #endif
-- 
cgit v1.2.3


From 44fb25d031065fc846c0f97e37456a792312a7d1 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 8 Jul 2016 22:33:36 +0800
Subject: mfd: ac100: Add device tree bindings for X-Powers AC100 codec/RTC
 combo IC

The AC100 is a multifunction device with an audio codec subsystem and
an RTC subsystem. These two subsystems share a common register space
and host interface.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/devicetree/bindings/mfd/ac100.txt | 54 +++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mfd/ac100.txt

diff --git a/Documentation/devicetree/bindings/mfd/ac100.txt b/Documentation/devicetree/bindings/mfd/ac100.txt
new file mode 100644
index 000000000000..b8ef00667599
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/ac100.txt
@@ -0,0 +1,54 @@
+X-Powers AC100 Codec/RTC IC Device Tree bindings
+
+AC100 is a audio codec and RTC subsystem combo IC. The 2 parts are
+separated, including power supplies and interrupt lines, but share
+a common register address space and host interface.
+
+Required properties:
+- compatible: "x-powers,ac100"
+- reg: The I2C slave address or RSB hardware address for the chip
+- sub-nodes:
+  - codec
+    - compatible:		"x-powers,ac100-codec"
+    - interrupt-parent:		The parent interrupt controller
+    - interrupts:		SoC NMI / GPIO interrupt connected to the
+    				IRQ_AUDIO pin
+    - #clock-cells:		Shall be 0
+    - clock-output-names:	"4M_adda"
+
+    - see clock/clock-bindings.txt for common clock bindings
+
+  - rtc
+    - compatible:		"x-powers,ac100-rtc"
+    - interrupt-parent:		The parent interrupt controller
+    - interrupts:		SoC NMI / GPIO interrupt connected to the
+    				IRQ_RTC pin
+    - clocks:			A phandle to the codec's "4M_adda" clock
+    - #clock-cells:		Shall be 1
+    - clock-output-names:	"cko1_rtc", "cko2_rtc", "cko3_rtc"
+
+    - see clock/clock-bindings.txt for common clock bindings
+
+Example:
+
+ac100: codec@e89 {
+	compatible = "x-powers,ac100";
+	reg = <0xe89>;
+
+	ac100_codec: codec {
+		compatible = "x-powers,ac100-codec";
+		interrupt-parent = <&r_pio>;
+		interrupts = <0 9 IRQ_TYPE_LEVEL_LOW>; /* PL9 */
+		#clock-cells = <0>;
+		clock-output-names = "4M_adda";
+	};
+
+	ac100_rtc: rtc {
+		compatible = "x-powers,ac100-rtc";
+		interrupt-parent = <&nmi_intc>;
+		interrupts = <0 IRQ_TYPE_LEVEL_LOW>;
+		clocks = <&ac100_codec>;
+		#clock-cells = <1>;
+		clock-output-names = "cko1_rtc", "cko2_rtc", "cko3_rtc";
+	};
+};
-- 
cgit v1.2.3


From 585083c539ca3f5fb3d00057b25f9be3304d54c6 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 8 Jul 2016 22:33:37 +0800
Subject: mfd: ac100: Add driver for X-Powers AC100 audio codec / RTC combo IC

The AC100 is a multifunction device with an audio codec subsystem and
an RTC subsystem. These two subsystems share a common register space
and host interface.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig       |  10 +++
 drivers/mfd/Makefile      |   2 +
 drivers/mfd/ac100.c       | 137 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/ac100.h | 178 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+)
 create mode 100644 drivers/mfd/ac100.c
 create mode 100644 include/linux/mfd/ac100.h

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2d1fb6420592..ccf73aab5384 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -112,6 +112,16 @@ config MFD_BCM590XX
 	help
 	  Support for the BCM590xx PMUs from Broadcom
 
+config MFD_AC100
+	tristate "X-Powers AC100"
+	select MFD_CORE
+	depends on SUNXI_RSB
+	help
+	  If you say Y here you get support for the X-Powers AC100 audio codec
+	  IC.
+	  This driver include only the core APIs. You have to select individual
+	  components like codecs or RTC under the corresponding menus.
+
 config MFD_AXP20X
 	tristate
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 2ba3ba35f745..bca83dbb5ed8 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -113,6 +113,8 @@ obj-$(CONFIG_PMIC_DA9052)	+= da9052-irq.o
 obj-$(CONFIG_PMIC_DA9052)	+= da9052-core.o
 obj-$(CONFIG_MFD_DA9052_SPI)	+= da9052-spi.o
 obj-$(CONFIG_MFD_DA9052_I2C)	+= da9052-i2c.o
+
+obj-$(CONFIG_MFD_AC100)		+= ac100.o
 obj-$(CONFIG_MFD_AXP20X)	+= axp20x.o
 obj-$(CONFIG_MFD_AXP20X_I2C)	+= axp20x-i2c.o
 obj-$(CONFIG_MFD_AXP20X_RSB)	+= axp20x-rsb.o
diff --git a/drivers/mfd/ac100.c b/drivers/mfd/ac100.c
new file mode 100644
index 000000000000..9bc69cd7807d
--- /dev/null
+++ b/drivers/mfd/ac100.c
@@ -0,0 +1,137 @@
+/*
+ * MFD core driver for X-Powers' AC100 Audio Codec IC
+ *
+ * The AC100 is a highly integrated audio codec and RTC subsystem designed
+ * for mobile applications. It has 3 I2S/PCM interfaces, a 2 channel DAC,
+ * a 2 channel ADC with 5 inputs and a builtin mixer. The RTC subsystem has
+ * 3 clock outputs.
+ *
+ * The audio codec and RTC parts are completely separate, sharing only the
+ * host interface for access to its registers.
+ *
+ * Copyright (2016) Chen-Yu Tsai
+ *
+ * Author: Chen-Yu Tsai <wens@csie.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/ac100.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/regmap.h>
+#include <linux/sunxi-rsb.h>
+
+static const struct regmap_range ac100_writeable_ranges[] = {
+	regmap_reg_range(AC100_CHIP_AUDIO_RST, AC100_I2S_SR_CTRL),
+	regmap_reg_range(AC100_I2S1_CLK_CTRL, AC100_I2S1_MXR_GAIN),
+	regmap_reg_range(AC100_I2S2_CLK_CTRL, AC100_I2S2_MXR_GAIN),
+	regmap_reg_range(AC100_I2S3_CLK_CTRL, AC100_I2S3_SIG_PATH_CTRL),
+	regmap_reg_range(AC100_ADC_DIG_CTRL, AC100_ADC_VOL_CTRL),
+	regmap_reg_range(AC100_HMIC_CTRL1, AC100_HMIC_STATUS),
+	regmap_reg_range(AC100_DAC_DIG_CTRL, AC100_DAC_MXR_GAIN),
+	regmap_reg_range(AC100_ADC_APC_CTRL, AC100_LINEOUT_CTRL),
+	regmap_reg_range(AC100_ADC_DAP_L_CTRL, AC100_ADC_DAP_OPT),
+	regmap_reg_range(AC100_DAC_DAP_CTRL, AC100_DAC_DAP_OPT),
+	regmap_reg_range(AC100_ADC_DAP_ENA, AC100_DAC_DAP_ENA),
+	regmap_reg_range(AC100_SRC1_CTRL1, AC100_SRC1_CTRL2),
+	regmap_reg_range(AC100_SRC2_CTRL1, AC100_SRC2_CTRL2),
+	regmap_reg_range(AC100_CLK32K_ANALOG_CTRL, AC100_CLKOUT_CTRL3),
+	regmap_reg_range(AC100_RTC_RST, AC100_RTC_UPD),
+	regmap_reg_range(AC100_ALM_INT_ENA, AC100_ALM_INT_STA),
+	regmap_reg_range(AC100_ALM_SEC, AC100_RTC_GP(15)),
+};
+
+static const struct regmap_range ac100_volatile_ranges[] = {
+	regmap_reg_range(AC100_CHIP_AUDIO_RST, AC100_PLL_CTRL2),
+	regmap_reg_range(AC100_HMIC_STATUS, AC100_HMIC_STATUS),
+	regmap_reg_range(AC100_ADC_DAP_L_STA, AC100_ADC_DAP_L_STA),
+	regmap_reg_range(AC100_SRC1_CTRL1, AC100_SRC1_CTRL1),
+	regmap_reg_range(AC100_SRC1_CTRL3, AC100_SRC2_CTRL1),
+	regmap_reg_range(AC100_SRC2_CTRL3, AC100_SRC2_CTRL4),
+	regmap_reg_range(AC100_RTC_RST, AC100_RTC_RST),
+	regmap_reg_range(AC100_RTC_SEC, AC100_ALM_INT_STA),
+	regmap_reg_range(AC100_ALM_SEC, AC100_ALM_UPD),
+};
+
+static const struct regmap_access_table ac100_writeable_table = {
+	.yes_ranges	= ac100_writeable_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(ac100_writeable_ranges),
+};
+
+static const struct regmap_access_table ac100_volatile_table = {
+	.yes_ranges	= ac100_volatile_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(ac100_volatile_ranges),
+};
+
+static const struct regmap_config ac100_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 16,
+	.wr_table	= &ac100_writeable_table,
+	.volatile_table	= &ac100_volatile_table,
+	.max_register	= AC100_RTC_GP(15),
+	.cache_type	= REGCACHE_RBTREE,
+};
+
+static struct mfd_cell ac100_cells[] = {
+	{
+		.name		= "ac100-codec",
+		.of_compatible	= "x-powers,ac100-codec",
+	}, {
+		.name		= "ac100-rtc",
+		.of_compatible	= "x-powers,ac100-rtc",
+	},
+};
+
+static int ac100_rsb_probe(struct sunxi_rsb_device *rdev)
+{
+	struct ac100_dev *ac100;
+	int ret;
+
+	ac100 = devm_kzalloc(&rdev->dev, sizeof(*ac100), GFP_KERNEL);
+	if (!ac100)
+		return -ENOMEM;
+
+	ac100->dev = &rdev->dev;
+	sunxi_rsb_device_set_drvdata(rdev, ac100);
+
+	ac100->regmap = devm_regmap_init_sunxi_rsb(rdev, &ac100_regmap_config);
+	if (IS_ERR(ac100->regmap)) {
+		ret = PTR_ERR(ac100->regmap);
+		dev_err(ac100->dev, "regmap init failed: %d\n", ret);
+		return ret;
+	}
+
+	ret = devm_mfd_add_devices(ac100->dev, PLATFORM_DEVID_NONE, ac100_cells,
+				   ARRAY_SIZE(ac100_cells), NULL, 0, NULL);
+	if (ret) {
+		dev_err(ac100->dev, "failed to add MFD devices: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct of_device_id ac100_of_match[] = {
+	{ .compatible = "x-powers,ac100" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, ac100_of_match);
+
+static struct sunxi_rsb_driver ac100_rsb_driver = {
+	.driver = {
+		.name	= "ac100",
+		.of_match_table	= of_match_ptr(ac100_of_match),
+	},
+	.probe	= ac100_rsb_probe,
+};
+module_sunxi_rsb_driver(ac100_rsb_driver);
+
+MODULE_DESCRIPTION("Audio codec MFD core driver for AC100");
+MODULE_AUTHOR("Chen-Yu Tsai <wens@csie.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/ac100.h b/include/linux/mfd/ac100.h
new file mode 100644
index 000000000000..3c148f196b9f
--- /dev/null
+++ b/include/linux/mfd/ac100.h
@@ -0,0 +1,178 @@
+/*
+ * Functions and registers to access AC100 codec / RTC combo IC.
+ *
+ * Copyright (C) 2016 Chen-Yu Tsai
+ *
+ * Chen-Yu Tsai <wens@csie.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_MFD_AC100_H
+#define __LINUX_MFD_AC100_H
+
+#include <linux/regmap.h>
+
+struct ac100_dev {
+	struct device			*dev;
+	struct regmap			*regmap;
+};
+
+/* Audio codec related registers */
+#define AC100_CHIP_AUDIO_RST		0x00
+#define AC100_PLL_CTRL1			0x01
+#define AC100_PLL_CTRL2			0x02
+#define AC100_SYSCLK_CTRL		0x03
+#define AC100_MOD_CLK_ENA		0x04
+#define AC100_MOD_RST_CTRL		0x05
+#define AC100_I2S_SR_CTRL		0x06
+
+/* I2S1 interface */
+#define AC100_I2S1_CLK_CTRL		0x10
+#define AC100_I2S1_SND_OUT_CTRL		0x11
+#define AC100_I2S1_SND_IN_CTRL		0x12
+#define AC100_I2S1_MXR_SRC		0x13
+#define AC100_I2S1_VOL_CTRL1		0x14
+#define AC100_I2S1_VOL_CTRL2		0x15
+#define AC100_I2S1_VOL_CTRL3		0x16
+#define AC100_I2S1_VOL_CTRL4		0x17
+#define AC100_I2S1_MXR_GAIN		0x18
+
+/* I2S2 interface */
+#define AC100_I2S2_CLK_CTRL		0x20
+#define AC100_I2S2_SND_OUT_CTRL		0x21
+#define AC100_I2S2_SND_IN_CTRL		0x22
+#define AC100_I2S2_MXR_SRC		0x23
+#define AC100_I2S2_VOL_CTRL1		0x24
+#define AC100_I2S2_VOL_CTRL2		0x25
+#define AC100_I2S2_VOL_CTRL3		0x26
+#define AC100_I2S2_VOL_CTRL4		0x27
+#define AC100_I2S2_MXR_GAIN		0x28
+
+/* I2S3 interface */
+#define AC100_I2S3_CLK_CTRL		0x30
+#define AC100_I2S3_SND_OUT_CTRL		0x31
+#define AC100_I2S3_SND_IN_CTRL		0x32
+#define AC100_I2S3_SIG_PATH_CTRL	0x33
+
+/* ADC digital controls */
+#define AC100_ADC_DIG_CTRL		0x40
+#define AC100_ADC_VOL_CTRL		0x41
+
+/* HMIC plug sensing / key detection */
+#define AC100_HMIC_CTRL1		0x44
+#define AC100_HMIC_CTRL2		0x45
+#define AC100_HMIC_STATUS		0x46
+
+/* DAC digital controls */
+#define AC100_DAC_DIG_CTRL		0x48
+#define AC100_DAC_VOL_CTRL		0x49
+#define AC100_DAC_MXR_SRC		0x4c
+#define AC100_DAC_MXR_GAIN		0x4d
+
+/* Analog controls */
+#define AC100_ADC_APC_CTRL		0x50
+#define AC100_ADC_SRC			0x51
+#define AC100_ADC_SRC_BST_CTRL		0x52
+#define AC100_OUT_MXR_DAC_A_CTRL	0x53
+#define AC100_OUT_MXR_SRC		0x54
+#define AC100_OUT_MXR_SRC_BST		0x55
+#define AC100_HPOUT_CTRL		0x56
+#define AC100_ERPOUT_CTRL		0x57
+#define AC100_SPKOUT_CTRL		0x58
+#define AC100_LINEOUT_CTRL		0x59
+
+/* ADC digital audio processing (high pass filter & auto gain control */
+#define AC100_ADC_DAP_L_STA		0x80
+#define AC100_ADC_DAP_R_STA		0x81
+#define AC100_ADC_DAP_L_CTRL		0x82
+#define AC100_ADC_DAP_R_CTRL		0x83
+#define AC100_ADC_DAP_L_T_L		0x84 /* Left Target Level */
+#define AC100_ADC_DAP_R_T_L		0x85 /* Right Target Level */
+#define AC100_ADC_DAP_L_H_A_C		0x86 /* Left High Avg. Coef */
+#define AC100_ADC_DAP_L_L_A_C		0x87 /* Left Low Avg. Coef */
+#define AC100_ADC_DAP_R_H_A_C		0x88 /* Right High Avg. Coef */
+#define AC100_ADC_DAP_R_L_A_C		0x89 /* Right Low Avg. Coef */
+#define AC100_ADC_DAP_L_D_T		0x8a /* Left Decay Time */
+#define AC100_ADC_DAP_L_A_T		0x8b /* Left Attack Time */
+#define AC100_ADC_DAP_R_D_T		0x8c /* Right Decay Time */
+#define AC100_ADC_DAP_R_A_T		0x8d /* Right Attack Time */
+#define AC100_ADC_DAP_N_TH		0x8e /* Noise Threshold */
+#define AC100_ADC_DAP_L_H_N_A_C		0x8f /* Left High Noise Avg. Coef */
+#define AC100_ADC_DAP_L_L_N_A_C		0x90 /* Left Low Noise Avg. Coef */
+#define AC100_ADC_DAP_R_H_N_A_C		0x91 /* Right High Noise Avg. Coef */
+#define AC100_ADC_DAP_R_L_N_A_C		0x92 /* Right Low Noise Avg. Coef */
+#define AC100_ADC_DAP_H_HPF_C		0x93 /* High High-Pass-Filter Coef */
+#define AC100_ADC_DAP_L_HPF_C		0x94 /* Low High-Pass-Filter Coef */
+#define AC100_ADC_DAP_OPT		0x95 /* AGC Optimum */
+
+/* DAC digital audio processing (high pass filter & dynamic range control) */
+#define AC100_DAC_DAP_CTRL		0xa0
+#define AC100_DAC_DAP_H_HPF_C		0xa1 /* High High-Pass-Filter Coef */
+#define AC100_DAC_DAP_L_HPF_C		0xa2 /* Low High-Pass-Filter Coef */
+#define AC100_DAC_DAP_L_H_E_A_C		0xa3 /* Left High Energy Avg Coef */
+#define AC100_DAC_DAP_L_L_E_A_C		0xa4 /* Left Low Energy Avg Coef */
+#define AC100_DAC_DAP_R_H_E_A_C		0xa5 /* Right High Energy Avg Coef */
+#define AC100_DAC_DAP_R_L_E_A_C		0xa6 /* Right Low Energy Avg Coef */
+#define AC100_DAC_DAP_H_G_D_T_C		0xa7 /* High Gain Delay Time Coef */
+#define AC100_DAC_DAP_L_G_D_T_C		0xa8 /* Low Gain Delay Time Coef */
+#define AC100_DAC_DAP_H_G_A_T_C		0xa9 /* High Gain Attack Time Coef */
+#define AC100_DAC_DAP_L_G_A_T_C		0xaa /* Low Gain Attack Time Coef */
+#define AC100_DAC_DAP_H_E_TH		0xab /* High Energy Threshold */
+#define AC100_DAC_DAP_L_E_TH		0xac /* Low Energy Threshold */
+#define AC100_DAC_DAP_H_G_K		0xad /* High Gain K parameter */
+#define AC100_DAC_DAP_L_G_K		0xae /* Low Gain K parameter */
+#define AC100_DAC_DAP_H_G_OFF		0xaf /* High Gain offset */
+#define AC100_DAC_DAP_L_G_OFF		0xb0 /* Low Gain offset */
+#define AC100_DAC_DAP_OPT		0xb1 /* DRC optimum */
+
+/* Digital audio processing enable */
+#define AC100_ADC_DAP_ENA		0xb4
+#define AC100_DAC_DAP_ENA		0xb5
+
+/* SRC control */
+#define AC100_SRC1_CTRL1		0xb8
+#define AC100_SRC1_CTRL2		0xb9
+#define AC100_SRC1_CTRL3		0xba
+#define AC100_SRC1_CTRL4		0xbb
+#define AC100_SRC2_CTRL1		0xbc
+#define AC100_SRC2_CTRL2		0xbd
+#define AC100_SRC2_CTRL3		0xbe
+#define AC100_SRC2_CTRL4		0xbf
+
+/* RTC clk control */
+#define AC100_CLK32K_ANALOG_CTRL	0xc0
+#define AC100_CLKOUT_CTRL1		0xc1
+#define AC100_CLKOUT_CTRL2		0xc2
+#define AC100_CLKOUT_CTRL3		0xc3
+
+/* RTC module */
+#define AC100_RTC_RST			0xc6
+#define AC100_RTC_CTRL			0xc7
+#define AC100_RTC_SEC			0xc8 /* second */
+#define AC100_RTC_MIN			0xc9 /* minute */
+#define AC100_RTC_HOU			0xca /* hour */
+#define AC100_RTC_WEE			0xcb /* weekday */
+#define AC100_RTC_DAY			0xcc /* day */
+#define AC100_RTC_MON			0xcd /* month */
+#define AC100_RTC_YEA			0xce /* year */
+#define AC100_RTC_UPD			0xcf /* update trigger */
+
+/* RTC alarm */
+#define AC100_ALM_INT_ENA		0xd0
+#define	AC100_ALM_INT_STA		0xd1
+#define AC100_ALM_SEC			0xd8
+#define AC100_ALM_MIN			0xd9
+#define AC100_ALM_HOU			0xda
+#define AC100_ALM_WEE			0xdb
+#define AC100_ALM_DAY			0xdc
+#define AC100_ALM_MON			0xdd
+#define AC100_ALM_YEA			0xde
+#define AC100_ALM_UPD			0xdf
+
+/* RTC general purpose register 0 ~ 15 */
+#define AC100_RTC_GP(x)			(0xe0 + (x))
+
+#endif /* __LINUX_MFD_AC100_H */
-- 
cgit v1.2.3


From d00a18a42c1483924de086ddfb0efe8da1dba3ac Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 8 Jul 2016 22:33:38 +0800
Subject: rtc: ac100: Add RTC driver for X-Powers AC100

X-Powers AC100 is a codec / RTC combo chip. This driver supports
the RTC sub-device.

The RTC block also has clock outputs and non-volatile storage.
Non-volatile storage wthin the RTC hardware is not supported.
Clock output support is added in the next patch.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/rtc/Kconfig     |  10 ++
 drivers/rtc/Makefile    |   1 +
 drivers/rtc/rtc-ac100.c | 325 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 336 insertions(+)
 create mode 100644 drivers/rtc/rtc-ac100.c

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index e215f50794b6..7fc11cdfd27e 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -187,6 +187,16 @@ config RTC_DRV_ABX80X
 	  This driver can also be built as a module. If so, the module
 	  will be called rtc-abx80x.
 
+config RTC_DRV_AC100
+	tristate "X-Powers AC100"
+	depends on MFD_AC100
+	help
+	  If you say yes here you get support for the real-time clock found
+	  in X-Powers AC100 family peripheral ICs.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called rtc-ac100.
+
 config RTC_DRV_AS3722
 	tristate "ams AS3722 RTC driver"
 	depends on MFD_AS3722
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 7cf7ad559c79..8fb994bacdf7 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_RTC_DRV_AB3100)	+= rtc-ab3100.o
 obj-$(CONFIG_RTC_DRV_AB8500)	+= rtc-ab8500.o
 obj-$(CONFIG_RTC_DRV_ABB5ZES3)	+= rtc-ab-b5ze-s3.o
 obj-$(CONFIG_RTC_DRV_ABX80X)	+= rtc-abx80x.o
+obj-$(CONFIG_RTC_DRV_AC100)	+= rtc-ac100.o
 obj-$(CONFIG_RTC_DRV_ARMADA38X)	+= rtc-armada38x.o
 obj-$(CONFIG_RTC_DRV_AS3722)	+= rtc-as3722.o
 obj-$(CONFIG_RTC_DRV_ASM9260)	+= rtc-asm9260.o
diff --git a/drivers/rtc/rtc-ac100.c b/drivers/rtc/rtc-ac100.c
new file mode 100644
index 000000000000..5a9ca89d04c7
--- /dev/null
+++ b/drivers/rtc/rtc-ac100.c
@@ -0,0 +1,325 @@
+/*
+ * RTC Driver for X-Powers AC100
+ *
+ * Copyright (c) 2016 Chen-Yu Tsai
+ *
+ * Chen-Yu Tsai <wens@csie.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/bcd.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mfd/ac100.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/rtc.h>
+#include <linux/types.h>
+
+/* Control register */
+#define AC100_RTC_CTRL_24HOUR	BIT(0)
+
+/* RTC */
+#define AC100_RTC_SEC_MASK	GENMASK(6, 0)
+#define AC100_RTC_MIN_MASK	GENMASK(6, 0)
+#define AC100_RTC_HOU_MASK	GENMASK(5, 0)
+#define AC100_RTC_WEE_MASK	GENMASK(2, 0)
+#define AC100_RTC_DAY_MASK	GENMASK(5, 0)
+#define AC100_RTC_MON_MASK	GENMASK(4, 0)
+#define AC100_RTC_YEA_MASK	GENMASK(7, 0)
+#define AC100_RTC_YEA_LEAP	BIT(15)
+#define AC100_RTC_UPD_TRIGGER	BIT(15)
+
+/* Alarm (wall clock) */
+#define AC100_ALM_INT_ENABLE	BIT(0)
+
+#define AC100_ALM_SEC_MASK	GENMASK(6, 0)
+#define AC100_ALM_MIN_MASK	GENMASK(6, 0)
+#define AC100_ALM_HOU_MASK	GENMASK(5, 0)
+#define AC100_ALM_WEE_MASK	GENMASK(2, 0)
+#define AC100_ALM_DAY_MASK	GENMASK(5, 0)
+#define AC100_ALM_MON_MASK	GENMASK(4, 0)
+#define AC100_ALM_YEA_MASK	GENMASK(7, 0)
+#define AC100_ALM_ENABLE_FLAG	BIT(15)
+#define AC100_ALM_UPD_TRIGGER	BIT(15)
+
+/*
+ * The year parameter passed to the driver is usually an offset relative to
+ * the year 1900. This macro is used to convert this offset to another one
+ * relative to the minimum year allowed by the hardware.
+ *
+ * The year range is 1970 - 2069. This range is selected to match Allwinner's
+ * driver.
+ */
+#define AC100_YEAR_MIN				1970
+#define AC100_YEAR_MAX				2069
+#define AC100_YEAR_OFF				(AC100_YEAR_MIN - 1900)
+
+struct ac100_rtc_dev {
+	struct rtc_device *rtc;
+	struct device *dev;
+	struct regmap *regmap;
+	int irq;
+	unsigned long alarm;
+};
+
+static int ac100_rtc_get_time(struct device *dev, struct rtc_time *rtc_tm)
+{
+	struct ac100_rtc_dev *chip = dev_get_drvdata(dev);
+	struct regmap *regmap = chip->regmap;
+	u16 reg[7];
+	int ret;
+
+	ret = regmap_bulk_read(regmap, AC100_RTC_SEC, reg, 7);
+	if (ret)
+		return ret;
+
+	rtc_tm->tm_sec  = bcd2bin(reg[0] & AC100_RTC_SEC_MASK);
+	rtc_tm->tm_min  = bcd2bin(reg[1] & AC100_RTC_MIN_MASK);
+	rtc_tm->tm_hour = bcd2bin(reg[2] & AC100_RTC_HOU_MASK);
+	rtc_tm->tm_wday = bcd2bin(reg[3] & AC100_RTC_WEE_MASK);
+	rtc_tm->tm_mday = bcd2bin(reg[4] & AC100_RTC_DAY_MASK);
+	rtc_tm->tm_mon  = bcd2bin(reg[5] & AC100_RTC_MON_MASK) - 1;
+	rtc_tm->tm_year = bcd2bin(reg[6] & AC100_RTC_YEA_MASK) +
+			  AC100_YEAR_OFF;
+
+	return rtc_valid_tm(rtc_tm);
+}
+
+static int ac100_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm)
+{
+	struct ac100_rtc_dev *chip = dev_get_drvdata(dev);
+	struct regmap *regmap = chip->regmap;
+	int year;
+	u16 reg[8];
+
+	/* our RTC has a limited year range... */
+	year = rtc_tm->tm_year - AC100_YEAR_OFF;
+	if (year < 0 || year > (AC100_YEAR_MAX - 1900)) {
+		dev_err(dev, "rtc only supports year in range %d - %d\n",
+			AC100_YEAR_MIN, AC100_YEAR_MAX);
+		return -EINVAL;
+	}
+
+	/* convert to BCD */
+	reg[0] = bin2bcd(rtc_tm->tm_sec)     & AC100_RTC_SEC_MASK;
+	reg[1] = bin2bcd(rtc_tm->tm_min)     & AC100_RTC_MIN_MASK;
+	reg[2] = bin2bcd(rtc_tm->tm_hour)    & AC100_RTC_HOU_MASK;
+	reg[3] = bin2bcd(rtc_tm->tm_wday)    & AC100_RTC_WEE_MASK;
+	reg[4] = bin2bcd(rtc_tm->tm_mday)    & AC100_RTC_DAY_MASK;
+	reg[5] = bin2bcd(rtc_tm->tm_mon + 1) & AC100_RTC_MON_MASK;
+	reg[6] = bin2bcd(year)		     & AC100_RTC_YEA_MASK;
+	/* trigger write */
+	reg[7] = AC100_RTC_UPD_TRIGGER;
+
+	/* Is it a leap year? */
+	if (is_leap_year(year + AC100_YEAR_OFF + 1900))
+		reg[6] |= AC100_RTC_YEA_LEAP;
+
+	return regmap_bulk_write(regmap, AC100_RTC_SEC, reg, 8);
+}
+
+static int ac100_rtc_alarm_irq_enable(struct device *dev, unsigned int en)
+{
+	struct ac100_rtc_dev *chip = dev_get_drvdata(dev);
+	struct regmap *regmap = chip->regmap;
+	unsigned int val;
+
+	val = en ? AC100_ALM_INT_ENABLE : 0;
+
+	return regmap_write(regmap, AC100_ALM_INT_ENA, val);
+}
+
+static int ac100_rtc_get_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct ac100_rtc_dev *chip = dev_get_drvdata(dev);
+	struct regmap *regmap = chip->regmap;
+	struct rtc_time *alrm_tm = &alrm->time;
+	u16 reg[7];
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(regmap, AC100_ALM_INT_ENA, &val);
+	if (ret)
+		return ret;
+
+	alrm->enabled = !!(val & AC100_ALM_INT_ENABLE);
+
+	ret = regmap_bulk_read(regmap, AC100_ALM_SEC, reg, 7);
+	if (ret)
+		return ret;
+
+	alrm_tm->tm_sec  = bcd2bin(reg[0] & AC100_ALM_SEC_MASK);
+	alrm_tm->tm_min  = bcd2bin(reg[1] & AC100_ALM_MIN_MASK);
+	alrm_tm->tm_hour = bcd2bin(reg[2] & AC100_ALM_HOU_MASK);
+	alrm_tm->tm_wday = bcd2bin(reg[3] & AC100_ALM_WEE_MASK);
+	alrm_tm->tm_mday = bcd2bin(reg[4] & AC100_ALM_DAY_MASK);
+	alrm_tm->tm_mon  = bcd2bin(reg[5] & AC100_ALM_MON_MASK) - 1;
+	alrm_tm->tm_year = bcd2bin(reg[6] & AC100_ALM_YEA_MASK) +
+			   AC100_YEAR_OFF;
+
+	return 0;
+}
+
+static int ac100_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct ac100_rtc_dev *chip = dev_get_drvdata(dev);
+	struct regmap *regmap = chip->regmap;
+	struct rtc_time *alrm_tm = &alrm->time;
+	u16 reg[8];
+	int year;
+	int ret;
+
+	/* our alarm has a limited year range... */
+	year = alrm_tm->tm_year - AC100_YEAR_OFF;
+	if (year < 0 || year > (AC100_YEAR_MAX - 1900)) {
+		dev_err(dev, "alarm only supports year in range %d - %d\n",
+			AC100_YEAR_MIN, AC100_YEAR_MAX);
+		return -EINVAL;
+	}
+
+	/* convert to BCD */
+	reg[0] = (bin2bcd(alrm_tm->tm_sec)  & AC100_ALM_SEC_MASK) |
+			AC100_ALM_ENABLE_FLAG;
+	reg[1] = (bin2bcd(alrm_tm->tm_min)  & AC100_ALM_MIN_MASK) |
+			AC100_ALM_ENABLE_FLAG;
+	reg[2] = (bin2bcd(alrm_tm->tm_hour) & AC100_ALM_HOU_MASK) |
+			AC100_ALM_ENABLE_FLAG;
+	/* Do not enable weekday alarm */
+	reg[3] = bin2bcd(alrm_tm->tm_wday) & AC100_ALM_WEE_MASK;
+	reg[4] = (bin2bcd(alrm_tm->tm_mday) & AC100_ALM_DAY_MASK) |
+			AC100_ALM_ENABLE_FLAG;
+	reg[5] = (bin2bcd(alrm_tm->tm_mon + 1)  & AC100_ALM_MON_MASK) |
+			AC100_ALM_ENABLE_FLAG;
+	reg[6] = (bin2bcd(year) & AC100_ALM_YEA_MASK) |
+			AC100_ALM_ENABLE_FLAG;
+	/* trigger write */
+	reg[7] = AC100_ALM_UPD_TRIGGER;
+
+	ret = regmap_bulk_write(regmap, AC100_ALM_SEC, reg, 8);
+	if (ret)
+		return ret;
+
+	return ac100_rtc_alarm_irq_enable(dev, alrm->enabled);
+}
+
+static irqreturn_t ac100_rtc_irq(int irq, void *data)
+{
+	struct ac100_rtc_dev *chip = data;
+	struct regmap *regmap = chip->regmap;
+	unsigned int val = 0;
+	int ret;
+
+	mutex_lock(&chip->rtc->ops_lock);
+
+	/* read status */
+	ret = regmap_read(regmap, AC100_ALM_INT_STA, &val);
+	if (ret)
+		goto out;
+
+	if (val & AC100_ALM_INT_ENABLE) {
+		/* signal rtc framework */
+		rtc_update_irq(chip->rtc, 1, RTC_AF | RTC_IRQF);
+
+		/* clear status */
+		ret = regmap_write(regmap, AC100_ALM_INT_STA, val);
+		if (ret)
+			goto out;
+
+		/* disable interrupt */
+		ret = ac100_rtc_alarm_irq_enable(chip->dev, 0);
+		if (ret)
+			goto out;
+	}
+
+out:
+	mutex_unlock(&chip->rtc->ops_lock);
+	return IRQ_HANDLED;
+}
+
+static const struct rtc_class_ops ac100_rtc_ops = {
+	.read_time	  = ac100_rtc_get_time,
+	.set_time	  = ac100_rtc_set_time,
+	.read_alarm	  = ac100_rtc_get_alarm,
+	.set_alarm	  = ac100_rtc_set_alarm,
+	.alarm_irq_enable = ac100_rtc_alarm_irq_enable,
+};
+
+static int ac100_rtc_probe(struct platform_device *pdev)
+{
+	struct ac100_dev *ac100 = dev_get_drvdata(pdev->dev.parent);
+	struct ac100_rtc_dev *chip;
+	int ret;
+
+	chip = devm_kzalloc(&pdev->dev, sizeof(*chip), GFP_KERNEL);
+	platform_set_drvdata(pdev, chip);
+	chip->dev = &pdev->dev;
+	chip->regmap = ac100->regmap;
+
+	chip->irq = platform_get_irq(pdev, 0);
+	if (chip->irq < 0) {
+		dev_err(&pdev->dev, "No IRQ resource\n");
+		return chip->irq;
+	}
+
+	ret = devm_request_threaded_irq(&pdev->dev, chip->irq, NULL,
+					ac100_rtc_irq,
+					IRQF_SHARED | IRQF_ONESHOT,
+					dev_name(&pdev->dev), chip);
+	if (ret) {
+		dev_err(&pdev->dev, "Could not request IRQ\n");
+		return ret;
+	}
+
+	/* always use 24 hour mode */
+	regmap_write_bits(chip->regmap, AC100_RTC_CTRL, AC100_RTC_CTRL_24HOUR,
+			  AC100_RTC_CTRL_24HOUR);
+
+	/* disable counter alarm interrupt */
+	regmap_write(chip->regmap, AC100_ALM_INT_ENA, 0);
+
+	/* clear counter alarm pending interrupts */
+	regmap_write(chip->regmap, AC100_ALM_INT_STA, AC100_ALM_INT_ENABLE);
+
+	chip->rtc = devm_rtc_device_register(&pdev->dev, "rtc-ac100",
+					     &ac100_rtc_ops, THIS_MODULE);
+	if (IS_ERR(chip->rtc)) {
+		dev_err(&pdev->dev, "unable to register device\n");
+		return PTR_ERR(chip->rtc);
+	}
+
+	dev_info(&pdev->dev, "RTC enabled\n");
+
+	return 0;
+}
+
+static const struct of_device_id ac100_rtc_match[] = {
+	{ .compatible = "x-powers,ac100-rtc" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, ac100_rtc_match);
+
+static struct platform_driver ac100_rtc_driver = {
+	.probe		= ac100_rtc_probe,
+	.driver		= {
+		.name		= "ac100-rtc",
+		.of_match_table	= of_match_ptr(ac100_rtc_match),
+	},
+};
+module_platform_driver(ac100_rtc_driver);
+
+MODULE_DESCRIPTION("X-Powers AC100 RTC driver");
+MODULE_AUTHOR("Chen-Yu Tsai <wens@csie.org>");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 04940631b8d2b2e57a13b6d4ca50dfe5994b514f Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 8 Jul 2016 22:33:39 +0800
Subject: rtc: ac100: Add clk output support

The AC100's RTC side has 3 clock outputs on external pins, which can
provide a clock signal to the SoC or other modules, such as WiFi or
GSM modules.

Support this with a custom clk driver integrated with the rtc driver.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/rtc/rtc-ac100.c | 302 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 302 insertions(+)

diff --git a/drivers/rtc/rtc-ac100.c b/drivers/rtc/rtc-ac100.c
index 5a9ca89d04c7..70b4fd0f6122 100644
--- a/drivers/rtc/rtc-ac100.c
+++ b/drivers/rtc/rtc-ac100.c
@@ -16,6 +16,7 @@
  */
 
 #include <linux/bcd.h>
+#include <linux/clk-provider.h>
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
@@ -31,6 +32,15 @@
 /* Control register */
 #define AC100_RTC_CTRL_24HOUR	BIT(0)
 
+/* Clock output register bits */
+#define AC100_CLKOUT_PRE_DIV_SHIFT	5
+#define AC100_CLKOUT_PRE_DIV_WIDTH	3
+#define AC100_CLKOUT_MUX_SHIFT		4
+#define AC100_CLKOUT_MUX_WIDTH		1
+#define AC100_CLKOUT_DIV_SHIFT		1
+#define AC100_CLKOUT_DIV_WIDTH		3
+#define AC100_CLKOUT_EN			BIT(0)
+
 /* RTC */
 #define AC100_RTC_SEC_MASK	GENMASK(6, 0)
 #define AC100_RTC_MIN_MASK	GENMASK(6, 0)
@@ -67,14 +77,292 @@
 #define AC100_YEAR_MAX				2069
 #define AC100_YEAR_OFF				(AC100_YEAR_MIN - 1900)
 
+struct ac100_clkout {
+	struct clk_hw hw;
+	struct regmap *regmap;
+	u8 offset;
+};
+
+#define to_ac100_clkout(_hw) container_of(_hw, struct ac100_clkout, hw)
+
+#define AC100_RTC_32K_NAME	"ac100-rtc-32k"
+#define AC100_RTC_32K_RATE	32768
+#define AC100_CLKOUT_NUM	3
+
+static const char * const ac100_clkout_names[AC100_CLKOUT_NUM] = {
+	"ac100-cko1-rtc",
+	"ac100-cko2-rtc",
+	"ac100-cko3-rtc",
+};
+
 struct ac100_rtc_dev {
 	struct rtc_device *rtc;
 	struct device *dev;
 	struct regmap *regmap;
 	int irq;
 	unsigned long alarm;
+
+	struct clk_hw *rtc_32k_clk;
+	struct ac100_clkout clks[AC100_CLKOUT_NUM];
+	struct clk_hw_onecell_data *clk_data;
 };
 
+/**
+ * Clock controls for 3 clock output pins
+ */
+
+static const struct clk_div_table ac100_clkout_prediv[] = {
+	{ .val = 0, .div = 1 },
+	{ .val = 1, .div = 2 },
+	{ .val = 2, .div = 4 },
+	{ .val = 3, .div = 8 },
+	{ .val = 4, .div = 16 },
+	{ .val = 5, .div = 32 },
+	{ .val = 6, .div = 64 },
+	{ .val = 7, .div = 122 },
+	{ },
+};
+
+/* Abuse the fact that one parent is 32768 Hz, and the other is 4 MHz */
+static unsigned long ac100_clkout_recalc_rate(struct clk_hw *hw,
+					      unsigned long prate)
+{
+	struct ac100_clkout *clk = to_ac100_clkout(hw);
+	unsigned int reg, div;
+
+	regmap_read(clk->regmap, clk->offset, &reg);
+
+	/* Handle pre-divider first */
+	if (prate != AC100_RTC_32K_RATE) {
+		div = (reg >> AC100_CLKOUT_PRE_DIV_SHIFT) &
+			((1 << AC100_CLKOUT_PRE_DIV_WIDTH) - 1);
+		prate = divider_recalc_rate(hw, prate, div,
+					    ac100_clkout_prediv, 0);
+	}
+
+	div = (reg >> AC100_CLKOUT_DIV_SHIFT) &
+		(BIT(AC100_CLKOUT_DIV_WIDTH) - 1);
+	return divider_recalc_rate(hw, prate, div, NULL,
+				   CLK_DIVIDER_POWER_OF_TWO);
+}
+
+static long ac100_clkout_round_rate(struct clk_hw *hw, unsigned long rate,
+				    unsigned long prate)
+{
+	unsigned long best_rate = 0, tmp_rate, tmp_prate;
+	int i;
+
+	if (prate == AC100_RTC_32K_RATE)
+		return divider_round_rate(hw, rate, &prate, NULL,
+					  AC100_CLKOUT_DIV_WIDTH,
+					  CLK_DIVIDER_POWER_OF_TWO);
+
+	for (i = 0; ac100_clkout_prediv[i].div; i++) {
+		tmp_prate = DIV_ROUND_UP(prate, ac100_clkout_prediv[i].val);
+		tmp_rate = divider_round_rate(hw, rate, &tmp_prate, NULL,
+					      AC100_CLKOUT_DIV_WIDTH,
+					      CLK_DIVIDER_POWER_OF_TWO);
+
+		if (tmp_rate > rate)
+			continue;
+		if (rate - tmp_rate < best_rate - tmp_rate)
+			best_rate = tmp_rate;
+	}
+
+	return best_rate;
+}
+
+static int ac100_clkout_determine_rate(struct clk_hw *hw,
+				       struct clk_rate_request *req)
+{
+	struct clk_hw *best_parent;
+	unsigned long best = 0;
+	int i, num_parents = clk_hw_get_num_parents(hw);
+
+	for (i = 0; i < num_parents; i++) {
+		struct clk_hw *parent = clk_hw_get_parent_by_index(hw, i);
+		unsigned long tmp, prate = clk_hw_get_rate(parent);
+
+		tmp = ac100_clkout_round_rate(hw, req->rate, prate);
+
+		if (tmp > req->rate)
+			continue;
+		if (req->rate - tmp < req->rate - best) {
+			best = tmp;
+			best_parent = parent;
+		}
+	}
+
+	if (!best)
+		return -EINVAL;
+
+	req->best_parent_hw = best_parent;
+	req->best_parent_rate = best;
+	req->rate = best;
+
+	return 0;
+}
+
+static int ac100_clkout_set_rate(struct clk_hw *hw, unsigned long rate,
+				 unsigned long prate)
+{
+	struct ac100_clkout *clk = to_ac100_clkout(hw);
+	int div = 0, pre_div = 0;
+
+	do {
+		div = divider_get_val(rate * ac100_clkout_prediv[pre_div].div,
+				      prate, NULL, AC100_CLKOUT_DIV_WIDTH,
+				      CLK_DIVIDER_POWER_OF_TWO);
+		if (div >= 0)
+			break;
+	} while (prate != AC100_RTC_32K_RATE &&
+		 ac100_clkout_prediv[++pre_div].div);
+
+	if (div < 0)
+		return div;
+
+	pre_div = ac100_clkout_prediv[pre_div].val;
+
+	regmap_update_bits(clk->regmap, clk->offset,
+			   ((1 << AC100_CLKOUT_DIV_WIDTH) - 1) << AC100_CLKOUT_DIV_SHIFT |
+			   ((1 << AC100_CLKOUT_PRE_DIV_WIDTH) - 1) << AC100_CLKOUT_PRE_DIV_SHIFT,
+			   (div - 1) << AC100_CLKOUT_DIV_SHIFT |
+			   (pre_div - 1) << AC100_CLKOUT_PRE_DIV_SHIFT);
+
+	return 0;
+}
+
+static int ac100_clkout_prepare(struct clk_hw *hw)
+{
+	struct ac100_clkout *clk = to_ac100_clkout(hw);
+
+	return regmap_update_bits(clk->regmap, clk->offset, AC100_CLKOUT_EN,
+				  AC100_CLKOUT_EN);
+}
+
+static void ac100_clkout_unprepare(struct clk_hw *hw)
+{
+	struct ac100_clkout *clk = to_ac100_clkout(hw);
+
+	regmap_update_bits(clk->regmap, clk->offset, AC100_CLKOUT_EN, 0);
+}
+
+static int ac100_clkout_is_prepared(struct clk_hw *hw)
+{
+	struct ac100_clkout *clk = to_ac100_clkout(hw);
+	unsigned int reg;
+
+	regmap_read(clk->regmap, clk->offset, &reg);
+
+	return reg & AC100_CLKOUT_EN;
+}
+
+static u8 ac100_clkout_get_parent(struct clk_hw *hw)
+{
+	struct ac100_clkout *clk = to_ac100_clkout(hw);
+	unsigned int reg;
+
+	regmap_read(clk->regmap, clk->offset, &reg);
+
+	return (reg >> AC100_CLKOUT_MUX_SHIFT) & 0x1;
+}
+
+static int ac100_clkout_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct ac100_clkout *clk = to_ac100_clkout(hw);
+
+	return regmap_update_bits(clk->regmap, clk->offset,
+				  BIT(AC100_CLKOUT_MUX_SHIFT),
+				  index ? BIT(AC100_CLKOUT_MUX_SHIFT) : 0);
+}
+
+static const struct clk_ops ac100_clkout_ops = {
+	.prepare	= ac100_clkout_prepare,
+	.unprepare	= ac100_clkout_unprepare,
+	.is_prepared	= ac100_clkout_is_prepared,
+	.recalc_rate	= ac100_clkout_recalc_rate,
+	.determine_rate	= ac100_clkout_determine_rate,
+	.get_parent	= ac100_clkout_get_parent,
+	.set_parent	= ac100_clkout_set_parent,
+	.set_rate	= ac100_clkout_set_rate,
+};
+
+static int ac100_rtc_register_clks(struct ac100_rtc_dev *chip)
+{
+	struct device_node *np = chip->dev->of_node;
+	const char *parents[2] = {AC100_RTC_32K_NAME};
+	int i, ret;
+
+	chip->clk_data = devm_kzalloc(chip->dev, sizeof(*chip->clk_data) +
+						 sizeof(*chip->clk_data->hws) *
+						 AC100_CLKOUT_NUM,
+						 GFP_KERNEL);
+	if (!chip->clk_data)
+		return -ENOMEM;
+
+	chip->rtc_32k_clk = clk_hw_register_fixed_rate(chip->dev,
+						       AC100_RTC_32K_NAME,
+						       NULL, 0,
+						       AC100_RTC_32K_RATE);
+	if (IS_ERR(chip->rtc_32k_clk)) {
+		ret = PTR_ERR(chip->rtc_32k_clk);
+		dev_err(chip->dev, "Failed to register RTC-32k clock: %d\n",
+			ret);
+		return ret;
+	}
+
+	parents[1] = of_clk_get_parent_name(np, 0);
+	if (!parents[1]) {
+		dev_err(chip->dev, "Failed to get ADDA 4M clock\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < AC100_CLKOUT_NUM; i++) {
+		struct ac100_clkout *clk = &chip->clks[i];
+		struct clk_init_data init = {
+			.name = ac100_clkout_names[i],
+			.ops = &ac100_clkout_ops,
+			.parent_names = parents,
+			.num_parents = ARRAY_SIZE(parents),
+			.flags = 0,
+		};
+
+		clk->regmap = chip->regmap;
+		clk->offset = AC100_CLKOUT_CTRL1 + i;
+		clk->hw.init = &init;
+
+		ret = devm_clk_hw_register(chip->dev, &clk->hw);
+		if (ret) {
+			dev_err(chip->dev, "Failed to register clk '%s': %d\n",
+				init.name, ret);
+			goto err_unregister_rtc_32k;
+		}
+
+		chip->clk_data->hws[i] = &clk->hw;
+	}
+
+	chip->clk_data->num = i;
+	ret = of_clk_add_hw_provider(np, of_clk_hw_onecell_get, chip->clk_data);
+	if (ret)
+		goto err_unregister_rtc_32k;
+
+	return 0;
+
+err_unregister_rtc_32k:
+	clk_unregister_fixed_rate(chip->rtc_32k_clk->clk);
+
+	return ret;
+}
+
+static void ac100_rtc_unregister_clks(struct ac100_rtc_dev *chip)
+{
+	of_clk_del_provider(chip->dev->of_node);
+	clk_unregister_fixed_rate(chip->rtc_32k_clk->clk);
+}
+
+/**
+ * RTC related bits
+ */
 static int ac100_rtc_get_time(struct device *dev, struct rtc_time *rtc_tm)
 {
 	struct ac100_rtc_dev *chip = dev_get_drvdata(dev);
@@ -300,11 +588,24 @@ static int ac100_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(chip->rtc);
 	}
 
+	ret = ac100_rtc_register_clks(chip);
+	if (ret)
+		return ret;
+
 	dev_info(&pdev->dev, "RTC enabled\n");
 
 	return 0;
 }
 
+static int ac100_rtc_remove(struct platform_device *pdev)
+{
+	struct ac100_rtc_dev *chip = platform_get_drvdata(pdev);
+
+	ac100_rtc_unregister_clks(chip);
+
+	return 0;
+}
+
 static const struct of_device_id ac100_rtc_match[] = {
 	{ .compatible = "x-powers,ac100-rtc" },
 	{ },
@@ -313,6 +614,7 @@ MODULE_DEVICE_TABLE(of, ac100_rtc_match);
 
 static struct platform_driver ac100_rtc_driver = {
 	.probe		= ac100_rtc_probe,
+	.remove		= ac100_rtc_remove,
 	.driver		= {
 		.name		= "ac100-rtc",
 		.of_match_table	= of_match_ptr(ac100_rtc_match),
-- 
cgit v1.2.3


From 68202c9f0ad6e16ee806fbadbc5838d55fe5aa5c Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 29 Jul 2016 14:59:12 -0600
Subject: libnvdimm, nd_blk: mask off reserved status bits

The "NVDIMM Block Window Driver Writer's Guide":

    http://pmem.io/documents/NVDIMM_DriverWritersGuide-July-2016.pdf

...defines the layout of the block window status register.  For the July
2016 version of the spec linked to above, this happens in Figure 4 on
page 26.

The only bits defined in this spec are bits 31, 5, 4, 2, 1 and 0.  The
rest of the bits in the status register are reserved, and there is a
warning following the diagram that says:

    Note: The driver cannot assume the value of the RESERVED bits in the
    status register are zero. These reserved bits need to be masked off, and
    the driver must avoid checking the state of those bits.

This change ensures that for hardware implementations that set these
reserved bits in the status register, the driver won't incorrectly fail the
block I/Os.

Cc: <stable@vger.kernel.org> #v4.2+
Reviewed-by: Lee, Chun-Yi <jlee@suse.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 8c234dd9b8bc..80cc7c089a15 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1527,11 +1527,12 @@ static u32 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
 {
 	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
 	u64 offset = nfit_blk->stat_offset + mmio->size * bw;
+	const u32 STATUS_MASK = 0x80000037;
 
 	if (mmio->num_lines)
 		offset = to_interleave_offset(offset, mmio);
 
-	return readl(mmio->addr.base + offset);
+	return readl(mmio->addr.base + offset) & STATUS_MASK;
 }
 
 static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
-- 
cgit v1.2.3


From abe8b4e3cef88b8202641d63f5ad58141b970b0f Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 27 Jul 2016 16:38:59 -0600
Subject: nvdimm, btt: add a size attribute for BTTs

To be consistent with other namespaces, expose a 'size' attribute for
BTT devices also.

Cc: Dan Williams <dan.j.williams@intel.com>
Reported-by: Linda Knippers <linda.knippers@hpe.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/btt.c      |  1 +
 drivers/nvdimm/btt_devs.c | 20 ++++++++++++++++++++
 drivers/nvdimm/nd.h       |  1 +
 3 files changed, 22 insertions(+)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 88e91666f145..368795aad5c9 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1269,6 +1269,7 @@ static int btt_blk_init(struct btt *btt)
 		}
 	}
 	set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
+	btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
 	revalidate_disk(btt->btt_disk);
 
 	return 0;
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 3fa7919f94a8..97dd2925ed6e 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -140,10 +140,30 @@ static ssize_t namespace_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(namespace);
 
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_btt *nd_btt = to_nd_btt(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (dev->driver)
+		rc = sprintf(buf, "%llu\n", nd_btt->size);
+	else {
+		/* no size to convey if the btt instance is disabled */
+		rc = -ENXIO;
+	}
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(size);
+
 static struct attribute *nd_btt_attributes[] = {
 	&dev_attr_sector_size.attr,
 	&dev_attr_namespace.attr,
 	&dev_attr_uuid.attr,
+	&dev_attr_size.attr,
 	NULL,
 };
 
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 40476399d227..8024a0ef86d3 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -143,6 +143,7 @@ struct nd_btt {
 	struct nd_namespace_common *ndns;
 	struct btt *btt;
 	unsigned long lbasize;
+	u64 size;
 	u8 *uuid;
 	int id;
 };
-- 
cgit v1.2.3


From 65ea11ec6a82b1d44aba62b59e9eb20247e57c6e Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Mon, 8 Aug 2016 20:35:29 +0300
Subject: x86/hweight: Don't clobber %rdi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The caller expects %rdi to remain intact, push+pop it make that happen.

Fixes the following kind of explosions on my core2duo machine when
trying to reboot or shut down:

  general protection fault: 0000 [#1] PREEMPT SMP
  Modules linked in: i915 i2c_algo_bit drm_kms_helper cfbfillrect syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea drm netconsole configfs binfmt_misc iTCO_wdt psmouse pcspkr snd_hda_codec_idt e100 coretemp hwmon snd_hda_codec_generic i2c_i801 mii i2c_smbus lpc_ich mfd_core snd_hda_intel uhci_hcd snd_hda_codec snd_hwdep snd_hda_core ehci_pci 8250 ehci_hcd snd_pcm 8250_base usbcore evdev serial_core usb_common parport_pc parport snd_timer snd soundcore
  CPU: 0 PID: 3070 Comm: reboot Not tainted 4.8.0-rc1-perf-dirty #69
  Hardware name:                  /D946GZIS, BIOS TS94610J.86A.0087.2007.1107.1049 11/07/2007
  task: ffff88012a0b4080 task.stack: ffff880123850000
  RIP: 0010:[<ffffffff81003c92>]  [<ffffffff81003c92>] x86_perf_event_update+0x52/0xc0
  RSP: 0018:ffff880123853b60  EFLAGS: 00010087
  RAX: 0000000000000001 RBX: ffff88012fc0a3c0 RCX: 000000000000001e
  RDX: 0000000000000000 RSI: 0000000040000000 RDI: ffff88012b014800
  RBP: ffff880123853b88 R08: ffffffffffffffff R09: 0000000000000000
  R10: ffffea0004a012c0 R11: ffffea0004acedc0 R12: ffffffff80000001
  R13: ffff88012b0149c0 R14: ffff88012b014800 R15: 0000000000000018
  FS:  00007f8b155cd700(0000) GS:ffff88012fc00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f8b155f5000 CR3: 000000012a2d7000 CR4: 00000000000006f0
  Stack:
   ffff88012fc0a3c0 ffff88012b014800 0000000000000004 0000000000000001
   ffff88012fc1b750 ffff880123853bb0 ffffffff81003d59 ffff88012b014800
   ffff88012fc0a3c0 ffff88012b014800 ffff880123853bd8 ffffffff81003e13
  Call Trace:
   [<ffffffff81003d59>] x86_pmu_stop+0x59/0xd0
   [<ffffffff81003e13>] x86_pmu_del+0x43/0x140
   [<ffffffff8111705d>] event_sched_out.isra.105+0xbd/0x260
   [<ffffffff8111738d>] __perf_remove_from_context+0x2d/0xb0
   [<ffffffff8111745d>] __perf_event_exit_context+0x4d/0x70
   [<ffffffff810c8826>] generic_exec_single+0xb6/0x140
   [<ffffffff81117410>] ? __perf_remove_from_context+0xb0/0xb0
   [<ffffffff81117410>] ? __perf_remove_from_context+0xb0/0xb0
   [<ffffffff810c898f>] smp_call_function_single+0xdf/0x140
   [<ffffffff81113d27>] perf_event_exit_cpu_context+0x87/0xc0
   [<ffffffff81113d73>] perf_reboot+0x13/0x40
   [<ffffffff8107578a>] notifier_call_chain+0x4a/0x70
   [<ffffffff81075ad7>] __blocking_notifier_call_chain+0x47/0x60
   [<ffffffff81075b06>] blocking_notifier_call_chain+0x16/0x20
   [<ffffffff81076a1d>] kernel_restart_prepare+0x1d/0x40
   [<ffffffff81076ae2>] kernel_restart+0x12/0x60
   [<ffffffff81076d56>] SYSC_reboot+0xf6/0x1b0
   [<ffffffff811a823c>] ? mntput_no_expire+0x2c/0x1b0
   [<ffffffff811a83e4>] ? mntput+0x24/0x40
   [<ffffffff811894fc>] ? __fput+0x16c/0x1e0
   [<ffffffff811895ae>] ? ____fput+0xe/0x10
   [<ffffffff81072fc3>] ? task_work_run+0x83/0xa0
   [<ffffffff81001623>] ? exit_to_usermode_loop+0x53/0xc0
   [<ffffffff8100105a>] ? trace_hardirqs_on_thunk+0x1a/0x1c
   [<ffffffff81076e6e>] SyS_reboot+0xe/0x10
   [<ffffffff814c4ba5>] entry_SYSCALL_64_fastpath+0x18/0xa3
  Code: 7c 4c 8d af c0 01 00 00 49 89 fe eb 10 48 09 c2 4c 89 e0 49 0f b1 55 00 4c 39 e0 74 35 4d 8b a6 c0 01 00 00 41 8b 8e 60 01 00 00 <0f> 33 8b 35 6e 02 8c 00 48 c1 e2 20 85 f6 7e d2 48 89 d3 89 cf
  RIP  [<ffffffff81003c92>] x86_perf_event_update+0x52/0xc0
   RSP <ffff880123853b60>
  ---[ end trace 7ec95181faf211be ]---
  note: reboot[3070] exited with preempt_count 2

Cc: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Fixes: f5967101e9de ("x86/hweight: Get rid of the special calling convention")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/lib/hweight.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 02de3d74d2c5..8a602a1e404a 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -35,6 +35,7 @@ ENDPROC(__sw_hweight32)
 
 ENTRY(__sw_hweight64)
 #ifdef CONFIG_X86_64
+	pushq   %rdi
 	pushq   %rdx
 
 	movq    %rdi, %rdx                      # w -> t
@@ -60,6 +61,7 @@ ENTRY(__sw_hweight64)
 	shrq    $56, %rax                       # w = w_tmp >> 56
 
 	popq    %rdx
+	popq    %rdi
 	ret
 #else /* CONFIG_X86_32 */
 	/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
-- 
cgit v1.2.3


From 65a97a67a7d7b7a953492573d5740d32510b9daa Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali.rohar@gmail.com>
Date: Wed, 27 Jul 2016 09:34:42 +0200
Subject: dell-wmi: Ignore WMI event 0xe00e
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WMI event 0xe00e is received when battery was removed or inserted.

Signed-off-by: Pali Rohár <pali.rohar@gmail.com>
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
---
 drivers/platform/x86/dell-wmi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c
index d2bc092defd7..da2fe18162e1 100644
--- a/drivers/platform/x86/dell-wmi.c
+++ b/drivers/platform/x86/dell-wmi.c
@@ -110,8 +110,8 @@ static const struct key_entry dell_wmi_keymap_type_0000[] __initconst = {
 	/* BIOS error detected */
 	{ KE_IGNORE, 0xe00d, { KEY_RESERVED } },
 
-	/* Unknown, defined in ACPI DSDT */
-	/* { KE_IGNORE, 0xe00e, { KEY_RESERVED } }, */
+	/* Battery was removed or inserted */
+	{ KE_IGNORE, 0xe00e, { KEY_RESERVED } },
 
 	/* Wifi Catcher */
 	{ KE_KEY,    0xe011, { KEY_PROG2 } },
-- 
cgit v1.2.3


From 574673c231a5fad1560249cc3a598907acb36cf9 Mon Sep 17 00:00:00 2001
From: Andreas Ziegler <andreas.ziegler@fau.de>
Date: Thu, 4 Aug 2016 09:52:09 +0200
Subject: printk: Remove unnecessary #ifdef CONFIG_PRINTK

In commit 874f9c7da9a4 ("printk: create pr_<level> functions"), new
pr_level defines were added to printk.c.

These new defines are guarded by an #ifdef CONFIG_PRINTK - however,
there is already a surrounding #ifdef CONFIG_PRINTK starting a lot
earlier in line 249 which means the newly introduced #ifdef is
unnecessary.

Let's remove it to avoid confusion.

Signed-off-by: Andreas Ziegler <andreas.ziegler@fau.de>
Cc: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a5ef95ca18c9..a37fc8cf8e84 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1930,7 +1930,6 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-#ifdef CONFIG_PRINTK
 #define define_pr_level(func, loglevel)				\
 asmlinkage __visible void func(const char *fmt, ...)		\
 {								\
@@ -1949,7 +1948,6 @@ define_pr_level(__pr_err, LOGLEVEL_ERR);
 define_pr_level(__pr_warn, LOGLEVEL_WARNING);
 define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
 define_pr_level(__pr_info, LOGLEVEL_INFO);
-#endif
 
 int vprintk_default(int level, const char *fmt, va_list args)
 {
-- 
cgit v1.2.3


From c22e853a2ed19321d00c1eae339ffdc4f5e7757e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Sat, 30 Jul 2016 00:37:57 +0000
Subject: libceph: fix return value check in alloc_msg_with_page_vector()

In case of error, the function ceph_alloc_page_vector() returns
ERR_PTR() and never returns NULL. The NULL test in the return value
check should be replaced with IS_ERR().

Fixes: 1907920324f1 ('libceph: support for sending notifies')
Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b5ec09612ff7..a97e7b506612 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -4220,7 +4220,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
 
 		pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
 					       GFP_NOIO);
-		if (!pages) {
+		if (IS_ERR(pages)) {
 			ceph_msg_put(m);
 			return NULL;
 		}
-- 
cgit v1.2.3


From f52ec33cbd848632559c87c9305a70fb6eb97f18 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Sat, 30 Jul 2016 00:37:31 +0000
Subject: libceph: make cancel_generic_request() static

Fixes the following sparse warning:

net/ceph/mon_client.c:577:6: warning:
 symbol 'cancel_generic_request' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/mon_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index c83326c5ba58..ef34a02719d7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -574,7 +574,7 @@ static void complete_generic_request(struct ceph_mon_generic_request *req)
 	put_generic_request(req);
 }
 
-void cancel_generic_request(struct ceph_mon_generic_request *req)
+static void cancel_generic_request(struct ceph_mon_generic_request *req)
 {
 	struct ceph_mon_client *monc = req->monc;
 	struct ceph_mon_generic_request *lookup_req;
-- 
cgit v1.2.3


From 864364a29c26ed83b3eeca5fa278468dc3ae9ed4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Sat, 30 Jul 2016 00:38:26 +0000
Subject: libceph: using kfree_rcu() to simplify the code

The callback function of call_rcu() just calls a kfree(), so we
can use kfree_rcu() instead of call_rcu() + callback function.

Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/string_table.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
index ca53c8319209..22fb96efcf34 100644
--- a/net/ceph/string_table.c
+++ b/net/ceph/string_table.c
@@ -84,12 +84,6 @@ retry:
 }
 EXPORT_SYMBOL(ceph_find_or_create_string);
 
-static void ceph_free_string(struct rcu_head *head)
-{
-	struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
-	kfree(cs);
-}
-
 void ceph_release_string(struct kref *ref)
 {
 	struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
@@ -101,7 +95,7 @@ void ceph_release_string(struct kref *ref)
 	}
 	spin_unlock(&string_tree_lock);
 
-	call_rcu(&cs->rcu, ceph_free_string);
+	kfree_rcu(cs, rcu);
 }
 EXPORT_SYMBOL(ceph_release_string);
 
-- 
cgit v1.2.3


From e4d2b16a445f85b7dc3fc67b21756eca515e0c74 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 4 Aug 2016 08:43:33 +0800
Subject: ceph: fix null pointer dereference in ceph_flush_snaps()

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/caps.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 99115cae1652..16e6ded0b7f2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1347,9 +1347,12 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
 {
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
-	struct ceph_mds_session *session = *psession;
+	struct ceph_mds_session *session = NULL;
 	int mds;
+
 	dout("ceph_flush_snaps %p\n", inode);
+	if (psession)
+		session = *psession;
 retry:
 	spin_lock(&ci->i_ceph_lock);
 	if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
-- 
cgit v1.2.3


From 6b6dddbe11b13bb00e0f9a1af2021e266811be85 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 5 Aug 2016 16:15:38 +0200
Subject: rbd: destroy header_oloc in rbd_dev_release()

Purely cosmetic at this point, as rbd doesn't use RADOS namespaces and
hence rbd_dev->header_oloc->pool_ns is always NULL.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1a04af6d2421..07668a6f0607 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3950,6 +3950,7 @@ static void rbd_dev_release(struct device *dev)
 	bool need_put = !!rbd_dev->opts;
 
 	ceph_oid_destroy(&rbd_dev->header_oid);
+	ceph_oloc_destroy(&rbd_dev->header_oloc);
 
 	rbd_put_client(rbd_dev->rbd_client);
 	rbd_spec_put(rbd_dev->spec);
-- 
cgit v1.2.3


From 1bd4403d86a1c06cb6cc9ac87664a0c9d3413d51 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 8 Aug 2016 13:02:01 -0700
Subject: unsafe_[get|put]_user: change interface to use a error target label

When I initially added the unsafe_[get|put]_user() helpers in commit
5b24a7a2aa20 ("Add 'unsafe' user access functions for batched
accesses"), I made the mistake of modeling the interface on our
traditional __[get|put]_user() functions, which return zero on success,
or -EFAULT on failure.

That interface is fairly easy to use, but it's actually fairly nasty for
good code generation, since it essentially forces the caller to check
the error value for each access.

In particular, since the error handling is already internally
implemented with an exception handler, and we already use "asm goto" for
various other things, we could fairly easily make the error cases just
jump directly to an error label instead, and avoid the need for explicit
checking after each operation.

So switch the interface to pass in an error label, rather than checking
the error value in the caller.  Best do it now before we start growing
more users (the signal handling code in particular would be a good place
to use the new interface).

So rather than

	if (unsafe_get_user(x, ptr))
		... handle error ..

the interface is now

	unsafe_get_user(x, ptr, label);

where an error during the user mode fetch will now just cause a jump to
'label' in the caller.

Right now the actual _implementation_ of this all still ends up being a
"if (err) goto label", and does not take advantage of any exception
label tricks, but for "unsafe_put_user()" in particular it should be
fairly straightforward to convert to using the exception table model.

Note that "unsafe_get_user()" is much harder to convert to a clever
exception table model, because current versions of gcc do not allow the
use of "asm goto" (for the exception) with output values (for the actual
value to be fetched).  But that is hopefully not a limitation in the
long term.

[ Also note that it might be a good idea to switch unsafe_get_user() to
  actually _return_ the value it fetches from user space, but this
  commit only changes the error handling semantics ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess.h | 16 ++++++++--------
 include/linux/uaccess.h        |  4 ++--
 lib/strncpy_from_user.c        |  8 ++++----
 lib/strnlen_user.c             |  7 +++----
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c03bfb68c503..52f230094c51 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -812,21 +812,21 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
 #define user_access_begin()	__uaccess_begin()
 #define user_access_end()	__uaccess_end()
 
-#define unsafe_put_user(x, ptr)						\
-({										\
+#define unsafe_put_user(x, ptr, err_label)					\
+do {										\
 	int __pu_err;								\
 	__put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT);		\
-	__builtin_expect(__pu_err, 0);						\
-})
+	if (unlikely(__pu_err)) goto err_label;					\
+} while (0)
 
-#define unsafe_get_user(x, ptr)						\
-({										\
+#define unsafe_get_user(x, ptr, err_label)					\
+do {										\
 	int __gu_err;								\
 	unsigned long __gu_val;							\
 	__get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;				\
-	__builtin_expect(__gu_err, 0);						\
-})
+	if (unlikely(__gu_err)) goto err_label;					\
+} while (0)
 
 #endif /* _ASM_X86_UACCESS_H */
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 349557825428..f30c187ed785 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -114,8 +114,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 #ifndef user_access_begin
 #define user_access_begin() do { } while (0)
 #define user_access_end() do { } while (0)
-#define unsafe_get_user(x, ptr) __get_user(x, ptr)
-#define unsafe_put_user(x, ptr) __put_user(x, ptr)
+#define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
+#define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
 #endif
 
 #endif		/* __LINUX_UACCESS_H__ */
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 33f655ef48cd..9c5fe8110413 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -40,8 +40,8 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, long
 		unsigned long c, data;
 
 		/* Fall back to byte-at-a-time if we get a page fault */
-		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
-			break;
+		unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);
+
 		*(unsigned long *)(dst+res) = c;
 		if (has_zero(c, &data, &constants)) {
 			data = prep_zero_mask(c, data, &constants);
@@ -56,8 +56,7 @@ byte_at_a_time:
 	while (max) {
 		char c;
 
-		if (unlikely(unsafe_get_user(c,src+res)))
-			return -EFAULT;
+		unsafe_get_user(c,src+res, efault);
 		dst[res] = c;
 		if (!c)
 			return res;
@@ -76,6 +75,7 @@ byte_at_a_time:
 	 * Nope: we hit the address space limit, and we still had more
 	 * characters the caller would have wanted. That's an EFAULT.
 	 */
+efault:
 	return -EFAULT;
 }
 
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 2625943625d7..8e105ed4df12 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -45,8 +45,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 	src -= align;
 	max += align;
 
-	if (unlikely(unsafe_get_user(c,(unsigned long __user *)src)))
-		return 0;
+	unsafe_get_user(c, (unsigned long __user *)src, efault);
 	c |= aligned_byte_mask(align);
 
 	for (;;) {
@@ -61,8 +60,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 		if (unlikely(max <= sizeof(unsigned long)))
 			break;
 		max -= sizeof(unsigned long);
-		if (unlikely(unsafe_get_user(c,(unsigned long __user *)(src+res))))
-			return 0;
+		unsafe_get_user(c, (unsigned long __user *)(src+res), efault);
 	}
 	res -= align;
 
@@ -77,6 +75,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count,
 	 * Nope: we hit the address space limit, and we still had more
 	 * characters the caller would have wanted. That's 0.
 	 */
+efault:
 	return 0;
 }
 
-- 
cgit v1.2.3


From e4630fdd47637168927905983205d7b7c5c08c09 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 8 Aug 2016 15:31:31 +0200
Subject: x86/power/64: Always create temporary identity mapping correctly

The low-level resume-from-hibernation code on x86-64 uses
kernel_ident_mapping_init() to create the temoprary identity mapping,
but that function assumes that the offset between kernel virtual
addresses and physical addresses is aligned on the PGD level.

However, with a randomized identity mapping base, it may be aligned
on the PUD level and if that happens, the temporary identity mapping
created by set_up_temporary_mappings() will not reflect the actual
kernel identity mapping and the image restoration will fail as a
result (leading to a kernel panic most of the time).

To fix this problem, rework kernel_ident_mapping_init() to support
unaligned offsets between KVA and PA up to the PMD level and make
set_up_temporary_mappings() use it as approprtiate.

Reported-and-tested-by: Thomas Garnier <thgarnie@google.com>
Reported-by: Borislav Petkov <bp@suse.de>
Suggested-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Yinghai Lu <yinghai@kernel.org>
---
 arch/x86/include/asm/init.h   |  4 ++--
 arch/x86/mm/ident_map.c       | 19 +++++++++++--------
 arch/x86/power/hibernate_64.c |  2 +-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 223042086f4e..737da62bfeb0 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,10 +5,10 @@ struct x86_mapping_info {
 	void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
 	void *context;			 /* context for alloc_pgt_page */
 	unsigned long pmd_flag;		 /* page flag for PMD entry */
-	bool kernel_mapping;		 /* kernel mapping or ident mapping */
+	unsigned long offset;		 /* ident mapping offset */
 };
 
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
-				unsigned long addr, unsigned long end);
+				unsigned long pstart, unsigned long pend);
 
 #endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index ec21796ac5fd..4473cb4f8b90 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -3,15 +3,17 @@
  * included by both the compressed kernel and the regular kernel.
  */
 
-static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
 			   unsigned long addr, unsigned long end)
 {
 	addr &= PMD_MASK;
 	for (; addr < end; addr += PMD_SIZE) {
 		pmd_t *pmd = pmd_page + pmd_index(addr);
 
-		if (!pmd_present(*pmd))
-			set_pmd(pmd, __pmd(addr | pmd_flag));
+		if (pmd_present(*pmd))
+			continue;
+
+		set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag));
 	}
 }
 
@@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 
 		if (pud_present(*pud)) {
 			pmd = pmd_offset(pud, 0);
-			ident_pmd_init(info->pmd_flag, pmd, addr, next);
+			ident_pmd_init(info, pmd, addr, next);
 			continue;
 		}
 		pmd = (pmd_t *)info->alloc_pgt_page(info->context);
 		if (!pmd)
 			return -ENOMEM;
-		ident_pmd_init(info->pmd_flag, pmd, addr, next);
+		ident_pmd_init(info, pmd, addr, next);
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 	}
 
@@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
 }
 
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
-			      unsigned long addr, unsigned long end)
+			      unsigned long pstart, unsigned long pend)
 {
+	unsigned long addr = pstart + info->offset;
+	unsigned long end = pend + info->offset;
 	unsigned long next;
 	int result;
-	int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
 
 	for (; addr < end; addr = next) {
-		pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+		pgd_t *pgd = pgd_page + pgd_index(addr);
 		pud_t *pud;
 
 		next = (addr & PGDIR_MASK) + PGDIR_SIZE;
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index f0b5f2d402af..a3e3ccc87138 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -87,7 +87,7 @@ static int set_up_temporary_mappings(void)
 	struct x86_mapping_info info = {
 		.alloc_pgt_page	= alloc_pgt_page,
 		.pmd_flag	= __PAGE_KERNEL_LARGE_EXEC,
-		.kernel_mapping = true,
+		.offset		= __PAGE_OFFSET,
 	};
 	unsigned long mstart, mend;
 	pgd_t *pgd;
-- 
cgit v1.2.3


From c8952a707556e04374d7b2fdb3a079d63ddf6f2f Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 8 Aug 2016 16:16:23 -0600
Subject: vfio/pci: Fix NULL pointer oops in error interrupt setup handling

There are multiple cases in vfio_pci_set_ctx_trigger_single() where
we assume we can safely read from our data pointer without actually
checking whether the user has passed any data via the count field.
VFIO_IRQ_SET_DATA_NONE in particular is entirely broken since we
attempt to pull an int32_t file descriptor out before even checking
the data type.  The other data types assume the data pointer contains
one element of their type as well.

In part this is good news because we were previously restricted from
doing much sanitization of parameters because it was missed in the
past and we didn't want to break existing users.  Clearly DATA_NONE
is completely broken, so it must not have any users and we can fix
it up completely.  For DATA_BOOL and DATA_EVENTFD, we'll just
protect ourselves, returning error when count is zero since we
previously would have oopsed.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reported-by: Chris Thompson <the_cartographer@hotmail.com>
Cc: stable@vger.kernel.org
Reviewed-by: Eric Auger <eric.auger@redhat.com>
---
 drivers/vfio/pci/vfio_pci_intrs.c | 85 ++++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 36 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 15ecfc9c5f6c..152b43822ef1 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -564,67 +564,80 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
 }
 
 static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
-					   uint32_t flags, void *data)
+					   unsigned int count, uint32_t flags,
+					   void *data)
 {
-	int32_t fd = *(int32_t *)data;
-
-	if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
-		return -EINVAL;
-
 	/* DATA_NONE/DATA_BOOL enables loopback testing */
 	if (flags & VFIO_IRQ_SET_DATA_NONE) {
-		if (*ctx)
-			eventfd_signal(*ctx, 1);
-		return 0;
+		if (*ctx) {
+			if (count) {
+				eventfd_signal(*ctx, 1);
+			} else {
+				eventfd_ctx_put(*ctx);
+				*ctx = NULL;
+			}
+			return 0;
+		}
 	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
-		uint8_t trigger = *(uint8_t *)data;
+		uint8_t trigger;
+
+		if (!count)
+			return -EINVAL;
+
+		trigger = *(uint8_t *)data;
 		if (trigger && *ctx)
 			eventfd_signal(*ctx, 1);
-		return 0;
-	}
 
-	/* Handle SET_DATA_EVENTFD */
-	if (fd == -1) {
-		if (*ctx)
-			eventfd_ctx_put(*ctx);
-		*ctx = NULL;
 		return 0;
-	} else if (fd >= 0) {
-		struct eventfd_ctx *efdctx;
-		efdctx = eventfd_ctx_fdget(fd);
-		if (IS_ERR(efdctx))
-			return PTR_ERR(efdctx);
-		if (*ctx)
-			eventfd_ctx_put(*ctx);
-		*ctx = efdctx;
+	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+		int32_t fd;
+
+		if (!count)
+			return -EINVAL;
+
+		fd = *(int32_t *)data;
+		if (fd == -1) {
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+			*ctx = NULL;
+		} else if (fd >= 0) {
+			struct eventfd_ctx *efdctx;
+
+			efdctx = eventfd_ctx_fdget(fd);
+			if (IS_ERR(efdctx))
+				return PTR_ERR(efdctx);
+
+			if (*ctx)
+				eventfd_ctx_put(*ctx);
+
+			*ctx = efdctx;
+		}
 		return 0;
-	} else
-		return -EINVAL;
+	}
+
+	return -EINVAL;
 }
 
 static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
-	if (index != VFIO_PCI_ERR_IRQ_INDEX)
+	if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1)
 		return -EINVAL;
 
-	/*
-	 * We should sanitize start & count, but that wasn't caught
-	 * originally, so this IRQ index must forever ignore them :-(
-	 */
-
-	return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
+	return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger,
+					       count, flags, data);
 }
 
 static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
 				    unsigned index, unsigned start,
 				    unsigned count, uint32_t flags, void *data)
 {
-	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1)
+	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1)
 		return -EINVAL;
 
-	return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data);
+	return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger,
+					       count, flags, data);
 }
 
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
-- 
cgit v1.2.3


From e10aec652f31ec61d6a0b4d00d8ef8d2b66fa0fd Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Wed, 6 Jul 2016 12:05:44 +0200
Subject: drm/edid: Add 6 bpc quirk for display AEO model 0.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bugzilla https://bugzilla.kernel.org/show_bug.cgi?id=105331
reports that the "AEO model 0" display is driven with 8 bpc
without dithering by default, which looks bad because that
panel is apparently a 6 bpc DP panel with faulty EDID.

A fix for this was made by commit 013dd9e03872
("drm/i915/dp: fall back to 18 bpp when sink capability is unknown").

That commit triggers new regressions in precision for DP->DVI and
DP->VGA displays. A patch is out to revert that commit, but it will
revert video output for the AEO model 0 panel to 8 bpc without
dithering.

The EDID 1.3 of that panel, as decoded from the xrandr output
attached to that bugzilla bug report, is somewhat faulty, and beyond
other problems also sets the "DFP 1.x compliant TMDS" bit, which
according to DFP spec means to drive the panel with 8 bpc and
no dithering in absence of other colorimetry information.

Try to make the original bug reporter happy despite the
faulty EDID by adding a quirk to mark that panel as 6 bpc,
so 6 bpc output with dithering creates a nice picture.

Tested by injecting the edid from the fdo bug into a DP connector
via drm_kms_helper.edid_firmware and verifying the 6 bpc + dithering
is selected.

This patch should be backported to stable.

Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Cc: stable@vger.kernel.org
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/drm_edid.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 7df26d4b7ad8..2cb472b9976a 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -74,6 +74,8 @@
 #define EDID_QUIRK_FORCE_8BPC			(1 << 8)
 /* Force 12bpc */
 #define EDID_QUIRK_FORCE_12BPC			(1 << 9)
+/* Force 6bpc */
+#define EDID_QUIRK_FORCE_6BPC			(1 << 10)
 
 struct detailed_mode_closure {
 	struct drm_connector *connector;
@@ -100,6 +102,9 @@ static struct edid_quirk {
 	/* Unknown Acer */
 	{ "ACR", 2423, EDID_QUIRK_FIRST_DETAILED_PREFERRED },
 
+	/* AEO model 0 reports 8 bpc, but is a 6 bpc panel */
+	{ "AEO", 0, EDID_QUIRK_FORCE_6BPC },
+
 	/* Belinea 10 15 55 */
 	{ "MAX", 1516, EDID_QUIRK_PREFER_LARGE_60 },
 	{ "MAX", 0x77e, EDID_QUIRK_PREFER_LARGE_60 },
@@ -4082,6 +4087,9 @@ int drm_add_edid_modes(struct drm_connector *connector, struct edid *edid)
 
 	drm_add_display_info(edid, &connector->display_info, connector);
 
+	if (quirks & EDID_QUIRK_FORCE_6BPC)
+		connector->display_info.bpc = 6;
+
 	if (quirks & EDID_QUIRK_FORCE_8BPC)
 		connector->display_info.bpc = 8;
 
-- 
cgit v1.2.3


From 196f954e250943df414efd3d632254c29be38e59 Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Wed, 6 Jul 2016 12:05:45 +0200
Subject: drm/i915/dp: Revert "drm/i915/dp: fall back to 18 bpp when sink
 capability is unknown"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 013dd9e03872
("drm/i915/dp: fall back to 18 bpp when sink capability is unknown")

This commit introduced a regression into stable kernels,
as it reduces output color depth to 6 bpc for any video
sink connected to a Displayport connector if that sink
doesn't report a specific color depth via EDID, or if
our EDID parser doesn't actually recognize the proper
bpc from EDID.

Affected are active DisplayPort->VGA converters and
active DisplayPort->DVI converters. Both should be
able to handle 8 bpc, but are degraded to 6 bpc with
this patch.

The reverted commit was meant to fix
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=105331

A followup patch implements a fix for that specific bug,
which is caused by a faulty EDID of the affected DP panel
by adding a new EDID quirk for that panel.

DP 18 bpp fallback handling and other improvements to
DP sink bpc detection will be handled for future
kernels in a separate series of patches.

Please backport to stable.

Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Cc: stable@vger.kernel.org
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/i915/intel_display.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index baeb75388dbe..dcf93b3d4fb6 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -12106,21 +12106,11 @@ connected_sink_compute_bpp(struct intel_connector *connector,
 		pipe_config->pipe_bpp = connector->base.display_info.bpc*3;
 	}
 
-	/* Clamp bpp to default limit on screens without EDID 1.4 */
-	if (connector->base.display_info.bpc == 0) {
-		int type = connector->base.connector_type;
-		int clamp_bpp = 24;
-
-		/* Fall back to 18 bpp when DP sink capability is unknown. */
-		if (type == DRM_MODE_CONNECTOR_DisplayPort ||
-		    type == DRM_MODE_CONNECTOR_eDP)
-			clamp_bpp = 18;
-
-		if (bpp > clamp_bpp) {
-			DRM_DEBUG_KMS("clamping display bpp (was %d) to default limit of %d\n",
-				      bpp, clamp_bpp);
-			pipe_config->pipe_bpp = clamp_bpp;
-		}
+	/* Clamp bpp to 8 on screens without EDID 1.4 */
+	if (connector->base.display_info.bpc == 0 && bpp > 24) {
+		DRM_DEBUG_KMS("clamping display bpp (was %d) to default limit of 24\n",
+			      bpp);
+		pipe_config->pipe_bpp = 24;
 	}
 }
 
-- 
cgit v1.2.3


From 210a021dab639694600450c14b877bf3e3240adc Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Wed, 6 Jul 2016 12:05:48 +0200
Subject: drm/edid: Set 8 bpc color depth for displays with "DFP 1.x compliant
 TMDS".
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to E-EDID spec 1.3, table 3.9, a digital video sink with the
"DFP 1.x compliant TMDS" bit set is "signal compatible with VESA DFP 1.x
TMDS CRGB, 1 pixel / clock, up to 8 bits / color MSB aligned".

For such displays, the DFP spec 1.0, section 3.10 "EDID support" says:

"If the DFP monitor only supports EDID 1.X (1.1, 1.2, etc.)
 without extensions, the host will make the following assumptions:

 1. 24-bit MSB-aligned RGB TFT
 2. DE polarity is active high
 3. H and V syncs are active high
 4. Established CRT timings will be used
 5. Dithering will not be enabled on the host"

So if we don't know the bit depth of the display from additional
colorimetry info we should assume 8 bpc / 24 bpp by default.

This patch adds info->bpc = 8 assignement for that case.

Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/drm_edid.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 2cb472b9976a..637a0aa4d3a0 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -3867,6 +3867,20 @@ static void drm_add_display_info(struct edid *edid,
 	/* HDMI deep color modes supported? Assign to info, if so */
 	drm_assign_hdmi_deep_color_info(edid, info, connector);
 
+	/*
+	 * Digital sink with "DFP 1.x compliant TMDS" according to EDID 1.3?
+	 *
+	 * For such displays, the DFP spec 1.0, section 3.10 "EDID support"
+	 * tells us to assume 8 bpc color depth if the EDID doesn't have
+	 * extensions which tell otherwise.
+	 */
+	if ((info->bpc == 0) && (edid->revision < 4) &&
+	    (edid->input & DRM_EDID_DIGITAL_TYPE_DVI)) {
+		info->bpc = 8;
+		DRM_DEBUG("%s: Assigning DFP sink color depth as %d bpc.\n",
+			  connector->name, info->bpc);
+	}
+
 	/* Only defined for 1.4 with digital displays */
 	if (edid->revision < 4)
 		return;
-- 
cgit v1.2.3


From d26e94149276f8c3d4911aa2c8395ba99b613c8d Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Fri, 17 Jun 2016 23:28:34 -0700
Subject: kbuild: no gcc-plugins during cc-option tests

The gcc-plugins arguments should not be included when performing
cc-option tests.

Steps to reproduce:
1) make mrproper
2) make defconfig
3) enable GCC_PLUGINS, GCC_PLUGIN_CYC_COMPLEXITY
4) enable FUNCTION_TRACER (it will select other options as well)
5) make && make modules

Build errors:
MODPOST 18 modules
ERROR: "__fentry__" [net/netfilter/xt_nat.ko] undefined!
ERROR: "__fentry__" [net/netfilter/xt_mark.ko] undefined!
ERROR: "__fentry__" [net/netfilter/xt_addrtype.ko] undefined!
ERROR: "__fentry__" [net/netfilter/xt_LOG.ko] undefined!
ERROR: "__fentry__" [net/netfilter/nf_nat_sip.ko] undefined!
ERROR: "__fentry__" [net/netfilter/nf_nat_irc.ko] undefined!
ERROR: "__fentry__" [net/netfilter/nf_nat_ftp.ko] undefined!
ERROR: "__fentry__" [net/netfilter/nf_nat.ko] undefined!

Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Emese Revfy <re.emese@gmail.com>
[kees: renamed variable, clarified commit message]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 scripts/Kbuild.include | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 15b196fc2f49..179219845dfc 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -108,16 +108,20 @@ as-option = $(call try-run,\
 as-instr = $(call try-run,\
 	printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3))
 
+# Do not attempt to build with gcc plugins during cc-option tests.
+# (And this uses delayed resolution so the flags will be up to date.)
+CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS))
+
 # cc-option
 # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
 
 cc-option = $(call try-run,\
-	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
+	$(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
 
 # cc-option-yn
 # Usage: flag := $(call cc-option-yn,-march=winchip-c6)
 cc-option-yn = $(call try-run,\
-	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n)
+	$(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n)
 
 # cc-option-align
 # Prefix align with either -falign or -malign
@@ -127,7 +131,7 @@ cc-option-align = $(subst -functions=0,,\
 # cc-disable-warning
 # Usage: cflags-y += $(call cc-disable-warning,unused-but-set-variable)
 cc-disable-warning = $(call try-run,\
-	$(CC) $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
+	$(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1)))
 
 # cc-name
 # Expands to either gcc or clang
-- 
cgit v1.2.3


From ed58c0e9eefef517aa5a547b78658e2ab4422232 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Jun 2016 23:11:12 -0700
Subject: gcc-plugins: abort builds cleanly when not supported

When the compiler doesn't support gcc plugins (either due to missing
headers or too old a version), report the problem and abort the build
instead of emitting a warning and letting the build founder with arcane
compiler errors.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Makefile                     |  7 -------
 scripts/Makefile.gcc-plugins | 34 +++++++++++++++++++++++++---------
 scripts/gcc-plugin.sh        | 14 ++++++++++++++
 3 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 70de1448c571..8c504f324154 100644
--- a/Makefile
+++ b/Makefile
@@ -635,13 +635,6 @@ endif
 # Tell gcc to never replace conditional load with a non-conditional one
 KBUILD_CFLAGS	+= $(call cc-option,--param=allow-store-data-races=0)
 
-PHONY += gcc-plugins
-gcc-plugins: scripts_basic
-ifdef CONFIG_GCC_PLUGINS
-	$(Q)$(MAKE) $(build)=scripts/gcc-plugins
-endif
-	@:
-
 include scripts/Makefile.gcc-plugins
 
 ifdef CONFIG_READABLE_ASM
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index 5e22b60589c1..bbca1f46f6e4 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -23,21 +23,37 @@ ifdef CONFIG_GCC_PLUGINS
 
   export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN SANCOV_PLUGIN
 
+  ifneq ($(PLUGINCC),)
+    # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
+    GCC_PLUGINS_CFLAGS := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGINS_CFLAGS))
+  endif
+
+  KBUILD_CFLAGS += $(GCC_PLUGINS_CFLAGS)
+  GCC_PLUGIN := $(gcc-plugin-y)
+endif
+
+# If plugins aren't supported, abort the build before hard-to-read compiler
+# errors start getting spewed by the main build.
+PHONY += gcc-plugins-check
+gcc-plugins-check: FORCE
+ifdef CONFIG_GCC_PLUGINS
   ifeq ($(PLUGINCC),)
     ifneq ($(GCC_PLUGINS_CFLAGS),)
       ifeq ($(call cc-ifversion, -ge, 0405, y), y)
-        PLUGINCC := $(shell $(CONFIG_SHELL) -x $(srctree)/scripts/gcc-plugin.sh "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)")
-        $(warning warning: your gcc installation does not support plugins, perhaps the necessary headers are missing?)
+	$(Q)$(srctree)/scripts/gcc-plugin.sh --show-error "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)" || true
+	@echo "Cannot use CONFIG_GCC_PLUGINS: your gcc installation does not support plugins, perhaps the necessary headers are missing?" >&2 && exit 1
       else
-        $(warning warning: your gcc version does not support plugins, you should upgrade it to gcc 4.5 at least)
+	@echo "Cannot use CONFIG_GCC_PLUGINS: your gcc version does not support plugins, you should upgrade it to at least gcc 4.5" >&2 && exit 1
       endif
     endif
-  else
-    # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
-    GCC_PLUGINS_CFLAGS := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGINS_CFLAGS))
   endif
+endif
+	@:
 
-  KBUILD_CFLAGS += $(GCC_PLUGINS_CFLAGS)
-  GCC_PLUGIN := $(gcc-plugin-y)
-
+# Actually do the build, if requested.
+PHONY += gcc-plugins
+gcc-plugins: scripts_basic gcc-plugins-check
+ifdef CONFIG_GCC_PLUGINS
+	$(Q)$(MAKE) $(build)=scripts/gcc-plugins
 endif
+	@:
diff --git a/scripts/gcc-plugin.sh b/scripts/gcc-plugin.sh
index fb9207565471..b65224bfb847 100755
--- a/scripts/gcc-plugin.sh
+++ b/scripts/gcc-plugin.sh
@@ -1,5 +1,12 @@
 #!/bin/sh
 srctree=$(dirname "$0")
+
+SHOW_ERROR=
+if [ "$1" = "--show-error" ] ; then
+	SHOW_ERROR=1
+	shift || true
+fi
+
 gccplugins_dir=$($3 -print-file-name=plugin)
 plugincc=$($1 -E -x c++ - -o /dev/null -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <<EOF
 #include "gcc-common.h"
@@ -13,6 +20,9 @@ EOF
 
 if [ $? -ne 0 ]
 then
+	if [ -n "$SHOW_ERROR" ] ; then
+		echo "${plugincc}" >&2
+	fi
 	exit 1
 fi
 
@@ -48,4 +58,8 @@ then
 	echo "$2"
 	exit 0
 fi
+
+if [ -n "$SHOW_ERROR" ] ; then
+	echo "${plugincc}" >&2
+fi
 exit 1
-- 
cgit v1.2.3


From 65d59ec8ad99e9d932ccca4daabb8e3fdafeb11b Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Mon, 20 Jun 2016 20:40:24 +0200
Subject: gcc-plugins: Add support for passing plugin arguments

The latent_entropy plugin needs to pass arguments, so this adds the
support.

Signed-off-by: Emese Revfy <re.emese@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 scripts/Makefile.gcc-plugins | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index bbca1f46f6e4..ffb928b9c74b 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -19,7 +19,7 @@ ifdef CONFIG_GCC_PLUGINS
     endif
   endif
 
-  GCC_PLUGINS_CFLAGS := $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y))
+  GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
 
   export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN SANCOV_PLUGIN
 
-- 
cgit v1.2.3


From 7040c83bfbaf2340d2f336dc7641ce909c8c2b4c Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Sun, 26 Jun 2016 17:36:43 +0200
Subject: gcc-plugins: Automate make rule generation

There's no reason to repeat the same names in the Makefile when the .so
files have already been listed. The .o list can be generated from them.

Reported-by: PaX Team <pageexec@freemail.hu>
Signed-off-by: Emese Revfy <re.emese@gmail.com>
[kees: clarified commit message]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 scripts/gcc-plugins/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile
index 88c8ec47232b..ee0a75d2a1c7 100644
--- a/scripts/gcc-plugins/Makefile
+++ b/scripts/gcc-plugins/Makefile
@@ -21,7 +21,6 @@ endif
 $(HOSTLIBS)-y := $(GCC_PLUGIN)
 always := $($(HOSTLIBS)-y)
 
-cyc_complexity_plugin-objs := cyc_complexity_plugin.o
-sancov_plugin-objs := sancov_plugin.o
+$(foreach p,$($(HOSTLIBS)-y:%.so=%),$(eval $(p)-objs := $(p).o))
 
 clean-files += *.so
-- 
cgit v1.2.3


From caefd8c9a9fb06811e07bf3571e5d4450846b16a Mon Sep 17 00:00:00 2001
From: Emese Revfy <re.emese@gmail.com>
Date: Sun, 26 Jun 2016 17:38:20 +0200
Subject: gcc-plugins: Add support for plugin subdirectories

This adds support for building more complex gcc plugins that live in a
subdirectory instead of just in a single source file.

Reported-by: PaX Team <pageexec@freemail.hu>
Signed-off-by: Emese Revfy <re.emese@gmail.com>
[kees: clarified commit message]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 scripts/Makefile.gcc-plugins | 3 ++-
 scripts/gcc-plugins/Makefile | 9 ++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
index ffb928b9c74b..61f0e6db909b 100644
--- a/scripts/Makefile.gcc-plugins
+++ b/scripts/Makefile.gcc-plugins
@@ -21,7 +21,7 @@ ifdef CONFIG_GCC_PLUGINS
 
   GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y))
 
-  export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN SANCOV_PLUGIN
+  export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR SANCOV_PLUGIN
 
   ifneq ($(PLUGINCC),)
     # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
@@ -30,6 +30,7 @@ ifdef CONFIG_GCC_PLUGINS
 
   KBUILD_CFLAGS += $(GCC_PLUGINS_CFLAGS)
   GCC_PLUGIN := $(gcc-plugin-y)
+  GCC_PLUGIN_SUBDIR := $(gcc-plugin-subdir-y)
 endif
 
 # If plugins aren't supported, abort the build before hard-to-read compiler
diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile
index ee0a75d2a1c7..8b29dc17c73c 100644
--- a/scripts/gcc-plugins/Makefile
+++ b/scripts/gcc-plugins/Makefile
@@ -12,15 +12,18 @@ else
   export HOST_EXTRACXXFLAGS
 endif
 
-export GCCPLUGINS_DIR HOSTLIBS
-
 ifneq ($(CFLAGS_KCOV), $(SANCOV_PLUGIN))
   GCC_PLUGIN := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGIN))
 endif
 
-$(HOSTLIBS)-y := $(GCC_PLUGIN)
+export HOSTLIBS
+
+$(HOSTLIBS)-y := $(foreach p,$(GCC_PLUGIN),$(if $(findstring /,$(p)),,$(p)))
 always := $($(HOSTLIBS)-y)
 
 $(foreach p,$($(HOSTLIBS)-y:%.so=%),$(eval $(p)-objs := $(p).o))
 
+subdir-y := $(GCC_PLUGIN_SUBDIR)
+subdir-  += $(GCC_PLUGIN_SUBDIR)
+
 clean-files += *.so
-- 
cgit v1.2.3


From 36e9d08b58f44c3a02974c405ccaaa6ecfaf05b8 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 9 Aug 2016 02:16:18 +0200
Subject: drm/cirrus: Fix NULL pointer dereference when registering the fbdev

cirrus_modeset_init() is initializing/registering the emulated fbdev
and, since commit c61b93fe51b1 ("drm/atomic: Fix remaining places where
!funcs->best_encoder is valid"), DRM internals can access/test some of
the fields in mode_config->funcs as part of the fbdev registration
process.
Make sure dev->mode_config.funcs is properly set to avoid dereferencing
a NULL pointer.

Reported-by: Mike Marshall <hubcap@omnibond.com>
Reported-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Fixes: c61b93fe51b1 ("drm/atomic: Fix remaining places where !funcs->best_encoder is valid")
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 drivers/gpu/drm/cirrus/cirrus_main.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/cirrus/cirrus_main.c b/drivers/gpu/drm/cirrus/cirrus_main.c
index 80446e2d3ab6..76bcb43e7c06 100644
--- a/drivers/gpu/drm/cirrus/cirrus_main.c
+++ b/drivers/gpu/drm/cirrus/cirrus_main.c
@@ -185,14 +185,23 @@ int cirrus_driver_load(struct drm_device *dev, unsigned long flags)
 		goto out;
 	}
 
+	/*
+	 * cirrus_modeset_init() is initializing/registering the emulated fbdev
+	 * and DRM internals can access/test some of the fields in
+	 * mode_config->funcs as part of the fbdev registration process.
+	 * Make sure dev->mode_config.funcs is properly set to avoid
+	 * dereferencing a NULL pointer.
+	 * FIXME: mode_config.funcs assignment should probably be done in
+	 * cirrus_modeset_init() (that's a common pattern seen in other DRM
+	 * drivers).
+	 */
+	dev->mode_config.funcs = &cirrus_mode_funcs;
 	r = cirrus_modeset_init(cdev);
 	if (r) {
 		dev_err(&dev->pdev->dev, "Fatal error during modeset init: %d\n", r);
 		goto out;
 	}
 
-	dev->mode_config.funcs = (void *)&cirrus_mode_funcs;
-
 	return 0;
 out:
 	cirrus_driver_unload(dev);
-- 
cgit v1.2.3


From d2cf5be07ff7c7cde8bef8551a30e8e86e2037a7 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 4 Aug 2016 16:38:15 +1000
Subject: crypto: crc32c-vpmsum - Convert to CPU feature based module
 autoloading

This patch utilises the GENERIC_CPU_AUTOPROBE infrastructure
to automatically load the crc32c-vpmsum module if the CPU supports
it.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/crypto/crc32c-vpmsum_glue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
index bfe3d37a24ef..9fa046d56eba 100644
--- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c
+++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/cpufeature.h>
 #include <asm/switch_to.h>
 
 #define CHKSUM_BLOCK_SIZE	1
@@ -157,7 +158,7 @@ static void __exit crc32c_vpmsum_mod_fini(void)
 	crypto_unregister_shash(&alg);
 }
 
-module_init(crc32c_vpmsum_mod_init);
+module_cpu_feature_match(PPC_MODULE_FEATURE_VEC_CRYPTO, crc32c_vpmsum_mod_init);
 module_exit(crc32c_vpmsum_mod_fini);
 
 MODULE_AUTHOR("Anton Blanchard <anton@samba.org>");
-- 
cgit v1.2.3


From 880a3d6afd068682d6386a0528be1217541d3d8e Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Tue, 2 Aug 2016 12:39:43 +1000
Subject: powerpc/xics: Properly set Edge/Level type and enable resend

This sets the type of the interrupt appropriately. We set it as follow:

 - If not mapped from the device-tree, we use edge. This is the case
of the virtual interrupts and PCI MSIs for example.

 - If mapped from the device-tree and #interrupt-cells is 2 (PAPR
compliant), we use the second cell to set the appropriate type

 - If mapped from the device-tree and #interrupt-cells is 1 (current
OPAL on P8 does that), we assume level sensitive since those are
typically going to be the PSI LSIs which are level sensitive.

Additionally, we mark the interrupts requested via the opal_interrupts
property all level. This is a bit fishy but the best we can do until we
fix OPAL to properly expose them with a complete descriptor. It is also
correct for the current HW anyway as OPAL interrupts are currently PCI
error and PSI interrupts which are level.

Finally now that edge interrupts are properly identified, we can enable
CONFIG_HARDIRQS_SW_RESEND which will make the core re-send them if
they occur while masked, which some drivers rely upon.

This fixes issues with lost interrupts on some Mellanox adapters.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/xics.h               |  2 +
 arch/powerpc/platforms/powernv/opal-irqchip.c |  3 +-
 arch/powerpc/sysdev/xics/Kconfig              |  1 +
 arch/powerpc/sysdev/xics/ics-opal.c           |  4 +-
 arch/powerpc/sysdev/xics/ics-rtas.c           |  4 +-
 arch/powerpc/sysdev/xics/xics-common.c        | 59 +++++++++++++++++++++++----
 6 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index f5f729c11578..f0b238516e9b 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -159,6 +159,8 @@ extern void xics_teardown_cpu(void);
 extern void xics_kexec_teardown_cpu(int secondary);
 extern void xics_migrate_irqs_away(void);
 extern void icp_native_eoi(struct irq_data *d);
+extern int xics_set_irq_type(struct irq_data *d, unsigned int flow_type);
+extern int xics_retrigger(struct irq_data *data);
 #ifdef CONFIG_SMP
 extern int xics_get_irq_server(unsigned int virq, const struct cpumask *cpumask,
 			       unsigned int strict_check);
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index e505223b4ec5..ed8bba68a162 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -228,7 +228,8 @@ int __init opal_event_init(void)
 		}
 
 		/* Install interrupt handler */
-		rc = request_irq(virq, opal_interrupt, 0, "opal", NULL);
+		rc = request_irq(virq, opal_interrupt, IRQF_TRIGGER_LOW,
+				 "opal", NULL);
 		if (rc) {
 			irq_dispose_mapping(virq);
 			pr_warn("Error %d requesting irq %d (0x%x)\n",
diff --git a/arch/powerpc/sysdev/xics/Kconfig b/arch/powerpc/sysdev/xics/Kconfig
index 0031eda320c3..385e7aa9e273 100644
--- a/arch/powerpc/sysdev/xics/Kconfig
+++ b/arch/powerpc/sysdev/xics/Kconfig
@@ -1,6 +1,7 @@
 config PPC_XICS
        def_bool n
        select PPC_SMP_MUXED_IPI
+       select HARDIRQS_SW_RESEND
 
 config PPC_ICP_NATIVE
        def_bool n
diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c
index 27c936c080a6..1c6bf4b66f56 100644
--- a/arch/powerpc/sysdev/xics/ics-opal.c
+++ b/arch/powerpc/sysdev/xics/ics-opal.c
@@ -156,7 +156,9 @@ static struct irq_chip ics_opal_irq_chip = {
 	.irq_mask = ics_opal_mask_irq,
 	.irq_unmask = ics_opal_unmask_irq,
 	.irq_eoi = NULL, /* Patched at init time */
-	.irq_set_affinity = ics_opal_set_affinity
+	.irq_set_affinity = ics_opal_set_affinity,
+	.irq_set_type = xics_set_irq_type,
+	.irq_retrigger = xics_retrigger,
 };
 
 static int ics_opal_map(struct ics *ics, unsigned int virq);
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c
index 3854dd41558d..78ee5c778ef8 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -163,7 +163,9 @@ static struct irq_chip ics_rtas_irq_chip = {
 	.irq_mask = ics_rtas_mask_irq,
 	.irq_unmask = ics_rtas_unmask_irq,
 	.irq_eoi = NULL, /* Patched at init time */
-	.irq_set_affinity = ics_rtas_set_affinity
+	.irq_set_affinity = ics_rtas_set_affinity,
+	.irq_set_type = xics_set_irq_type,
+	.irq_retrigger = xics_retrigger,
 };
 
 static int ics_rtas_map(struct ics *ics, unsigned int virq)
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index a795a5f0301c..9d530f479588 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -328,8 +328,12 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq,
 
 	pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw);
 
-	/* They aren't all level sensitive but we just don't really know */
-	irq_set_status_flags(virq, IRQ_LEVEL);
+	/*
+	 * Mark interrupts as edge sensitive by default so that resend
+	 * actually works. The device-tree parsing will turn the LSIs
+	 * back to level.
+	 */
+	irq_clear_status_flags(virq, IRQ_LEVEL);
 
 	/* Don't call into ICS for IPIs */
 	if (hw == XICS_IPI) {
@@ -351,13 +355,54 @@ static int xics_host_xlate(struct irq_domain *h, struct device_node *ct,
 			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
 
 {
-	/* Current xics implementation translates everything
-	 * to level. It is not technically right for MSIs but this
-	 * is irrelevant at this point. We might get smarter in the future
-	 */
 	*out_hwirq = intspec[0];
-	*out_flags = IRQ_TYPE_LEVEL_LOW;
 
+	/*
+	 * If intsize is at least 2, we look for the type in the second cell,
+	 * we assume the LSB indicates a level interrupt.
+	 */
+	if (intsize > 1) {
+		if (intspec[1] & 1)
+			*out_flags = IRQ_TYPE_LEVEL_LOW;
+		else
+			*out_flags = IRQ_TYPE_EDGE_RISING;
+	} else
+		*out_flags = IRQ_TYPE_LEVEL_LOW;
+
+	return 0;
+}
+
+int xics_set_irq_type(struct irq_data *d, unsigned int flow_type)
+{
+	/*
+	 * We only support these. This has really no effect other than setting
+	 * the corresponding descriptor bits mind you but those will in turn
+	 * affect the resend function when re-enabling an edge interrupt.
+	 *
+	 * Set set the default to edge as explained in map().
+	 */
+	if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE)
+		flow_type = IRQ_TYPE_EDGE_RISING;
+
+	if (flow_type != IRQ_TYPE_EDGE_RISING &&
+	    flow_type != IRQ_TYPE_LEVEL_LOW)
+		return -EINVAL;
+
+	irqd_set_trigger_type(d, flow_type);
+
+	return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+int xics_retrigger(struct irq_data *data)
+{
+	/*
+	 * We need to push a dummy CPPR when retriggering, since the subsequent
+	 * EOI will try to pop it. Passing 0 works, as the function hard codes
+	 * the priority value anyway.
+	 */
+	xics_push_cppr(0);
+
+	/* Tell the core to do a soft retrigger */
 	return 0;
 }
 
-- 
cgit v1.2.3


From 54a94fcf2f00c9deb9491d5ab2b5a4022332f47e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 4 Aug 2016 08:37:03 +0300
Subject: powerpc/cell: Add missing error code in spufs_mkgang()

We should return -ENOMEM if alloc_spu_gang() fails.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/cell/spufs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 5be15cff758d..2975754c65ea 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -496,8 +496,10 @@ spufs_mkgang(struct inode *dir, struct dentry *dentry, umode_t mode)
 	gang = alloc_spu_gang();
 	SPUFS_I(inode)->i_ctx = NULL;
 	SPUFS_I(inode)->i_gang = gang;
-	if (!gang)
+	if (!gang) {
+		ret = -ENOMEM;
 		goto out_iput;
+	}
 
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
-- 
cgit v1.2.3


From 4d9021957b5218310e28767f25551ca8f5eac797 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 3 Aug 2016 18:40:45 +1000
Subject: powerpc/powernv/ioda: Fix TCE invalidate to work in real mode again

Commit fd141d1a99a3 ("powerpc/powernv/pci: Rework accessing the TCE
invalidate register") broke TCE invalidation on IODA2/PHB3 for real
mode.

This makes invalidate work again.

Fixes: fd141d1a99a3 ("powerpc/powernv/pci: Rework accessing the TCE invalidate register")
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 6b9528307f62..dedbbe9785ba 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1877,7 +1877,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
 					unsigned shift, unsigned long index,
 					unsigned long npages)
 {
-	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
+	__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
 	unsigned long start, end, inc;
 
 	/* We'll invalidate DMA address in PE scope */
-- 
cgit v1.2.3


From e325d76f8bd2d222a1f577aba00dfb43cece4cbc Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Fri, 5 Aug 2016 19:13:12 +0530
Subject: powerpc/powernv: Load correct TOC pointer while waking up from
 winkle.

The function pnv_restore_hyp_resource() loads the TOC into r2 from
the invalid PACA pointer before fixing r13 value. This do not affect
POWER ISA 3.0 but it does have an impact on POWER ISA 2.07 or less
leading CPU to get stuck forever.

	login: [  471.830433] Processor 120 is stuck.

This can be easily reproducible using following steps:
- Turn off SMT
	$ ppc64_cpu --smt=off
- offline/online any online cpu (Thread 0 of any core which is online)
	$ echo 0 > /sys/devices/system/cpu/cpu<num>/online
	$ echo 1 > /sys/devices/system/cpu/cpu<num>/online

For POWER ISA 2.07 or less, the last bit of HSPRG0 is set indicating
that thread is waking up from winkle. Hence, the last bit of HSPRG0(r13)
needs to be clear before accessing it as PACA to avoid loading invalid
values from invalid PACA pointer.

Fix this by loading TOC after r13 register is corrected.

Fixes: bcef83a00dc4 ("powerpc/powernv: Add platform support for stop instruction")
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index ba79d15f4ddd..07a330ed7022 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -363,8 +363,8 @@ _GLOBAL(power9_idle_stop)
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
  */
 _GLOBAL(pnv_restore_hyp_resource)
-	ld	r2,PACATOC(r13);
 BEGIN_FTR_SECTION
+	ld	r2,PACATOC(r13);
 	/*
 	 * POWER ISA 3. Use PSSCR to determine if we
 	 * are waking up from deep idle state
@@ -395,6 +395,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	 */
 	clrldi	r5,r13,63
 	clrrdi	r13,r13,1
+
+	/* Now that we are sure r13 is corrected, load TOC */
+	ld	r2,PACATOC(r13);
 	cmpwi	cr4,r5,1
 	mtspr	SPRN_HSPRG0,r13
 
-- 
cgit v1.2.3


From 98d8821a47f3fd7354d3ab87adb50b10c9adb937 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Fri, 5 Aug 2016 17:34:04 +0530
Subject: powerpc/powernv: Move IDLE_STATE_ENTER_SEQ macro to cpuidle.h

Move IDLE_STATE_ENTER_SEQ macro to cpuidle.h so that MCE handler changes
in subsequent patch can use it.

No functionality change.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cpuidle.h | 13 +++++++++++++
 arch/powerpc/kernel/idle_book3s.S  | 12 ------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 3d7fc06532a1..01b8a13f0224 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -19,4 +19,17 @@ extern u64 pnv_first_deep_stop_state;
 
 #endif
 
+/* Idle state entry routines */
+#ifdef	CONFIG_PPC_P7_NAP
+#define	IDLE_STATE_ENTER_SEQ(IDLE_INST)				\
+	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
+	std	r0,0(r1);					\
+	ptesync;						\
+	ld	r0,0(r1);					\
+1:	cmp	cr0,r0,r0;					\
+	bne	1b;						\
+	IDLE_INST;						\
+	b	.
+#endif /* CONFIG_PPC_P7_NAP */
+
 #endif
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 07a330ed7022..2265c6398a17 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -44,18 +44,6 @@
 				PSSCR_PSLL_MASK | PSSCR_TR_MASK | \
 				PSSCR_MTL_MASK
 
-/* Idle state entry routines */
-
-#define	IDLE_STATE_ENTER_SEQ(IDLE_INST)				\
-	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
-	std	r0,0(r1);					\
-	ptesync;						\
-	ld	r0,0(r1);					\
-1:	cmp	cr0,r0,r0;					\
-	bne	1b;						\
-	IDLE_INST;						\
-	b	.
-
 	.text
 
 /*
-- 
cgit v1.2.3


From 7627e40c66b5547e12b6c5673646ceea84797a74 Mon Sep 17 00:00:00 2001
From: Piotr Karasinski <peter.karasinski@gmail.com>
Date: Sat, 6 Aug 2016 21:23:05 +0200
Subject: ALSA: usb-audio: Add a sample rate quirk for Creative Live! Cam
 Socialize HD (VF0610)

VF0610 does not support reading the sample rate which leads to many
lines of "cannot get freq at ep 0x82". This patch adds the USB ID
(0x041E:4080) to snd_usb_get_sample_rate_quirk() list.

Signed-off-by: Piotr Karasinski <peter.karasinski@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/quirks.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 6adde457b602..b70d4dc4ef10 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1128,6 +1128,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
 {
 	/* devices which do not support reading the sample rate. */
 	switch (chip->usb_id) {
+	case USB_ID(0x041E, 0x4080): /* Creative Live Cam VF0610 */
 	case USB_ID(0x045E, 0x075D): /* MS Lifecam Cinema  */
 	case USB_ID(0x045E, 0x076D): /* MS Lifecam HD-5000 */
 	case USB_ID(0x045E, 0x076E): /* MS Lifecam HD-5001 */
-- 
cgit v1.2.3


From 41f5e3bdbf706a9e98194bf0c4b62a875c02f170 Mon Sep 17 00:00:00 2001
From: "Vittorio Gambaletta (VittGam)" <linuxbugs@vittgam.net>
Date: Mon, 8 Aug 2016 12:35:40 +0200
Subject: ALSA: usb-audio: Add quirk for ELP HD USB Camera

The ELP HD USB Camera (05a3:9420) needs this quirk for suppressing
the unsupported sample rate inquiry.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=98481
Cc: <stable@vger.kernel.org>
Signed-off-by: Vittorio Gambaletta <linuxbugs@vittgam.net>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/usb/quirks.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index b70d4dc4ef10..6cf1f3597455 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1139,6 +1139,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
 	case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */
 	case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
 	case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */
+	case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */
 	case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */
 	case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */
 	case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */
-- 
cgit v1.2.3


From bc14c49195e49b3231c01e4c44e3e5456c940b94 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Fri, 5 Aug 2016 17:34:13 +0530
Subject: powerpc/powernv: Fix MCE handler to avoid trashing CR0/CR1 registers.

The current implementation of MCE early handling modifies CR0/1 registers
without saving its old values. Fix this by moving early check for
powersaving mode to machine_check_handle_early().

The power architecture 2.06 or later allows the possibility of getting
machine check while in nap/sleep/winkle. The last bit of HSPRG0 is set
to 1, if thread is woken up from winkle. Hence, clear the last bit of
HSPRG0 (r13) before MCE handler starts using it as paca pointer.

Also, the current code always puts the thread into nap state irrespective
of whatever idle state it woke up from. Fix that by looking at
paca->thread_idle_state and put the thread back into same state where it
came from.

Fixes: 1c51089f777b ("powerpc/book3s: Return from interrupt if coming from evil context.")
Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Reviewed-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/exceptions-64s.S | 69 +++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 41091fdf9bd8..df6d45eb4115 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -144,29 +144,14 @@ machine_check_pSeries_1:
 	 * vector
 	 */
 	SET_SCRATCH0(r13)		/* save r13 */
-#ifdef CONFIG_PPC_P7_NAP
-BEGIN_FTR_SECTION
-	/* Running native on arch 2.06 or later, check if we are
-	 * waking up from nap. We only handle no state loss and
-	 * supervisor state loss. We do -not- handle hypervisor
-	 * state loss at this time.
+	/*
+	 * Running native on arch 2.06 or later, we may wakeup from winkle
+	 * inside machine check. If yes, then last bit of HSPGR0 would be set
+	 * to 1. Hence clear it unconditionally.
 	 */
-	mfspr	r13,SPRN_SRR1
-	rlwinm.	r13,r13,47-31,30,31
-	OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
-	beq	9f
-
-	mfspr	r13,SPRN_SRR1
-	rlwinm.	r13,r13,47-31,30,31
-	/* waking up from powersave (nap) state */
-	cmpwi	cr1,r13,2
-	/* Total loss of HV state is fatal. let's just stay stuck here */
-	OPT_GET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
-	bgt	cr1,.
-9:
-	OPT_SET_SPR(r13, SPRN_CFAR, CPU_FTR_CFAR)
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-#endif /* CONFIG_PPC_P7_NAP */
+	GET_PACA(r13)
+	clrrdi	r13,r13,1
+	SET_PACA(r13)
 	EXCEPTION_PROLOG_0(PACA_EXMC)
 BEGIN_FTR_SECTION
 	b	machine_check_powernv_early
@@ -1273,25 +1258,51 @@ machine_check_handle_early:
 	 * Check if thread was in power saving mode. We come here when any
 	 * of the following is true:
 	 * a. thread wasn't in power saving mode
-	 * b. thread was in power saving mode with no state loss or
-	 *    supervisor state loss
+	 * b. thread was in power saving mode with no state loss,
+	 *    supervisor state loss or hypervisor state loss.
 	 *
-	 * Go back to nap again if (b) is true.
+	 * Go back to nap/sleep/winkle mode again if (b) is true.
 	 */
 	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
 	beq	4f			/* No, it wasn;t */
 	/* Thread was in power saving mode. Go back to nap again. */
 	cmpwi	r11,2
-	bne	3f
-	/* Supervisor state loss */
+	blt	3f
+	/* Supervisor/Hypervisor state loss */
 	li	r0,1
 	stb	r0,PACA_NAPSTATELOST(r13)
 3:	bl	machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
 	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
-	li	r3,PNV_THREAD_NAP
-	b	pnv_enter_arch207_idle_mode
+	/*
+	 * Check what idle state this CPU was in and go back to same mode
+	 * again.
+	 */
+	lbz	r3,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi	r3,PNV_THREAD_NAP
+	bgt	10f
+	IDLE_STATE_ENTER_SEQ(PPC_NAP)
+	/* No return */
+10:
+	cmpwi	r3,PNV_THREAD_SLEEP
+	bgt	2f
+	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+	/* No return */
+
+2:
+	/*
+	 * Go back to winkle. Please note that this thread was woken up in
+	 * machine check from winkle and have not restored the per-subcore
+	 * state. Hence before going back to winkle, set last bit of HSPGR0
+	 * to 1. This will make sure that if this thread gets woken up
+	 * again at reset vector 0x100 then it will get chance to restore
+	 * the subcore state.
+	 */
+	ori	r13,r13,1
+	SET_PACA(r13)
+	IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
+	/* No return */
 4:
 #endif
 	/*
-- 
cgit v1.2.3


From 9dc512819e4b723856773f5a018865c20441072f Mon Sep 17 00:00:00 2001
From: Alastair D'Silva <alastair@d-silva.org>
Date: Mon, 1 Aug 2016 16:32:51 +1000
Subject: powerpc: Fix unused function warning 'lmb_to_memblock'

This patch fixes the following warning:
arch/powerpc/platforms/pseries/hotplug-memory.c:323:29: error: 'lmb_to_memblock' defined but not used [-Werror=unused-function]
static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb)
                           ^~~~~~~~~~~~~~~

The only consumer of this function is 'dlpar_remove_lmb', which is
enabled with CONFIG_MEMORY_HOTREMOVE, so move it into the same
ifdef block.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 26 ++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 43f7beb2902d..76ec104e88be 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -320,19 +320,6 @@ static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb)
 	return dlpar_update_device_tree_lmb(lmb);
 }
 
-static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb)
-{
-	unsigned long section_nr;
-	struct mem_section *mem_sect;
-	struct memory_block *mem_block;
-
-	section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
-	mem_sect = __nr_to_section(section_nr);
-
-	mem_block = find_memory_block(mem_sect);
-	return mem_block;
-}
-
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
 {
@@ -420,6 +407,19 @@ static bool lmb_is_removable(struct of_drconf_cell *lmb)
 
 static int dlpar_add_lmb(struct of_drconf_cell *);
 
+static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb)
+{
+	unsigned long section_nr;
+	struct mem_section *mem_sect;
+	struct memory_block *mem_block;
+
+	section_nr = pfn_to_section_nr(PFN_DOWN(lmb->base_addr));
+	mem_sect = __nr_to_section(section_nr);
+
+	mem_block = find_memory_block(mem_sect);
+	return mem_block;
+}
+
 static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
 {
 	struct memory_block *mem_block;
-- 
cgit v1.2.3


From 546c4402c57497a6ffdf5373aff75bc89f0866bc Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sat, 6 Aug 2016 15:54:12 -0700
Subject: powerpc/vdso: Add missing include file

Some powerpc builds fail with the following buld error.

  In file included from ./arch/powerpc/include/asm/mmu_context.h:11:0,
                   from arch/powerpc/kernel/vdso.c:28:
  arch/powerpc/include/asm/cputhreads.h: In function 'get_tensr':
  arch/powerpc/include/asm/cputhreads.h:101:2: error:
  	implicit declaration of function 'cpu_has_feature'

Fixes: b92a226e5284 ("powerpc: Move cpu_has_feature() to a separate file")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/vdso.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 6767605ea8da..4111d30badfa 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -22,6 +22,7 @@
 #include <linux/security.h>
 #include <linux/memblock.h>
 
+#include <asm/cpu_has_feature.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
-- 
cgit v1.2.3


From cbd74e1bc8129efb9908f130a8a6e60fd95d2106 Mon Sep 17 00:00:00 2001
From: Philippe Bergheaud <felix@linux.vnet.ibm.com>
Date: Fri, 5 Aug 2016 14:02:00 +0200
Subject: cxl: Use fixed width predefined types in data structure.

This patch fixes a regression introduced by commit b810253bd934 ("cxl:
Add mechanism for delivering AFU driver specific events").

It changes the type u8 to __u8 in the uapi header cxl.h, because the
former is a kernel internal type, and may not be defined in userland
build environments, in particular when cross-compiling libcxl on x86_64
linux machines (RHEL6.7 and Ubuntu 16.04).

This patch also changes the size of the field data_size, and makes it
constant, to support 32-bit userland applications running on big-endian
ppc64 kernels transparently.

mpe: This is an ABI change, however the ABI was only added during the
4.8 merge window so has never been part of a released kernel - therefore
we give ourselves permission to change it.

Fixes: b810253bd934 ("cxl: Add mechanism for delivering AFU driver specific events")
Signed-off-by: Philippe Bergheaud <felix@linux.vnet.ibm.com>
Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/uapi/misc/cxl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h
index cbae529b7ce0..180d526a55c3 100644
--- a/include/uapi/misc/cxl.h
+++ b/include/uapi/misc/cxl.h
@@ -136,8 +136,8 @@ struct cxl_event_afu_driver_reserved {
 	 *
 	 * Of course the contents will be ABI, but that's up the AFU driver.
 	 */
-	size_t data_size;
-	u8 data[];
+	__u32 data_size;
+	__u8 data[];
 };
 
 struct cxl_event {
-- 
cgit v1.2.3


From 164793379ad3b7ef5fc5a28260c111358892dff3 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Thu, 28 Jul 2016 15:39:41 +1000
Subject: cxl: Fix NULL dereference in cxl_context_init() on PowerVM guests

Commit f67a6722d650 ("cxl: Workaround PE=0 hardware limitation in
Mellanox CX4") added a "min_pe" field to struct cxl_service_layer_ops,
to allow us to work around a Mellanox CX-4 hardware limitation.

When allocating the PE number in cxl_context_init(), we read from
ctx->afu->adapter->native->sl_ops->min_pe to get the minimum PE number.
Unsurprisingly, in a PowerVM guest ctx->afu->adapter->native is NULL,
and guests don't have a cxl_service_layer_ops struct anywhere.

Move min_pe from struct cxl_service_layer_ops to struct cxl so it's
accessible in both native and PowerVM environments. For the Mellanox
CX-4, set the min_pe value in set_sl_ops().

Fixes: f67a6722d650 ("cxl: Workaround PE=0 hardware limitation in Mellanox CX4")
Reported-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/context.c | 3 +--
 drivers/misc/cxl/cxl.h     | 2 +-
 drivers/misc/cxl/pci.c     | 3 ++-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index bdee9a01ef35..c466ee2b0c97 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -90,8 +90,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
 	 */
 	mutex_lock(&afu->contexts_lock);
 	idr_preload(GFP_KERNEL);
-	i = idr_alloc(&ctx->afu->contexts_idr, ctx,
-		      ctx->afu->adapter->native->sl_ops->min_pe,
+	i = idr_alloc(&ctx->afu->contexts_idr, ctx, ctx->afu->adapter->min_pe,
 		      ctx->afu->num_procs, GFP_NOWAIT);
 	idr_preload_end();
 	mutex_unlock(&afu->contexts_lock);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index de090533f18c..344a0ff8f8c7 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -561,7 +561,6 @@ struct cxl_service_layer_ops {
 	u64 (*timebase_read)(struct cxl *adapter);
 	int capi_mode;
 	bool needs_reset_before_disable;
-	int min_pe;
 };
 
 struct cxl_native {
@@ -603,6 +602,7 @@ struct cxl {
 	struct bin_attribute cxl_attr;
 	int adapter_num;
 	int user_irqs;
+	int min_pe;
 	u64 ps_size;
 	u16 psl_rev;
 	u16 base_image;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index d152e2de8c93..1d0347c36e6d 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1521,14 +1521,15 @@ static const struct cxl_service_layer_ops xsl_ops = {
 	.write_timebase_ctrl = write_timebase_ctrl_xsl,
 	.timebase_read = timebase_read_xsl,
 	.capi_mode = OPAL_PHB_CAPI_MODE_DMA,
-	.min_pe = 1, /* Workaround for Mellanox CX4 HW bug */
 };
 
 static void set_sl_ops(struct cxl *adapter, struct pci_dev *dev)
 {
 	if (dev->vendor == PCI_VENDOR_ID_MELLANOX && dev->device == 0x1013) {
+		/* Mellanox CX-4 */
 		dev_info(&adapter->dev, "Device uses an XSL\n");
 		adapter->native->sl_ops = &xsl_ops;
+		adapter->min_pe = 1; /* Workaround for CX-4 hardware bug */
 	} else {
 		dev_info(&adapter->dev, "Device uses a PSL\n");
 		adapter->native->sl_ops = &psl_ops;
-- 
cgit v1.2.3


From 6fd40f192a9dba391b2d84882f1ed3169c52b714 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Fri, 22 Jul 2016 19:01:36 +1000
Subject: cxl: Fix sparse warnings

Make native_irq_wait() static and use NULL rather than 0 to initialise
phb->cfg_data in cxl_pci_vphb_add() to remove sparse warnings.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Reviewed-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/native.c | 2 +-
 drivers/misc/cxl/vphb.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 3bcdaee11ba1..e606fdc4bc9c 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -924,7 +924,7 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
 	return fail_psl_irq(afu, &irq_info);
 }
 
-void native_irq_wait(struct cxl_context *ctx)
+static void native_irq_wait(struct cxl_context *ctx)
 {
 	u64 dsisr;
 	int timeout = 1000;
diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index dee8def1c193..7ada5f1b7bb6 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -221,7 +221,7 @@ int cxl_pci_vphb_add(struct cxl_afu *afu)
 	/* Setup the PHB using arch provided callback */
 	phb->ops = &cxl_pcie_pci_ops;
 	phb->cfg_addr = NULL;
-	phb->cfg_data = 0;
+	phb->cfg_data = NULL;
 	phb->private_data = afu;
 	phb->controller_ops = cxl_pci_controller_ops;
 
-- 
cgit v1.2.3


From 10560b9afc8abf349843dff88c45dd43223e803e Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com>
Date: Fri, 22 Jul 2016 14:05:29 -0300
Subject: powerpc/eeh: Switch to conventional PCI address output in EEH log

This is a very minor/trivial fix for the output of PCI address on EEH
logs. The PCI address on "OF node" field currently is using ":" as a
separator for the function, but the usual separator is ".". This patch
changes the separator to dot, so the PCI address is printed as usual.

Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/eeh.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index c9bc78e9c610..7429556eb8df 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -168,10 +168,10 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
 	int n = 0, l = 0;
 	char buffer[128];
 
-	n += scnprintf(buf+n, len-n, "%04x:%02x:%02x:%01x\n",
+	n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
 		       edev->phb->global_number, pdn->busno,
 		       PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
-	pr_warn("EEH: of node=%04x:%02x:%02x:%01x\n",
+	pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
 		edev->phb->global_number, pdn->busno,
 		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
 
-- 
cgit v1.2.3


From 61e8a0d5a0270b91581f6c715036844b2ea98da1 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 5 Aug 2016 16:40:56 +1000
Subject: powerpc/pci: Fix endian bug in fixed PHB numbering

The recent commit 63a72284b159 ("powerpc/pci: Assign fixed PHB number
based on device-tree properties"), added code to read a 64-bit property
from the device tree, and if not found read a 32-bit property (reg).

There was a bug in the 32-bit case, on big endian machines, due to the
use of the 64-bit value to read the 32-bit property. The cast of &prop
means we end up writing to the high 32-bit of prop, leaving the low
32-bits containing whatever junk was on the stack.

If that junk value was non-zero, and < MAX_PHBS, we would end up using
it as the PHB id. This results in users seeing what appear to be random
PHB ids.

Fix it by reading into a u32 property and then assigning that to the
u64 value, letting the CPU do the correct conversions for us.

Fixes: 63a72284b159 ("powerpc/pci: Assign fixed PHB number based on device-tree properties")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/pci-common.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index a5c0153ede37..7fdf324d5b51 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(get_pci_dma_ops);
 static int get_phb_number(struct device_node *dn)
 {
 	int ret, phb_id = -1;
+	u32 prop_32;
 	u64 prop;
 
 	/*
@@ -86,8 +87,10 @@ static int get_phb_number(struct device_node *dn)
 	 * reading "ibm,opal-phbid", only present in OPAL environment.
 	 */
 	ret = of_property_read_u64(dn, "ibm,opal-phbid", &prop);
-	if (ret)
-		ret = of_property_read_u32_index(dn, "reg", 1, (u32 *)&prop);
+	if (ret) {
+		ret = of_property_read_u32_index(dn, "reg", 1, &prop_32);
+		prop = prop_32;
+	}
 
 	if (!ret)
 		phb_id = (int)(prop & (MAX_PHBS - 1));
-- 
cgit v1.2.3


From f3b0946d629c8bfbd3e5f038e30cb9c711a35f10 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Wed, 13 Jul 2016 17:18:33 +0100
Subject: genirq/msi: Make sure PCI MSIs are activated early

Bharat Kumar Gogada reported issues with the generic MSI code, where the
end-point ended up with garbage in its MSI configuration (both for the vector
and the message).

It turns out that the two MSI paths in the kernel are doing slightly different
things:

generic MSI: disable MSI -> allocate MSI -> enable MSI -> setup EP
PCI MSI: disable MSI -> allocate MSI -> setup EP -> enable MSI

And it turns out that end-points are allowed to latch the content of the MSI
configuration registers as soon as MSIs are enabled.  In Bharat's case, the
end-point ends up using whatever was there already, which is not what you
want.

In order to make things converge, we introduce a new MSI domain flag
(MSI_FLAG_ACTIVATE_EARLY) that is unconditionally set for PCI/MSI. When set,
this flag forces the programming of the end-point as soon as the MSIs are
allocated.

A consequence of this is that we have an extra activate in irq_startup, but
that should be without much consequence.

tglx:

 - Several people reported a VMWare regression with PCI/MSI-X passthrough. It
   turns out that the patch also cures that issue.

 - We need to have a look at the MSI disable interrupt path, where we write
   the msg to all zeros without disabling MSI in the PCI device. Is that
   correct?

Fixes: 52f518a3a7c2 "x86/MSI: Use hierarchical irqdomains to manage MSI interrupts"
Reported-and-tested-by: Bharat Kumar Gogada <bharat.kumar.gogada@xilinx.com>
Reported-and-tested-by: Foster Snowhill <forst@forstwoof.ru>
Reported-by: Matthias Prager <linux@matthiasprager.de>
Reported-by: Jason Taylor <jason.taylor@simplivity.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: linux-pci@vger.kernel.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1468426713-31431-1-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c   |  2 ++
 include/linux/msi.h |  2 ++
 kernel/irq/msi.c    | 11 +++++++++++
 3 files changed, 15 insertions(+)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index a02981efdad5..eafa6138a6b8 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1411,6 +1411,8 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode,
 	if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
 		pci_msi_domain_update_chip_ops(info);
 
+	info->flags |= MSI_FLAG_ACTIVATE_EARLY;
+
 	domain = msi_create_irq_domain(fwnode, info, parent);
 	if (!domain)
 		return NULL;
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 4f0bfe5912b2..e8c81fbd5f9c 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -270,6 +270,8 @@ enum {
 	MSI_FLAG_MULTI_PCI_MSI		= (1 << 2),
 	/* Support PCI MSIX interrupts */
 	MSI_FLAG_PCI_MSIX		= (1 << 3),
+	/* Needs early activate, required for PCI */
+	MSI_FLAG_ACTIVATE_EARLY		= (1 << 4),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 54999350162c..19e9dfbe97fa 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 		else
 			dev_dbg(dev, "irq [%d-%d] for MSI\n",
 				virq, virq + desc->nvec_used - 1);
+		/*
+		 * This flag is set by the PCI layer as we need to activate
+		 * the MSI entries before the PCI layer enables MSI in the
+		 * card. Otherwise the card latches a random msi message.
+		 */
+		if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+			struct irq_data *irq_data;
+
+			irq_data = irq_domain_get_irq_data(domain, desc->irq);
+			irq_domain_activate_irq(irq_data);
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 46c8f0b077a838eb1f6169bb370aab8ed98f7630 Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@mellanox.com>
Date: Mon, 8 Aug 2016 16:29:07 -0400
Subject: timers: Fix get_next_timer_interrupt() computation

The tick_nohz_stop_sched_tick() routine is not properly
canceling the sched timer when nothing is pending, because
get_next_timer_interrupt() is no longer returning KTIME_MAX in
that case.  This causes periodic interrupts when none are needed.

When determining the next interrupt time, we first use
__next_timer_interrupt() to get the first expiring timer in the
timer wheel.  If no timer is found, we return the base clock value
plus NEXT_TIMER_MAX_DELTA to indicate there is no timer in the
timer wheel.

Back in get_next_timer_interrupt(), we set the "expires" value
by converting the timer wheel expiry (in ticks) to a nsec value.
But we don't want to do this if the timer wheel expiry value
indicates no timer; we want to return KTIME_MAX.

Prior to commit 500462a9de65 ("timers: Switch to a non-cascading
wheel") we checked base->active_timers to see if any timers
were active, and if not, we didn't touch the expiry value and so
properly returned KTIME_MAX.  Now we don't have active_timers.

To fix this, we now just check the timer wheel expiry value to
see if it is "now + NEXT_TIMER_MAX_DELTA", and if it is, we don't
try to compute a new value based on it, but instead simply let the
KTIME_MAX value in expires remain.

Fixes: 500462a9de65 "timers: Switch to a non-cascading wheel"
Signed-off-by: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1470688147-22287-1-git-send-email-cmetcalf@mellanox.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 555670a5143c..32bf6f75a8fe 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
+	bool is_max_delta;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 
 	spin_lock(&base->lock);
 	nextevt = __next_timer_interrupt(base);
+	is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
 	base->next_expiry = nextevt;
 	/*
 	 * We have a fresh next event. Check whether we can forward the base:
@@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 		expires = basem;
 		base->is_idle = false;
 	} else {
-		expires = basem + (nextevt - basej) * TICK_NSEC;
+		if (!is_max_delta)
+			expires = basem + (nextevt - basej) * TICK_NSEC;
 		/*
 		 * If we expect to sleep more than a tick, mark the base idle:
 		 */
-- 
cgit v1.2.3


From c74dd88e77d3ecbc9e55c78796d82c9aa21cabad Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Tue, 9 Aug 2016 10:39:13 +0530
Subject: powerpc/book3s: Fix MCE console messages for unrecoverable MCE.

When machine check occurs with MSR(RI=0), it means MC interrupt is
unrecoverable and kernel goes down to panic path. But the console
message still shows it as recovered. This patch fixes the MCE console
messages.

Fixes: 36df96f8acaf ("powerpc/book3s: Decode and save machine check event.")
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce.c             | 3 ++-
 arch/powerpc/platforms/powernv/opal.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index ef267fd9dd22..5e7ece0fda9f 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -92,7 +92,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	mce->in_use = 1;
 
 	mce->initiator = MCE_INITIATOR_CPU;
-	if (handled)
+	/* Mark it recovered if we have handled it and MSR(RI=1). */
+	if (handled && (regs->msr & MSR_RI))
 		mce->disposition = MCE_DISPOSITION_RECOVERED;
 	else
 		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 8b4fc68cebcb..6c9a65b52e63 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -399,6 +399,7 @@ static int opal_recover_mce(struct pt_regs *regs,
 
 	if (!(regs->msr & MSR_RI)) {
 		/* If MSR_RI isn't set, we cannot recover */
+		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
 		recovered = 0;
 	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
 		/* Platform corrected itself */
-- 
cgit v1.2.3


From 5958d19a143eb229e9ece20bd4c781ad41cb7d24 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 8 Jul 2016 15:55:43 +1000
Subject: powerpc/pnv/pci: Fix incorrect PE reservation attempt on some 64-bit
 BARs

The generic allocation code may sometimes decide to assign a prefetchable
64-bit BAR to the M32 window. In fact it may also decide to allocate
a 64-bit non-prefetchable BAR to the M64 one ! So using the resource
flags as a test to decide which window was used for PE allocation is
just wrong and leads to insane PE numbers.

Instead, compare the addresses to figure it out.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Rename the function as agreed by Ben & Gavin]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index dedbbe9785ba..fd9444f9fb0c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -111,10 +111,17 @@ static int __init iommu_setup(char *str)
 }
 early_param("iommu", iommu_setup);
 
-static inline bool pnv_pci_is_mem_pref_64(unsigned long flags)
+static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
 {
-	return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) ==
-		(IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
+	/*
+	 * WARNING: We cannot rely on the resource flags. The Linux PCI
+	 * allocation code sometimes decides to put a 64-bit prefetchable
+	 * BAR in the 32-bit window, so we have to compare the addresses.
+	 *
+	 * For simplicity we only test resource start.
+	 */
+	return (r->start >= phb->ioda.m64_base &&
+		r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
 }
 
 static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
@@ -229,7 +236,7 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
 	sgsz = phb->ioda.m64_segsize;
 	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
 		r = &pdev->resource[i];
-		if (!r->parent || !pnv_pci_is_mem_pref_64(r->flags))
+		if (!r->parent || !pnv_pci_is_m64(phb, r))
 			continue;
 
 		start = _ALIGN_DOWN(r->start - base, sgsz);
@@ -2863,7 +2870,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || res->parent)
 			continue;
-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
+		if (!pnv_pci_is_m64(phb, res)) {
 			dev_warn(&pdev->dev, "Don't support SR-IOV with"
 					" non M64 VF BAR%d: %pR. \n",
 				 i, res);
@@ -2958,7 +2965,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
 			index++;
 		}
 	} else if ((res->flags & IORESOURCE_MEM) &&
-		   !pnv_pci_is_mem_pref_64(res->flags)) {
+		   !pnv_pci_is_m64(phb, res)) {
 		region.start = res->start -
 			       phb->hose->mem_offset[0] -
 			       phb->ioda.m32_pci_base;
@@ -3083,9 +3090,12 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
 		bridge = bridge->bus->self;
 	}
 
-	/* We fail back to M32 if M64 isn't supported */
-	if (phb->ioda.m64_segsize &&
-	    pnv_pci_is_mem_pref_64(type))
+	/*
+	 * We fall back to M32 if M64 isn't supported. We enforce the M64
+	 * alignment for any 64-bit resource, PCIe doesn't care and
+	 * bridges only do 64-bit prefetchable anyway.
+	 */
+	if (phb->ioda.m64_segsize && (type & IORESOURCE_MEM_64))
 		return phb->ioda.m64_segsize;
 	if (type & IORESOURCE_MEM)
 		return phb->ioda.m32_segsize;
@@ -3125,7 +3135,7 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
 		w = NULL;
 		if (r->flags & type & IORESOURCE_IO)
 			w = &hose->io_resource;
-		else if (pnv_pci_is_mem_pref_64(r->flags) &&
+		else if (pnv_pci_is_m64(phb, r) &&
 			 (type & IORESOURCE_PREFETCH) &&
 			 phb->ioda.m64_segsize)
 			w = &hose->mem_resources[1];
-- 
cgit v1.2.3


From 50ee91bdef41c15b671dcd9446ee007a1d2f5ab7 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 9 Aug 2016 10:30:49 +0800
Subject: arm64: Support hard limit of cpu count by nr_cpus

Enable the hard limit of cpu count by set boot options nr_cpus=x
on arm64, and make a minor change about message when total number
of cpu exceeds the limit.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reported-by: Shiyuan Hu <hushiyuan@huawei.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/kernel/smp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 76a6d9263908..d93d43352504 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -661,9 +661,9 @@ void __init smp_init_cpus(void)
 		acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
 				      acpi_parse_gic_cpu_interface, 0);
 
-	if (cpu_count > NR_CPUS)
-		pr_warn("no. of cores (%d) greater than configured maximum of %d - clipping\n",
-			cpu_count, NR_CPUS);
+	if (cpu_count > nr_cpu_ids)
+		pr_warn("Number of cores (%d) exceeds configured maximum of %d - clipping\n",
+			cpu_count, nr_cpu_ids);
 
 	if (!bootcpu_valid) {
 		pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
@@ -677,7 +677,7 @@ void __init smp_init_cpus(void)
 	 * with entries in cpu_logical_map while initializing the cpus.
 	 * If the cpu set-up fails, invalidate the cpu_logical_map entry.
 	 */
-	for (i = 1; i < NR_CPUS; i++) {
+	for (i = 1; i < nr_cpu_ids; i++) {
 		if (cpu_logical_map(i) != INVALID_HWID) {
 			if (smp_cpu_setup(i))
 				cpu_logical_map(i) = INVALID_HWID;
-- 
cgit v1.2.3


From 58625edf9e2515ed41dac2a24fa8004030a87b87 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Tue, 2 Aug 2016 14:16:31 +0000
Subject: virtio: fix memory leak in virtqueue_add()

When using the indirect buffers feature, 'desc' is allocated in
virtqueue_add() but isn't freed before leaving on a ring full error,
causing a memory leak.

For example, it seems rather clear that this can trigger
with virtio net if mergeable buffers are not used.

Cc: stable@vger.kernel.org
Signed-off-by: Wei Yongjun <weiyj.lk@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 114a0c88afb8..5ed228ddadba 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -327,6 +327,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 		 * host should service the ring ASAP. */
 		if (out_sgs)
 			vq->notify(&vq->vq);
+		if (indirect)
+			kfree(desc);
 		END_USE(vq);
 		return -ENOSPC;
 	}
-- 
cgit v1.2.3


From 3cc36f6e34bd2d92d23be7b598ba5e639c47b01a Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 3 Aug 2016 07:18:51 +0300
Subject: virtio: fix error handling for debug builds

On error, virtqueue_add calls START_USE but not
END_USE. Thankfully that's normally empty anyway,
but might not be when debugging. Fix it up.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5ed228ddadba..e383ecdaca59 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -428,6 +428,7 @@ unmap_release:
 	if (indirect)
 		kfree(desc);
 
+	END_USE(vq);
 	return -EIO;
 }
 
-- 
cgit v1.2.3


From 1b8553c04bf95180eb91be94f089a1e8b38cfd62 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Wed, 3 Aug 2016 19:59:47 +0300
Subject: 9p/trans_virtio: use kvfree() for iov_iter_get_pages_alloc()

The memory allocated by iov_iter_get_pages_alloc() can be allocated with
vmalloc() if kmalloc() failed -- see get_pages_array().

In that case we need to free it with vfree(), so let's use kvfree().

The bug manifests like this:

BUG: unable to handle kernel paging request at ffffeb0400072da0
IP: [<ffffffff8139c67b>] kfree+0x4b/0x140
PGD 0
Oops: 0000 [#1] PREEMPT SMP KASAN
CPU: 2 PID: 675 Comm: trinity-c2 Not tainted 4.7.0-rc7+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
task: ffff8800badef2c0 ti: ffff880069208000 task.ti: ffff880069208000
RIP: 0010:[<ffffffff8139c67b>]  [<ffffffff8139c67b>] kfree+0x4b/0x140
RSP: 0000:ffff88006920f3f0  EFLAGS: 00010282
RAX: ffffea0000000000 RBX: ffffc90001cb6000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: 0000000000000246 RDI: ffffc90001cb6000
RBP: ffff88006920f410 R08: 0000000000000000 R09: dffffc0000000000
R10: ffff8800badefa30 R11: 0000056a3d3b0d9f R12: ffff88006920f620
R13: ffffeb0400072d80 R14: ffff8800baa94078 R15: 0000000000000000
FS:  00007fbd2b437700(0000) GS:ffff88011af00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffeb0400072da0 CR3: 000000006926d000 CR4: 00000000000006e0
Stack:
 0000000000000001 ffff88006920f620 ffffed001755280f ffff8800baa94078
 ffff88006920f6a8 ffffffff8310442b dffffc0000000000 ffff8800badefa30
 ffff8800badefa28 ffff88011af1fba0 1ffff1000d241e98 ffff8800ba892150
Call Trace:
 [<ffffffff8310442b>] p9_virtio_zc_request+0x72b/0xdb0
 [<ffffffff830f2116>] p9_client_zc_rpc.constprop.8+0x246/0xb10
 [<ffffffff830f5d79>] p9_client_read+0x4c9/0x750
 [<ffffffff8175ceac>] v9fs_fid_readpage+0x14c/0x320
 [<ffffffff8175d0b6>] v9fs_vfs_readpage+0x36/0x50
 [<ffffffff812c6f13>] filemap_fault+0x9a3/0xe60
 [<ffffffff81331878>] __do_fault+0x158/0x300
 [<ffffffff81339e01>] handle_mm_fault+0x1cf1/0x3c80
 [<ffffffff810c0aaa>] __do_page_fault+0x30a/0x8e0
 [<ffffffff810c10df>] do_page_fault+0x2f/0x80
 [<ffffffff810b5b07>] do_async_page_fault+0x27/0xa0
 [<ffffffff83296c48>] async_page_fault+0x28/0x30
Code: 00 80 41 54 53 49 01 fd 48 0f 42 05 b0 39 67 02 48 89 fb 49 01 c5 48 b8 00 00 00 00 00 ea ff ff 49 c1 ed 0c 49 c1 e5 06 49 01 c5 <49> 8b 45 20 48 8d 50 ff a8 01 4c 0f 45 ea 49 8b 55 20 48 8d 42
RIP  [<ffffffff8139c67b>] kfree+0x4b/0x140
 RSP <ffff88006920f3f0>
CR2: ffffeb0400072da0
---[ end trace f3d59a04bafec038 ]---

Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net/9p/trans_virtio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 4acb1d5417aa..f24b25c25106 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -507,8 +507,8 @@ err_out:
 		/* wakeup anybody waiting for slots to pin pages */
 		wake_up(&vp_wq);
 	}
-	kfree(in_pages);
-	kfree(out_pages);
+	kvfree(in_pages);
+	kvfree(out_pages);
 	return err;
 }
 
-- 
cgit v1.2.3


From 3fda5d6e580193fa005014355b3a61498f1b3ae0 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Thu, 4 Aug 2016 14:52:53 +0100
Subject: vhost/vsock: fix vhost virtio_vsock_pkt use-after-free

Stash the packet length in a local variable before handing over
ownership of the packet to virtio_transport_recv_pkt() or
virtio_transport_free_pkt().

This patch solves the use-after-free since pkt is no longer guaranteed
to be alive.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vsock.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 0ddf3a2dbfc4..e3b30ea9ece5 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -307,6 +307,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 
 	vhost_disable_notify(&vsock->dev, vq);
 	for (;;) {
+		u32 len;
+
 		if (!vhost_vsock_more_replies(vsock)) {
 			/* Stop tx until the device processes already
 			 * pending replies.  Leave tx virtqueue
@@ -334,13 +336,15 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 			continue;
 		}
 
+		len = pkt->len;
+
 		/* Only accept correctly addressed packets */
 		if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid)
 			virtio_transport_recv_pkt(pkt);
 		else
 			virtio_transport_free_pkt(pkt);
 
-		vhost_add_used(vq, head, sizeof(pkt->hdr) + pkt->len);
+		vhost_add_used(vq, head, sizeof(pkt->hdr) + len);
 		added = true;
 	}
 
-- 
cgit v1.2.3


From 28ad55578b8a76390d966b09da8c7fa3644f5140 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Fri, 5 Aug 2016 13:52:09 +0100
Subject: virtio-vsock: fix include guard typo

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_vsock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h
index 6b011c19b50f..1d57ed3d84d2 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -32,7 +32,7 @@
  */
 
 #ifndef _UAPI_LINUX_VIRTIO_VSOCK_H
-#define _UAPI_LINUX_VIRTIO_VOSCK_H
+#define _UAPI_LINUX_VIRTIO_VSOCK_H
 
 #include <linux/types.h>
 #include <linux/virtio_ids.h>
-- 
cgit v1.2.3


From 347a529398e8e723338cca5d8a8ae2d9e7e93448 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnghuan@gmail.com>
Date: Tue, 9 Aug 2016 16:39:20 +0800
Subject: virtio_blk: Fix a slient kernel panic

We do a lot of memory allocation in function init_vq, and don't handle
the allocation failure properly. Then this function will return 0,
although initialization fails due to lacking memory. At that moment,
kernel will panic in guest machine, if virtio is used to drive disk.

To fix this bug, we should take care of allocation failure, and return
correct value to let caller know what happen.

Tested-by: Chao Fan <fanc.fnst@cn.fujitsu.com>
Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Signed-off-by: Minfei Huang <minfei.hmf@alibaba-inc.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1523e05c46fc..93b1aaa5ba3b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -391,22 +391,16 @@ static int init_vq(struct virtio_blk *vblk)
 		num_vqs = 1;
 
 	vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
-	if (!vblk->vqs) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!vblk->vqs)
+		return -ENOMEM;
 
 	names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
-	if (!names)
-		goto err_names;
-
 	callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
-	if (!callbacks)
-		goto err_callbacks;
-
 	vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
-	if (!vqs)
-		goto err_vqs;
+	if (!names || !callbacks || !vqs) {
+		err = -ENOMEM;
+		goto out;
+	}
 
 	for (i = 0; i < num_vqs; i++) {
 		callbacks[i] = virtblk_done;
@@ -417,7 +411,7 @@ static int init_vq(struct virtio_blk *vblk)
 	/* Discover virtqueues and write information to configuration.  */
 	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
 	if (err)
-		goto err_find_vqs;
+		goto out;
 
 	for (i = 0; i < num_vqs; i++) {
 		spin_lock_init(&vblk->vqs[i].lock);
@@ -425,16 +419,12 @@ static int init_vq(struct virtio_blk *vblk)
 	}
 	vblk->num_vqs = num_vqs;
 
- err_find_vqs:
+out:
 	kfree(vqs);
- err_vqs:
 	kfree(callbacks);
- err_callbacks:
 	kfree(names);
- err_names:
 	if (err)
 		kfree(vblk->vqs);
- out:
 	return err;
 }
 
-- 
cgit v1.2.3


From 2ab0d56aadbcd120b8fa524b4a1142e8b06e13c8 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 7 Jul 2016 17:07:56 +0200
Subject: virtio/s390: keep early_put_chars

In case the registration of the hvc tty never happens AND the kernel
thinks that hvc0 is the preferred console we should keep the early
printk function to avoid a kernel panic due to code being removed.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Jing Liu <liujbjl@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/s390/virtio/kvm_virtio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index 1d060fd293a3..b0a849f02df3 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -482,7 +482,7 @@ static int __init kvm_devices_init(void)
 }
 
 /* code for early console output with virtio_console */
-static __init int early_put_chars(u32 vtermno, const char *buf, int count)
+static int early_put_chars(u32 vtermno, const char *buf, int count)
 {
 	char scratch[17];
 	unsigned int len = count;
-- 
cgit v1.2.3


From 3b2fbb3f06efe5bd2dfdce2a1db703e23c1a78af Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Thu, 7 Jul 2016 17:07:57 +0200
Subject: virtio/s390: deprecate old transport

There only ever have been two host implementations of the old
s390-virtio (pre-ccw) transport: the experimental kuli userspace,
and qemu. As qemu switched its default to ccw with 2.4 (with most
users having used ccw well before that) and removed the old transport
entirely in 2.6, s390-virtio probably hasn't been in active use for
quite some time and is therefore likely to bitrot.

Let's start the slow march towards removing the code by deprecating
it.

Note that this also deprecates the early virtio console code, which
has been causing trouble in the guest without being wired up in any
relevant hypervisor code.

Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Dong Jia Shi <bjsdjshi@linux.vnet.ibm.com>
Reviewed-by: Sascha Silbe <silbe@linux.vnet.ibm.com>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/s390/Kconfig                | 13 +++++++++++++
 drivers/s390/virtio/Makefile     |  6 +++++-
 drivers/s390/virtio/kvm_virtio.c |  2 ++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 9e607bf2d640..5a2907ee8c93 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -871,4 +871,17 @@ config S390_GUEST
 	  Select this option if you want to run the kernel as a guest under
 	  the KVM hypervisor.
 
+config S390_GUEST_OLD_TRANSPORT
+	def_bool y
+	prompt "Guest support for old s390 virtio transport (DEPRECATED)"
+	depends on S390_GUEST
+	help
+	  Enable this option to add support for the old s390-virtio
+	  transport (i.e. virtio devices NOT based on virtio-ccw). This
+	  type of virtio devices is only available on the experimental
+	  kuli userspace or with old (< 2.6) qemu. If you are running
+	  with a modern version of qemu (which supports virtio-ccw since
+	  1.4 and uses it by default since version 2.4), you probably won't
+	  need this.
+
 endmenu
diff --git a/drivers/s390/virtio/Makefile b/drivers/s390/virtio/Makefile
index 241891a57caf..df40692a9011 100644
--- a/drivers/s390/virtio/Makefile
+++ b/drivers/s390/virtio/Makefile
@@ -6,4 +6,8 @@
 # it under the terms of the GNU General Public License (version 2 only)
 # as published by the Free Software Foundation.
 
-obj-$(CONFIG_S390_GUEST) += kvm_virtio.o virtio_ccw.o
+s390-virtio-objs := virtio_ccw.o
+ifdef CONFIG_S390_GUEST_OLD_TRANSPORT
+s390-virtio-objs += kvm_virtio.o
+endif
+obj-$(CONFIG_S390_GUEST) += $(s390-virtio-objs)
diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index b0a849f02df3..5e5c11f37b24 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -458,6 +458,8 @@ static int __init kvm_devices_init(void)
 	if (test_devices_support(total_memory_size) < 0)
 		return -ENODEV;
 
+	pr_warn("The s390-virtio transport is deprecated. Please switch to a modern host providing virtio-ccw.\n");
+
 	rc = vmem_add_mapping(total_memory_size, PAGE_SIZE);
 	if (rc)
 		return rc;
-- 
cgit v1.2.3


From 97b1d23f7bcbee00a5b85bf2c022c612ea67429d Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 5 Aug 2016 12:08:36 +0100
Subject: metag: Drop show_mem() from mem_init()

The recent commit 599d0c954f91 ("mm, vmscan: move LRU lists to node"),
changed memory management code so that show_mem() is no longer safe to
call prior to setup_per_cpu_pageset(), as pgdat->per_cpu_nodestats will
still be NULL. This causes an oops on metag due to the call to
show_mem() from mem_init():

  node_page_state_snapshot(...) + 0x48
  pgdat_reclaimable(struct pglist_data * pgdat = 0x402517a0)
  show_free_areas(unsigned int filter = 0) + 0x2cc
  show_mem(unsigned int filter = 0) + 0x18
  mem_init()
  mm_init()
  start_kernel() + 0x204

This wasn't a problem before with zone_reclaimable() as zone_pcp_init()
was already setting zone->pageset to &boot_pageset, via setup_arch() and
paging_init(), which happens before mm_init():

  zone_pcp_init(...)
  free_area_init_core(...) + 0x138
  free_area_init_node(int nid = 0, ...) + 0x1a0
  free_area_init_nodes(...) + 0x440
  paging_init(unsigned long mem_end = 0x4fe00000) + 0x378
  setup_arch(char ** cmdline_p = 0x4024e038) + 0x2b8
  start_kernel() + 0x54

No other arches appear to call show_mem() during boot, and it doesn't
really add much value to the log, so lets just drop it from mem_init().

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: linux-metag@vger.kernel.org
---
 arch/metag/mm/init.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index 11fa51c89617..c0ec116b3993 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -390,7 +390,6 @@ void __init mem_init(void)
 
 	free_all_bootmem();
 	mem_init_print_info(NULL);
-	show_mem(0);
 }
 
 void free_initmem(void)
-- 
cgit v1.2.3


From bcdc09af3ef30ef071677544ce23a1c8873a2dda Mon Sep 17 00:00:00 2001
From: Brendan Gregg <bgregg@netflix.com>
Date: Wed, 3 Aug 2016 02:47:49 +0000
Subject: perf script: Add 'bpf-output' field to usage message

This adds the 'bpf-output' field to the perf script usage message, and docs.

Signed-off-by: Brendan Gregg <bgregg@netflix.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1470192469-11910-4-git-send-email-bgregg@netflix.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-script.txt | 4 ++--
 tools/perf/builtin-script.c              | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 1f6c70594f0f..053bbbd84ece 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,8 +116,8 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-	srcline, period, iregs, brstack, brstacksym, flags.
-        Field list can be prepended with the type, trace, sw or hw,
+        srcline, period, iregs, brstack, brstacksym, flags, bpf-output,
+        callindent. Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -F sw:comm,tid,time,ip,sym  and -F trace:time,cpu,trace
 
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 971ff91b16cb..9c640a8081c7 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -2116,7 +2116,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
 		     "Valid types: hw,sw,trace,raw. "
 		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
 		     "addr,symoff,period,iregs,brstack,brstacksym,flags,"
-		     "callindent", parse_output_fields),
+		     "bpf-output,callindent", parse_output_fields),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
-- 
cgit v1.2.3


From 887fa86d6fd7a45cee2d0f9d5f75026786d61df2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 5 Aug 2016 12:37:21 -0300
Subject: perf hists: Trim libtraceevent trace_seq buffers

When we use libtraceevent to format trace event fields into printable
strings to use in hist entries it is important to trim it from the
default 4 KiB it starts with to what is really used, to reduce the
memory footprint, so use realloc(seq.buffer, seq.len + 1) when returning
the seq.buffer formatted with the fields contents.

Reported-and-Tested-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/n/tip-t3hl7uxmilrkigzmc90rlhk2@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/sort.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 947d21f38398..3d3cb8392c86 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -588,7 +588,11 @@ static char *get_trace_output(struct hist_entry *he)
 	} else {
 		pevent_event_info(&seq, evsel->tp_format, &rec);
 	}
-	return seq.buffer;
+	/*
+	 * Trim the buffer, it starts at 4KB and we're not going to
+	 * add anything more to this buffer.
+	 */
+	return realloc(seq.buffer, seq.len + 1);
 }
 
 static int64_t
-- 
cgit v1.2.3


From 8e34189b347d76acf48ce05831176582201b664d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu@linaro.org>
Date: Sat, 6 Aug 2016 19:29:48 +0900
Subject: perf probe: Adjust map->reloc offset when finding kernel symbol from
 map

Adjust map->reloc offset for the unmapped address when finding
alternative symbol address from map, because KASLR can relocate the
kernel symbol address.

The same adjustment has been done when finding appropriate kernel symbol
address from map which was introduced by commit f90acac75713 ("perf
probe: Find given address from offline dwarf")

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu@linaro.org>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20160806192948.e366f3fbc4b194de600f8326@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 953dc1ab2ed7..d5ccb656fd81 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -385,7 +385,7 @@ static int find_alternative_probe_point(struct debuginfo *dinfo,
 		if (uprobes)
 			address = sym->start;
 		else
-			address = map->unmap_ip(map, sym->start);
+			address = map->unmap_ip(map, sym->start) - map->reloc;
 		break;
 	}
 	if (!address) {
-- 
cgit v1.2.3


From cb3f3378cd09aa3fe975b4ad5ee0229dc76315bb Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 5 Aug 2016 15:22:36 +0300
Subject: perf probe: Fix module name matching

If module is "module" then dso->short_name is "[module]".  Substring
comparing is't enough: "raid10" matches to "[raid1]".  This patch also
checks terminating zero in module name.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: http://lkml.kernel.org/r/147039975648.715620.12985971832789032159.stgit@buzz
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index d5ccb656fd81..1201f73ca723 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -170,8 +170,10 @@ static struct map *kernel_get_module_map(const char *module)
 		module = "kernel";
 
 	for (pos = maps__first(maps); pos; pos = map__next(pos)) {
+		/* short_name is "[module]" */
 		if (strncmp(pos->dso->short_name + 1, module,
-			    pos->dso->short_name_len - 2) == 0) {
+			    pos->dso->short_name_len - 2) == 0 &&
+		    module[pos->dso->short_name_len - 2] == '\0') {
 			return pos;
 		}
 	}
-- 
cgit v1.2.3


From 3df33eff2ba96be4f1535db4f672013d756dc9b1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 9 Aug 2016 14:04:29 +0100
Subject: perf stat: Avoid skew when reading events

When we don't have a tracee (i.e. we're attaching to a task or CPU),
counters can still be running after our workload finishes, and can still
be running as we read their values. As we read events one-by-one, there
can be arbitrary skew between values of events, even within a group.
This means that ratios within an event group are not reliable.

This skew can be seen if measuring a group of identical events, e.g:

  # perf stat -a -C0 -e '{cycles,cycles}' sleep 1

To avoid this, we must stop groups from counting before we read the
values of any constituent events. This patch adds and makes use of a new
disable_counters() helper, which disables group leaders (and thus each
group as a whole). This mirrors the use of enable_counters() for
starting event groups in the absence of a tracee.

Closing a group leader splits the group, and without a disabled group
leader the newly split events will begin counting. Thus to ensure counts
are reliable we must defer closing group leaders until all counts have
been read. To do so this patch removes the event closing logic from the
read_counters() helper, explicitly closes the events using
perf_evlist__close(), which also aids legibility.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1470747869-3567-1-git-send-email-mark.rutland@arm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-stat.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 0c16d20d7e32..3c7452b39f57 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -331,7 +331,7 @@ static int read_counter(struct perf_evsel *counter)
 	return 0;
 }
 
-static void read_counters(bool close_counters)
+static void read_counters(void)
 {
 	struct perf_evsel *counter;
 
@@ -341,11 +341,6 @@ static void read_counters(bool close_counters)
 
 		if (perf_stat_process_counter(&stat_config, counter))
 			pr_warning("failed to process counter %s\n", counter->name);
-
-		if (close_counters) {
-			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter),
-					     thread_map__nr(evsel_list->threads));
-		}
 	}
 }
 
@@ -353,7 +348,7 @@ static void process_interval(void)
 {
 	struct timespec ts, rs;
 
-	read_counters(false);
+	read_counters();
 
 	clock_gettime(CLOCK_MONOTONIC, &ts);
 	diff_timespec(&rs, &ts, &ref_time);
@@ -380,6 +375,17 @@ static void enable_counters(void)
 		perf_evlist__enable(evsel_list);
 }
 
+static void disable_counters(void)
+{
+	/*
+	 * If we don't have tracee (attaching to task or cpu), counters may
+	 * still be running. To get accurate group ratios, we must stop groups
+	 * from counting before reading their constituent counters.
+	 */
+	if (!target__none(&target))
+		perf_evlist__disable(evsel_list);
+}
+
 static volatile int workload_exec_errno;
 
 /*
@@ -657,11 +663,20 @@ try_again:
 		}
 	}
 
+	disable_counters();
+
 	t1 = rdclock();
 
 	update_stats(&walltime_nsecs_stats, t1 - t0);
 
-	read_counters(true);
+	/*
+	 * Closing a group leader splits the group, and as we only disable
+	 * group leaders, results in remaining events becoming enabled. To
+	 * avoid arbitrary skew, we must read all counters before closing any
+	 * group leaders.
+	 */
+	read_counters();
+	perf_evlist__close(evsel_list);
 
 	return WEXITSTATUS(status);
 }
-- 
cgit v1.2.3


From c87edb36118664f1fa275107c1138b6f47793240 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 5 Aug 2016 12:41:52 -0400
Subject: tracing: Fix tick_stop tracepoint symbols for user export

The symbols used in the tick_stop tracepoint were not being converted
properly into integers in the trace_stop format file. Instead we had this:

print fmt: "success=%d dependency=%s", REC->success,
    __print_symbolic(REC->dependency, { 0, "NONE" },
     { (1 << TICK_DEP_BIT_POSIX_TIMER), "POSIX_TIMER" },
     { (1 << TICK_DEP_BIT_PERF_EVENTS), "PERF_EVENTS" },
     { (1 << TICK_DEP_BIT_SCHED), "SCHED" },
     { (1 << TICK_DEP_BIT_CLOCK_UNSTABLE), "CLOCK_UNSTABLE" })

User space tools have no idea how to parse "TICK_DEP_BIT_SCHED" or the other
symbols used to do the bit shifting. The reason is that the conversion was
done with using the TICK_DEP_MASK_* symbols which are just macros that
convert to the BIT shift itself (with the exception of NONE, which was
converted properly, because it doesn't use bits, and is defined as zero).

The TICK_DEP_BIT_* needs to be denoted by TRACE_DEFINE_ENUM() in order to
have this properly converted for user space tools to parse this event.

Cc: stable@vger.kernel.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Fixes: e6e6cc22e067 ("nohz: Use enum code for tick stop failure tracing message")
Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/timer.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 51440131d337..28c5da6fdfac 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -330,24 +330,32 @@ TRACE_EVENT(itimer_expire,
 #ifdef CONFIG_NO_HZ_COMMON
 
 #define TICK_DEP_NAMES					\
-		tick_dep_name(NONE)			\
+		tick_dep_mask_name(NONE)		\
 		tick_dep_name(POSIX_TIMER)		\
 		tick_dep_name(PERF_EVENTS)		\
 		tick_dep_name(SCHED)			\
 		tick_dep_name_end(CLOCK_UNSTABLE)
 
 #undef tick_dep_name
+#undef tick_dep_mask_name
 #undef tick_dep_name_end
 
-#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
-#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+/* The MASK will convert to their bits and they need to be processed too */
+#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
+	TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
+	TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+/* NONE only has a mask defined for it */
+#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
 
 TICK_DEP_NAMES
 
 #undef tick_dep_name
+#undef tick_dep_mask_name
 #undef tick_dep_name_end
 
 #define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
+#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
 #define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }
 
 #define show_tick_dep_name(val)				\
-- 
cgit v1.2.3


From 19f00b011729417f69e4df53cc3fe5ecc25134a4 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@hgst.com>
Date: Tue, 9 Aug 2016 11:40:08 +0900
Subject: perf probe: Support signedness casting

The 'perf probe' tool detects a variable's type and use the detected
type to add a new probe. Then, kprobes prints its variable in
hexadecimal format if the variable is unsigned and prints in decimal if
it is signed.

We sometimes want to see unsigned variable in decimal format (i.e.
sector_t or size_t). In that case, we need to investigate the variable's
size manually to specify just signedness.

This patch add signedness casting support. By specifying "s" or "u" as a
type, perf-probe will investigate variable size as usual and use the
specified signedness.

E.g. without this:

  $ perf probe -a 'submit_bio bio->bi_iter.bi_sector'
  Added new event:
    probe:submit_bio     (on submit_bio with bi_sector=bio->bi_iter.bi_sector)
  You can now use it in all perf tools, such as:
          perf record -e probe:submit_bio -aR sleep 1
  $ cat trace_pipe|head
          dbench-9692  [003] d..1   971.096633: submit_bio: (submit_bio+0x0/0x140) bi_sector=0x3a3d00
          dbench-9692  [003] d..1   971.096685: submit_bio: (submit_bio+0x0/0x140) bi_sector=0x1a3d80
          dbench-9692  [003] d..1   971.096687: submit_bio: (submit_bio+0x0/0x140) bi_sector=0x3a3d80
...
  // need to investigate the variable size
  $ perf probe -a 'submit_bio bio->bi_iter.bi_sector:s64'
  Added new event:
    probe:submit_bio     (on submit_bio with bi_sector=bio->bi_iter.bi_sector:s64)
  You can now use it in all perf tools, such as:
        perf record -e probe:submit_bio -aR sleep 1

  With this:

  // just use "s" to cast its signedness
  $ perf probe -v -a 'submit_bio bio->bi_iter.bi_sector:s'
  Added new event:
    probe:submit_bio     (on submit_bio with bi_sector=bio->bi_iter.bi_sector:s)
  You can now use it in all perf tools, such as:
          perf record -e probe:submit_bio -aR sleep 1
  $ cat trace_pipe|head
          dbench-9689  [001] d..1  1212.391237: submit_bio: (submit_bio+0x0/0x140) bi_sector=128
          dbench-9689  [001] d..1  1212.391252: submit_bio: (submit_bio+0x0/0x140) bi_sector=131072
          dbench-9697  [006] d..1  1212.398611: submit_bio: (submit_bio+0x0/0x140) bi_sector=30208

  This commit also update perf-probe.txt to describe "types". Most parts
  are based on existing documentation: Documentation/trace/kprobetrace.txt

Committer note:

Testing using 'perf trace':

  # perf probe -a 'submit_bio bio->bi_iter.bi_sector'
  Added new event:
    probe:submit_bio     (on submit_bio with bi_sector=bio->bi_iter.bi_sector)

  You can now use it in all perf tools, such as:

	perf record -e probe:submit_bio -aR sleep 1

  # trace --no-syscalls --ev probe:submit_bio
      0.000 probe:submit_bio:(ffffffffac3aee00) bi_sector=0xc133c0)
   3181.861 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x6cffb8)
   3181.881 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x6cffc0)
   3184.488 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x6cffc8)
<SNIP>
   4717.927 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x4dc7a88)
   4717.970 probe:submit_bio:(ffffffffac3aee00) bi_sector=0x4dc7880)
  ^C[root@jouet ~]#

Now, using this new feature:

[root@jouet ~]# perf probe -a 'submit_bio bio->bi_iter.bi_sector:s'
Added new event:
  probe:submit_bio     (on submit_bio with bi_sector=bio->bi_iter.bi_sector:s)

You can now use it in all perf tools, such as:

	perf record -e probe:submit_bio -aR sleep 1

  [root@jouet ~]# trace --no-syscalls --ev probe:submit_bio
     0.000 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145704)
     0.017 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145712)
     0.019 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145720)
     2.567 probe:submit_bio:(ffffffffac3aee00) bi_sector=7145728)
  5631.919 probe:submit_bio:(ffffffffac3aee00) bi_sector=0)
  5631.941 probe:submit_bio:(ffffffffac3aee00) bi_sector=8)
  5631.945 probe:submit_bio:(ffffffffac3aee00) bi_sector=16)
  5631.948 probe:submit_bio:(ffffffffac3aee00) bi_sector=24)
  ^C#

With callchains:

  # trace --no-syscalls --ev probe:submit_bio/max-stack=10/
     0.000 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662544)
                                       submit_bio+0xa8200001 ([kernel.kallsyms])
                                       submit_bh+0xa8200013 ([kernel.kallsyms])
                                       jbd2_journal_commit_transaction+0xa8200691 ([kernel.kallsyms])
                                       kjournald2+0xa82000ca ([kernel.kallsyms])
                                       kthread+0xa82000d8 ([kernel.kallsyms])
                                       ret_from_fork+0xa820001f ([kernel.kallsyms])
     0.023 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662552)
                                       submit_bio+0xa8200001 ([kernel.kallsyms])
                                       submit_bh+0xa8200013 ([kernel.kallsyms])
                                       jbd2_journal_commit_transaction+0xa8200691 ([kernel.kallsyms])
                                       kjournald2+0xa82000ca ([kernel.kallsyms])
                                       kthread+0xa82000d8 ([kernel.kallsyms])
                                       ret_from_fork+0xa820001f ([kernel.kallsyms])
     0.027 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662560)
                                       submit_bio+0xa8200001 ([kernel.kallsyms])
                                       submit_bh+0xa8200013 ([kernel.kallsyms])
                                       jbd2_journal_commit_transaction+0xa8200691 ([kernel.kallsyms])
                                       kjournald2+0xa82000ca ([kernel.kallsyms])
                                       kthread+0xa82000d8 ([kernel.kallsyms])
                                       ret_from_fork+0xa820001f ([kernel.kallsyms])
     2.593 probe:submit_bio:(ffffffffac3aee00) bi_sector=50662568)
                                       submit_bio+0xa8200001 ([kernel.kallsyms])
                                       submit_bh+0xa8200013 ([kernel.kallsyms])
                                       journal_submit_commit_record+0xa82001ac ([kernel.kallsyms])
                                       jbd2_journal_commit_transaction+0xa82012e8 ([kernel.kallsyms])
                                       kjournald2+0xa82000ca ([kernel.kallsyms])
                                       kthread+0xa82000d8 ([kernel.kallsyms])
                                       ret_from_fork+0xa820001f ([kernel.kallsyms])
  ^C#

Signed-off-by: Naohiro Aota <naohiro.aota@hgst.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Hemant Kumar <hemant@linux.vnet.ibm.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1470710408-23515-1-git-send-email-naohiro.aota@hgst.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-probe.txt | 10 +++++++++-
 tools/perf/util/probe-finder.c          | 15 ++++++++++++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 736da44596e4..b303bcdd8ed1 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -176,10 +176,18 @@ Each probe argument follows below syntax.
 
 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
 '$vars' and '$params' special arguments are also available for NAME, '$vars' is expanded to the local variables (including function parameters) which can access at given probe point. '$params' is expanded to only the function parameters.
-'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
+'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), signedness casting (u/s), "string" and bitfield are supported. (see TYPES for detail)
 
 On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
 
+TYPES
+-----
+Basic types (u8/u16/u32/u64/s8/s16/s32/s64) are integer types. Prefix 's' and 'u' means those types are signed and unsigned respectively. Traced arguments are shown in decimal (signed) or hex (unsigned). You can also use 's' or 'u' to specify only signedness and leave its size auto-detected by perf probe.
+String type is a special type, which fetches a "null-terminated" string from kernel space. This means it will fail and store NULL if the string container has been paged out. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
+Bitfield is another special type, which takes 3 parameters, bit-width, bit-offset, and container-size (usually 32). The syntax is;
+
+ b<bit-width>@<bit-offset>/<container-size>
+
 LINE SYNTAX
 -----------
 Line range is described by following syntax.
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index f2d9ff064e2d..5c290c682afe 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -297,10 +297,13 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 	char sbuf[STRERR_BUFSIZE];
 	int bsize, boffs, total;
 	int ret;
+	char sign;
 
 	/* TODO: check all types */
-	if (cast && strcmp(cast, "string") != 0) {
+	if (cast && strcmp(cast, "string") != 0 &&
+	    strcmp(cast, "s") != 0 && strcmp(cast, "u") != 0) {
 		/* Non string type is OK */
+		/* and respect signedness cast */
 		tvar->type = strdup(cast);
 		return (tvar->type == NULL) ? -ENOMEM : 0;
 	}
@@ -361,6 +364,13 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 		return (tvar->type == NULL) ? -ENOMEM : 0;
 	}
 
+	if (cast && (strcmp(cast, "u") == 0))
+		sign = 'u';
+	else if (cast && (strcmp(cast, "s") == 0))
+		sign = 's';
+	else
+		sign = die_is_signed_type(&type) ? 's' : 'u';
+
 	ret = dwarf_bytesize(&type);
 	if (ret <= 0)
 		/* No size ... try to use default type */
@@ -373,8 +383,7 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 			dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
 		ret = MAX_BASIC_TYPE_BITS;
 	}
-	ret = snprintf(buf, 16, "%c%d",
-		       die_is_signed_type(&type) ? 's' : 'u', ret);
+	ret = snprintf(buf, 16, "%c%d", sign, ret);
 
 formatted:
 	if (ret < 0 || ret >= 16) {
-- 
cgit v1.2.3


From bebfb730125863aac74607dff24f1bdb9fb02a90 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 9 Aug 2016 11:21:57 -0300
Subject: tools: Sync cpufeatures.h and vmx.h with the kernel

There were changes related to the deprecation of the "pcommit"
instruction:

  fd1d961dd681 ("x86/insn: remove pcommit")
  dfa169bbee00 ("Revert "KVM: x86: add pcommit support"")

No need to update anything in the tools, as "pcommit" wasn't being
listed on the VMX_EXIT_REASONS in the tools/perf/arch/x86/util/kvm-stat.c
file.

Just grab fresh copies of these files to silence the file cache
coherency detector:

  $ make -C tools/perf O=/tmp/build/perf install-bin
  make: Entering directory '/home/acme/git/linux/tools/perf'
    BUILD:   Doing 'make -j4' parallel build
  Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel
  Warning: tools/arch/x86/include/uapi/asm/vmx.h differs from kernel
    INSTALL  GTK UI
  <SNIP>
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-07pmcc1ysydhyyxbmp1vt0l4@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/cpufeatures.h | 9 +++------
 tools/arch/x86/include/uapi/asm/vmx.h    | 4 +---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 4a413485f9eb..92a8308b96f6 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -225,7 +225,6 @@
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
@@ -301,10 +300,6 @@
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
-#define X86_BUG_NULL_SEG	X86_BUG(9) /* Nulling a selector preserves the base */
-#define X86_BUG_SWAPGS_FENCE	X86_BUG(10) /* SWAPGS without input dep on GS */
-
-
 #ifdef CONFIG_X86_32
 /*
  * 64-bit kernels don't use X86_BUG_ESPFIX.  Make the define conditional
@@ -312,5 +307,7 @@
  */
 #define X86_BUG_ESPFIX		X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
 #endif
-
+#define X86_BUG_NULL_SEG	X86_BUG(10) /* Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE	X86_BUG(11) /* SWAPGS without input dep on GS */
+#define X86_BUG_MONITOR		X86_BUG(12) /* IPI required to wake up remote CPU */
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h
index 5b15d94a33f8..37fee272618f 100644
--- a/tools/arch/x86/include/uapi/asm/vmx.h
+++ b/tools/arch/x86/include/uapi/asm/vmx.h
@@ -78,7 +78,6 @@
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
-#define EXIT_REASON_PCOMMIT             65
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -127,8 +126,7 @@
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
-	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
-	{ EXIT_REASON_PCOMMIT,               "PCOMMIT" }
+	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
-- 
cgit v1.2.3


From 791cceb89f7987c0375ff2e8971928a47f62ccae Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 9 Aug 2016 11:48:07 -0300
Subject: toops: Sync tools/include/uapi/linux/bpf.h with the kernel

The way we're using kernel headers in tools/ now, with a copy that is
made to the same path prefixed by "tools/" plus checking if that copy
got stale, i.e. if the kernel counterpart changed, helps in keeping
track with new features that may be useful for tools to exploit.

For instance, looking at all the changes to bpf.h since it was last
copied to tools/include brings this to toolers' attention:

Need to investigate this one to check how to run a program via perf, setting up
a BPF event, that will take advantage of the way perf already calls clang/LLVM,
sets up the event and runs the workload in a single command line, helping in
debugging such semi cooperative programs:

  96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers")

This one needs further investigation about using the feature it improves
in 'perf trace' to do some tcpdumpin' mixed with syscalls, tracepoints,
probe points, callgraphs, etc:

  555c8a8623a3 ("bpf: avoid stack copy and use skb ctx for event output")

Add tracing just packets that are related to some container to that mix:

  4a482f34afcc ("cgroup: bpf: Add bpf_skb_in_cgroup_proto")
  4ed8ec521ed5 ("cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY")

Definetely needs to have example programs accessing task_struct from a bpf proggie
started from 'perf trace':

  606274c5abd8 ("bpf: introduce bpf_get_current_task() helper")

Core networking related, XDP:

  6ce96ca348a9 ("bpf: add XDP_TX xdp_action for direct forwarding")
  6a773a15a1e8 ("bpf: add XDP prog type for early driver filter")
  13c5c240f789 ("bpf: add bpf_get_hash_recalc helper")
  d2485c4242a8 ("bpf: add bpf_skb_change_type helper")
  6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")

Changes detected by the tools build system:

  $ make -C tools/perf O=/tmp/build/perf install-bin
  make: Entering directory '/home/acme/git/linux/tools/perf'
    BUILD:   Doing 'make -j4' parallel build
  Warning: tools/include/uapi/linux/bpf.h differs from kernel
    INSTALL  GTK UI
    CC       /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o
  <SNIP>
  $

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Brenden Blanco <bblanco@plumgrid.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David Ahern <dsahern@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-difq4ts1xvww6eyfs9e7zlft@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/include/uapi/linux/bpf.h | 86 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 406459b935a2..da218fec6056 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -84,6 +84,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_HASH,
 	BPF_MAP_TYPE_PERCPU_ARRAY,
 	BPF_MAP_TYPE_STACK_TRACE,
+	BPF_MAP_TYPE_CGROUP_ARRAY,
 };
 
 enum bpf_prog_type {
@@ -93,6 +94,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
 	BPF_PROG_TYPE_TRACEPOINT,
+	BPF_PROG_TYPE_XDP,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
@@ -313,6 +315,66 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_skb_get_tunnel_opt,
 	BPF_FUNC_skb_set_tunnel_opt,
+
+	/**
+	 * bpf_skb_change_proto(skb, proto, flags)
+	 * Change protocol of the skb. Currently supported is
+	 * v4 -> v6, v6 -> v4 transitions. The helper will also
+	 * resize the skb. eBPF program is expected to fill the
+	 * new headers via skb_store_bytes and lX_csum_replace.
+	 * @skb: pointer to skb
+	 * @proto: new skb->protocol type
+	 * @flags: reserved
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_change_proto,
+
+	/**
+	 * bpf_skb_change_type(skb, type)
+	 * Change packet type of skb.
+	 * @skb: pointer to skb
+	 * @type: new skb->pkt_type type
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_skb_change_type,
+
+	/**
+	 * bpf_skb_in_cgroup(skb, map, index) - Check cgroup2 membership of skb
+	 * @skb: pointer to skb
+	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+	 * @index: index of the cgroup in the bpf_map
+	 * Return:
+	 *   == 0 skb failed the cgroup2 descendant test
+	 *   == 1 skb succeeded the cgroup2 descendant test
+	 *    < 0 error
+	 */
+	BPF_FUNC_skb_in_cgroup,
+
+	/**
+	 * bpf_get_hash_recalc(skb)
+	 * Retrieve and possibly recalculate skb->hash.
+	 * @skb: pointer to skb
+	 * Return: hash
+	 */
+	BPF_FUNC_get_hash_recalc,
+
+	/**
+	 * u64 bpf_get_current_task(void)
+	 * Returns current task_struct
+	 * Return: current
+	 */
+	BPF_FUNC_get_current_task,
+
+	/**
+	 * bpf_probe_write_user(void *dst, void *src, int len)
+	 * safely attempt to write to a location
+	 * @dst: destination address in userspace
+	 * @src: source address on stack
+	 * @len: number of bytes to copy
+	 * Return: 0 on success or negative error
+	 */
+	BPF_FUNC_probe_write_user,
+
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -347,9 +409,11 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
-/* BPF_FUNC_perf_event_output flags. */
+/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
+/* BPF_FUNC_perf_event_output for sk_buff input context. */
+#define BPF_F_CTXLEN_MASK		(0xfffffULL << 32)
 
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
@@ -386,4 +450,24 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* User return codes for XDP prog type.
+ * A valid XDP program must return one of these defined values. All other
+ * return codes are reserved for future use. Unknown return codes will result
+ * in packet drop.
+ */
+enum xdp_action {
+	XDP_ABORTED = 0,
+	XDP_DROP,
+	XDP_PASS,
+	XDP_TX,
+};
+
+/* user accessible metadata for XDP packet hook
+ * new fields must be added to the end of this structure
+ */
+struct xdp_md {
+	__u32 data;
+	__u32 data_end;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 840b49ba554b40fa8301ad2716abd2fe3d9e382a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 9 Aug 2016 11:56:33 -0300
Subject: tools: Sync cpufeatures headers with the kernel

Due to:

  1e61f78baf89 ("x86/cpufeature: Make sure DISABLED/REQUIRED macros are updated")

No changes to tools using those headers (tools/arch/x86/lib/mem{set,cpu}_64.S)
seems necessary.

Detected by the tools build header drift checker:

  $ make -C tools/perf O=/tmp/build/perf
  make: Entering directory '/home/acme/git/linux/tools/perf'
    BUILD:   Doing 'make -j4' parallel build
    GEN      /tmp/build/perf/common-cmds.h
  Warning: tools/arch/x86/include/asm/disabled-features.h differs from kernel
  Warning: tools/arch/x86/include/asm/required-features.h differs from kernel
  Warning: tools/arch/x86/include/asm/cpufeatures.h differs from kernel
    CC       /tmp/build/perf/util/probe-finder.o
    CC       /tmp/build/perf/builtin-help.o
  <SNIP>
  ^C$

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-ja75m7zk8j0jkzmrv16i5ehw@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/arch/x86/include/asm/disabled-features.h | 2 ++
 tools/arch/x86/include/asm/required-features.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h
index 911e9358ceb1..85599ad4d024 100644
--- a/tools/arch/x86/include/asm/disabled-features.h
+++ b/tools/arch/x86/include/asm/disabled-features.h
@@ -56,5 +56,7 @@
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
 #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE)
+#define DISABLED_MASK17	0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
 
 #endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h
index 4916144e3c42..fac9a5c0abe9 100644
--- a/tools/arch/x86/include/asm/required-features.h
+++ b/tools/arch/x86/include/asm/required-features.h
@@ -99,5 +99,7 @@
 #define REQUIRED_MASK14	0
 #define REQUIRED_MASK15	0
 #define REQUIRED_MASK16	0
+#define REQUIRED_MASK17	0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
 
 #endif /* _ASM_X86_REQUIRED_FEATURES_H */
-- 
cgit v1.2.3


From d820456dc70b231d62171ba46b43db0045e9bd57 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Tue, 9 Aug 2016 01:23:24 -0500
Subject: perf probe: Add function to post process kernel trace events

Instead of inline code, introduce function to post process kernel
probe trace events.

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1470723805-5081-1-git-send-email-ravi.bangoria@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 1201f73ca723..234fbfb5c2ed 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -666,22 +666,14 @@ static int add_module_to_probe_trace_events(struct probe_trace_event *tevs,
 	return ret;
 }
 
-/* Post processing the probe events */
-static int post_process_probe_trace_events(struct probe_trace_event *tevs,
-					   int ntevs, const char *module,
-					   bool uprobe)
+static int
+post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
+				       int ntevs)
 {
 	struct ref_reloc_sym *reloc_sym;
 	char *tmp;
 	int i, skipped = 0;
 
-	if (uprobe)
-		return add_exec_to_probe_trace_events(tevs, ntevs, module);
-
-	/* Note that currently ref_reloc_sym based probe is not for drivers */
-	if (module)
-		return add_module_to_probe_trace_events(tevs, ntevs, module);
-
 	reloc_sym = kernel_get_ref_reloc_sym();
 	if (!reloc_sym) {
 		pr_warning("Relocated base symbol is not found!\n");
@@ -713,6 +705,21 @@ static int post_process_probe_trace_events(struct probe_trace_event *tevs,
 	return skipped;
 }
 
+/* Post processing the probe events */
+static int post_process_probe_trace_events(struct probe_trace_event *tevs,
+					   int ntevs, const char *module,
+					   bool uprobe)
+{
+	if (uprobe)
+		return add_exec_to_probe_trace_events(tevs, ntevs, module);
+
+	if (module)
+		/* Currently ref_reloc_sym based probe is not for drivers */
+		return add_module_to_probe_trace_events(tevs, ntevs, module);
+
+	return post_process_kernel_probe_trace_events(tevs, ntevs);
+}
+
 /* Try to find perf_probe_event with debuginfo */
 static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 					  struct probe_trace_event **tevs)
-- 
cgit v1.2.3


From 99e608b5954c9e1ebadbf9660b74697d9dfd9f20 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Tue, 9 Aug 2016 01:23:25 -0500
Subject: perf probe ppc64le: Fix probe location when using DWARF

Powerpc has Global Entry Point and Local Entry Point for functions.  LEP
catches call from both the GEP and the LEP. Symbol table of ELF contains
GEP and Offset from which we can calculate LEP, but debuginfo does not
have LEP info.

Currently, perf prioritize symbol table over dwarf to probe on LEP for
ppc64le. But when user tries to probe with function parameter, we fall
back to using dwarf(i.e. GEP) and when function called via LEP, probe
will never hit.

For example:

  $ objdump -d vmlinux
    ...
    do_sys_open():
    c0000000002eb4a0:       e8 00 4c 3c     addis   r2,r12,232
    c0000000002eb4a4:       60 00 42 38     addi    r2,r2,96
    c0000000002eb4a8:       a6 02 08 7c     mflr    r0
    c0000000002eb4ac:       d0 ff 41 fb     std     r26,-48(r1)

  $ sudo ./perf probe do_sys_open
  $ sudo cat /sys/kernel/debug/tracing/kprobe_events
    p:probe/do_sys_open _text+3060904

  $ sudo ./perf probe 'do_sys_open filename:string'
  $ sudo cat /sys/kernel/debug/tracing/kprobe_events
    p:probe/do_sys_open _text+3060896 filename_string=+0(%gpr4):string

For second case, perf probed on GEP. So when function will be called via
LEP, probe won't hit.

  $ sudo ./perf record -a -e probe:do_sys_open ls
    [ perf record: Woken up 1 times to write data ]
    [ perf record: Captured and wrote 0.195 MB perf.data ]

To resolve this issue, let's not prioritize symbol table, let perf
decide what it wants to use. Perf is already converting GEP to LEP when
it uses symbol table. When perf uses debuginfo, let it find LEP offset
form symbol table. This way we fall back to probe on LEP for all cases.

After patch:

  $ sudo ./perf probe 'do_sys_open filename:string'
  $ sudo cat /sys/kernel/debug/tracing/kprobe_events
    p:probe/do_sys_open _text+3060904 filename_string=+0(%gpr4):string

  $ sudo ./perf record -a -e probe:do_sys_open ls
    [ perf record: Woken up 1 times to write data ]
    [ perf record: Captured and wrote 0.197 MB perf.data (11 samples) ]

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1470723805-5081-2-git-send-email-ravi.bangoria@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/arch/powerpc/util/sym-handling.c | 27 +++++++++++++++++----
 tools/perf/util/probe-event.c               | 37 ++++++++++++++++-------------
 tools/perf/util/probe-event.h               |  6 ++++-
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index c6d0f91731a1..8d4dc97d80ba 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -54,10 +54,6 @@ int arch__compare_symbol_names(const char *namea, const char *nameb)
 #endif
 
 #if defined(_CALL_ELF) && _CALL_ELF == 2
-bool arch__prefers_symtab(void)
-{
-	return true;
-}
 
 #ifdef HAVE_LIBELF_SUPPORT
 void arch__sym_update(struct symbol *s, GElf_Sym *sym)
@@ -100,4 +96,27 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev,
 			tev->point.offset += lep_offset;
 	}
 }
+
+void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
+					   int ntevs)
+{
+	struct probe_trace_event *tev;
+	struct map *map;
+	struct symbol *sym = NULL;
+	struct rb_node *tmp;
+	int i = 0;
+
+	map = get_target_map(pev->target, pev->uprobes);
+	if (!map || map__load(map, NULL) < 0)
+		return;
+
+	for (i = 0; i < ntevs; i++) {
+		tev = &pev->tevs[i];
+		map__for_each_symbol(map, sym, tmp) {
+			if (map->unmap_ip(map, sym->start) == tev->point.address)
+				arch__fix_tev_from_maps(pev, tev, map, sym);
+		}
+	}
+}
+
 #endif
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 234fbfb5c2ed..28733962cd80 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -180,7 +180,7 @@ static struct map *kernel_get_module_map(const char *module)
 	return NULL;
 }
 
-static struct map *get_target_map(const char *target, bool user)
+struct map *get_target_map(const char *target, bool user)
 {
 	/* Init maps of given executable or kernel */
 	if (user)
@@ -705,19 +705,32 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
 	return skipped;
 }
 
+void __weak
+arch__post_process_probe_trace_events(struct perf_probe_event *pev __maybe_unused,
+				      int ntevs __maybe_unused)
+{
+}
+
 /* Post processing the probe events */
-static int post_process_probe_trace_events(struct probe_trace_event *tevs,
+static int post_process_probe_trace_events(struct perf_probe_event *pev,
+					   struct probe_trace_event *tevs,
 					   int ntevs, const char *module,
 					   bool uprobe)
 {
-	if (uprobe)
-		return add_exec_to_probe_trace_events(tevs, ntevs, module);
+	int ret;
 
-	if (module)
+	if (uprobe)
+		ret = add_exec_to_probe_trace_events(tevs, ntevs, module);
+	else if (module)
 		/* Currently ref_reloc_sym based probe is not for drivers */
-		return add_module_to_probe_trace_events(tevs, ntevs, module);
+		ret = add_module_to_probe_trace_events(tevs, ntevs, module);
+	else
+		ret = post_process_kernel_probe_trace_events(tevs, ntevs);
 
-	return post_process_kernel_probe_trace_events(tevs, ntevs);
+	if (ret >= 0)
+		arch__post_process_probe_trace_events(pev, ntevs);
+
+	return ret;
 }
 
 /* Try to find perf_probe_event with debuginfo */
@@ -758,7 +771,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
 
 	if (ntevs > 0) {	/* Succeeded to find trace events */
 		pr_debug("Found %d probe_trace_events.\n", ntevs);
-		ret = post_process_probe_trace_events(*tevs, ntevs,
+		ret = post_process_probe_trace_events(pev, *tevs, ntevs,
 						pev->target, pev->uprobes);
 		if (ret < 0 || ret == ntevs) {
 			clear_probe_trace_events(*tevs, ntevs);
@@ -2945,8 +2958,6 @@ errout:
 	return err;
 }
 
-bool __weak arch__prefers_symtab(void) { return false; }
-
 /* Concatinate two arrays */
 static void *memcat(void *a, size_t sz_a, void *b, size_t sz_b)
 {
@@ -3167,12 +3178,6 @@ static int convert_to_probe_trace_events(struct perf_probe_event *pev,
 	if (ret > 0 || pev->sdt)	/* SDT can be found only in the cache */
 		return ret == 0 ? -ENOENT : ret; /* Found in probe cache */
 
-	if (arch__prefers_symtab() && !perf_probe_event_need_dwarf(pev)) {
-		ret = find_probe_trace_events_from_map(pev, tevs);
-		if (ret > 0)
-			return ret; /* Found in symbol table */
-	}
-
 	/* Convert perf_probe_event with debuginfo */
 	ret = try_to_find_probe_trace_events(pev, tevs);
 	if (ret != 0)
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index e18ea9fe6385..f4f45db77c1c 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -158,7 +158,6 @@ int show_line_range(struct line_range *lr, const char *module, bool user);
 int show_available_vars(struct perf_probe_event *pevs, int npevs,
 			struct strfilter *filter);
 int show_available_funcs(const char *module, struct strfilter *filter, bool user);
-bool arch__prefers_symtab(void);
 void arch__fix_tev_from_maps(struct perf_probe_event *pev,
 			     struct probe_trace_event *tev, struct map *map,
 			     struct symbol *sym);
@@ -173,4 +172,9 @@ int e_snprintf(char *str, size_t size, const char *format, ...)
 int copy_to_probe_trace_arg(struct probe_trace_arg *tvar,
 			    struct perf_probe_arg *pvar);
 
+struct map *get_target_map(const char *target, bool user);
+
+void arch__post_process_probe_trace_events(struct perf_probe_event *pev,
+					   int ntevs);
+
 #endif /*_PROBE_EVENT_H */
-- 
cgit v1.2.3


From d8734849d8007dacaa40b31ba7319ed28077141d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 8 Aug 2016 15:24:02 +0200
Subject: rbd: nuke the 32-bit pool id check

ceph_file_layout::pool_id is now s64.  rbd_add_get_pool_id() and
ceph_pg_poolid_by_name() both return an int, so it's bogus anyway.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 07668a6f0607..6c6519f6492a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5337,15 +5337,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 	}
 	spec->pool_id = (u64)rc;
 
-	/* The ceph file layout needs to fit pool id in 32 bits */
-
-	if (spec->pool_id > (u64)U32_MAX) {
-		rbd_warn(NULL, "pool id too large (%llu > %u)",
-				(unsigned long long)spec->pool_id, U32_MAX);
-		rc = -EIO;
-		goto err_out_client;
-	}
-
 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
 	if (!rbd_dev) {
 		rc = -ENOMEM;
-- 
cgit v1.2.3


From 4eacd4cb3a7c4794688ef4fad5d01f3a532a58e9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 9 Aug 2016 16:12:09 +0200
Subject: ceph: initialize pathbase in the !dentry case in encode_caps_cb()

pathbase is the base inode; set it to 0 if we've got no path.

Coverity-id: 146348
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 fs/ceph/mds_client.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa59a85226b2..f72d4ae303b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2759,6 +2759,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	} else {
 		path = NULL;
 		pathlen = 0;
+		pathbase = 0;
 	}
 
 	spin_lock(&ci->i_ceph_lock);
-- 
cgit v1.2.3


From a026bb12cc57d758e045126a252e12e868076cb4 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 3 Aug 2016 18:08:55 +0100
Subject: drivers/perf: arm-pmu: convert arm_pmu_mutex to spinlock

arm_pmu_mutex is never held long and we don't want to sleep while the
lock is being held as it's executed in the context of hotplug notifiers.
So it can be converted to a simple spinlock instead.

Without this patch we get the following warning:

BUG: sleeping function called from invalid context at kernel/locking/mutex.c:620
in_atomic(): 1, irqs_disabled(): 128, pid: 0, name: swapper/2
no locks held by swapper/2/0.
irq event stamp: 381314
hardirqs last  enabled at (381313): _raw_spin_unlock_irqrestore+0x7c/0x88
hardirqs last disabled at (381314): cpu_die+0x28/0x48
softirqs last  enabled at (381294): _local_bh_enable+0x28/0x50
softirqs last disabled at (381293): irq_enter+0x58/0x78
CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.7.0 #12
Call trace:
 dump_backtrace+0x0/0x220
 show_stack+0x24/0x30
 dump_stack+0xb4/0xf0
 ___might_sleep+0x1d8/0x1f0
 __might_sleep+0x5c/0x98
 mutex_lock_nested+0x54/0x400
 arm_perf_starting_cpu+0x34/0xb0
 cpuhp_invoke_callback+0x88/0x3d8
 notify_cpu_starting+0x78/0x98
 secondary_start_kernel+0x108/0x1a8

This patch converts the mutex to spinlock to eliminate the above
warnings. This constraints pmu->reset to be non-blocking call which is
the case with all the ARM PMU backends.

Cc: Stephen Boyd <sboyd@codeaurora.org>
Fixes: 37b502f121ad ("arm/perf: Fix hotplug state machine conversion")
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 6ccb994bdfcb..4c9a537a1265 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -688,7 +688,7 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 	return 0;
 }
 
-static DEFINE_MUTEX(arm_pmu_mutex);
+static DEFINE_SPINLOCK(arm_pmu_lock);
 static LIST_HEAD(arm_pmu_list);
 
 /*
@@ -701,7 +701,7 @@ static int arm_perf_starting_cpu(unsigned int cpu)
 {
 	struct arm_pmu *pmu;
 
-	mutex_lock(&arm_pmu_mutex);
+	spin_lock(&arm_pmu_lock);
 	list_for_each_entry(pmu, &arm_pmu_list, entry) {
 
 		if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
@@ -709,7 +709,7 @@ static int arm_perf_starting_cpu(unsigned int cpu)
 		if (pmu->reset)
 			pmu->reset(pmu);
 	}
-	mutex_unlock(&arm_pmu_mutex);
+	spin_unlock(&arm_pmu_lock);
 	return 0;
 }
 
@@ -821,9 +821,9 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 	if (!cpu_hw_events)
 		return -ENOMEM;
 
-	mutex_lock(&arm_pmu_mutex);
+	spin_lock(&arm_pmu_lock);
 	list_add_tail(&cpu_pmu->entry, &arm_pmu_list);
-	mutex_unlock(&arm_pmu_mutex);
+	spin_unlock(&arm_pmu_lock);
 
 	err = cpu_pm_pmu_register(cpu_pmu);
 	if (err)
@@ -859,9 +859,9 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 	return 0;
 
 out_unregister:
-	mutex_lock(&arm_pmu_mutex);
+	spin_lock(&arm_pmu_lock);
 	list_del(&cpu_pmu->entry);
-	mutex_unlock(&arm_pmu_mutex);
+	spin_unlock(&arm_pmu_lock);
 	free_percpu(cpu_hw_events);
 	return err;
 }
@@ -869,9 +869,9 @@ out_unregister:
 static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
 {
 	cpu_pm_pmu_unregister(cpu_pmu);
-	mutex_lock(&arm_pmu_mutex);
+	spin_lock(&arm_pmu_lock);
 	list_del(&cpu_pmu->entry);
-	mutex_unlock(&arm_pmu_mutex);
+	spin_unlock(&arm_pmu_lock);
 	free_percpu(cpu_pmu->hw_events);
 }
 
-- 
cgit v1.2.3


From 7f1d642fbb5c356519617c24757a0cbed7f800a8 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 19 Jul 2016 15:39:02 +0100
Subject: drivers/perf: arm-pmu: Fix handling of SPI lacking
 "interrupt-affinity" property

Patch 19a469a58720 ("drivers/perf: arm-pmu: Handle per-interrupt
affinity mask") added support for partitionned PPI setups, but
inadvertently broke setups using SPIs without the "interrupt-affinity"
property (which is the case for UP platforms).

This patch restore the broken functionnality by testing whether the
interrupt is percpu or not instead of relying on the using_spi flag
that really means "SPI *and* interrupt-affinity property".

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Fixes: 19a469a58720 ("drivers/perf: arm-pmu: Handle per-interrupt affinity mask")
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 4c9a537a1265..c494613c1909 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -967,11 +967,12 @@ static int of_pmu_irq_cfg(struct arm_pmu *pmu)
 
 	/* If we didn't manage to parse anything, try the interrupt affinity */
 	if (cpumask_weight(&pmu->supported_cpus) == 0) {
-		if (!using_spi) {
+		int irq = platform_get_irq(pdev, 0);
+
+		if (irq_is_percpu(irq)) {
 			/* If using PPIs, check the affinity of the partition */
-			int ret, irq;
+			int ret;
 
-			irq = platform_get_irq(pdev, 0);
 			ret = irq_get_percpu_devid_partition(irq, &pmu->supported_cpus);
 			if (ret) {
 				kfree(irqs);
-- 
cgit v1.2.3


From c4159a75b64c0e67caededf4d7372c1b58a5f42a Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Mon, 8 Aug 2016 23:03:12 +0300
Subject: mm: memcontrol: only mark charged pages with PageKmemcg

To distinguish non-slab pages charged to kmemcg we mark them PageKmemcg,
which sets page->_mapcount to -512.  Currently, we set/clear PageKmemcg
in __alloc_pages_nodemask()/free_pages_prepare() for any page allocated
with __GFP_ACCOUNT, including those that aren't actually charged to any
cgroup, i.e. allocated from the root cgroup context.  To avoid overhead
in case cgroups are not used, we only do that if memcg_kmem_enabled() is
true.  The latter is set iff there are kmem-enabled memory cgroups
(online or offline).  The root cgroup is not considered kmem-enabled.

As a result, if a page is allocated with __GFP_ACCOUNT for the root
cgroup when there are kmem-enabled memory cgroups and is freed after all
kmem-enabled memory cgroups were removed, e.g.

  # no memory cgroups has been created yet, create one
  mkdir /sys/fs/cgroup/memory/test
  # run something allocating pages with __GFP_ACCOUNT, e.g.
  # a program using pipe
  dmesg | tail
  # remove the memory cgroup
  rmdir /sys/fs/cgroup/memory/test

we'll get bad page state bug complaining about page->_mapcount != -1:

  BUG: Bad page state in process swapper/0  pfn:1fd945c
  page:ffffea007f651700 count:0 mapcount:-511 mapping:          (null) index:0x0
  flags: 0x1000000000000000()

To avoid that, let's mark with PageKmemcg only those pages that are
actually charged to and hence pin a non-root memory cgroup.

Fixes: 4949148ad433 ("mm: charge/uncharge kmemcg from generic page allocator paths")
Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c       |  4 +---
 mm/memcontrol.c | 14 ++++++++++++--
 mm/page_alloc.c | 14 +++++---------
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 4b32928f5426..4ebe6b2e5217 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -144,10 +144,8 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 	struct page *page = buf->page;
 
 	if (page_count(page) == 1) {
-		if (memcg_kmem_enabled()) {
+		if (memcg_kmem_enabled())
 			memcg_kmem_uncharge(page, 0);
-			__ClearPageKmemcg(page);
-		}
 		__SetPageLocked(page);
 		return 0;
 	}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 66beca1ad92f..e74d7080ec9e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2337,8 +2337,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 		return 0;
 
 	memcg = get_mem_cgroup_from_mm(current->mm);
-	if (!mem_cgroup_is_root(memcg))
+	if (!mem_cgroup_is_root(memcg)) {
 		ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+		if (!ret)
+			__SetPageKmemcg(page);
+	}
 	css_put(&memcg->css);
 	return ret;
 }
@@ -2365,6 +2368,11 @@ void memcg_kmem_uncharge(struct page *page, int order)
 		page_counter_uncharge(&memcg->memsw, nr_pages);
 
 	page->mem_cgroup = NULL;
+
+	/* slab pages do not have PageKmemcg flag set */
+	if (PageKmemcg(page))
+		__ClearPageKmemcg(page);
+
 	css_put_many(&memcg->css, nr_pages);
 }
 #endif /* !CONFIG_SLOB */
@@ -5537,8 +5545,10 @@ static void uncharge_list(struct list_head *page_list)
 			else
 				nr_file += nr_pages;
 			pgpgout++;
-		} else
+		} else {
 			nr_kmem += 1 << compound_order(page);
+			__ClearPageKmemcg(page);
+		}
 
 		page->mem_cgroup = NULL;
 	} while (next != page_list);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fb975cec3518..ee744fa3b93d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1008,10 +1008,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	}
 	if (PageMappingFlags(page))
 		page->mapping = NULL;
-	if (memcg_kmem_enabled() && PageKmemcg(page)) {
+	if (memcg_kmem_enabled() && PageKmemcg(page))
 		memcg_kmem_uncharge(page, order);
-		__ClearPageKmemcg(page);
-	}
 	if (check_free)
 		bad += free_pages_check(page);
 	if (bad)
@@ -3756,12 +3754,10 @@ no_zone:
 	}
 
 out:
-	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) {
-		if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) {
-			__free_pages(page, order);
-			page = NULL;
-		} else
-			__SetPageKmemcg(page);
+	if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
+	    unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+		__free_pages(page, order);
+		page = NULL;
 	}
 
 	if (kmemcheck_enabled && page)
-- 
cgit v1.2.3


From a3d1ddd932bc86f0f7027079754ab0aefd259109 Mon Sep 17 00:00:00 2001
From: Brian King <brking@linux.vnet.ibm.com>
Date: Mon, 8 Aug 2016 17:53:12 -0500
Subject: ipr: Fix sync scsi scan

Commit b195d5e2bffd ("ipr: Wait to do async scan until scsi host is
initialized") fixed async scan for ipr, but broke sync scan for ipr.

This fixes sync scan back up.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
Reported-and-tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/scsi/ipr.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index bf85974be862..17d04c702e1b 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -10410,8 +10410,11 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id)
 		__ipr_remove(pdev);
 		return rc;
 	}
+	spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
+	ioa_cfg->scan_enabled = 1;
+	schedule_work(&ioa_cfg->work_q);
+	spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
 
-	scsi_scan_host(ioa_cfg->host);
 	ioa_cfg->iopoll_weight = ioa_cfg->chip_cfg->iopoll_weight;
 
 	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
@@ -10421,10 +10424,8 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id)
 		}
 	}
 
-	spin_lock_irqsave(ioa_cfg->host->host_lock, flags);
-	ioa_cfg->scan_enabled = 1;
-	schedule_work(&ioa_cfg->work_q);
-	spin_unlock_irqrestore(ioa_cfg->host->host_lock, flags);
+	scsi_scan_host(ioa_cfg->host);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From a0cba2179ea4c1820fce2ee046b6ed90ecc56196 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Aug 2016 10:48:18 -0700
Subject: Revert "printk: create pr_<level> functions"

This reverts commit 874f9c7da9a4acbc1b9e12ca722579fb50e4d142.

Geert Uytterhoeven reports:
 "This change seems to have an (unintendent?) side-effect.

  Before, pr_*() calls without a trailing newline characters would be
  printed with a newline character appended, both on the console and in
  the output of the dmesg command.

  After this commit, no new line character is appended, and the output
  of the next pr_*() call of the same type may be appended, like in:

    - Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000
    - Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)
    + Truncating RAM at 0x0000000040000000-0x00000000c0000000 to -0x0000000070000000Ignoring RAM at 0x0000000200000000-0x0000000240000000 (!CONFIG_HIGHMEM)"

Joe Perches says:
 "No, that is not intentional.

  The newline handling code inside vprintk_emit is a bit involved and
  for now I suggest a revert until this has all the same behavior as
  earlier"

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Requested-by: Joe Perches <joe@perches.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   | 48 +++++++++++++++---------------------------------
 kernel/printk/internal.h | 16 ++++++----------
 kernel/printk/nmi.c      | 13 ++-----------
 kernel/printk/printk.c   | 25 +++----------------------
 4 files changed, 26 insertions(+), 76 deletions(-)

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 8dc155dab3ed..696a56be7d3e 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -266,39 +266,21 @@ extern asmlinkage void dump_stack(void) __cold;
  * and other debug macros are compiled out unless either DEBUG is defined
  * or CONFIG_DYNAMIC_DEBUG is set.
  */
-
-#ifdef CONFIG_PRINTK
-
-asmlinkage __printf(1, 2) __cold void __pr_emerg(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_alert(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_crit(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_err(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_warn(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_notice(const char *fmt, ...);
-asmlinkage __printf(1, 2) __cold void __pr_info(const char *fmt, ...);
-
-#define pr_emerg(fmt, ...)	__pr_emerg(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert(fmt, ...)	__pr_alert(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit(fmt, ...)	__pr_crit(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err(fmt, ...)	__pr_err(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn(fmt, ...)	__pr_warn(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_notice(fmt, ...)	__pr_notice(pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info(fmt, ...)	__pr_info(pr_fmt(fmt), ##__VA_ARGS__)
-
-#else
-
-#define pr_emerg(fmt, ...)	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_alert(fmt, ...)	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_crit(fmt, ...)	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_err(fmt, ...)	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_warn(fmt, ...)	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_notice(fmt, ...)	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
-#define pr_info(fmt, ...)	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
-
-#endif
-
-#define pr_warning pr_warn
-
+#define pr_emerg(fmt, ...) \
+	printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_alert(fmt, ...) \
+	printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...) \
+	printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...) \
+	printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warning(fmt, ...) \
+	printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_warn pr_warning
+#define pr_notice(fmt, ...) \
+	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...) \
+	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 /*
  * Like KERN_CONT, pr_cont() should only be used when continuing
  * a line with no newline ('\n') enclosed. Otherwise it defaults
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 5d4505f30083..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -16,11 +16,9 @@
  */
 #include <linux/percpu.h>
 
-typedef __printf(2, 0) int (*printk_func_t)(int level, const char *fmt,
-					    va_list args);
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
 
-__printf(2, 0)
-int vprintk_default(int level, const char *fmt, va_list args);
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
@@ -33,10 +31,9 @@ extern raw_spinlock_t logbuf_lock;
  * via per-CPU variable.
  */
 DECLARE_PER_CPU(printk_func_t, printk_func);
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return this_cpu_read(printk_func)(level, fmt, args);
+	return this_cpu_read(printk_func)(fmt, args);
 }
 
 extern atomic_t nmi_message_lost;
@@ -47,10 +44,9 @@ static inline int get_nmi_message_lost(void)
 
 #else /* CONFIG_PRINTK_NMI */
 
-__printf(2, 0)
-static inline int vprintk_func(int level, const char *fmt, va_list args)
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 {
-	return vprintk_default(level, fmt, args);
+	return vprintk_default(fmt, args);
 }
 
 static inline int get_nmi_message_lost(void)
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bc3eeb1ae6da..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
  * one writer running. But the buffer might get flushed from another
  * CPU, so we need to be careful.
  */
-static int vprintk_nmi(int level, const char *fmt, va_list args)
+static int vprintk_nmi(const char *fmt, va_list args)
 {
 	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
 	int add = 0;
@@ -79,16 +79,7 @@ again:
 	if (!len)
 		smp_rmb();
 
-	if (level != LOGLEVEL_DEFAULT) {
-		add = snprintf(s->buffer + len, sizeof(s->buffer) - len,
-				KERN_SOH "%c", '0' + level);
-		add += vsnprintf(s->buffer + len + add,
-				 sizeof(s->buffer) - len - add,
-				 fmt, args);
-	} else {
-		add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len,
-				fmt, args);
-	}
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a37fc8cf8e84..eea6dbc2d8cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1930,26 +1930,7 @@ asmlinkage int printk_emit(int facility, int level,
 }
 EXPORT_SYMBOL(printk_emit);
 
-#define define_pr_level(func, loglevel)				\
-asmlinkage __visible void func(const char *fmt, ...)		\
-{								\
-	va_list args;						\
-								\
-	va_start(args, fmt);					\
-	vprintk_default(loglevel, fmt, args);			\
-	va_end(args);						\
-}								\
-EXPORT_SYMBOL(func)
-
-define_pr_level(__pr_emerg, LOGLEVEL_EMERG);
-define_pr_level(__pr_alert, LOGLEVEL_ALERT);
-define_pr_level(__pr_crit, LOGLEVEL_CRIT);
-define_pr_level(__pr_err, LOGLEVEL_ERR);
-define_pr_level(__pr_warn, LOGLEVEL_WARNING);
-define_pr_level(__pr_notice, LOGLEVEL_NOTICE);
-define_pr_level(__pr_info, LOGLEVEL_INFO);
-
-int vprintk_default(int level, const char *fmt, va_list args)
+int vprintk_default(const char *fmt, va_list args)
 {
 	int r;
 
@@ -1959,7 +1940,7 @@ int vprintk_default(int level, const char *fmt, va_list args)
 		return r;
 	}
 #endif
-	r = vprintk_emit(0, level, NULL, 0, fmt, args);
+	r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
 
 	return r;
 }
@@ -1992,7 +1973,7 @@ asmlinkage __visible int printk(const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = vprintk_func(LOGLEVEL_DEFAULT, fmt, args);
+	r = vprintk_func(fmt, args);
 	va_end(args);
 
 	return r;
-- 
cgit v1.2.3


From 51350ea0d7f355dfc03deb343a665802d3d5cbba Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 4 Aug 2016 21:36:05 +0300
Subject: mm, writeback: flush plugged IO in wakeup_flusher_threads()

I've found funny live-lock between raid10 barriers during resync and
memory controller hard limits. Inside mpage_readpages() task holds on to
its plug bio which blocks the barrier in raid10. Its memory cgroup have
no free memory thus the task goes into reclaimer but all reclaimable
pages are dirty and cannot be written because raid10 is rebuilding and
stuck on the barrier.

Common flush of such IO in schedule() never happens, because the caller
doesn't go to sleep.

Lock is 'live' because changing memory limit or killing tasks which
holds that stuck bio unblock whole progress.

That was what happened in 3.18.x but I see no difference in upstream
logic.  Theoretically this might happen even without memory cgroup.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4d09d4441e3e..05713a5da083 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1949,6 +1949,12 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
+	/*
+	 * If we are expecting writeback progress we must submit plugged IO.
+	 */
+	if (blk_needs_flush_plug(current))
+		blk_schedule_flush_plug(current);
+
 	if (!nr_pages)
 		nr_pages = get_nr_dirty_pages();
 
-- 
cgit v1.2.3


From c6d2ee09c2fffd3efdd31be2b2811d081a45bb99 Mon Sep 17 00:00:00 2001
From: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Date: Mon, 8 Aug 2016 11:57:48 +0200
Subject: cxl: Set psl_fir_cntl to production environment value

Switch the setting of psl_fir_cntl from debug to production
environment recommended value. It mostly affects the PSL behavior when
an error is raised in psl_fir1/2.

Tested with cxlflash.

Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Reviewed-by: Uma Krishnan <ukrishn@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/misc/cxl/pci.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 1d0347c36e6d..6f0c4ac4b649 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -379,7 +379,7 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id
 
 static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_dev *dev)
 {
-	u64 psl_dsnctl;
+	u64 psl_dsnctl, psl_fircntl;
 	u64 chipid;
 	u64 capp_unit_id;
 	int rc;
@@ -398,8 +398,11 @@ static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_
 	cxl_p1_write(adapter, CXL_PSL_RESLCKTO, 0x20000000200ULL);
 	/* snoop write mask */
 	cxl_p1_write(adapter, CXL_PSL_SNWRALLOC, 0x00000000FFFFFFFFULL);
-	/* set fir_accum */
-	cxl_p1_write(adapter, CXL_PSL_FIR_CNTL, 0x0800000000000000ULL);
+	/* set fir_cntl to recommended value for production env */
+	psl_fircntl = (0x2ULL << (63-3)); /* ce_report */
+	psl_fircntl |= (0x1ULL << (63-6)); /* FIR_report */
+	psl_fircntl |= 0x1ULL; /* ce_thresh */
+	cxl_p1_write(adapter, CXL_PSL_FIR_CNTL, psl_fircntl);
 	/* for debugging with trace arrays */
 	cxl_p1_write(adapter, CXL_PSL_TRACE, 0x0000FF7C00000000ULL);
 
-- 
cgit v1.2.3


From 1bc8b816cb8058c31f61fe78442f10a43209e582 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 2 Aug 2016 10:07:05 +0200
Subject: powerpc/32: Fix csum_partial_copy_generic()

Commit 7aef4136566b0 ("powerpc32: rewrite csum_partial_copy_generic()
based on copy_tofrom_user()") introduced a bug when destination
address is odd and initial csum is not null

In that (rare) case the initial csum value has to be rotated one byte
as well as the resulting value is

This patch also fixes related comments

Fixes: 7aef4136566b0 ("powerpc32: rewrite csum_partial_copy_generic() based on copy_tofrom_user()")
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/checksum_32.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index d90870a66b60..0a57fe6d49cc 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -127,8 +127,9 @@ _GLOBAL(csum_partial_copy_generic)
 	stw	r7,12(r1)
 	stw	r8,8(r1)
 
-	andi.	r0,r4,1			/* is destination address even ? */
-	cmplwi	cr7,r0,0
+	rlwinm	r0,r4,3,0x8
+	rlwnm	r6,r6,r0,0,31	/* odd destination address: rotate one byte */
+	cmplwi	cr7,r0,0	/* is destination address even ? */
 	addic	r12,r6,0
 	addi	r6,r4,-4
 	neg	r0,r4
@@ -237,7 +238,7 @@ _GLOBAL(csum_partial_copy_generic)
 66:	addze	r3,r12
 	addi	r1,r1,16
 	beqlr+	cr7
-	rlwinm	r3,r3,8,0,31	/* swap bytes for odd destination */
+	rlwinm	r3,r3,8,0,31	/* odd destination address: rotate one byte */
 	blr
 
 /* read fault */
-- 
cgit v1.2.3


From c7a318ba868c61fc9be710a4970172d8c2eeb8b9 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Wed, 10 Aug 2016 15:44:46 +1000
Subject: powerpc/ptrace: Fix coredump since ptrace TM changes

Commit 8d460f6156cd ("powerpc/process: Add the function
flush_tmregs_to_thread") added flush_tmregs_to_thread() and included
the assumption that it would only be called for a task which is not
current.

Although this is correct for ptrace, when generating a core dump, some
of the routines which call flush_tmregs_to_thread() are called. This
leads to a WARNing such as:

  Not expecting ptrace on self: TM regs may be incorrect
  ------------[ cut here ]------------
  WARNING: CPU: 123 PID: 7727 at arch/powerpc/kernel/process.c:1088 flush_tmregs_to_thread+0x78/0x80
  CPU: 123 PID: 7727 Comm: libvirtd Not tainted 4.8.0-rc1-gcc6x-g61e8a0d #1
  task: c000000fe631b600 task.stack: c000000fe63b0000
  NIP: c00000000001a1a8 LR: c00000000001a1a4 CTR: c000000000717780
  REGS: c000000fe63b3420 TRAP: 0700   Not tainted  (4.8.0-rc1-gcc6x-g61e8a0d)
  MSR: 900000010282b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE,TM[E]>  CR: 28004222  XER: 20000000
  ...
  NIP [c00000000001a1a8] flush_tmregs_to_thread+0x78/0x80
  LR [c00000000001a1a4] flush_tmregs_to_thread+0x74/0x80
  Call Trace:
   flush_tmregs_to_thread+0x74/0x80 (unreliable)
   vsr_get+0x64/0x1a0
   elf_core_dump+0x604/0x1430
   do_coredump+0x5fc/0x1200
   get_signal+0x398/0x740
   do_signal+0x54/0x2b0
   do_notify_resume+0x98/0xb0
   ret_from_except_lite+0x70/0x74

So fix flush_tmregs_to_thread() to detect the case where it is called on
current, and a transaction is active, and in that case flush the TM regs
to the thread_struct.

This patch also moves flush_tmregs_to_thread() into ptrace.c as it is
only called from that file.

Fixes: 8d460f6156cd ("powerpc/process: Add the function flush_tmregs_to_thread")
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
[mpe: Flesh out change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/switch_to.h |  8 --------
 arch/powerpc/kernel/process.c        | 20 --------------------
 arch/powerpc/kernel/ptrace.c         | 19 +++++++++++++++++++
 3 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 0a74ebe934e1..17c8380673a6 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -75,14 +75,6 @@ static inline void disable_kernel_spe(void)
 static inline void __giveup_spe(struct task_struct *t) { }
 #endif
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-extern void flush_tmregs_to_thread(struct task_struct *);
-#else
-static inline void flush_tmregs_to_thread(struct task_struct *t)
-{
-}
-#endif
-
 static inline void clear_task_ebb(struct task_struct *t)
 {
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 58ccf86415b4..9ee2623e0f67 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1074,26 +1074,6 @@ static inline void restore_sprs(struct thread_struct *old_thread,
 #endif
 }
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-void flush_tmregs_to_thread(struct task_struct *tsk)
-{
-	/*
-	 * Process self tracing is not yet supported through
-	 * ptrace interface. Ptrace generic code should have
-	 * prevented this from happening in the first place.
-	 * Warn once here with the message, if some how it
-	 * is attempted.
-	 */
-	WARN_ONCE(tsk == current,
-		"Not expecting ptrace on self: TM regs may be incorrect\n");
-
-	/*
-	 * If task is not current, it should have been flushed
-	 * already to it's thread_struct during __switch_to().
-	 */
-}
-#endif
-
 struct task_struct *__switch_to(struct task_struct *prev,
 	struct task_struct *new)
 {
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 4f3c5756cc09..bf91658a8a40 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -38,6 +38,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/switch_to.h>
+#include <asm/tm.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -118,6 +119,24 @@ static const struct pt_regs_offset regoffset_table[] = {
 	REG_OFFSET_END,
 };
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static void flush_tmregs_to_thread(struct task_struct *tsk)
+{
+	/*
+	 * If task is not current, it will have been flushed already to
+	 * it's thread_struct during __switch_to().
+	 *
+	 * A reclaim flushes ALL the state.
+	 */
+
+	if (tsk == current && MSR_TM_SUSPENDED(mfmsr()))
+		tm_reclaim_current(TM_CAUSE_SIGNAL);
+
+}
+#else
+static inline void flush_tmregs_to_thread(struct task_struct *tsk) { }
+#endif
+
 /**
  * regs_query_register_offset() - query register offset from its name
  * @name:	the name of a register
-- 
cgit v1.2.3


From 7d70c63c7132eb95e428e945242360233ab23d82 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 10 Aug 2016 17:29:29 +1000
Subject: powerpc: Print the kernel load address at the end of prom_init()

This makes it easier to debug crashes that happen very early before
the kernel takes over Open Firmware by allowing us to relate the OF
reported crashing addresses to offsets within the kernel.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/prom_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 6ee4b72cda42..4e74fc588a3f 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -2940,7 +2940,7 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4,
 
 	/* Don't print anything after quiesce under OPAL, it crashes OFW */
 	if (of_platform != PLATFORM_OPAL) {
-		prom_printf("Booting Linux via __start() ...\n");
+		prom_printf("Booting Linux via __start() @ 0x%lx ...\n", kbase);
 		prom_debug("->dt_header_start=0x%x\n", hdr);
 	}
 
-- 
cgit v1.2.3


From f9cc1d1f808dbdfd56978259d262b879504efaed Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 10 Aug 2016 17:32:38 +1000
Subject: powerpc: Update obsolete comment in setup_32.c about early_init()

We don't identify the machine type anymore...

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup_32.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index c3e861df4b20..f10a975baa5e 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -93,10 +93,8 @@ notrace unsigned long __init early_init(unsigned long dt_ptr)
  * and we are running with enough of the MMU enabled to have our
  * proper kernel virtual addresses
  *
- * Find out what kind of machine we're on and save any data we need
- * from the early boot process (devtree is copied on pmac by prom_init()).
- * This is called very early on the boot process, after a minimal
- * MMU environment has been set up but before MMU_init is called.
+ * We do the initial parsing of the flat device-tree and prepares
+ * for the MMU to be fully initialized.
  */
 extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
 
-- 
cgit v1.2.3


From 97f6e0cc35026a2a09147a6da636d901525e1969 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 10 Aug 2016 17:27:34 +1000
Subject: powerpc/32: Fix crash during static key init

We cannot do those initializations from apply_feature_fixups() as
this function runs in a very restricted environment on 32-bit where
the kernel isn't running at its linked address and the PTRRELOC()
macro must be used for any global accesss.

Instead, split them into a separtate steup_feature_keys() function
which is called in a more suitable spot on ppc32.

Fixes: 309b315b6ec6 ("powerpc: Call jump_label_init() in apply_feature_fixups()")
Reported-and-tested-by: Christian Kujau <lists@nerdbynature.de>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/feature-fixups.h | 1 +
 arch/powerpc/kernel/setup_32.c            | 3 +++
 arch/powerpc/kernel/setup_64.c            | 1 +
 arch/powerpc/lib/feature-fixups.c         | 3 +++
 4 files changed, 8 insertions(+)

diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 57fec8ac7b92..ddf54f5bbdd1 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -186,6 +186,7 @@ label##3:					       	\
 
 #ifndef __ASSEMBLY__
 void apply_feature_fixups(void);
+void setup_feature_keys(void);
 #endif
 
 #endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index f10a975baa5e..24ec3ea4b3a2 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -100,6 +100,9 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
 
 notrace void __init machine_init(u64 dt_ptr)
 {
+	/* Configure static keys first, now that we're relocated. */
+	setup_feature_keys();
+
 	/* Enable early debugging if any specified (see udbg.h) */
 	udbg_early_init();
 
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index eafb9a79e011..7ac8e6eaab5b 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -300,6 +300,7 @@ void __init early_setup(unsigned long dt_ptr)
 
 	/* Apply all the dynamic patching */
 	apply_feature_fixups();
+	setup_feature_keys();
 
 	/* Initialize the hash table or TLB handling */
 	early_init_mmu();
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 74145f02ad41..043415f0bdb1 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -188,7 +188,10 @@ void __init apply_feature_fixups(void)
 			  &__start___fw_ftr_fixup, &__stop___fw_ftr_fixup);
 #endif
 	do_final_fixups();
+}
 
+void __init setup_feature_keys(void)
+{
 	/*
 	 * Initialise jump label. This causes all the cpu/mmu_has_feature()
 	 * checks to take on their correct polarity based on the current set of
-- 
cgit v1.2.3


From 1a9e4c564ab174e53ed86def922804a5ddc63e7d Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Thu, 14 Jul 2016 17:22:54 +0200
Subject: x86/timers/apic: Fix imprecise timer interrupts by eliminating TSC
 clockevents frequency roundoff error

I noticed the following bug/misbehavior on certain Intel systems: with a
single task running on a NOHZ CPU on an Intel Haswell, I recognized
that I did not only get the one expected local_timer APIC interrupt, but
two per second at minimum. (!)

Further tracing showed that the first one precedes the programmed deadline
by up to ~50us and hence, it did nothing except for reprogramming the TSC
deadline clockevent device to trigger shortly thereafter again.

The reason for this is imprecise calibration, the timeout we program into
the APIC results in 'too short' timer interrupts. The core (hr)timer code
notices this (because it has a precise ktime source and sees the short
interrupt) and fixes it up by programming an additional very short
interrupt period.

This is obviously suboptimal.

The reason for the imprecise calibration is twofold, and this patch
fixes the first reason:

In setup_APIC_timer(), the registered clockevent device's frequency
is calculated by first dividing tsc_khz by TSC_DIVISOR and multiplying
it with 1000 afterwards:

  (tsc_khz / TSC_DIVISOR) * 1000

The multiplication with 1000 is done for converting from kHz to Hz and the
division by TSC_DIVISOR is carried out in order to make sure that the final
result fits into an u32.

However, with the order given in this calculation, the roundoff error
introduced by the division gets magnified by a factor of 1000 by the
following multiplication.

To fix it, reversing the order of the division and the multiplication a la:

  (tsc_khz * 1000) / TSC_DIVISOR

... reduces the roundoff error already.

Furthermore, if TSC_DIVISOR divides 1000, associativity holds:

  (tsc_khz * 1000) / TSC_DIVISOR = tsc_khz * (1000 / TSC_DIVISOR)

and thus, the roundoff error even vanishes and the whole operation can be
carried out within 32 bits.

The powers of two that divide 1000 are 2, 4 and 8. A value of 8 for
TSC_DIVISOR still allows for TSC frequencies up to
2^32 / 10^9ns * 8 = 34.4GHz which is way larger than anything to expect
in the next years.

Thus we also replace the current TSC_DIVISOR value of 32 by 8. Reverse
the order of the divison and the multiplication in the calculation of
the registered clockevent device's frequency.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Christopher S. Hall <christopher.s.hall@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20160714152255.18295-2-nicstange@gmail.com
[ Improved changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ac8d8ad8b009..a315dc404756 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -313,7 +313,7 @@ int lapic_get_maxlvt(void)
 
 /* Clock divisor */
 #define APIC_DIVISOR 16
-#define TSC_DIVISOR  32
+#define TSC_DIVISOR  8
 
 /*
  * This function sets up the local APIC timer, with a timeout of
@@ -565,7 +565,7 @@ static void setup_APIC_timer(void)
 				    CLOCK_EVT_FEAT_DUMMY);
 		levt->set_next_event = lapic_next_deadline;
 		clockevents_config_and_register(levt,
-						(tsc_khz / TSC_DIVISOR) * 1000,
+						tsc_khz * (1000 / TSC_DIVISOR),
 						0xF, ~0UL);
 	} else
 		clockevents_register_device(levt);
-- 
cgit v1.2.3


From 6731b0d611a1274f9e785fa0189ac2aeeabd0591 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Thu, 14 Jul 2016 17:22:55 +0200
Subject: x86/timers/apic: Inform TSC deadline clockevent device about
 recalibration

This patch eliminates a source of imprecise APIC timer interrupts,
which imprecision may result in double interrupts or even late
interrupts.

The TSC deadline clockevent devices' configuration and registration
happens before the TSC frequency calibration is refined in
tsc_refine_calibration_work().

This results in the TSC clocksource and the TSC deadline clockevent
devices being configured with slightly different frequencies: the former
gets the refined one and the latter are configured with the inaccurate
frequency detected earlier by means of the "Fast TSC calibration using PIT".

Within the APIC code, introduce the notifier function
lapic_update_tsc_freq() which reconfigures all per-CPU TSC deadline
clockevent devices with the current tsc_khz.

Call it from the TSC code after TSC calibration refinement has happened.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Christopher S. Hall <christopher.s.hall@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Link: http://lkml.kernel.org/r/20160714152255.18295-3-nicstange@gmail.com
[ Pushed #ifdef CONFIG_X86_LOCAL_APIC into header, improved changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h |  2 ++
 arch/x86/kernel/apic/apic.c | 24 ++++++++++++++++++++++++
 arch/x86/kernel/tsc.c       |  4 ++++
 3 files changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f5befd4945f2..124357773ffa 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -135,6 +135,7 @@ extern void init_apic_mappings(void);
 void register_lapic_address(unsigned long address);
 extern void setup_boot_APIC_clock(void);
 extern void setup_secondary_APIC_clock(void);
+extern void lapic_update_tsc_freq(void);
 extern int APIC_init_uniprocessor(void);
 
 #ifdef CONFIG_X86_64
@@ -170,6 +171,7 @@ static inline void init_apic_mappings(void) { }
 static inline void disable_local_APIC(void) { }
 # define setup_boot_APIC_clock x86_init_noop
 # define setup_secondary_APIC_clock x86_init_noop
+static inline void lapic_update_tsc_freq(void) { }
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_X2APIC
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a315dc404756..0fd3d659f13c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -571,6 +571,30 @@ static void setup_APIC_timer(void)
 		clockevents_register_device(levt);
 }
 
+/*
+ * Install the updated TSC frequency from recalibration at the TSC
+ * deadline clockevent devices.
+ */
+static void __lapic_update_tsc_freq(void *info)
+{
+	struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
+
+	if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+		return;
+
+	clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR));
+}
+
+void lapic_update_tsc_freq(void)
+{
+	/*
+	 * The clockevent device's ->mult and ->shift can both be
+	 * changed. In order to avoid races, schedule the frequency
+	 * update code on each CPU.
+	 */
+	on_each_cpu(__lapic_update_tsc_freq, NULL, 0);
+}
+
 /*
  * In this functions we calibrate APIC bus clocks to the external timer.
  *
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a804b5ab32d0..8fb4b6abac0e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -22,6 +22,7 @@
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
 #include <asm/geode.h>
+#include <asm/apic.h>
 
 unsigned int __read_mostly cpu_khz;	/* TSC clocks / usec, not used here */
 EXPORT_SYMBOL(cpu_khz);
@@ -1249,6 +1250,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 		(unsigned long)tsc_khz / 1000,
 		(unsigned long)tsc_khz % 1000);
 
+	/* Inform the TSC deadline clockevent devices about the recalibration */
+	lapic_update_tsc_freq();
+
 out:
 	if (boot_cpu_has(X86_FEATURE_ART))
 		art_related_clocksource = &clocksource_tsc;
-- 
cgit v1.2.3


From 0b8f1e2e26bfc6b9abe3f0f3faba2cb0eecb9fb9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 4 Aug 2016 14:37:24 +0200
Subject: perf/core: Fix sideband list-iteration vs. event ordering NULL
 pointer deference crash

Vegard Nossum reported that perf fuzzing generates a NULL
pointer dereference crash:

> Digging a bit deeper into this, it seems the event itself is getting
> created by perf_event_open() and it gets added to the pmu_event_list
> through:
>
> perf_event_open()
>  - perf_event_alloc()
>     - account_event()
>        - account_pmu_sb_event()
>           - attach_sb_event()
>
> so at this point the event is being attached but its ->ctx is still
> NULL. It seems like ->ctx is set just a bit later in
> perf_event_open(), though.
>
> But before that, __schedule() comes along and creates a stack trace
> similar to the one above:
>
> __schedule()
>  - __perf_event_task_sched_out()
>    - perf_iterate_sb()
>      - perf_iterate_sb_cpu()
>         - event_filter_match()
>           - perf_cgroup_match()
>             - __get_cpu_context()
>               - (dereference ctx which is NULL)
>
> So I guess the question is... should the event be attached (= put on
> the list) before ->ctx gets set? Or should the cgroup code check for a
> NULL ->ctx?

The latter seems like the simplest solution. Moving the list-add later
creates a bit of a mess.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f2fb6bef9251 ("perf/core: Optimize side-band event delivery")
Link: http://lkml.kernel.org/r/20160804123724.GN6862@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index a19550d80ab1..87d02b8cb87e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1716,8 +1716,8 @@ static inline int pmu_filter_match(struct perf_event *event)
 static inline int
 event_filter_match(struct perf_event *event)
 {
-	return (event->cpu == -1 || event->cpu == smp_processor_id())
-	    && perf_cgroup_match(event) && pmu_filter_match(event);
+	return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+	       perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
 static void
@@ -1737,8 +1737,8 @@ event_sched_out(struct perf_event *event,
 	 * maintained, otherwise bogus information is return
 	 * via read() for time_enabled, time_running:
 	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE
-	    && !event_filter_match(event)) {
+	if (event->state == PERF_EVENT_STATE_INACTIVE &&
+	    !event_filter_match(event)) {
 		delta = tstamp - event->tstamp_stopped;
 		event->tstamp_running += delta;
 		event->tstamp_stopped = tstamp;
@@ -2236,10 +2236,15 @@ perf_install_in_context(struct perf_event_context *ctx,
 
 	lockdep_assert_held(&ctx->mutex);
 
-	event->ctx = ctx;
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
+	/*
+	 * Ensures that if we can observe event->ctx, both the event and ctx
+	 * will be 'complete'. See perf_iterate_sb_cpu().
+	 */
+	smp_store_release(&event->ctx, ctx);
+
 	if (!task) {
 		cpu_function_call(cpu, __perf_install_in_context, event);
 		return;
@@ -5969,6 +5974,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
 	struct perf_event *event;
 
 	list_for_each_entry_rcu(event, &pel->list, sb_list) {
+		/*
+		 * Skip events that are not fully formed yet; ensure that
+		 * if we observe event->ctx, both event and ctx will be
+		 * complete enough. See perf_install_in_context().
+		 */
+		if (!smp_load_acquire(&event->ctx))
+			continue;
+
 		if (event->state < PERF_EVENT_STATE_INACTIVE)
 			continue;
 		if (!event_filter_match(event))
-- 
cgit v1.2.3


From db4a835601b73cf8d6cd8986381d966b8e13d2d9 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Tue, 2 Aug 2016 00:48:12 -0700
Subject: perf/core: Set cgroup in CPU contexts for new cgroup events

There's a perf stat bug easy to observer on a machine with only one cgroup:

  $ perf stat -e cycles -I 1000 -C 0 -G /
  #          time             counts unit events
      1.000161699      <not counted>      cycles                    /
      2.000355591      <not counted>      cycles                    /
      3.000565154      <not counted>      cycles                    /
      4.000951350      <not counted>      cycles                    /

We'd expect some output there.

The underlying problem is that there is an optimization in
perf_cgroup_sched_{in,out}() that skips the switch of cgroup events
if the old and new cgroups in a task switch are the same.

This optimization interacts with the current code in two ways
that cause a CPU context's cgroup (cpuctx->cgrp) to be NULL even if a
cgroup event matches the current task. These are:

  1. On creation of the first cgroup event in a CPU: In current code,
  cpuctx->cpu is only set in perf_cgroup_sched_in, but due to the
  aforesaid optimization, perf_cgroup_sched_in will run until the next
  cgroup switches in that CPU. This may happen late or never happen,
  depending on system's number of cgroups, CPU load, etc.

  2. On deletion of the last cgroup event in a cpuctx: In list_del_event,
  cpuctx->cgrp is set NULL. Any new cgroup event will not be sched in
  because cpuctx->cgrp == NULL until a cgroup switch occurs and
  perf_cgroup_sched_in is executed (updating cpuctx->cgrp).

This patch fixes both problems by setting cpuctx->cgrp in list_add_event,
mirroring what list_del_event does when removing a cgroup event from CPU
context, as introduced in:

  commit 68cacd29167b ("perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx()")

With this patch, cpuctx->cgrp is always set/clear when installing/removing
the first/last cgroup event in/from the CPU context. With cpuctx->cgrp
correctly set, event_filter_match works as intended when events are
sched in/out.

After the fix, the output is as expected:

  $ perf stat -e cycles -I 1000 -a -G /
  #         time             counts unit events
     1.004699159          627342882      cycles                    /
     2.007397156          615272690      cycles                    /
     3.010019057          616726074      cycles                    /

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1470124092-113192-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  4 ++++
 kernel/events/core.c       | 54 ++++++++++++++++++++++++++++++----------------
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8ed4326164cc..2b6b43cc0dd5 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -743,7 +743,9 @@ struct perf_event_context {
 	u64				parent_gen;
 	u64				generation;
 	int				pin_count;
+#ifdef CONFIG_CGROUP_PERF
 	int				nr_cgroups;	 /* cgroup evts */
+#endif
 	void				*task_ctx_data; /* pmu specific data */
 	struct rcu_head			rcu_head;
 };
@@ -769,7 +771,9 @@ struct perf_cpu_context {
 	unsigned int			hrtimer_active;
 
 	struct pmu			*unique_pmu;
+#ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp;
+#endif
 };
 
 struct perf_output_handle {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 87d02b8cb87e..1903b8f3a705 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -843,6 +843,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 		}
 	}
 }
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+			 struct perf_event_context *ctx, bool add)
+{
+	struct perf_cpu_context *cpuctx;
+
+	if (!is_cgroup_event(event))
+		return;
+
+	if (add && ctx->nr_cgroups++)
+		return;
+	else if (!add && --ctx->nr_cgroups)
+		return;
+	/*
+	 * Because cgroup events are always per-cpu events,
+	 * this will always be called from the right CPU.
+	 */
+	cpuctx = __get_cpu_context(ctx);
+	cpuctx->cgrp = add ? event->cgrp : NULL;
+}
+
 #else /* !CONFIG_CGROUP_PERF */
 
 static inline bool
@@ -920,6 +946,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 			 struct perf_event_context *ctx)
 {
 }
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+			 struct perf_event_context *ctx, bool add)
+{
+}
+
 #endif
 
 /*
@@ -1392,6 +1425,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 {
+
 	lockdep_assert_held(&ctx->lock);
 
 	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1412,8 +1446,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		list_add_tail(&event->group_entry, list);
 	}
 
-	if (is_cgroup_event(event))
-		ctx->nr_cgroups++;
+	list_update_cgroup_event(event, ctx, true);
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
@@ -1581,8 +1614,6 @@ static void perf_group_attach(struct perf_event *event)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_cpu_context *cpuctx;
-
 	WARN_ON_ONCE(event->ctx != ctx);
 	lockdep_assert_held(&ctx->lock);
 
@@ -1594,20 +1625,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
-	if (is_cgroup_event(event)) {
-		ctx->nr_cgroups--;
-		/*
-		 * Because cgroup events are always per-cpu events, this will
-		 * always be called from the right CPU.
-		 */
-		cpuctx = __get_cpu_context(ctx);
-		/*
-		 * If there are no more cgroup events then clear cgrp to avoid
-		 * stale pointer in update_cgrp_time_from_cpuctx().
-		 */
-		if (!ctx->nr_cgroups)
-			cpuctx->cgrp = NULL;
-	}
+	list_update_cgroup_event(event, ctx, false);
 
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
-- 
cgit v1.2.3


From a23eadfae2fd45536a355b785d5a1533e1955c22 Mon Sep 17 00:00:00 2001
From: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Date: Tue, 19 Jul 2016 11:44:50 +0200
Subject: sched/deadline: Fix wrap-around in DL heap

Current code in cpudeadline.c has a bug in re-heapifying when adding a
new element at the end of the heap, because a deadline value of 0 is
temporarily set in the new elem, then cpudl_change_key() is called
with the actual elem deadline as param.

However, the function compares the new deadline to set with the one
previously in the elem, which is 0.  So, if current absolute deadlines
grew so much to have negative values as s64, the comparison in
cpudl_change_key() makes the wrong decision.  Instead, as from
dl_time_before(), the kernel should handle correctly abs deadlines
wrap-arounds.

This patch fixes the problem with a minimally invasive change that
forces cpudl_change_key() to heapify up in this case.

Signed-off-by: Tommaso Cucinotta <tommaso.cucinotta@sssup.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1468921493-10054-2-git-send-email-tommaso.cucinotta@sssup.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpudeadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be58820465c..d4184498c9f5 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
 
 	if (old_idx == IDX_INVALID) {
 		cp->size++;
-		cp->elements[cp->size - 1].dl = 0;
+		cp->elements[cp->size - 1].dl = dl;
 		cp->elements[cp->size - 1].cpu = cpu;
 		cp->elements[cpu].idx = cp->size - 1;
 		cpudl_change_key(cp, cp->size - 1, dl);
-- 
cgit v1.2.3


From b8922125e4790fa237a8a4204562ecf457ef54bb Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Sat, 9 Jul 2016 15:54:22 +0800
Subject: sched/fair: Fix typo in sync_throttle()

We should update cfs_rq->throttled_clock_task, not
pcfs_rq->throttle_clock_task.

The effects of this bug was probably occasionally erratic
group scheduling, particularly in cgroups-intense workloads.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
[ Added changelog. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 55e16d30bd99 ("sched/fair: Rework throttle_count sync")
Link: http://lkml.kernel.org/r/1468050862-18864-1-git-send-email-xlpang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4088eedea763..039de34f1521 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
 	pcfs_rq = tg->parent->cfs_rq[cpu];
 
 	cfs_rq->throttle_count = pcfs_rq->throttle_count;
-	pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-- 
cgit v1.2.3


From 6075620b0590eaf22f10ce88833eb20a57f760d6 Mon Sep 17 00:00:00 2001
From: Giovanni Gherdovich <ggherdovich@suse.cz>
Date: Fri, 5 Aug 2016 10:21:56 +0200
Subject: sched/cputime: Mitigate performance regression in
 times()/clock_gettime()

Commit:

  6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency")

fixed a problem whereby clock_nanosleep() followed by clock_gettime() could
allow a task to wake early. It addressed the problem by calling the scheduling
classes update_curr() when the cputimer starts.

Said change induced a considerable performance regression on the syscalls
times() and clock_gettimes(CLOCK_PROCESS_CPUTIME_ID). There are some
debuggers and applications that monitor their own performance that
accidentally depend on the performance of these specific calls.

This patch mitigates the performace loss by prefetching data in the CPU
cache, as stalls due to cache misses appear to be where most time is spent
in our benchmarks.

Here are the performance gain of this patch over v4.7-rc7 on a Sandy Bridge
box with 32 logical cores and 2 NUMA nodes. The test is repeated with a
variable number of threads, from 2 to 4*num_cpus; the results are in
seconds and correspond to the average of 10 runs; the percentage gain is
computed with (before-after)/before so a positive value is an improvement
(it's faster). The improvement varies between a few percents for 5-20
threads and more than 10% for 2 or >20 threads.

pound_clock_gettime:

    threads       4.7-rc7     patched 4.7-rc7
    [num]         [secs]      [secs (percent)]
      2           3.48        3.06 ( 11.83%)
      5           3.33        3.25 (  2.40%)
      8           3.37        3.26 (  3.30%)
     12           3.32        3.37 ( -1.60%)
     21           4.01        3.90 (  2.74%)
     30           3.63        3.36 (  7.41%)
     48           3.71        3.11 ( 16.27%)
     79           3.75        3.16 ( 15.74%)
    110           3.81        3.25 ( 14.80%)
    128           3.88        3.31 ( 14.76%)

pound_times:

    threads       4.7-rc7     patched 4.7-rc7
    [num]         [secs]      [secs (percent)]
      2           3.65        3.25 ( 11.03%)
      5           3.45        3.17 (  7.92%)
      8           3.52        3.22 (  8.69%)
     12           3.29        3.36 ( -2.04%)
     21           4.07        3.92 (  3.78%)
     30           3.87        3.40 ( 12.17%)
     48           3.79        3.16 ( 16.61%)
     79           3.88        3.28 ( 15.42%)
    110           3.90        3.38 ( 13.35%)
    128           4.00        3.38 ( 15.45%)

pound_clock_gettime and pound_clock_gettime are two benchmarks included in
the MMTests framework. They launch a given number of threads which
repeatedly call times() or clock_gettimes(). The results above can be
reproduced with cloning MMTests from github.com and running the "poundtime"
workload:

  $ git clone https://github.com/gormanm/mmtests.git
  $ cd mmtests
  $ cp configs/config-global-dhp__workload_poundtime config
  $ ./run-mmtests.sh --run-monitor $(uname -r)

The above will run "poundtime" measuring the kernel currently running on
the machine; Once a new kernel is installed and the machine rebooted,
running again

  $ cd mmtests
  $ ./run-mmtests.sh --run-monitor $(uname -r)

will produce results to compare with. A comparison table will be output
with:

  $ cd mmtests/work/log
  $ ../../compare-kernels.sh

the table will contain a lot of entries; grepping for "Amean" (as in
"arithmetic mean") will give the tables presented above. The source code
for the two benchmarks is reported at the end of this changelog for
clairity.

The cache misses addressed by this patch were found using a combination of
`perf top`, `perf record` and `perf annotate`. The incriminated lines were
found to be

    struct sched_entity *curr = cfs_rq->curr;

and

    delta_exec = now - curr->exec_start;

in the function update_curr() from kernel/sched/fair.c. This patch
prefetches the data from memory just before update_curr is called in the
interested execution path.

A comparison of the total number of cycles before and after the patch
follows; the data is obtained using `perf stat -r 10 -ddd <program>`
running over the same sequence of number of threads used above (a positive
gain is an improvement):

  threads   cycles before                 cycles after                gain

    2      19,699,563,964  +-1.19%      17,358,917,517  +-1.85%      11.88%
    5      47,401,089,566  +-2.96%      45,103,730,829  +-0.97%       4.85%
    8      80,923,501,004  +-3.01%      71,419,385,977  +-0.77%      11.74%
   12     112,326,485,473  +-0.47%     110,371,524,403  +-0.47%       1.74%
   21     193,455,574,299  +-0.72%     180,120,667,904  +-0.36%       6.89%
   30     315,073,519,013  +-1.64%     271,222,225,950  +-1.29%      13.92%
   48     321,969,515,332  +-1.48%     273,353,977,321  +-1.16%      15.10%
   79     337,866,003,422  +-0.97%     289,462,481,538  +-1.05%      14.33%
  110     338,712,691,920  +-0.78%     290,574,233,170  +-0.77%      14.21%
  128     348,384,794,006  +-0.50%     292,691,648,206  +-0.66%      15.99%

A comparison of cache miss vs total cache loads ratios, before and after
the patch (again from the `perf stat -r 10 -ddd <program>` tables):

  threads   L1 misses/total*100     L1 misses/total*100            gain
		         before                   after
      2           7.43  +-4.90%           7.36  +-4.70%           0.94%
      5          13.09  +-4.74%          13.52  +-3.73%          -3.28%
      8          13.79  +-5.61%          12.90  +-3.27%           6.45%
     12          11.57  +-2.44%           8.71  +-1.40%          24.72%
     21          12.39  +-3.92%           9.97  +-1.84%          19.53%
     30          13.91  +-2.53%          11.73  +-2.28%          15.67%
     48          13.71  +-1.59%          12.32  +-1.97%          10.14%
     79          14.44  +-0.66%          13.40  +-1.06%           7.20%
    110          15.86  +-0.50%          14.46  +-0.59%           8.83%
    128          16.51  +-0.32%          15.06  +-0.78%           8.78%

As a final note, the following shows the evolution of performance figures
in the "poundtime" benchmark and pinpoints commit 6e998916dfe3
("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") as a
major source of degradation, mostly unaddressed to this day (figures
expressed in seconds).

pound_clock_gettime:

  threads   parent of         6e998916dfe3        4.7-rc7
	    6e998916dfe3            itself
    2        2.23          3.68 ( -64.56%)        3.48 (-55.48%)
    5        2.83          3.78 ( -33.42%)        3.33 (-17.43%)
    8        2.84          4.31 ( -52.12%)        3.37 (-18.76%)
    12       3.09          3.61 ( -16.74%)        3.32 ( -7.17%)
    21       3.14          4.63 ( -47.36%)        4.01 (-27.71%)
    30       3.28          5.75 ( -75.37%)        3.63 (-10.80%)
    48       3.02          6.05 (-100.56%)        3.71 (-22.99%)
    79       2.88          6.30 (-118.90%)        3.75 (-30.26%)
    110      2.95          6.46 (-119.00%)        3.81 (-29.24%)
    128      3.05          6.42 (-110.08%)        3.88 (-27.04%)

pound_times:

  threads   parent of         6e998916dfe3        4.7-rc7
	    6e998916dfe3            itself
    2        2.27          3.73 ( -64.71%)        3.65 (-61.14%)
    5        2.78          3.77 ( -35.56%)        3.45 (-23.98%)
    8        2.79          4.41 ( -57.71%)        3.52 (-26.05%)
    12       3.02          3.56 ( -17.94%)        3.29 ( -9.08%)
    21       3.10          4.61 ( -48.74%)        4.07 (-31.34%)
    30       3.33          5.75 ( -72.53%)        3.87 (-16.01%)
    48       2.96          6.06 (-105.04%)        3.79 (-28.10%)
    79       2.88          6.24 (-116.83%)        3.88 (-34.81%)
    110      2.98          6.37 (-114.08%)        3.90 (-31.12%)
    128      3.10          6.35 (-104.61%)        4.00 (-28.87%)

The source code of the two benchmarks follows. To compile the two:

  NR_THREADS=42
  for FILE in pound_times pound_clock_gettime; do
      gcc -lrt -O2 -lpthread -DNUM_THREADS=$NR_THREADS $FILE.c -o $FILE
  done

==== BEGIN pound_times.c ====

struct tms start;

void *pound (void *threadid)
{
  struct tms end;
  int oldutime = 0;
  int utime;
  int i;
  for (i = 0; i < 5000000 / NUM_THREADS; i++) {
          times(&end);
          utime = ((int)end.tms_utime - (int)start.tms_utime);
          if (oldutime > utime) {
            printf("utime decreased, was %d, now %d!\n", oldutime, utime);
          }
          oldutime = utime;
  }
  pthread_exit(NULL);
}

int main()
{
  pthread_t th[NUM_THREADS];
  long i;
  times(&start);
  for (i = 0; i < NUM_THREADS; i++) {
    pthread_create (&th[i], NULL, pound, (void *)i);
  }
  pthread_exit(NULL);
  return 0;
}
==== END pound_times.c ====

==== BEGIN pound_clock_gettime.c ====

void *pound (void *threadid)
{
	struct timespec ts;
	int rc, i;
	unsigned long prev = 0, this = 0;

	for (i = 0; i < 5000000 / NUM_THREADS; i++) {
		rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		if (rc < 0)
			perror("clock_gettime");
		this = (ts.tv_sec * 1000000000) + ts.tv_nsec;
		if (0 && this < prev)
			printf("%lu ns timewarp at iteration %d\n", prev - this, i);
		prev = this;
	}
	pthread_exit(NULL);
}

int main()
{
	pthread_t th[NUM_THREADS];
	long rc, i;
	pid_t pgid;

	for (i = 0; i < NUM_THREADS; i++) {
		rc = pthread_create(&th[i], NULL, pound, (void *)i);
		if (rc < 0)
			perror("pthread_create");
	}

	pthread_exit(NULL);
	return 0;
}
==== END pound_clock_gettime.c ====

Suggested-by: Mike Galbraith <mgalbraith@suse.de>
Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470385316-15027-2-git-send-email-ggherdovich@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5c883fe8e440..2a906f20fba7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 #include <linux/frame.h>
+#include <linux/prefetch.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -2971,6 +2972,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 
+/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+	prefetch(curr);
+	prefetch(&curr->exec_start);
+}
+
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
@@ -3005,6 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * thread, breaking clock_gettime().
 	 */
 	if (task_current(rq, p) && task_on_rq_queued(p)) {
+		prefetch_curr_exec_start(p);
 		update_rq_clock(rq);
 		p->sched_class->update_curr(rq);
 	}
-- 
cgit v1.2.3


From c0c8c9fa210c9a042060435f17e40ba4a76d6d6f Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 4 Aug 2016 09:42:20 +0800
Subject: sched/deadline: Fix lock pinning warning during CPU hotplug

The following warning can be triggered by hot-unplugging the CPU
on which an active SCHED_DEADLINE task is running on:

  WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3531 lock_release+0x690/0x6a0
  releasing a pinned lock
  Call Trace:
   dump_stack+0x99/0xd0
   __warn+0xd1/0xf0
   ? dl_task_timer+0x1a1/0x2b0
   warn_slowpath_fmt+0x4f/0x60
   ? sched_clock+0x13/0x20
   lock_release+0x690/0x6a0
   ? enqueue_pushable_dl_task+0x9b/0xa0
   ? enqueue_task_dl+0x1ca/0x480
   _raw_spin_unlock+0x1f/0x40
   dl_task_timer+0x1a1/0x2b0
   ? push_dl_task.part.31+0x190/0x190
  WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:3649 lock_unpin_lock+0x181/0x1a0
  unpinning an unpinned lock
  Call Trace:
   dump_stack+0x99/0xd0
   __warn+0xd1/0xf0
   warn_slowpath_fmt+0x4f/0x60
   lock_unpin_lock+0x181/0x1a0
   dl_task_timer+0x127/0x2b0
   ? push_dl_task.part.31+0x190/0x190

As per the comment before this code, its safe to drop the RQ lock
here, and since we (potentially) change rq, unpin and repin to avoid
the splat.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
[ Rewrote changelog. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470274940-17976-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..1ce8867283dc 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	 *
 	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
 	 */
-	if (unlikely(!rq->online))
+	if (unlikely(!rq->online)) {
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		rq = dl_task_offline_migration(rq, p);
+		rf.cookie = lockdep_pin_lock(&rq->lock);
+	}
 
 	/*
 	 * Queueing this task back might have overloaded rq, check if we need
-- 
cgit v1.2.3


From 2db34e8bf9a22f4e38b29deccee57457bc0e7d74 Mon Sep 17 00:00:00 2001
From: pan xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Mon, 18 Jul 2016 17:47:39 +0800
Subject: locking/qrwlock: Fix write unlock bug on big endian systems

This patch aims to get rid of endianness in queued_write_unlock(). We
want to set  __qrwlock->wmode to NULL, however the address is not
&lock->cnts in big endian machine. That causes queued_write_unlock()
write NULL to the wrong field of __qrwlock.

So implement __qrwlock_write_byte() which returns the correct
__qrwlock->wmode address.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman.Long@hpe.com
Cc: arnd@arndb.de
Cc: boqun.feng@gmail.com
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1468835259-4486-1-git-send-email-xinhui.pan@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/qrwlock.h | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 54a8e65e18b6..7d026bf27713 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -25,7 +25,20 @@
 #include <asm-generic/qrwlock_types.h>
 
 /*
- * Writer states & reader shift and bias
+ * Writer states & reader shift and bias.
+ *
+ *       | +0 | +1 | +2 | +3 |
+ *   ----+----+----+----+----+
+ *    LE | 78 | 56 | 34 | 12 | 0x12345678
+ *   ----+----+----+----+----+
+ *       | wr |      rd      |
+ *       +----+----+----+----+
+ *
+ *   ----+----+----+----+----+
+ *    BE | 12 | 34 | 56 | 78 | 0x12345678
+ *   ----+----+----+----+----+
+ *       |      rd      | wr |
+ *       +----+----+----+----+
  */
 #define	_QW_WAITING	1		/* A writer is waiting	   */
 #define	_QW_LOCKED	0xff		/* A writer holds the lock */
@@ -133,13 +146,23 @@ static inline void queued_read_unlock(struct qrwlock *lock)
 	(void)atomic_sub_return_release(_QR_BIAS, &lock->cnts);
 }
 
+/**
+ * __qrwlock_write_byte - retrieve the write byte address of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ * Return: the write byte address of a queue rwlock
+ */
+static inline u8 *__qrwlock_write_byte(struct qrwlock *lock)
+{
+	return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
+}
+
 /**
  * queued_write_unlock - release write lock of a queue rwlock
  * @lock : Pointer to queue rwlock structure
  */
 static inline void queued_write_unlock(struct qrwlock *lock)
 {
-	smp_store_release((u8 *)&lock->cnts, 0);
+	smp_store_release(__qrwlock_write_byte(lock), 0);
 }
 
 /*
-- 
cgit v1.2.3


From 229ce631574761870a2ac938845fadbd07f35caa Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 14 Jul 2016 16:15:56 +0800
Subject: locking/pvqspinlock: Fix double hash race

When the lock holder vCPU is racing with the queue head:

   CPU 0 (lock holder)    CPU1 (queue head)
   ===================    =================
   spin_lock();           spin_lock();
    pv_kick_node():        pv_wait_head_or_lock():
                            if (!lp) {
                             lp = pv_hash(lock, pn);
                             xchg(&l->locked, _Q_SLOW_VAL);
                            }
                            WRITE_ONCE(pn->state, vcpu_halted);
     cmpxchg(&pn->state,
      vcpu_halted, vcpu_hashed);
     WRITE_ONCE(l->locked, _Q_SLOW_VAL);
     (void)pv_hash(lock, pn);

In this case, lock holder inserts the pv_node of queue head into the
hash table and set _Q_SLOW_VAL unnecessary. This patch avoids it by
restoring/setting vcpu_hashed state after failing adaptive locking
spinning.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/1468484156-4521-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_paravirt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 37649e69056c..8a99abf58080 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -450,7 +450,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 				goto gotlock;
 			}
 		}
-		WRITE_ONCE(pn->state, vcpu_halted);
+		WRITE_ONCE(pn->state, vcpu_hashed);
 		qstat_inc(qstat_pv_wait_head, true);
 		qstat_inc(qstat_pv_wait_again, waitcnt);
 		pv_wait(&l->locked, _Q_SLOW_VAL);
-- 
cgit v1.2.3


From c2ace36b884de9330c4149064ae8d212d2e0d9ee Mon Sep 17 00:00:00 2001
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Wed, 13 Jul 2016 18:23:34 +0800
Subject: locking/pvqspinlock: Fix a bug in qstat_read()

It's obviously wrong to set stat to NULL. So lets remove it.
Otherwise it is always zero when we check the latency of kick/wake.

Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Waiman Long <Waiman.Long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1468405414-3700-1-git-send-email-xinhui.pan@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_stat.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 22e025309845..b9d031516254 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -153,7 +153,6 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
 		 */
 		if ((counter == qstat_pv_latency_kick) ||
 		    (counter == qstat_pv_latency_wake)) {
-			stat = 0;
 			if (kicks)
 				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
 		}
-- 
cgit v1.2.3


From 469f00231278da68062a809306df0bac95a27507 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 15 Jul 2016 11:42:43 +0200
Subject: x86, kasan, ftrace: Put APIC interrupt handlers into .irqentry.text

Dmitry Vyukov has reported unexpected KASAN stackdepot growth:

  https://github.com/google/kasan/issues/36

... which is caused by the APIC handlers not being present in .irqentry.text:

When building with CONFIG_FUNCTION_GRAPH_TRACER=y or CONFIG_KASAN=y, put the
APIC interrupt handlers into the .irqentry.text section. This is needed
because both KASAN and function graph tracer use __irqentry_text_start and
__irqentry_text_end to determine whether a function is an IRQ entry point.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: aryabinin@virtuozzo.com
Cc: kasan-dev@googlegroups.com
Cc: kcc@google.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1468575763-144889-1-git-send-email-glider@google.com
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index b846875aeea6..9f85827db24e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -601,9 +601,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 .endm
 #endif
 
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+# define POP_SECTION_IRQENTRY	.popsection
+#else
+# define PUSH_SECTION_IRQENTRY
+# define POP_SECTION_IRQENTRY
+#endif
+
 .macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
 apicinterrupt3 \num \sym \do_sym
 trace_apicinterrupt \num \sym
+POP_SECTION_IRQENTRY
 .endm
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 22cc1ca3c5469cf17e149be232817b9223afa5e4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Aug 2016 21:54:53 +0200
Subject: x86/hpet: Fix /dev/rtc breakage caused by RTC cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ville Syrjälä reports "The first time I run hwclock after rebooting
I get this:

 open("/dev/rtc", O_RDONLY)              = 3
 ioctl(3, PHN_SET_REGS or RTC_UIE_ON, 0) = 0
 select(4, [3], NULL, NULL, {10, 0})     = 0 (Timeout)
 ioctl(3, PHN_NOT_OH or RTC_UIE_OFF, 0)  = 0
 close(3)                                = 0

On all subsequent runs I get this:

 open("/dev/rtc", O_RDONLY)              = 3
 ioctl(3, PHN_SET_REGS or RTC_UIE_ON, 0) = -1 EINVAL (Invalid argument)
 ioctl(3, RTC_RD_TIME, 0x7ffd76b3ae70)   = -1 EINVAL (Invalid argument)
 close(3)                                = 0"

This was caused by a stupid typo in a patch that should have been
a simple rename to move around contents of a header file, but
accidentally wrote zeroes into the rtc rather than reading from
it:

  463a86304cae ("char/genrtc: x86: remove remnants of asm/rtc.h")

Reported-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Tested-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Tested-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: rtc-linux@googlegroups.com
Fixes: 463a86304cae ("char/genrtc: x86: remove remnants of asm/rtc.h")
Link: http://lkml.kernel.org/r/20160809195528.1604312-1-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/hpet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ed16e58658a4..c6dfd801df97 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1242,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 	memset(&curr_time, 0, sizeof(struct rtc_time));
 
 	if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
-		mc146818_set_time(&curr_time);
+		mc146818_get_time(&curr_time);
 
 	if (hpet_rtc_flags & RTC_UIE &&
 	    curr_time.tm_sec != hpet_prev_update_sec) {
-- 
cgit v1.2.3


From c7d2361f7524f365c1ae42f47880e3fa9efb2c2a Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Tue, 9 Aug 2016 10:11:04 -0700
Subject: x86/mm/KASLR: Fix physical memory calculation on KASLR memory
 randomization

Initialize KASLR memory randomization after max_pfn is initialized. Also
ensure the size is rounded up. It could create problems on machines
with more than 1Tb of memory on certain random addresses.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Cc: Aleksey Makarov <aleksey.makarov@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lv Zheng <lv.zheng@intel.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kernel-hardening@lists.openwall.com
Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions")
Link: http://lkml.kernel.org/r/1470762665-88032-1-git-send-email-thgarnie@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c | 8 ++++++--
 arch/x86/mm/kaslr.c     | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 991b77986d57..95cf31c9f4ec 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.oem.arch_setup();
 
-	kernel_randomize_memory();
-
 	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
 	setup_memory_map();
 	parse_setup_data();
@@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p)
 
 	max_possible_pfn = max_pfn;
 
+	/*
+	 * Define random base addresses for memory sections after max_pfn is
+	 * defined and before each memory section base is used.
+	 */
+	kernel_randomize_memory();
+
 #ifdef CONFIG_X86_32
 	/* max_low_pfn get updated here */
 	find_low_pfn_range();
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 26dccd6c0df1..ec8654f117d8 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -97,7 +97,7 @@ void __init kernel_randomize_memory(void)
 	 * add padding if needed (especially for memory hotplug support).
 	 */
 	BUG_ON(kaslr_regions[0].base != &page_offset_base);
-	memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) +
+	memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
 		CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 
 	/* Adapt phyiscal memory region size based on available memory */
-- 
cgit v1.2.3


From fb754f958f8e46202c1efd7f66d5b3db1208117d Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Tue, 9 Aug 2016 10:11:05 -0700
Subject: x86/mm/KASLR: Increase BRK pages for KASLR memory randomization

Default implementation expects 6 pages maximum are needed for low page
allocations. If KASLR memory randomization is enabled, the worse case
of e820 layout would require 12 pages (no large pages). It is due to the
PUD level randomization and the variable e820 memory layout.

This bug was found while doing extensive testing of KASLR memory
randomization on different type of hardware.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Cc: Aleksey Makarov <aleksey.makarov@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lv Zheng <lv.zheng@intel.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kernel-hardening@lists.openwall.com
Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions")
Link: http://lkml.kernel.org/r/1470762665-88032-2-git-send-email-thgarnie@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 620928903be3..d28a2d741f9e 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num)
 	return __va(pfn << PAGE_SHIFT);
 }
 
-/* need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS */
-#define INIT_PGT_BUF_SIZE	(6 * PAGE_SIZE)
+/*
+ * By default need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS.
+ * With KASLR memory randomization, depending on the machine e820 memory
+ * and the PUD alignment. We may need twice more pages when KASLR memory
+ * randomization is enabled.
+ */
+#ifndef CONFIG_RANDOMIZE_MEMORY
+#define INIT_PGD_PAGE_COUNT      6
+#else
+#define INIT_PGD_PAGE_COUNT      12
+#endif
+#define INIT_PGT_BUF_SIZE	(INIT_PGD_PAGE_COUNT * PAGE_SIZE)
 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
 void  __init early_alloc_pgt_buf(void)
 {
-- 
cgit v1.2.3


From 164af597ce945751e2dcd53d0a86e84203a6d117 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 9 Aug 2016 22:43:46 +1000
Subject: powerpc/Makefile: Use cflags-y/aflags-y for setting endian options

When we introduced the little endian support, we added the endian flags
to CC directly using override. I don't know the history of why we did
that, I suspect no one does.

Although this mostly works, it has one bug, which is that CROSS32CC
doesn't get -mbig-endian. That means when the compiler is little endian
by default and the user is building big endian, vdso32 is incorrectly
compiled as little endian and the kernel fails to build.

Instead we can add the endian flags to cflags-y/aflags-y, and then
append those to KBUILD_CFLAGS/KBUILD_AFLAGS.

This has the advantage of being 1) less ugly, 2) the documented way of
adding flags in the arch Makefile and 3) it fixes building vdso32 with a
LE toolchain.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Makefile | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index ca254546cd05..1934707bf321 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -66,29 +66,28 @@ endif
 UTS_MACHINE := $(OLDARCH)
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-override CC	+= -mlittle-endian
-ifneq ($(cc-name),clang)
-override CC	+= -mno-strict-align
-endif
-override AS	+= -mlittle-endian
 override LD	+= -EL
-override CROSS32CC += -mlittle-endian
 override CROSS32AS += -mlittle-endian
 LDEMULATION	:= lppc
 GNUTARGET	:= powerpcle
 MULTIPLEWORD	:= -mno-multiple
 KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-save-toc-indirect)
 else
-ifeq ($(call cc-option-yn,-mbig-endian),y)
-override CC	+= -mbig-endian
-override AS	+= -mbig-endian
-endif
 override LD	+= -EB
 LDEMULATION	:= ppc
 GNUTARGET	:= powerpc
 MULTIPLEWORD	:= -mmultiple
 endif
 
+cflags-$(CONFIG_CPU_BIG_ENDIAN)		+= $(call cc-option,-mbig-endian)
+cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mlittle-endian
+ifneq ($(cc-name),clang)
+  cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mno-strict-align
+endif
+
+aflags-$(CONFIG_CPU_BIG_ENDIAN)		+= $(call cc-option,-mbig-endian)
+aflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mlittle-endian
+
 ifeq ($(HAS_BIARCH),y)
 override AS	+= -a$(CONFIG_WORD_SIZE)
 override LD	+= -m elf$(CONFIG_WORD_SIZE)$(LDEMULATION)
@@ -232,6 +231,9 @@ cpu-as-$(CONFIG_E200)		+= -Wa,-me200
 KBUILD_AFLAGS += $(cpu-as-y)
 KBUILD_CFLAGS += $(cpu-as-y)
 
+KBUILD_AFLAGS += $(aflags-y)
+KBUILD_CFLAGS += $(cflags-y)
+
 head-y				:= arch/powerpc/kernel/head_$(CONFIG_WORD_SIZE).o
 head-$(CONFIG_8xx)		:= arch/powerpc/kernel/head_8xx.o
 head-$(CONFIG_40x)		:= arch/powerpc/kernel/head_40x.o
-- 
cgit v1.2.3


From b9a4a0d02c5b8d9a1397c11d741d2a1a56381178 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 9 Aug 2016 22:17:29 +1000
Subject: powerpc/vdso: Fix build rules to rebuild vdsos correctly

When using if_changed, we need to add FORCE as a dependency (see
Documentation/kbuild/makefiles.txt) otherwise we don't get command line
change checking amongst other things. This has resulted in vdsos not
being rebuilt when switching between big and little endian.

The vdso64/32ld commands have to be changed around to avoid pulling
FORCE into the linker command line (code copied from x86).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/vdso32/Makefile | 6 +++---
 arch/powerpc/kernel/vdso64/Makefile | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
index cbabd143acae..78a7449bf489 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -30,7 +30,7 @@ CPPFLAGS_vdso32.lds += -P -C -Upowerpc
 $(obj)/vdso32_wrapper.o : $(obj)/vdso32.so
 
 # link rule for the .so file, .lds has to be first
-$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32)
+$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE
 	$(call if_changed,vdso32ld)
 
 # strip rule for the .so file
@@ -39,12 +39,12 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
 	$(call if_changed,objcopy)
 
 # assembly rules for the .S files
-$(obj-vdso32): %.o: %.S
+$(obj-vdso32): %.o: %.S FORCE
 	$(call if_changed_dep,vdso32as)
 
 # actual build commands
 quiet_cmd_vdso32ld = VDSO32L $@
-      cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@
+      cmd_vdso32ld = $(CROSS32CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
 quiet_cmd_vdso32as = VDSO32A $@
       cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $<
 
diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile
index c710802b8fb6..366ae09b14c1 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -23,7 +23,7 @@ CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
 $(obj)/vdso64_wrapper.o : $(obj)/vdso64.so
 
 # link rule for the .so file, .lds has to be first
-$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64)
+$(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) FORCE
 	$(call if_changed,vdso64ld)
 
 # strip rule for the .so file
@@ -32,12 +32,12 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
 	$(call if_changed,objcopy)
 
 # assembly rules for the .S files
-$(obj-vdso64): %.o: %.S
+$(obj-vdso64): %.o: %.S FORCE
 	$(call if_changed_dep,vdso64as)
 
 # actual build commands
 quiet_cmd_vdso64ld = VDSO64L $@
-      cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@
+      cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
 quiet_cmd_vdso64as = VDSO64A $@
       cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
 
-- 
cgit v1.2.3


From a52ff34e5ec61749c62c6618b76a9d6dbecee450 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Thu, 4 Aug 2016 22:38:36 +0200
Subject: ALSA: hda - Manage power well properly for resume

For SKL and later Intel chips, we control the power well per codec
basis via link_power callback since the commit [03b135cebc47: ALSA:
hda - remove dependency on i915 power well for SKL].
However, there are a few exceptional cases where the gfx registers are
accessed from the audio driver: namely the wakeup override bit
toggling at (both system and runtime) resume.  This seems causing a
kernel warning when accessed during the power well down (and likely
resulting in the bogus register accesses).

This patch puts the proper power up / down sequence around the resume
code so that the wakeup bit is fiddled properly while the power is
up.  (The other callback, sync_audio_rate, is used only in the PCM
callback, so it's guaranteed in the power-on.)

Also, by this proper power up/down, the instantaneous flip of wakeup
bit in the resume callback that was introduced by the commit
[033ea349a7cd: ALSA: hda - Fix Skylake codec timeout] becomes
superfluous, as snd_hdac_display_power() already does it.  So we can
clean it up together.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96214
Fixes: 03b135cebc47 ('ALSA: hda - remove dependency on i915 power well for SKL')
Cc: <stable@vger.kernel.org> # v4.2+
Tested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/hda_intel.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 89dacf9b4e6c..160c7f713722 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -906,20 +906,23 @@ static int azx_resume(struct device *dev)
 	struct snd_card *card = dev_get_drvdata(dev);
 	struct azx *chip;
 	struct hda_intel *hda;
+	struct hdac_bus *bus;
 
 	if (!card)
 		return 0;
 
 	chip = card->private_data;
 	hda = container_of(chip, struct hda_intel, chip);
+	bus = azx_bus(chip);
 	if (chip->disabled || hda->init_failed || !chip->running)
 		return 0;
 
-	if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL
-		&& hda->need_i915_power) {
-		snd_hdac_display_power(azx_bus(chip), true);
-		snd_hdac_i915_set_bclk(azx_bus(chip));
+	if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) {
+		snd_hdac_display_power(bus, true);
+		if (hda->need_i915_power)
+			snd_hdac_i915_set_bclk(bus);
 	}
+
 	if (chip->msi)
 		if (pci_enable_msi(pci) < 0)
 			chip->msi = 0;
@@ -929,6 +932,11 @@ static int azx_resume(struct device *dev)
 
 	hda_intel_init_chip(chip, true);
 
+	/* power down again for link-controlled chips */
+	if ((chip->driver_caps & AZX_DCAPS_I915_POWERWELL) &&
+	    !hda->need_i915_power)
+		snd_hdac_display_power(bus, false);
+
 	snd_power_change_state(card, SNDRV_CTL_POWER_D0);
 
 	trace_azx_resume(chip);
@@ -1008,6 +1016,7 @@ static int azx_runtime_resume(struct device *dev)
 
 	chip = card->private_data;
 	hda = container_of(chip, struct hda_intel, chip);
+	bus = azx_bus(chip);
 	if (chip->disabled || hda->init_failed)
 		return 0;
 
@@ -1015,15 +1024,9 @@ static int azx_runtime_resume(struct device *dev)
 		return 0;
 
 	if (chip->driver_caps & AZX_DCAPS_I915_POWERWELL) {
-		bus = azx_bus(chip);
-		if (hda->need_i915_power) {
-			snd_hdac_display_power(bus, true);
+		snd_hdac_display_power(bus, true);
+		if (hda->need_i915_power)
 			snd_hdac_i915_set_bclk(bus);
-		} else {
-			/* toggle codec wakeup bit for STATESTS read */
-			snd_hdac_set_codec_wakeup(bus, true);
-			snd_hdac_set_codec_wakeup(bus, false);
-		}
 	}
 
 	/* Read STATESTS before controller reset */
@@ -1043,6 +1046,11 @@ static int azx_runtime_resume(struct device *dev)
 	azx_writew(chip, WAKEEN, azx_readw(chip, WAKEEN) &
 			~STATESTS_INT_MASK);
 
+	/* power down again for link-controlled chips */
+	if ((chip->driver_caps & AZX_DCAPS_I915_POWERWELL) &&
+	    !hda->need_i915_power)
+		snd_hdac_display_power(bus, false);
+
 	trace_azx_runtime_resume(chip);
 	return 0;
 }
-- 
cgit v1.2.3


From ca49e64f0cb1368fc666a53b16b45d4505763d9c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 29 Jul 2016 20:48:09 +1000
Subject: selftests/powerpc: Specify we expect to build with std=gnu99

We have some tests that assume we're using std=gnu99, which is fine on
most compilers, but some old compilers use a different default.

So make it explicit that we want to use std=gnu99.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 tools/testing/selftests/powerpc/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index 3c40c9d0e6c7..1cc6d64c39b7 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -8,7 +8,7 @@ ifeq ($(ARCH),powerpc)
 
 GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown")
 
-CFLAGS := -Wall -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) $(CFLAGS)
+CFLAGS := -std=gnu99 -Wall -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) $(CFLAGS)
 
 export CFLAGS
 
-- 
cgit v1.2.3


From 5cf0791da5c162ebc14b01eb01631cfa7ed4fa6e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 5 Aug 2016 15:37:39 +0200
Subject: x86/mm: Disable preemption during CR3 read+write
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a subtle preemption race on UP kernels:

Usually current->mm (and therefore mm->pgd) stays the same during the
lifetime of a task so it does not matter if a task gets preempted during
the read and write of the CR3.

But then, there is this scenario on x86-UP:

TaskA is in do_exit() and exit_mm() sets current->mm = NULL followed by:

 -> mmput()
 -> exit_mmap()
 -> tlb_finish_mmu()
 -> tlb_flush_mmu()
 -> tlb_flush_mmu_tlbonly()
 -> tlb_flush()
 -> flush_tlb_mm_range()
 -> __flush_tlb_up()
 -> __flush_tlb()
 ->  __native_flush_tlb()

At this point current->mm is NULL but current->active_mm still points to
the "old" mm.

Let's preempt taskA _after_ native_read_cr3() by taskB. TaskB has its
own mm so CR3 has changed.

Now preempt back to taskA. TaskA has no ->mm set so it borrows taskB's
mm and so CR3 remains unchanged. Once taskA gets active it continues
where it was interrupted and that means it writes its old CR3 value
back. Everything is fine because userland won't need its memory
anymore.

Now the fun part:

Let's preempt taskA one more time and get back to taskB. This
time switch_mm() won't do a thing because oldmm (->active_mm)
is the same as mm (as per context_switch()). So we remain
with a bad CR3 / PGD and return to userland.

The next thing that happens is handle_mm_fault() with an address for
the execution of its code in userland. handle_mm_fault() realizes that
it has a PTE with proper rights so it returns doing nothing. But the
CPU looks at the wrong PGD and insists that something is wrong and
faults again. And again. And one more time…

This pagefault circle continues until the scheduler gets tired of it and
puts another task on the CPU. It gets little difficult if the task is a
RT task with a high priority. The system will either freeze or it gets
fixed by the software watchdog thread which usually runs at RT-max prio.
But waiting for the watchdog will increase the latency of the RT task
which is no good.

Fix this by disabling preemption across the critical code section.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1470404259-26290-1-git-send-email-bigeasy@linutronix.de
[ Prettified the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/tlbflush.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4e5be94e079a..6fa85944af83 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 static inline void __native_flush_tlb(void)
 {
+	/*
+	 * If current->mm == NULL then we borrow a mm which may change during a
+	 * task switch and therefore we must not be preempted while we write CR3
+	 * back:
+	 */
+	preempt_disable();
 	native_write_cr3(native_read_cr3());
+	preempt_enable();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
-- 
cgit v1.2.3


From 3e035305875cfa8a58c1ca573d0cfa6a7f201f27 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 3 Aug 2016 19:14:29 +0200
Subject: x86/entry: Clarify the RF saving/restoring situation with
 SYSCALL/SYSRET

Clarify why exactly RF cannot be restored properly by SYSRET to avoid
confusion.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160803171429.GA2590@nazgul.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/entry_64.S | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9f85827db24e..d172c619c449 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -288,11 +288,15 @@ return_from_SYSCALL_64:
 	jne	opportunistic_sysret_failed
 
 	/*
-	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
-	 * restoring TF results in a trap from userspace immediately after
-	 * SYSRET.  This would cause an infinite loop whenever #DB happens
-	 * with register state that satisfies the opportunistic SYSRET
-	 * conditions.  For example, single-stepping this user code:
+	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+	 * restore RF properly. If the slowpath sets it for whatever reason, we
+	 * need to restore it correctly.
+	 *
+	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+	 * trap from userspace immediately after SYSRET.  This would cause an
+	 * infinite loop whenever #DB happens with register state that satisfies
+	 * the opportunistic SYSRET conditions.  For example, single-stepping
+	 * this user code:
 	 *
 	 *           movq	$stuck_here, %rcx
 	 *           pushfq
-- 
cgit v1.2.3


From 054f621fd5b1c7245710f5d3935c94ce6ae4b3b7 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:50 -0500
Subject: x86/platform/UV: Fix problem with UV4 Socket IDs not being contiguous

The UV4 Socket IDs are not guaranteed to equate to Node values which
can cause the GAM (Global Addressable Memory) table lookups to fail.
Fix this by using an independent index into the GAM table instead of
the Socket ID to reference the base address.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.048755337@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 09b59adaea3f..d9187336aa2e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -325,7 +325,7 @@ static __init void build_uv_gr_table(void)
 	struct uv_gam_range_entry *gre = uv_gre_table;
 	struct uv_gam_range_s *grt;
 	unsigned long last_limit = 0, ram_limit = 0;
-	int bytes, i, sid, lsid = -1;
+	int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
 
 	if (!gre)
 		return;
@@ -356,11 +356,12 @@ static __init void build_uv_gr_table(void)
 		}
 		sid = gre->sockid - _min_socket;
 		if (lsid < sid) {		/* new range */
-			grt = &_gr_table[sid];
-			grt->base = lsid;
+			grt = &_gr_table[indx];
+			grt->base = lindx;
 			grt->nasid = gre->nasid;
 			grt->limit = last_limit = gre->limit;
 			lsid = sid;
+			lindx = indx++;
 			continue;
 		}
 		if (lsid == sid && !ram_limit) {	/* update range */
@@ -371,7 +372,7 @@ static __init void build_uv_gr_table(void)
 		}
 		if (!ram_limit) {		/* non-contiguous ram range */
 			grt++;
-			grt->base = sid - 1;
+			grt->base = lindx;
 			grt->nasid = gre->nasid;
 			grt->limit = last_limit = gre->limit;
 			continue;
-- 
cgit v1.2.3


From e363d24c2b997c421476c6aa00547edadf678efe Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:51 -0500
Subject: x86/platform/UV: Fix bug with iounmap() of the UV4 EFI System Table
 causing a crash

Save the uv_systab::size field before doing the iounmap()
of the struct pointer, to avoid a NULL dereference crash.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.250424783@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/bios_uv.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 66b2166ea4a1..4e9fd1378aec 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -199,12 +199,14 @@ void uv_bios_init(void)
 		return;
 	}
 
+	/* Starting with UV4 the UV systab size is variable */
 	if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) {
+		int size = uv_systab->size;
+
 		iounmap(uv_systab);
-		uv_systab = ioremap(efi.uv_systab, uv_systab->size);
+		uv_systab = ioremap(efi.uv_systab, size);
 		if (!uv_systab) {
-			pr_err("UV: UVsystab: ioremap(%d) failed!\n",
-				uv_systab->size);
+			pr_err("UV: UVsystab: ioremap(%d) failed!\n", size);
 			return;
 		}
 	}
-- 
cgit v1.2.3


From 22ac2bca92f2d92c6495248d65ff648182df428d Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:52 -0500
Subject: x86/platform/UV: Fix problem with UV4 BIOS providing incorrect PXM
 values

There are some circumstances where the UV4 BIOS cannot provide the
correct Proximity Node values to associate with specific Sockets and
Physical Nodes.  The decision was made to remove these values from BIOS
and for the kernel to get these values from the standard ACPI tables.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.414210079@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/uv/bios.h     |  5 +++--
 arch/x86/kernel/apic/x2apic_uv_x.c | 28 ++++++++++------------------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index c852590254d5..e652a7cc6186 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -79,7 +79,7 @@ struct uv_gam_range_entry {
 	u16	nasid;		/* HNasid */
 	u16	sockid;		/* Socket ID, high bits of APIC ID */
 	u16	pnode;		/* Index to MMR and GRU spaces */
-	u32	pxm;		/* ACPI proximity domain number */
+	u32	unused2;
 	u32	limit;		/* PA bits 56:26 (UV_GAM_RANGE_SHFT) */
 };
 
@@ -88,7 +88,8 @@ struct uv_gam_range_entry {
 #define	UV_SYSTAB_VERSION_UV4		0x400	/* UV4 BIOS base version */
 #define	UV_SYSTAB_VERSION_UV4_1		0x401	/* + gpa_shift */
 #define	UV_SYSTAB_VERSION_UV4_2		0x402	/* + TYPE_NVRAM/WINDOW/MBOX */
-#define	UV_SYSTAB_VERSION_UV4_LATEST	UV_SYSTAB_VERSION_UV4_2
+#define	UV_SYSTAB_VERSION_UV4_3		0x403	/* - GAM Range PXM Value */
+#define	UV_SYSTAB_VERSION_UV4_LATEST	UV_SYSTAB_VERSION_UV4_3
 
 #define	UV_SYSTAB_TYPE_UNUSED		0	/* End of table (offset == 0) */
 #define	UV_SYSTAB_TYPE_GAM_PARAMS	1	/* GAM PARAM conversions */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d9187336aa2e..6aa0545879bb 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1156,19 +1156,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (!index) {
 			pr_info("UV: GAM Range Table...\n");
-			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s %3s\n",
+			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n",
 				"Range", "", "Size", "Type", "NASID",
-				"SID", "PN", "PXM");
+				"SID", "PN");
 		}
 		pr_info(
-		"UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x %3d\n",
+		"UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x\n",
 			index++,
 			(unsigned long)lgre << UV_GAM_RANGE_SHFT,
 			(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
 			((unsigned long)(gre->limit - lgre)) >>
 				(30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
-			gre->type, gre->nasid, gre->sockid,
-			gre->pnode, gre->pxm);
+			gre->type, gre->nasid, gre->sockid, gre->pnode);
 
 		lgre = gre->limit;
 		if (sock_min > gre->sockid)
@@ -1287,7 +1286,7 @@ static void __init build_socket_tables(void)
 		_pnode_to_socket[i] = SOCK_EMPTY;
 
 	/* fill in pnode/node/addr conversion list values */
-	pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n");
+	pr_info("UV: GAM Building socket/pnode conversion tables\n");
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
 			continue;
@@ -1295,20 +1294,18 @@ static void __init build_socket_tables(void)
 		if (_socket_to_pnode[i] != SOCK_EMPTY)
 			continue;	/* duplicate */
 		_socket_to_pnode[i] = gre->pnode;
-		_socket_to_node[i] = gre->pxm;
 
 		i = gre->pnode - minpnode;
 		_pnode_to_socket[i] = gre->sockid;
 
 		pr_info(
-		"UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n",
+		"UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
 			gre->sockid, gre->type, gre->nasid,
 			_socket_to_pnode[gre->sockid - minsock],
-			_socket_to_node[gre->sockid - minsock],
 			_pnode_to_socket[gre->pnode - minpnode]);
 	}
 
-	/* check socket -> node values */
+	/* Set socket -> node values */
 	lnid = -1;
 	for_each_present_cpu(cpu) {
 		int nid = cpu_to_node(cpu);
@@ -1319,14 +1316,9 @@ static void __init build_socket_tables(void)
 		lnid = nid;
 		apicid = per_cpu(x86_cpu_to_apicid, cpu);
 		sockid = apicid >> uv_cpuid.socketid_shift;
-		i = sockid - minsock;
-
-		if (nid != _socket_to_node[i]) {
-			pr_warn(
-			"UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n",
-				i, sockid, gre->type, _socket_to_node[i], nid);
-			_socket_to_node[i] = nid;
-		}
+		_socket_to_node[sockid - minsock] = nid;
+		pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
+			sockid, apicid, nid);
 	}
 
 	/* Setup physical blade to pnode translation from GAM Range Table */
-- 
cgit v1.2.3


From 5a52e8f822374bebc702bb2688ed8b5515bbb55b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:53 -0500
Subject: x86/platform/UV: Fix kernel panic running RHEL kdump kernel on UV
 systems

The latest UV kernel support panics when RHEL7 kexec's the kdump kernel
to make a dumpfile.  This patch fixes the problem by turning off all UV
support if NUMA is off.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.577755634@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 6aa0545879bb..cb0673c1e940 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	if (strncmp(oem_id, "SGI", 3) != 0)
 		return 0;
 
+	if (numa_off) {
+		pr_err("UV: NUMA is off, disabling UV support\n");
+		return 0;
+	}
+
 	/* Setup early hub type field in uv_hub_info for Node 0 */
 	uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
 
-- 
cgit v1.2.3


From 5e44258d168b2bdee51d9e2e1f1f4726ff9775cd Mon Sep 17 00:00:00 2001
From: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Date: Sun, 31 Jul 2016 23:24:50 -0400
Subject: x86/build: Reduce the W=1 warnings noise when compiling x86 syscall
 tables

Building an X86_64 kernel with W=1 throws a total of 9,948 lines of warnings of
this form for both 32-bit and 64-bit syscall tables. Given that the entire rest
of the build for my config only generates 8,375 lines of output, this is a big
reduction in the warnings generated.

The warnings follow this pattern:

  ./arch/x86/include/generated/asm/syscalls_32.h:885:21: warning: initialized field overwritten [-Woverride-init]
   __SYSCALL_I386(379, compat_sys_pwritev2, )
                     ^
  arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386'
   #define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
                                              ^~~
  ./arch/x86/include/generated/asm/syscalls_32.h:885:21: note: (near initialization for 'ia32_sys_call_table[379]')
   __SYSCALL_I386(379, compat_sys_pwritev2, )
                     ^
  arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386'
   #define __SYSCALL_I386(nr, sym, qual) [nr] = sym,

Since we intentionally build the syscall tables this way, ignore that one
warning in the two files.

Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/7464.1470021890@turing-police.cc.vt.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/entry/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index fe91c25092da..77f28ce9c646 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -5,6 +5,8 @@
 OBJECT_FILES_NON_STANDARD_entry_$(BITS).o   := y
 OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
 
+CFLAGS_syscall_64.o		+= -Wno-override-init
+CFLAGS_syscall_32.o		+= -Wno-override-init
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
 obj-y				+= common.o
 
-- 
cgit v1.2.3


From b79daf85899215d5ede3641806db2e2a77b776b4 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 27 Jul 2016 16:20:40 -0700
Subject: x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE
 buffer manipulation

The Memory Protection Keys "rights register" (PKRU) is
XSAVE-managed, and is saved/restored along with the FPU state.

When kernel code accesses FPU regsisters, it does a delicate
dance with preempt.  Otherwise, the context switching code can
get confused as to whether the most up-to-date state is in the
registers themselves or in the XSAVE buffer.

But, PKRU is not a normal FPU register.  Using it does not
generate the normal device-not-available (#NM) exceptions which
means we can not manage it lazily, and the kernel completley
disallows using lazy mode when it is enabled.

The dance with preempt *only* occurs when managing the FPU
lazily.  Since we never manage PKRU lazily, we do not have to do
the dance with preempt; we can access it directly.  Doing it
this way saves a ton of complicated code (and is faster too).

Further, the XSAVES reenabling failed to patch a bit of code
in fpu__xfeature_set_state() the checked for compacted buffers.
That check caused fpu__xfeature_set_state() to silently refuse to
work when the kernel is using compacted XSAVE buffers.  This
broke execute-only and future pkey_mprotect() support when using
compact XSAVE buffers.

But, removing fpu__xfeature_set_state() gets rid of this issue,
in addition to the nice cleanup and speedup.

This fixes the same thing as a fix that Sai posted:

  https://lkml.org/lkml/2016/7/25/637

The fix that he posted is a much more obviously correct, but I
think we should just do this instead.

Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-Cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/fpu/xstate.c | 138 ++++++-------------------------------------
 1 file changed, 17 insertions(+), 121 deletions(-)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 680049aa4593..01567aa87503 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state)
 	return get_xsave_addr(&fpu->state.xsave, xsave_state);
 }
 
-
-/*
- * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
- * to take out of its "init state".  This will ensure that an
- * XRSTOR actually restores the state.
- */
-static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
-		int xstate_feature_mask)
-{
-	xsave->header.xfeatures |= xstate_feature_mask;
-}
-
-/*
- * This function is safe to call whether the FPU is in use or not.
- *
- * Note that this only works on the current task.
- *
- * Inputs:
- *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
- *	XFEATURE_MASK_SSE, etc...)
- *	@xsave_state_ptr: a pointer to a copy of the state that you would
- *	like written in to the current task's FPU xsave state.  This pointer
- *	must not be located in the current tasks's xsave area.
- * Output:
- *	address of the state in the xsave area or NULL if the state
- *	is not present or is in its 'init state'.
- */
-static void fpu__xfeature_set_state(int xstate_feature_mask,
-		void *xstate_feature_src, size_t len)
-{
-	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
-	struct fpu *fpu = &current->thread.fpu;
-	void *dst;
-
-	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
-		WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
-		return;
-	}
-
-	/*
-	 * Tell the FPU code that we need the FPU state to be in
-	 * 'fpu' (not in the registers), and that we need it to
-	 * be stable while we write to it.
-	 */
-	fpu__current_fpstate_write_begin();
-
-	/*
-	 * This method *WILL* *NOT* work for compact-format
-	 * buffers.  If the 'xstate_feature_mask' is unset in
-	 * xcomp_bv then we may need to move other feature state
-	 * "up" in the buffer.
-	 */
-	if (xsave->header.xcomp_bv & xstate_feature_mask) {
-		WARN_ON_ONCE(1);
-		goto out;
-	}
-
-	/* find the location in the xsave buffer of the desired state */
-	dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
-
-	/*
-	 * Make sure that the pointer being passed in did not
-	 * come from the xsave buffer itself.
-	 */
-	WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
-
-	/* put the caller-provided data in the location */
-	memcpy(dst, xstate_feature_src, len);
-
-	/*
-	 * Mark the xfeature so that the CPU knows there is state
-	 * in the buffer now.
-	 */
-	fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
-out:
-	/*
-	 * We are done writing to the 'fpu'.  Reenable preeption
-	 * and (possibly) move the fpstate back in to the fpregs.
-	 */
-	fpu__current_fpstate_write_end();
-}
-
 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
 
 /*
- * This will go out and modify the XSAVE buffer so that PKRU is
- * set to a particular state for access to 'pkey'.
- *
- * PKRU state does affect kernel access to user memory.  We do
- * not modfiy PKRU *itself* here, only the XSAVE state that will
- * be restored in to PKRU when we return back to userspace.
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
  */
 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val)
 {
-	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
-	struct pkru_state *old_pkru_state;
-	struct pkru_state new_pkru_state;
+	u32 old_pkru;
 	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
 	u32 new_pkru_bits = 0;
 
@@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	 */
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return -EINVAL;
+	/*
+	 * For most XSAVE components, this would be an arduous task:
+	 * brining fpstate up to date with fpregs, updating fpstate,
+	 * then re-populating fpregs.  But, for components that are
+	 * never lazily managed, we can just access the fpregs
+	 * directly.  PKRU is never managed lazily, so we can just
+	 * manipulate it directly.  Make sure it stays that way.
+	 */
+	WARN_ON_ONCE(!use_eager_fpu());
 
 	/* Set the bits we need in PKRU:  */
 	if (init_val & PKEY_DISABLE_ACCESS)
@@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	/* Shift the bits in to the correct place in PKRU for pkey: */
 	new_pkru_bits <<= pkey_shift;
 
-	/* Locate old copy of the state in the xsave buffer: */
-	old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
-
-	/*
-	 * When state is not in the buffer, it is in the init
-	 * state, set it manually.  Otherwise, copy out the old
-	 * state.
-	 */
-	if (!old_pkru_state)
-		new_pkru_state.pkru = 0;
-	else
-		new_pkru_state.pkru = old_pkru_state->pkru;
-
-	/* Mask off any old bits in place: */
-	new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
-
-	/* Set the newly-requested bits: */
-	new_pkru_state.pkru |= new_pkru_bits;
-
-	/*
-	 * We could theoretically live without zeroing pkru.pad.
-	 * The current XSAVE feature state definition says that
-	 * only bytes 0->3 are used.  But we do not want to
-	 * chance leaking kernel stack out to userspace in case a
-	 * memcpy() of the whole xsave buffer was done.
-	 *
-	 * They're in the same cacheline anyway.
-	 */
-	new_pkru_state.pad = 0;
+	/* Get old PKRU and mask off any old bits in place: */
+	old_pkru = read_pkru();
+	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
 
-	fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state));
+	/* Write old part along with new part: */
+	write_pkru(old_pkru | new_pkru_bits);
 
 	return 0;
 }
-- 
cgit v1.2.3


From aec742e8e1cf05114530bdef8051238b6dbb4052 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 10 Aug 2016 08:45:11 -0700
Subject: get_maintainer: Don't check if STDIN exists in a VCS repository

If get_maintainer is not given any filename arguments on the command line,
the standard input is read for a patch.

But checking if a VCS has a file named &STDIN is not a good idea and fails.

Verify the nominal input file is not &STDIN before checking the VCS.

Fixes: 4cad35a7ca69 ("get_maintainer.pl: reduce need for command-line option -f")
Reported-by: Christopher Covington <cov@codeaurora.org>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/get_maintainer.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 122fcdaf42c8..49a00d54b835 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -432,7 +432,7 @@ foreach my $file (@ARGV) {
 	    die "$P: file '${file}' not found\n";
 	}
     }
-    if ($from_filename || vcs_file_exists($file)) {
+    if ($from_filename || ($file ne "&STDIN" && vcs_file_exists($file))) {
 	$file =~ s/^\Q${cur_path}\E//;	#strip any absolute path
 	$file =~ s/^\Q${lk_path}\E//;	#or the path to the lk tree
 	push(@files, $file);
-- 
cgit v1.2.3


From 7de249964f5578e67b99699c5f0b405738d820a2 Mon Sep 17 00:00:00 2001
From: Dave Weinstein <olorin@google.com>
Date: Thu, 28 Jul 2016 11:55:41 -0700
Subject: arm: oabi compat: add missing access checks

Add access checks to sys_oabi_epoll_wait() and sys_oabi_semtimedop().
This fixes CVE-2016-3857, a local privilege escalation under
CONFIG_OABI_COMPAT.

Cc: stable@vger.kernel.org
Reported-by: Chiachih Wu <wuchiachih@gmail.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Dave Weinstein <olorin@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/sys_oabi-compat.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index 087acb569b63..5f221acd21ae 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -279,8 +279,12 @@ asmlinkage long sys_oabi_epoll_wait(int epfd,
 	mm_segment_t fs;
 	long ret, err, i;
 
-	if (maxevents <= 0 || maxevents > (INT_MAX/sizeof(struct epoll_event)))
+	if (maxevents <= 0 ||
+			maxevents > (INT_MAX/sizeof(*kbuf)) ||
+			maxevents > (INT_MAX/sizeof(*events)))
 		return -EINVAL;
+	if (!access_ok(VERIFY_WRITE, events, sizeof(*events) * maxevents))
+		return -EFAULT;
 	kbuf = kmalloc(sizeof(*kbuf) * maxevents, GFP_KERNEL);
 	if (!kbuf)
 		return -ENOMEM;
@@ -317,6 +321,8 @@ asmlinkage long sys_oabi_semtimedop(int semid,
 
 	if (nsops < 1 || nsops > SEMOPM)
 		return -EINVAL;
+	if (!access_ok(VERIFY_READ, tsops, sizeof(*tsops) * nsops))
+		return -EFAULT;
 	sops = kmalloc(sizeof(*sops) * nsops, GFP_KERNEL);
 	if (!sops)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 549fba3a615a5dc6219bb41b3ee3951173fcf1e6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 8 Jun 2016 16:21:19 +0200
Subject: ARM: don't include removed directories

Three platforms used to have header files in include/mach that
are now all gone, but the removed directories are still being
included, which leads to -Wmissing-include-dirs warnings.

This removes the extra -I flags.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/mach-mvebu/Makefile    | 3 +--
 arch/arm/mach-realview/Makefile | 3 +--
 arch/arm/mach-s5pv210/Makefile  | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/arm/mach-mvebu/Makefile b/arch/arm/mach-mvebu/Makefile
index e53c6cfcab51..6c6497e80a7b 100644
--- a/arch/arm/mach-mvebu/Makefile
+++ b/arch/arm/mach-mvebu/Makefile
@@ -1,5 +1,4 @@
-ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \
-	-I$(srctree)/arch/arm/plat-orion/include
+ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/arch/arm/plat-orion/include
 
 AFLAGS_coherency_ll.o		:= -Wa,-march=armv7-a
 CFLAGS_pmsu.o			:= -march=armv7-a
diff --git a/arch/arm/mach-realview/Makefile b/arch/arm/mach-realview/Makefile
index dae8d86ef4cc..404882130956 100644
--- a/arch/arm/mach-realview/Makefile
+++ b/arch/arm/mach-realview/Makefile
@@ -1,8 +1,7 @@
 #
 # Makefile for the linux kernel.
 #
-ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \
-	-I$(srctree)/arch/arm/plat-versatile/include
+ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/arch/arm/plat-versatile/include
 
 obj-y					:= core.o
 obj-$(CONFIG_REALVIEW_DT)		+= realview-dt.o
diff --git a/arch/arm/mach-s5pv210/Makefile b/arch/arm/mach-s5pv210/Makefile
index 72b9e9671507..fa7fb716e388 100644
--- a/arch/arm/mach-s5pv210/Makefile
+++ b/arch/arm/mach-s5pv210/Makefile
@@ -5,7 +5,7 @@
 #
 # Licensed under GPLv2
 
-ccflags-$(CONFIG_ARCH_MULTIPLATFORM) += -I$(srctree)/$(src)/include -I$(srctree)/arch/arm/plat-samsung/include
+ccflags-$(CONFIG_ARCH_MULTIPLATFORM) += -I$(srctree)/arch/arm/plat-samsung/include
 
 # Core
 
-- 
cgit v1.2.3


From de8a06f674a6f9d781e5087e9217319c7b831821 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 9 Jun 2016 09:50:28 +0200
Subject: ARM: hide mach-*/ include for ARM_SINGLE_ARMV7M

The machine specific header files are exported for traditional
platforms, but not for the ones that use ARCH_MULTIPLATFORM, as
they could conflict with one another.

In case of ARM_SINGLE_ARMV7M, we end up also exporting them,
but that appears to be a mistake, and we should treat it the
same way as ARCH_MULTIPLATFORM here.

'make W=1' warns about this because it passes -Wmissing-includes
to gcc and the directories are not actually present.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 56ea5c60b318..61f6ccc19cfa 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -260,12 +260,14 @@ machdirs := $(patsubst %,arch/arm/mach-%/,$(machine-y))
 platdirs := $(patsubst %,arch/arm/plat-%/,$(sort $(plat-y)))
 
 ifneq ($(CONFIG_ARCH_MULTIPLATFORM),y)
+ifneq ($(CONFIG_ARM_SINGLE_ARMV7M),y)
 ifeq ($(KBUILD_SRC),)
 KBUILD_CPPFLAGS += $(patsubst %,-I%include,$(machdirs) $(platdirs))
 else
 KBUILD_CPPFLAGS += $(patsubst %,-I$(srctree)/%include,$(machdirs) $(platdirs))
 endif
 endif
+endif
 
 export	TEXT_OFFSET GZFLAGS MMUEXT
 
-- 
cgit v1.2.3


From af9d238c6a1d5c131104bb011cc4c55a6207b9a2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 6 Jul 2016 14:16:24 +0200
Subject: ARM: oxnas: select reset controller framework

For unknown reasons, we have to enable three symbols for a platform
to use a reset controller driver, otherwise we get a Kconfig
warning:

warning: (MACH_OX810SE) selects RESET_OXNAS which has unmet direct dependencies (RESET_CONTROLLER)

This selects the other two symbols for oxnas.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Neil Armstrong <narmstrong@baylibre.com>
---
 arch/arm/mach-oxnas/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/mach-oxnas/Kconfig b/arch/arm/mach-oxnas/Kconfig
index 567496bd250a..29100beb2e7f 100644
--- a/arch/arm/mach-oxnas/Kconfig
+++ b/arch/arm/mach-oxnas/Kconfig
@@ -11,11 +11,13 @@ if ARCH_OXNAS
 
 config MACH_OX810SE
 	bool "Support OX810SE Based Products"
+	select ARCH_HAS_RESET_CONTROLLER
 	select COMMON_CLK_OXNAS
 	select CPU_ARM926T
 	select MFD_SYSCON
 	select OXNAS_RPS_TIMER
 	select PINCTRL_OXNAS
+	select RESET_CONTROLLER
 	select RESET_OXNAS
 	select VERSATILE_FPGA_IRQ
 	help
-- 
cgit v1.2.3


From 83e484fcbe44e91fe8efb41b33ed3434806eeaac Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 10 Aug 2016 11:38:24 +0200
Subject: ARM: dts: add syscon compatible string for CP syscon

This syscon needs to be looked up by flash protection, CLCD
display output settings and other consumers.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/integratorcp.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/integratorcp.dts b/arch/arm/boot/dts/integratorcp.dts
index d43f15b4f79a..79430fbfec3b 100644
--- a/arch/arm/boot/dts/integratorcp.dts
+++ b/arch/arm/boot/dts/integratorcp.dts
@@ -94,7 +94,7 @@
 	};
 
 	syscon {
-		compatible = "arm,integrator-cp-syscon";
+		compatible = "arm,integrator-cp-syscon", "syscon";
 		reg = <0xcb000000 0x100>;
 	};
 
-- 
cgit v1.2.3


From f2b54191f7bef2925db39eceb84e7e19ff6d8f56 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 10 Aug 2016 11:38:12 +0200
Subject: ARM: dts: add syscon compatible string for AP syscon

This syscon needs to be looked up by clocks, flash protection
and other consumers.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/integratorap.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/boot/dts/integratorap.dts b/arch/arm/boot/dts/integratorap.dts
index cf06e32ee108..4b34b54e09a1 100644
--- a/arch/arm/boot/dts/integratorap.dts
+++ b/arch/arm/boot/dts/integratorap.dts
@@ -42,7 +42,7 @@
 	};
 
 	syscon {
-		compatible = "arm,integrator-ap-syscon";
+		compatible = "arm,integrator-ap-syscon", "syscon";
 		reg = <0x11000000 0x100>;
 		interrupt-parent = <&pic>;
 		/* These are the logical module IRQs */
-- 
cgit v1.2.3


From b5c86b7496d74f6e454bcab5166efa023e1f0459 Mon Sep 17 00:00:00 2001
From: Ralf Ramsauer <ralf@ramses-pyramidenbau.de>
Date: Mon, 18 Jul 2016 11:46:48 +0200
Subject: ARM: tegra: fix erroneous address in dts

c90bb7b enabled the high speed UARTs of the Jetson TK1. Due to a merge
quirk, wrong addresses were introduced. Fix it and use the correct
addresses.

Thierry let me know, that there is another patch (b5896f67ab3c in
linux-next) in preparation which removes all the '0,' prefixes of unit
addresses on Tegra124 and is planned to go upstream in 4.8, so
this patch will get reverted then.

But for the moment, this patch is necessary to fix current misbehaviour.

Fixes: c90bb7b9b9 ("ARM: tegra: Add high speed UARTs to Jetson TK1 device tree")
Signed-off-by: Ralf Ramsauer <ralf@ramses-pyramidenbau.de>
Acked-by: Thierry Reding <thierry.reding@gmail.com>
Cc: stable@vger.kernel.org # v4.7
Cc: linux-tegra@vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/tegra124-jetson-tk1.dts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/tegra124-jetson-tk1.dts b/arch/arm/boot/dts/tegra124-jetson-tk1.dts
index e52b82449a79..6403e0de540e 100644
--- a/arch/arm/boot/dts/tegra124-jetson-tk1.dts
+++ b/arch/arm/boot/dts/tegra124-jetson-tk1.dts
@@ -1382,7 +1382,7 @@
 	 *   Pin 41: BR_UART1_TXD
 	 *   Pin 44: BR_UART1_RXD
 	 */
-	serial@70006000 {
+	serial@0,70006000 {
 		compatible = "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart";
 		status = "okay";
 	};
@@ -1394,7 +1394,7 @@
 	 *   Pin 71: UART2_CTS_L
 	 *   Pin 74: UART2_RTS_L
 	 */
-	serial@70006040 {
+	serial@0,70006040 {
 		compatible = "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart";
 		status = "okay";
 	};
-- 
cgit v1.2.3


From a20303725ec31ea0fcf498f1885b1d4245a4ee56 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 10 Aug 2016 14:02:17 +0200
Subject: ARM: dts: realview: Fix PBX-A9 cache description

Clearly QEMU is very permissive in how its PL310 model may be set up,
but the real hardware turns out to be far more particular about things
actually being correct. Fix up the DT description so that the real
thing actually boots:

- The arm,data-latency and arm,tag-latency properties need 3 cells to
  be valid, otherwise we end up retaining the default 8-cycle latencies
  which leads pretty quickly to lockup.
- The arm,dirty-latency property is only relevant to L210/L220, so get
  rid of it.
- The cache geometry override also leads to lockup and/or general
  misbehaviour. Irritatingly, the manual doesn't state the actual PL310
  configuration, but based on the boardfile code and poking registers
  from the Boot Monitor, it would seem to be 8 sets of 16KB ways.

With that, we can successfully boot to enjoy the fun of mismatched FPUs...

Cc: stable@vger.kernel.org
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/boot/dts/arm-realview-pbx-a9.dts | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/arm/boot/dts/arm-realview-pbx-a9.dts b/arch/arm/boot/dts/arm-realview-pbx-a9.dts
index db808f92dd79..90d00b407f85 100644
--- a/arch/arm/boot/dts/arm-realview-pbx-a9.dts
+++ b/arch/arm/boot/dts/arm-realview-pbx-a9.dts
@@ -70,13 +70,12 @@
 		 * associativity as these may be erroneously set
 		 * up by boot loader(s).
 		 */
-		cache-size = <1048576>; // 1MB
-		cache-sets = <4096>;
+		cache-size = <131072>; // 128KB
+		cache-sets = <512>;
 		cache-line-size = <32>;
 		arm,parity-disable;
-		arm,tag-latency = <1>;
-		arm,data-latency = <1 1>;
-		arm,dirty-latency = <1>;
+		arm,tag-latency = <1 1 1>;
+		arm,data-latency = <1 1 1>;
 	};
 
 	scu: scu@1f000000 {
-- 
cgit v1.2.3


From 5a3f75a4b2dab54b6b0d645b500583abfc27b260 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 3 Aug 2016 15:29:33 +0100
Subject: arm64: Kconfig: select ALPINE_MSI only if PCI is selected

Even when PCI is disabled, ARCH_ALPINE selects ALPINE_MSI triggerring
the following config warning:

warning: (ARCH_ALPINE) selects ALPINE_MSI which has unmet direct
		dependencies (PCI)

This patch makes selection of ALPINE_MSI conditional on PCI.

Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm64/Kconfig.platforms | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index bb2616b16157..fd3ee3a392be 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -8,7 +8,7 @@ config ARCH_SUNXI
 
 config ARCH_ALPINE
 	bool "Annapurna Labs Alpine platform"
-	select ALPINE_MSI
+	select ALPINE_MSI if PCI
 	help
 	  This enables support for the Annapurna Labs Alpine
 	  Soc family.
-- 
cgit v1.2.3


From f9db43bc296833451cbe4b03eb5e1f633ad1f787 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 3 Aug 2016 15:29:34 +0100
Subject: arm64: Kconfig: select HISILICON_IRQ_MBIGEN only if PCI is selected

Even when PCI is disabled, ARCH_HISI selects HISILICON_IRQ_MBIGEN
triggerring the following config warning:

warning: (ARM64 && HISILICON_IRQ_MBIGEN) selects ARM_GIC_V3_ITS which
	has unmet direct dependencies (PCI && PCI_MSI)

This patch makes selection of HISILICON_IRQ_MBIGEN conditional on PCI.

Cc: Ma Jun <majun258@huawei.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm64/Kconfig.platforms | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index fd3ee3a392be..be5d824ebdba 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -66,7 +66,7 @@ config ARCH_LG1K
 config ARCH_HISI
 	bool "Hisilicon SoC Family"
 	select ARM_TIMER_SP804
-	select HISILICON_IRQ_MBIGEN
+	select HISILICON_IRQ_MBIGEN if PCI
 	help
 	  This enables support for Hisilicon ARMv8 SoC family
 
-- 
cgit v1.2.3


From d8d378fa1a0c98ecb50ca52c9bf3bc14e25aa2d2 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 10 Aug 2016 15:59:09 -0700
Subject: tools/testing/nvdimm: fix SIGTERM vs hotplug crash

The unit tests crash when hotplug races the previous probe. This race
requires that the loading of the nfit_test module be terminated with
SIGTERM, and the module to be unloaded while the ars scan is still
running.

In contrast to the normal nfit driver, the unit test calls
acpi_nfit_init() twice to simulate hotplug, whereas the nominal case
goes through the acpi_nfit_notify() event handler.  The
acpi_nfit_notify() path is careful to flush the previous region
registration before servicing the hotplug event. The unit test was
missing this guarantee.

 BUG: unable to handle kernel NULL pointer dereference at           (null)
 IP: [<ffffffff810cdce7>] pwq_activate_delayed_work+0x47/0x170
 [..]
 Call Trace:
  [<ffffffff810ce186>] pwq_dec_nr_in_flight+0x66/0xa0
  [<ffffffff810ce490>] process_one_work+0x2d0/0x680
  [<ffffffff810ce331>] ? process_one_work+0x171/0x680
  [<ffffffff810ce88e>] worker_thread+0x4e/0x480
  [<ffffffff810ce840>] ? process_one_work+0x680/0x680
  [<ffffffff810ce840>] ? process_one_work+0x680/0x680
  [<ffffffff810d5343>] kthread+0xf3/0x110
  [<ffffffff8199846f>] ret_from_fork+0x1f/0x40
  [<ffffffff810d5250>] ? kthread_create_on_node+0x230/0x230

Cc: <stable@vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 tools/testing/nvdimm/test/nfit.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 5404efa578a3..dd48f421844c 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -13,6 +13,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/platform_device.h>
 #include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
 #include <linux/libnvdimm.h>
 #include <linux/vmalloc.h>
 #include <linux/device.h>
@@ -1474,6 +1475,7 @@ static int nfit_test_probe(struct platform_device *pdev)
 	if (nfit_test->setup != nfit_test0_setup)
 		return 0;
 
+	flush_work(&acpi_desc->work);
 	nfit_test->setup_hotplug = 1;
 	nfit_test->setup(nfit_test);
 
-- 
cgit v1.2.3


From 73984137d32bb6e48646daea60ebe1457a01a061 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 10 Aug 2016 16:27:38 -0700
Subject: rapidio: dereferencing an error pointer

Original patch: https://lkml.org/lkml/2016/8/4/32

If riocm_ch_alloc() fails then we end up dereferencing the error
pointer.

The problem is that we're not unwinding in the reverse order from how we
allocate things so it gets confusing.  I've changed this around so now
"ch" is NULL when we are done with it after we call riocm_put_channel().
That way we can check if it's NULL and avoid calling riocm_put_channel()
on it twice.

I renamed err_nodev to err_put_new_ch so that it better reflects what
the goto does.

Then because we had flipping things around, it means we don't neeed to
initialize the pointers to NULL and we can remove an if statement and
pull things in an indent level.

Link: http://lkml.kernel.org/r/20160805152406.20713-1-alexandre.bounine@idt.com
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Andre van Herk <andre.van.herk@prodrive-technologies.com>
Cc: Barry Wood <barry.wood@idt.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio_cm.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
index cecc15a880de..3fa17ac8df54 100644
--- a/drivers/rapidio/rio_cm.c
+++ b/drivers/rapidio/rio_cm.c
@@ -1080,8 +1080,8 @@ static int riocm_send_ack(struct rio_channel *ch)
 static struct rio_channel *riocm_ch_accept(u16 ch_id, u16 *new_ch_id,
 					   long timeout)
 {
-	struct rio_channel *ch = NULL;
-	struct rio_channel *new_ch = NULL;
+	struct rio_channel *ch;
+	struct rio_channel *new_ch;
 	struct conn_req *req;
 	struct cm_peer *peer;
 	int found = 0;
@@ -1155,6 +1155,7 @@ static struct rio_channel *riocm_ch_accept(u16 ch_id, u16 *new_ch_id,
 
 	spin_unlock_bh(&ch->lock);
 	riocm_put_channel(ch);
+	ch = NULL;
 	kfree(req);
 
 	down_read(&rdev_sem);
@@ -1172,7 +1173,7 @@ static struct rio_channel *riocm_ch_accept(u16 ch_id, u16 *new_ch_id,
 	if (!found) {
 		/* If peer device object not found, simply ignore the request */
 		err = -ENODEV;
-		goto err_nodev;
+		goto err_put_new_ch;
 	}
 
 	new_ch->rdev = peer->rdev;
@@ -1184,15 +1185,16 @@ static struct rio_channel *riocm_ch_accept(u16 ch_id, u16 *new_ch_id,
 
 	*new_ch_id = new_ch->id;
 	return new_ch;
+
+err_put_new_ch:
+	spin_lock_bh(&idr_lock);
+	idr_remove(&ch_idr, new_ch->id);
+	spin_unlock_bh(&idr_lock);
+	riocm_put_channel(new_ch);
+
 err_put:
-	riocm_put_channel(ch);
-err_nodev:
-	if (new_ch) {
-		spin_lock_bh(&idr_lock);
-		idr_remove(&ch_idr, new_ch->id);
-		spin_unlock_bh(&idr_lock);
-		riocm_put_channel(new_ch);
-	}
+	if (ch)
+		riocm_put_channel(ch);
 	*new_ch_id = 0;
 	return ERR_PTR(err);
 }
-- 
cgit v1.2.3


From a545de5ce2ef3abc4db0b9331840acf59c8f9efa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 10 Aug 2016 16:27:41 -0700
Subject: revert "ARM: keystone: dts: add psci command definition"

Revert commit 51d5d12b8f3d ("ARM: keystone: dts: add psci command
definition"), which was inadvertently added twice.

Cc: Russell King - ARM Linux <linux@armlinux.org.uk>
Cc: Vitaly Andrianov <vitalya@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/boot/dts/keystone.dtsi | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/arm/boot/dts/keystone.dtsi b/arch/arm/boot/dts/keystone.dtsi
index 00cb314d5e4d..e23f46d15c80 100644
--- a/arch/arm/boot/dts/keystone.dtsi
+++ b/arch/arm/boot/dts/keystone.dtsi
@@ -70,14 +70,6 @@
 		cpu_on		= <0x84000003>;
 	};
 
-	psci {
-		compatible	= "arm,psci";
-		method		= "smc";
-		cpu_suspend	= <0x84000001>;
-		cpu_off		= <0x84000002>;
-		cpu_on		= <0x84000003>;
-	};
-
 	soc {
 		#address-cells = <1>;
 		#size-cells = <1>;
-- 
cgit v1.2.3


From 3b33719c9b741066f7d2bc6036409752f8e0478d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Aug 2016 16:27:44 -0700
Subject: thp: move shmem_huge_enabled() outside of SYSFS ifdef

The newly introduced shmem_huge_enabled() function has two definitions,
but neither of them is visible if CONFIG_SYSFS is disabled, leading to a
build error:

  mm/khugepaged.o: In function `khugepaged':
  khugepaged.c:(.text.khugepaged+0x3ca): undefined reference to `shmem_huge_enabled'

This changes the #ifdef guards around the definition to match those that
are used in the header file.

Fixes: e496cf3d7821 ("thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE")
Link: http://lkml.kernel.org/r/20160809123638.1357593-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index 7f7748a0f9e1..fd8b2b5741b1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3975,7 +3975,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
 
 struct kobj_attribute shmem_enabled_attr =
 	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
 
+#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
 bool shmem_huge_enabled(struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(vma->vm_file);
@@ -4006,7 +4008,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
 			return false;
 	}
 }
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
+#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
 
 #else /* !CONFIG_SHMEM */
 
-- 
cgit v1.2.3


From 81cbcbc2d810c0ce49fba81f864302e1afe5ff27 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 10 Aug 2016 16:27:46 -0700
Subject: mm/page_alloc.c: fix wrong initialization when
 sysctl_min_unmapped_ratio changes

Before resetting min_unmapped_pages, we need to initialize
min_unmapped_pages rather than min_slab_pages.

Fixes: a5f5f91da6 (mm: convert zone_reclaim to node_reclaim)
Link: http://lkml.kernel.org/r/1470724248-26780-1-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ee744fa3b93d..9a92718b1103 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6854,7 +6854,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 		return rc;
 
 	for_each_online_pgdat(pgdat)
-		pgdat->min_slab_pages = 0;
+		pgdat->min_unmapped_pages = 0;
 
 	for_each_zone(zone)
 		zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
-- 
cgit v1.2.3


From 6423aa8192c596848e1b23bd4193dc0924e7274d Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 10 Aug 2016 16:27:49 -0700
Subject: mm/page_alloc.c: recalculate some of node threshold when on/offline
 memory

Some of node threshold depends on number of managed pages in the node.
When memory is going on/offline, it can be changed and we need to adjust
them.

Add recalculation to appropriate places and clean-up related functions
for better maintenance.

Link: http://lkml.kernel.org/r/1470724248-26780-2-git-send-email-iamjoonsoo.kim@lge.com
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9a92718b1103..ab2c0ff8c2e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4757,6 +4757,8 @@ int local_memory_node(int node)
 }
 #endif
 
+static void setup_min_unmapped_ratio(void);
+static void setup_min_slab_ratio(void);
 #else	/* CONFIG_NUMA */
 
 static void set_zonelist_order(void)
@@ -5878,9 +5880,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 		zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
 		zone->node = nid;
-		pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
-						/ 100;
-		pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
 #endif
 		zone->name = zone_names[j];
 		zone->zone_pgdat = pgdat;
@@ -6801,6 +6800,12 @@ int __meminit init_per_zone_wmark_min(void)
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
+
+#ifdef CONFIG_NUMA
+	setup_min_unmapped_ratio();
+	setup_min_slab_ratio();
+#endif
+
 	return 0;
 }
 core_initcall(init_per_zone_wmark_min)
@@ -6842,16 +6847,10 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
 }
 
 #ifdef CONFIG_NUMA
-int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
-	void __user *buffer, size_t *length, loff_t *ppos)
+static void setup_min_unmapped_ratio(void)
 {
-	struct pglist_data *pgdat;
+	pg_data_t *pgdat;
 	struct zone *zone;
-	int rc;
-
-	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
-	if (rc)
-		return rc;
 
 	for_each_online_pgdat(pgdat)
 		pgdat->min_unmapped_pages = 0;
@@ -6859,26 +6858,47 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	for_each_zone(zone)
 		zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
 				sysctl_min_unmapped_ratio) / 100;
-	return 0;
 }
 
-int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+
+int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
-	struct pglist_data *pgdat;
-	struct zone *zone;
 	int rc;
 
 	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
 	if (rc)
 		return rc;
 
+	setup_min_unmapped_ratio();
+
+	return 0;
+}
+
+static void setup_min_slab_ratio(void)
+{
+	pg_data_t *pgdat;
+	struct zone *zone;
+
 	for_each_online_pgdat(pgdat)
 		pgdat->min_slab_pages = 0;
 
 	for_each_zone(zone)
 		zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
 				sysctl_min_slab_ratio) / 100;
+}
+
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	setup_min_slab_ratio();
+
 	return 0;
 }
 #endif
-- 
cgit v1.2.3


From c8efc390c1e0eca195ae59a2f7cec46773620e0c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 10 Aug 2016 16:27:52 -0700
Subject: mm, rmap: fix false positive VM_BUG() in page_add_file_rmap()

PageTransCompound() doesn't distinguish THP from from any other type of
compound pages.  This can lead to false-positive VM_BUG_ON() in
page_add_file_rmap() if called on compound page from a driver[1].

I think we can exclude such cases by checking if the page belong to a
mapping.

The VM_BUG_ON_PAGE() is downgraded to VM_WARN_ON_ONCE().  This path
should not cause any harm to non-THP page, but good to know if we step
on anything else.

[1] http://lkml.kernel.org/r/c711e067-0bff-a6cb-3c37-04dfe77d2db1@redhat.com

Link: http://lkml.kernel.org/r/20160810161345.GA67522@black.fi.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Laura Abbott <labbott@redhat.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 709bc83703b1..d4f56060ba3f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1284,8 +1284,9 @@ void page_add_file_rmap(struct page *page, bool compound)
 		VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 		__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
 	} else {
-		if (PageTransCompound(page)) {
-			VM_BUG_ON_PAGE(!PageLocked(page), page);
+		if (PageTransCompound(page) && page_mapping(page)) {
+			VM_WARN_ON_ONCE(!PageLocked(page));
+
 			SetPageDoubleMap(compound_head(page));
 			if (PageMlocked(page))
 				clear_page_mlock(compound_head(page));
-- 
cgit v1.2.3


From 57dea93ac42d341e1fe902528b348ef6763fa485 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Wed, 10 Aug 2016 16:27:55 -0700
Subject: rmap: fix compound check logic in page_remove_file_rmap

In page_remove_file_rmap(.) we have the following check:

  VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);

This is meant to check for either HugeTLB pages or THP when a compound
page is passed in.

Unfortunately, if one disables CONFIG_TRANSPARENT_HUGEPAGE, then
PageTransHuge(.) will always return false, provoking BUGs when one runs
the libhugetlbfs test suite.

This patch replaces PageTransHuge(), with PageHead() which will work for
both HugeTLB and THP.

Fixes: dd78fedde4b9 ("rmap: support file thp")
Link: http://lkml.kernel.org/r/1470838217-5889-1-git-send-email-steve.capper@arm.com
Signed-off-by: Steve Capper <steve.capper@arm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Huang Shijie <shijie.huang@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index d4f56060ba3f..1ef36404e7b2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1304,7 +1304,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 {
 	int i, nr = 1;
 
-	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
+	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
 	lock_page_memcg(page);
 
 	/* Hugepages are not counted in NR_FILE_MAPPED for now. */
-- 
cgit v1.2.3


From 6039892396d845b18228935561960441900cffca Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 10 Aug 2016 16:27:58 -0700
Subject: mm/slub.c: run free_partial() outside of the
 kmem_cache_node->list_lock

With debugobjects enabled and using SLAB_DESTROY_BY_RCU, when a
kmem_cache_node is destroyed the call_rcu() may trigger a slab
allocation to fill the debug object pool (__debug_object_init:fill_pool).

Everywhere but during kmem_cache_destroy(), discard_slab() is performed
outside of the kmem_cache_node->list_lock and avoids a lockdep warning
about potential recursion:

  =============================================
  [ INFO: possible recursive locking detected ]
  4.8.0-rc1-gfxbench+ #1 Tainted: G     U
  ---------------------------------------------
  rmmod/8895 is trying to acquire lock:
   (&(&n->list_lock)->rlock){-.-...}, at: [<ffffffff811c80d7>] get_partial_node.isra.63+0x47/0x430

  but task is already holding lock:
   (&(&n->list_lock)->rlock){-.-...}, at: [<ffffffff811cbda4>] __kmem_cache_shutdown+0x54/0x320

  other info that might help us debug this:
  Possible unsafe locking scenario:
        CPU0
        ----
   lock(&(&n->list_lock)->rlock);
   lock(&(&n->list_lock)->rlock);

   *** DEADLOCK ***
   May be due to missing lock nesting notation
   5 locks held by rmmod/8895:
   #0:  (&dev->mutex){......}, at: driver_detach+0x42/0xc0
   #1:  (&dev->mutex){......}, at: driver_detach+0x50/0xc0
   #2:  (cpu_hotplug.dep_map){++++++}, at: get_online_cpus+0x2d/0x80
   #3:  (slab_mutex){+.+.+.}, at: kmem_cache_destroy+0x3c/0x220
   #4:  (&(&n->list_lock)->rlock){-.-...}, at: __kmem_cache_shutdown+0x54/0x320

  stack backtrace:
  CPU: 6 PID: 8895 Comm: rmmod Tainted: G     U          4.8.0-rc1-gfxbench+ #1
  Hardware name: Gigabyte Technology Co., Ltd. H87M-D3H/H87M-D3H, BIOS F11 08/18/2015
  Call Trace:
    __lock_acquire+0x1646/0x1ad0
    lock_acquire+0xb2/0x200
    _raw_spin_lock+0x36/0x50
    get_partial_node.isra.63+0x47/0x430
    ___slab_alloc.constprop.67+0x1a7/0x3b0
    __slab_alloc.isra.64.constprop.66+0x43/0x80
    kmem_cache_alloc+0x236/0x2d0
    __debug_object_init+0x2de/0x400
    debug_object_activate+0x109/0x1e0
    __call_rcu.constprop.63+0x32/0x2f0
    call_rcu+0x12/0x20
    discard_slab+0x3d/0x40
    __kmem_cache_shutdown+0xdb/0x320
    shutdown_cache+0x19/0x60
    kmem_cache_destroy+0x1ae/0x220
    i915_gem_load_cleanup+0x14/0x40 [i915]
    i915_driver_unload+0x151/0x180 [i915]
    i915_pci_remove+0x14/0x20 [i915]
    pci_device_remove+0x34/0xb0
    __device_release_driver+0x95/0x140
    driver_detach+0xb6/0xc0
    bus_remove_driver+0x53/0xd0
    driver_unregister+0x27/0x50
    pci_unregister_driver+0x25/0x70
    i915_exit+0x1a/0x1e2 [i915]
    SyS_delete_module+0x193/0x1f0
    entry_SYSCALL_64_fastpath+0x1c/0xac

Fixes: 52b4b950b507 ("mm: slab: free kmem_cache_node after destroy sysfs file")
Link: http://lkml.kernel.org/r/1470759070-18743-1-git-send-email-chris@chris-wilson.co.uk
Reported-by: Dave Gordon <david.s.gordon@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index cead06394e9e..9adae58462f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3629,6 +3629,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
  */
 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 {
+	LIST_HEAD(discard);
 	struct page *page, *h;
 
 	BUG_ON(irqs_disabled());
@@ -3636,13 +3637,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 	list_for_each_entry_safe(page, h, &n->partial, lru) {
 		if (!page->inuse) {
 			remove_partial(n, page);
-			discard_slab(s, page);
+			list_add(&page->lru, &discard);
 		} else {
 			list_slab_objects(s, page,
 			"Objects remaining in %s on __kmem_cache_shutdown()");
 		}
 	}
 	spin_unlock_irq(&n->list_lock);
+
+	list_for_each_entry_safe(page, h, &discard, lru)
+		discard_slab(s, page);
 }
 
 /*
-- 
cgit v1.2.3


From 12beb346710b766b8e74a7a3ac8165835bd68def Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Aug 2016 22:47:59 +0200
Subject: Merge tag 'pxa-fixes-v4.8' of https://github.com/rjarzmik/linux into
 randconfig-4.8

This is the pxa changes for v4.8 cycle.

This is a tiny fix couple to enable changes in includes in
gpio API without breaking pxa boards.

* tag 'pxa-fixes-v4.8' of https://github.com/rjarzmik/linux:
  ARM: pxa: add module.h for corgi symbol_get/symbol_put usage
  ARM: pxa: add module.h for spitz symbol_get/symbol_put usage
---
 arch/arm/mach-pxa/corgi.c | 1 +
 arch/arm/mach-pxa/spitz.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index dc109dc3a622..10bfdb169366 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/module.h>	/* symbol_get ; symbol_put */
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/major.h>
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index 1080580b1343..2c150bfc0cd5 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/module.h>	/* symbol_get ; symbol_put */
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/gpio_keys.h>
-- 
cgit v1.2.3


From 62d16b5a3fca4d186e13215e0d7d2f6d36191796 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Sat, 6 Aug 2016 12:20:39 +0200
Subject: x86/mm/kaslr: Fix -Wformat-security warning

debug_putstr() is used to output strings without using printf-like
formatting but debug_putstr(v) is defined as early_printk(v) in
arch/x86/lib/kaslr.c.

This makes clang reports the following warning when building
with -Wformat-security:

    arch/x86/lib/kaslr.c:57:15: warning: format string is not a string
    literal (potentially insecure) [-Wformat-security]
            debug_putstr(purpose);
                         ^~~~~~~

Fix this by using "%s" in early_printk().

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160806102039.27221-1-nicolas.iooss_linux@m4x.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/kaslr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index f7dfeda83e5c..121f59c6ee54 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -19,7 +19,7 @@
 #include <asm/cpufeature.h>
 #include <asm/setup.h>
 
-#define debug_putstr(v) early_printk(v)
+#define debug_putstr(v) early_printk("%s", v)
 #define has_cpuflag(f) boot_cpu_has(f)
 #define get_boot_seed() kaslr_offset()
 #endif
-- 
cgit v1.2.3


From f9bcf1e0e0145323ba2cf72ecad5264ff3883eb1 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 11 Aug 2016 13:36:35 +0800
Subject: sched/cputime: Fix steal time accounting

Commit:

  57430218317 ("sched/cputime: Count actually elapsed irq & softirq time")

... didn't take steal time into consideration with passing the noirqtime
kernel parameter.

As Paolo pointed out before:

| Why not? If idle=poll, for example, any time the guest is suspended (and
| thus cannot poll) does count as stolen time.

This patch fixes it by reducing steal time from idle time accounting when
the noirqtime parameter is true. The average idle time drops from 56.8%
to 54.75% for nohz idle kvm guest(noirqtime, idle=poll, four vCPUs running
on one pCPU).

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470893795-3527-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 1934f658c036..8b9bcc5a58fa 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -508,13 +508,20 @@ void account_process_tick(struct task_struct *p, int user_tick)
  */
 void account_idle_ticks(unsigned long ticks)
 {
-
+	cputime_t cputime, steal;
 	if (sched_clock_irqtime) {
 		irqtime_account_idle_ticks(ticks);
 		return;
 	}
 
-	account_idle_time(jiffies_to_cputime(ticks));
+	cputime = cputime_one_jiffy;
+	steal = steal_account_process_time(cputime);
+
+	if (steal >= cputime)
+		return;
+
+	cputime -= steal;
+	account_idle_time(cputime);
 }
 
 /*
-- 
cgit v1.2.3


From ace7fab7a6cdd363a615ec537f2aa94dbc761ee2 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 10 Aug 2016 10:23:25 -0700
Subject: x86/mm: Fix swap entry comment and macro

A recent patch changed the format of a swap PTE.

The comment explaining the format of the swap PTE is wrong about
the bits used for the swap type field.  Amusingly, the ASCII art
and the patch description are correct, but the comment itself
is wrong.

As I was looking at this, I also noticed that the
SWP_OFFSET_FIRST_BIT has an off-by-one error.  This does not
really hurt anything.  It just wasted a bit of space in the PTE,
giving us 2^59 bytes of addressable space in our swapfiles
instead of 2^60.  But, it doesn't match with the comments, and it
wastes a bit of space, so fix it.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Fixes: 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work around erratum")
Link: http://lkml.kernel.org/r/20160810172325.E56AD7DA@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/pgtable_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 7e8ec7ae10fa..1cc82ece9ac1 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
  *
  * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2|1|0| <- bit number
  * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
- * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ * | OFFSET (14->63) | TYPE (9-13)  |0|X|X|X| X| X|X|X|0| <- swp entry
  *
  * G (8) is aliased and used as a PROT_NONE indicator for
  * !present ptes.  We need to start storing swap entries above
@@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
 #define SWP_TYPE_BITS 5
 /* Place the offset above the type: */
-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1)
+#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
-- 
cgit v1.2.3


From 82ba4faca1bffad429f15c90c980ffd010366c25 Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Thu, 11 Aug 2016 15:44:30 +0800
Subject: x86/irq: Do not substract irq_tlb_count from irq_call_count

Since commit:

  52aec3308db8 ("x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR")

the TLB remote shootdown is done through call function vector. That
commit didn't take care of irq_tlb_count, which a later commit:

  fd0f5869724f ("x86: Distinguish TLB shootdown interrupts from other functions call interrupts")

... tried to fix.

The fix assumes every increase of irq_tlb_count has a corresponding
increase of irq_call_count. So the irq_call_count is always bigger than
irq_tlb_count and we could substract irq_tlb_count from irq_call_count.

Unfortunately this is not true for the smp_call_function_single() case.
The IPI is only sent if the target CPU's call_single_queue is empty when
adding a csd into it in generic_exec_single. That means if two threads
are both adding flush tlb csds to the same CPU's call_single_queue, only
one IPI is sent. In other words, the irq_call_count is incremented by 1
but irq_tlb_count is incremented by 2. Over time, irq_tlb_count will be
bigger than irq_call_count and the substract will produce a very large
irq_call_count value due to overflow.

Considering that:

  1) it's not worth to send more IPIs for the sake of accurate counting of
     irq_call_count in generic_exec_single();

  2) it's not easy to tell if the call function interrupt is for TLB
     shootdown in __smp_call_function_single_interrupt().

Not to exclude TLB shootdown from call function count seems to be the
simplest fix and this patch just does that.

This bug was found by LKP's cyclic performance regression tracking recently
with the vm-scalability test suite. I have bisected to commit:

  3dec0ba0be6a ("mm/rmap: share the i_mmap_rwsem")

This commit didn't do anything wrong but revealed the irq_call_count
problem. IIUC, the commit makes rwc->remap_one in rmap_walk_file
concurrent with multiple threads.  When remap_one is try_to_unmap_one(),
then multiple threads could queue flush TLB to the same CPU but only
one IPI will be sent.

Since the commit was added in Linux v3.19, the counting problem only
shows up from v3.19 onwards.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Cc: Alex Shi <alex.shi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Link: http://lkml.kernel.org/r/20160811074430.GA18163@aaronlu.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/hardirq.h | 4 ----
 arch/x86/kernel/irq.c          | 3 +--
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 7178043b0e1d..59405a248fc2 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,10 +22,6 @@ typedef struct {
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
-	/*
-	 * irq_tlb_count is double-counted in irq_call_count, so it must be
-	 * subtracted from irq_call_count when displaying irq_call_count
-	 */
 	unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 61521dc19c10..9f669fdd2010 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_puts(p, "  Rescheduling interrupts\n");
 	seq_printf(p, "%*s: ", prec, "CAL");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
-					irq_stats(j)->irq_tlb_count);
+		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
 	seq_puts(p, "  Function call interrupts\n");
 	seq_printf(p, "%*s: ", prec, "TLB");
 	for_each_online_cpu(j)
-- 
cgit v1.2.3


From 007b756053386af079ba963a8f5817ac651c7c59 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:13 -0700
Subject: x86/boot: Run reserve_bios_regions() after we initialize the memory
 map

reserve_bios_regions() is a quirk that reserves memory that we might
otherwise think is available.  There's no need to run it so early,
and running it before we have the memory map initialized with its
non-quirky inputs makes it hard to make reserve_bios_regions() more
intelligent.

Move it right after we populate the memblock state.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/59f58618911005c799c6c9979ce6ae4881d907c2.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head32.c | 2 --
 arch/x86/kernel/head64.c | 1 -
 arch/x86/kernel/setup.c  | 2 ++
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 2dda0bc4576e..f16c55bfc090 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void)
 	/* Initialize 32bit specific setup functions */
 	x86_init.resources.reserve_resources = i386_reserve_resources;
 	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
-
-	reserve_bios_regions();
 }
 
 asmlinkage __visible void __init i386_start_kernel(void)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 99d48e7d2974..54a2372f5dbb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
 		copy_bootdata(__va(real_mode_data));
 
 	x86_early_init_platform_quirks();
-	reserve_bios_regions();
 
 	switch (boot_params.hdr.hardware_subarch) {
 	case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 95cf31c9f4ec..bf780e0f76ef 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1101,6 +1101,8 @@ void __init setup_arch(char **cmdline_p)
 		efi_find_mirror();
 	}
 
+	reserve_bios_regions();
+
 	/*
 	 * The EFI specification says that boot service code won't be called
 	 * after ExitBootServices(). This is, in fact, a lie.
-- 
cgit v1.2.3


From 18bc7bd523e0fc5be8d76bf84bde733a97a8c375 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:14 -0700
Subject: x86/boot: Synchronize trampoline_cr4_features and mmu_cr4_features
 directly

The initialization process for trampoline_cr4_features and
mmu_cr4_features was confusing.  The intent is for mmu_cr4_features
and *trampoline_cr4_features to stay in sync, but
trampoline_cr4_features is NULL until setup_real_mode() runs.  The
old code synchronized *trampoline_cr4_features *twice*, once in
setup_real_mode() and once in setup_arch().  It also initialized
mmu_cr4_features in setup_real_mode(), which causes the actual value
of mmu_cr4_features to potentially depend on when setup_real_mode()
is called.

With this patch, mmu_cr4_features is initialized directly in
setup_arch(), and *trampoline_cr4_features is synchronized to
mmu_cr4_features when the trampoline is set up.

After this patch, it should be safe to defer setup_real_mode().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d48a263f9912389b957dd495a7127b009259ffe0.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/setup.c  | 17 ++++++++++-------
 arch/x86/realmode/init.c |  3 ++-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bf780e0f76ef..95c8c9c7dd6d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1131,6 +1131,16 @@ void __init setup_arch(char **cmdline_p)
 
 	early_trap_pf_init();
 
+	/*
+	 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+	 * with the current CR4 value.  This may not be necessary, but
+	 * auditing all the early-boot CR4 manipulation would be needed to
+	 * rule it out.
+	 */
+	if (boot_cpu_data.cpuid_level >= 0)
+		/* A CPU has %cr4 if and only if it has CPUID. */
+		mmu_cr4_features = __read_cr4();
+
 	setup_real_mode();
 
 	memblock_set_current_limit(get_max_mapped());
@@ -1180,13 +1190,6 @@ void __init setup_arch(char **cmdline_p)
 
 	kasan_init();
 
-	if (boot_cpu_data.cpuid_level >= 0) {
-		/* A CPU has %cr4 if and only if it has CPUID */
-		mmu_cr4_features = __read_cr4();
-		if (trampoline_cr4_features)
-			*trampoline_cr4_features = mmu_cr4_features;
-	}
-
 #ifdef CONFIG_X86_32
 	/* sync back kernel address range */
 	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 705e3fffb4a1..c5bdc4e473e7 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -4,6 +4,7 @@
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/realmode.h>
+#include <asm/tlbflush.h>
 
 struct real_mode_header *real_mode_header;
 u32 *trampoline_cr4_features;
@@ -84,7 +85,7 @@ void __init setup_real_mode(void)
 
 	trampoline_header->start = (u64) secondary_startup_64;
 	trampoline_cr4_features = &trampoline_header->cr4;
-	*trampoline_cr4_features = __read_cr4();
+	*trampoline_cr4_features = mmu_cr4_features;
 
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = trampoline_pgd_entry.pgd;
-- 
cgit v1.2.3


From d0de0f685db7faf2ae4597a39a59996dd84e18c7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:15 -0700
Subject: x86/boot: Defer setup_real_mode() to early_initcall time

There's no need to run setup_real_mode() as early as we run it.
Defer it to the same early_initcall that sets up the page
permissions for the real mode code.

This should be a code size reduction.  More importantly, it give us
a longer window in which we can allocate the real mode trampoline.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/fd62f0da4f79357695e9bf3e365623736b05f119.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/realmode.h |  1 -
 arch/x86/kernel/setup.c         |  2 --
 arch/x86/realmode/init.c        | 15 ++++++++++++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 9c6b890d5e7a..8d6777724ba4 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -59,6 +59,5 @@ extern unsigned char secondary_startup_64[];
 #endif
 
 void reserve_real_mode(void);
-void setup_real_mode(void);
 
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 95c8c9c7dd6d..0fa60f5f5a16 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1141,8 +1141,6 @@ void __init setup_arch(char **cmdline_p)
 		/* A CPU has %cr4 if and only if it has CPUID. */
 		mmu_cr4_features = __read_cr4();
 
-	setup_real_mode();
-
 	memblock_set_current_limit(get_max_mapped());
 
 	/*
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index c5bdc4e473e7..747b71e8f547 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -30,7 +30,7 @@ void __init reserve_real_mode(void)
 	       base, (unsigned long long)mem, size);
 }
 
-void __init setup_real_mode(void)
+static void __init setup_real_mode(void)
 {
 	u16 real_mode_seg;
 	const u32 *rel;
@@ -101,7 +101,7 @@ void __init setup_real_mode(void)
  * need to mark it executable at do_pre_smp_initcalls() at least,
  * thus run it as a early_initcall().
  */
-static int __init set_real_mode_permissions(void)
+static void __init set_real_mode_permissions(void)
 {
 	unsigned char *base = (unsigned char *) real_mode_header;
 	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
@@ -120,7 +120,16 @@ static int __init set_real_mode_permissions(void)
 	set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
 	set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
 	set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
+}
+
+static int __init init_real_mode(void)
+{
+	if (!real_mode_header)
+		panic("Real mode trampoline was not allocated");
+
+	setup_real_mode();
+	set_real_mode_permissions();
 
 	return 0;
 }
-early_initcall(set_real_mode_permissions);
+early_initcall(init_real_mode);
-- 
cgit v1.2.3


From 5ff3e2c3c3eebe13967d81ad1f23b9468fefea81 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:16 -0700
Subject: x86/boot: Rework reserve_real_mode() to allow multiple tries

If reserve_real_mode() fails, panicing immediately means we're
doomed.  Make it safe to try more than once to allocate the
trampoline:

 - Degrade a failure from panic() to pr_info().  (If we make it to
   setup_real_mode() without reserving the trampoline, we'll panic
   them.)

 - Factor out helpers so that platform code can supply a specific
   address to try.

 - Warn if reserve_real_mode() is called after we're done with the
   memblock allocator.  If that were to happen, we would behave
   unpredictably.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/876e383038f3e9971aa72fd20a4f5da05f9d193d.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/realmode.h |  9 +++++++++
 arch/x86/realmode/init.c        | 29 +++++++++++++++++++++--------
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 8d6777724ba4..b2988c0ed829 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,6 +58,15 @@ extern unsigned char boot_gdt[];
 extern unsigned char secondary_startup_64[];
 #endif
 
+static inline size_t real_mode_size_needed(void)
+{
+	if (real_mode_header)
+		return 0;	/* already allocated. */
+
+	return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
+}
+
+void set_real_mode_mem(phys_addr_t mem, size_t size);
 void reserve_real_mode(void);
 
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 747b71e8f547..5db706f14111 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,4 +1,5 @@
 #include <linux/io.h>
+#include <linux/slab.h>
 #include <linux/memblock.h>
 
 #include <asm/cacheflush.h>
@@ -12,22 +13,34 @@ u32 *trampoline_cr4_features;
 /* Hold the pgd entry used on booting additional CPUs */
 pgd_t trampoline_pgd_entry;
 
+void __init set_real_mode_mem(phys_addr_t mem, size_t size)
+{
+	void *base = __va(mem);
+
+	real_mode_header = (struct real_mode_header *) base;
+	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+	       base, (unsigned long long)mem, size);
+}
+
 void __init reserve_real_mode(void)
 {
 	phys_addr_t mem;
-	unsigned char *base;
-	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+	size_t size = real_mode_size_needed();
+
+	if (!size)
+		return;
+
+	WARN_ON(slab_is_available());
 
 	/* Has to be under 1M so we can execute real-mode AP code. */
 	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-	if (!mem)
-		panic("Cannot allocate trampoline\n");
+	if (!mem) {
+		pr_info("No sub-1M memory is available for the trampoline\n");
+		return;
+	}
 
-	base = __va(mem);
 	memblock_reserve(mem, size);
-	real_mode_header = (struct real_mode_header *) base;
-	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
-	       base, (unsigned long long)mem, size);
+	set_real_mode_mem(mem, size);
 }
 
 static void __init setup_real_mode(void)
-- 
cgit v1.2.3


From 5bc653b7318217c54244a14f248f1f07abe0a865 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:17 -0700
Subject: x86/efi: Allocate a trampoline if needed in efi_free_boot_services()

On my Dell XPS 13 9350 with firmware 1.4.4 and SGX on, if I boot
Fedora 24's grub2-efi off a hard disk, my first 1MB of RAM looks
like:

 efi: mem00: [Runtime Data       |RUN|  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000000000-0x0000000000000fff] (0MB)
 efi: mem01: [Boot Data          |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000001000-0x0000000000027fff] (0MB)
 efi: mem02: [Loader Data        |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000028000-0x0000000000029fff] (0MB)
 efi: mem03: [Reserved           |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002a000-0x000000000002bfff] (0MB)
 efi: mem04: [Runtime Data       |RUN|  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002c000-0x000000000002cfff] (0MB)
 efi: mem05: [Loader Data        |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002d000-0x000000000002dfff] (0MB)
 efi: mem06: [Conventional Memory|   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002e000-0x0000000000057fff] (0MB)
 efi: mem07: [Reserved           |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000058000-0x0000000000058fff] (0MB)
 efi: mem08: [Conventional Memory|   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000059000-0x000000000009ffff] (0MB)

My EBDA is at 0x2c000, which blocks off everything from 0x2c000 and
up, and my trampoline is 0x6000 bytes (6 pages), so it doesn't fit
in the loader data range at 0x28000.

Without this patch, it panics due to a failure to allocate the
trampoline.  With this patch, it works:

 [  +0.001744] Base memory trampoline at [ffff880000001000] 1000 size 24576

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/998c77b3bf709f3dfed85cb30701ed1a5d8a438b.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/quirks.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 4480c06cade7..89d1146f5a6f 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -254,6 +254,7 @@ void __init efi_free_boot_services(void)
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+		size_t rm_size;
 
 		if (md->type != EFI_BOOT_SERVICES_CODE &&
 		    md->type != EFI_BOOT_SERVICES_DATA)
@@ -263,6 +264,26 @@ void __init efi_free_boot_services(void)
 		if (md->attribute & EFI_MEMORY_RUNTIME)
 			continue;
 
+		/*
+		 * Nasty quirk: if all sub-1MB memory is used for boot
+		 * services, we can get here without having allocated the
+		 * real mode trampoline.  It's too late to hand boot services
+		 * memory back to the memblock allocator, so instead
+		 * try to manually allocate the trampoline if needed.
+		 *
+		 * I've seen this on a Dell XPS 13 9350 with firmware
+		 * 1.4.4 with SGX enabled booting Linux via Fedora 24's
+		 * grub2-efi on a hard disk.  (And no, I don't know why
+		 * this happened, but Linux should still try to boot rather
+		 * panicing early.)
+		 */
+		rm_size = real_mode_size_needed();
+		if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
+			set_real_mode_mem(start, rm_size);
+			start += rm_size;
+			size -= rm_size;
+		}
+
 		free_bootmem_late(start, size);
 	}
 
-- 
cgit v1.2.3


From f72075c9eda8a43aeea2f9dbb8d187afd4a76f0b Mon Sep 17 00:00:00 2001
From: Alex Thorlton <athorlton@sgi.com>
Date: Thu, 11 Aug 2016 11:41:59 +0100
Subject: x86/platform/uv: Skip UV runtime services mapping in the
 efi_runtime_disabled case

This problem has actually been in the UV code for a while, but we didn't
catch it until recently, because we had been relying on EFI_OLD_MEMMAP
to allow our systems to boot for a period of time.  We noticed the issue
when trying to kexec a recent community kernel, where we hit this NULL
pointer dereference in efi_sync_low_kernel_mappings():

 [    0.337515] BUG: unable to handle kernel NULL pointer dereference at 0000000000000880
 [    0.346276] IP: [<ffffffff8105df8d>] efi_sync_low_kernel_mappings+0x5d/0x1b0

The problem doesn't show up with EFI_OLD_MEMMAP because we skip the
chunk of setup_efi_state() that sets the efi_loader_signature for the
kexec'd kernel.  When the kexec'd kernel boots, it won't set EFI_BOOT in
setup_arch, so we completely avoid the bug.

We always kexec with noefi on the command line, so this shouldn't be an
issue, but since we're not actually checking for efi_runtime_disabled in
uv_bios_init(), we end up trying to do EFI runtime callbacks when we
shouldn't be. This patch just adds a check for efi_runtime_disabled in
uv_bios_init() so that we don't map in uv_systab when runtime_disabled ==
true.

Signed-off-by: Alex Thorlton <athorlton@sgi.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: <stable@vger.kernel.org> # v4.7
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1470912120-22831-2-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/uv/bios_uv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 66b2166ea4a1..0df8a0370d32 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -187,7 +187,8 @@ EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
 void uv_bios_init(void)
 {
 	uv_systab = NULL;
-	if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab) {
+	if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+	    !efi.uv_systab || efi_runtime_disabled()) {
 		pr_crit("UV: UVsystab: missing\n");
 		return;
 	}
-- 
cgit v1.2.3


From 6862e6ad95e984991a6ceec592cf67831658f928 Mon Sep 17 00:00:00 2001
From: Austin Christ <austinwc@codeaurora.org>
Date: Thu, 11 Aug 2016 11:42:00 +0100
Subject: efi/capsule: Allocate whole capsule into virtual memory

According to UEFI 2.6 section 7.5.3, the capsule should be in contiguous
virtual memory and firmware may consume the capsule immediately. To
correctly implement this functionality, the kernel driver needs to vmap
the entire capsule at the time it is made available to firmware.

The virtual allocation of the capsule update has been changed from kmap,
which was only allocating the first page of the update, to vmap, and
allocates the entire data payload.

Signed-off-by: Austin Christ <austinwc@codeaurora.org>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Lee, Chun-Yi <jlee@suse.com>
Cc: <stable@vger.kernel.org> # v4.7
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kweh Hock Leong <hock.leong.kweh@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1470912120-22831-3-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/capsule-loader.c | 8 +++++---
 drivers/firmware/efi/capsule.c        | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
index c99c24bc79b0..9ae6c116c474 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/efi.h>
+#include <linux/vmalloc.h>
 
 #define NO_FURTHER_WRITE_ACTION -1
 
@@ -108,14 +109,15 @@ static ssize_t efi_capsule_submit_update(struct capsule_info *cap_info)
 	int ret;
 	void *cap_hdr_temp;
 
-	cap_hdr_temp = kmap(cap_info->pages[0]);
+	cap_hdr_temp = vmap(cap_info->pages, cap_info->index,
+			VM_MAP, PAGE_KERNEL);
 	if (!cap_hdr_temp) {
-		pr_debug("%s: kmap() failed\n", __func__);
+		pr_debug("%s: vmap() failed\n", __func__);
 		return -EFAULT;
 	}
 
 	ret = efi_capsule_update(cap_hdr_temp, cap_info->pages);
-	kunmap(cap_info->pages[0]);
+	vunmap(cap_hdr_temp);
 	if (ret) {
 		pr_err("%s: efi_capsule_update() failed\n", __func__);
 		return ret;
diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c
index 53b9fd2293ee..6eedff45e6d7 100644
--- a/drivers/firmware/efi/capsule.c
+++ b/drivers/firmware/efi/capsule.c
@@ -190,9 +190,9 @@ efi_capsule_update_locked(efi_capsule_header_t *capsule,
  * map the capsule described by @capsule with its data in @pages and
  * send it to the firmware via the UpdateCapsule() runtime service.
  *
- * @capsule must be a virtual mapping of the first page in @pages
- * (@pages[0]) in the kernel address space. That is, a
- * capsule_header_t that describes the entire contents of the capsule
+ * @capsule must be a virtual mapping of the complete capsule update in the
+ * kernel address space, as the capsule can be consumed immediately.
+ * A capsule_header_t that describes the entire contents of the capsule
  * must be at the start of the first data page.
  *
  * Even though this function will validate that the firmware supports
-- 
cgit v1.2.3


From 26f2c75cd2cf10a6120ef02ca9a94db77cc9c8e0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 11 Aug 2016 14:58:24 +0200
Subject: sched/cputime: Fix omitted ticks passed in parameter

Commit:

  f9bcf1e0e014 ("sched/cputime: Fix steal time accounting")

... fixes a leak on steal time accounting but forgets to account
the ticks passed in parameters, assuming there is only one to
take into account.

Let's consider that parameter back.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Wanpeng Li <kernellwp@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: linux-tip-commits@vger.kernel.org
Link: http://lkml.kernel.org/r/20160811125822.GB4214@lerouge
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cputime.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8b9bcc5a58fa..9858266fb0b3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -509,12 +509,13 @@ void account_process_tick(struct task_struct *p, int user_tick)
 void account_idle_ticks(unsigned long ticks)
 {
 	cputime_t cputime, steal;
+
 	if (sched_clock_irqtime) {
 		irqtime_account_idle_ticks(ticks);
 		return;
 	}
 
-	cputime = cputime_one_jiffy;
+	cputime = jiffies_to_cputime(ticks);
 	steal = steal_account_process_time(cputime);
 
 	if (steal >= cputime)
-- 
cgit v1.2.3


From d52c0569bab4edc888832df44dc7ac28517134f6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 11 Aug 2016 16:08:35 +0200
Subject: x86/apic/x2apic, smp/hotplug: Don't use before alloc in
 x2apic_cluster_probe()

I made a mistake while converting the driver to the hotplug state
machine and as a result x2apic_cluster_probe() was accessing
cpus_in_cluster before allocating it.

This patch fixes it by setting the cpumask after the allocation the
memory succeeded.

While at it, I marked two functions static which are only used within
this file.

Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 6b2c28471de5 ("x86/x2apic: Convert to CPU hotplug state machine")
Link: http://lkml.kernel.org/r/1470924515-9444-1-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 6368fa69d2af..54f35d988025 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -155,7 +155,7 @@ static void init_x2apic_ldr(void)
 /*
  * At CPU state changes, update the x2apic cluster sibling info.
  */
-int x2apic_prepare_cpu(unsigned int cpu)
+static int x2apic_prepare_cpu(unsigned int cpu)
 {
 	if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL))
 		return -ENOMEM;
@@ -168,7 +168,7 @@ int x2apic_prepare_cpu(unsigned int cpu)
 	return 0;
 }
 
-int x2apic_dead_cpu(unsigned int this_cpu)
+static int x2apic_dead_cpu(unsigned int this_cpu)
 {
 	int cpu;
 
@@ -186,13 +186,18 @@ int x2apic_dead_cpu(unsigned int this_cpu)
 static int x2apic_cluster_probe(void)
 {
 	int cpu = smp_processor_id();
+	int ret;
 
 	if (!x2apic_mode)
 		return 0;
 
+	ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
+				x2apic_prepare_cpu, x2apic_dead_cpu);
+	if (ret < 0) {
+		pr_err("Failed to register X2APIC_PREPARE\n");
+		return 0;
+	}
 	cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
-	cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
-			  x2apic_prepare_cpu, x2apic_dead_cpu);
 	return 1;
 }
 
-- 
cgit v1.2.3


From c21377f8366c95440d533edbe47d070f662c62ef Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Date: Thu, 11 Aug 2016 09:35:57 -0600
Subject: nvme: Suspend all queues before deletion

When nvme_delete_queue fails in the first pass of the
nvme_disable_io_queues() loop, we return early, failing to suspend all
of the IO queues.  Later, on the nvme_pci_disable path, this causes us
to disable MSI without actually having freed all the IRQs, which
triggers the BUG_ON in free_msi_irqs(), as show below.

This patch refactors nvme_disable_io_queues to suspend all queues before
start submitting delete queue commands.  This way, we ensure that we
have at least returned every IRQ before continuing with the removal
path.

[  487.529200] kernel BUG at ../drivers/pci/msi.c:368!
cpu 0x46: Vector: 700 (Program Check) at [c0000078c5b83650]
    pc: c000000000627a50: free_msi_irqs+0x90/0x200
    lr: c000000000627a40: free_msi_irqs+0x80/0x200
    sp: c0000078c5b838d0
   msr: 9000000100029033
  current = 0xc0000078c5b40000
  paca    = 0xc000000002bd7600   softe: 0        irq_happened: 0x01
    pid   = 1376, comm = kworker/70:1H
kernel BUG at ../drivers/pci/msi.c:368!
Linux version 4.7.0.mainline+ (root@iod76) (gcc version 5.3.1 20160413
(Ubuntu/IBM 5.3.1-14ubuntu2.1) ) #104 SMP Fri Jul 29 09:20:17 CDT 2016
enter ? for help
[c0000078c5b83920] d0000000363b0cd8 nvme_dev_disable+0x208/0x4f0 [nvme]
[c0000078c5b83a10] d0000000363b12a4 nvme_timeout+0xe4/0x250 [nvme]
[c0000078c5b83ad0] c0000000005690e4 blk_mq_rq_timed_out+0x64/0x110
[c0000078c5b83b40] c00000000056c930 bt_for_each+0x160/0x170
[c0000078c5b83bb0] c00000000056d928 blk_mq_queue_tag_busy_iter+0x78/0x110
[c0000078c5b83c00] c0000000005675d8 blk_mq_timeout_work+0xd8/0x1b0
[c0000078c5b83c50] c0000000000e8cf0 process_one_work+0x1e0/0x590
[c0000078c5b83ce0] c0000000000e9148 worker_thread+0xa8/0x660
[c0000078c5b83d80] c0000000000f2090 kthread+0x110/0x130
[c0000078c5b83e30] c0000000000095f0 ret_from_kernel_thread+0x5c/0x6c

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Cc: Brian King <brking@linux.vnet.ibm.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: linux-nvme@lists.infradead.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/pci.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d7c33f9361aa..8dcf5a960951 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1543,15 +1543,10 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
 		reinit_completion(&dev->ioq_wait);
  retry:
 		timeout = ADMIN_TIMEOUT;
-		for (; i > 0; i--) {
-			struct nvme_queue *nvmeq = dev->queues[i];
-
-			if (!pass)
-				nvme_suspend_queue(nvmeq);
-			if (nvme_delete_queue(nvmeq, opcode))
+		for (; i > 0; i--, sent++)
+			if (nvme_delete_queue(dev->queues[i], opcode))
 				break;
-			++sent;
-		}
+
 		while (sent--) {
 			timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
 			if (timeout == 0)
@@ -1693,11 +1688,12 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		nvme_stop_queues(&dev->ctrl);
 		csts = readl(dev->bar + NVME_REG_CSTS);
 	}
+
+	for (i = dev->queue_count - 1; i > 0; i--)
+		nvme_suspend_queue(dev->queues[i]);
+
 	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
-		for (i = dev->queue_count - 1; i >= 0; i--) {
-			struct nvme_queue *nvmeq = dev->queues[i];
-			nvme_suspend_queue(nvmeq);
-		}
+		nvme_suspend_queue(dev->queues[0]);
 	} else {
 		nvme_disable_io_queues(dev);
 		nvme_disable_admin_queue(dev, shutdown);
-- 
cgit v1.2.3


From 005411ea7ee776a56b1e0120a31c65efdee5cab1 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Tue, 9 Aug 2016 14:01:30 -0400
Subject: doc: update block/queue-sysfs.txt entries

Add descriptions for dax, io_poll, and write_same_max_bytes files.

Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/queue-sysfs.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index d515d58962b9..2a3904030dea 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -14,6 +14,12 @@ add_random (RW)
 This file allows to turn off the disk entropy contribution. Default
 value of this file is '1'(on).
 
+dax (RO)
+--------
+This file indicates whether the device supports Direct Access (DAX),
+used by CPU-addressable storage to bypass the pagecache.  It shows '1'
+if true, '0' if not.
+
 discard_granularity (RO)
 -----------------------
 This shows the size of internal allocation of the device in bytes, if
@@ -46,6 +52,12 @@ hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
 
+io_poll (RW)
+------------
+When read, this file shows the total number of block IO polls and how
+many returned success.  Writing '0' to this file will disable polling
+for this device.  Writing any non-zero value will enable this feature.
+
 iostats (RW)
 -------------
 This file is used to control (on/off) the iostats accounting of the
@@ -151,5 +163,11 @@ device state. This means that it might not be safe to toggle the
 setting from "write back" to "write through", since that will also
 eliminate cache flushes issued by the kernel.
 
+write_same_max_bytes (RO)
+-------------------------
+This is the number of bytes the device can write in a single write-same
+command.  A value of '0' means write-same is not supported by this
+device.
+
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
-- 
cgit v1.2.3


From 1ea049b2de5d803374fdbf43add23c8d1c518e7b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 11 Aug 2016 10:15:56 +0200
Subject: bvec: avoid variable shadowing warning

Due to the (indirect) nesting of min(..., min(...)), sparse will
show a variable shadowing warning whenever bvec.h is included.

Avoid that by assigning the inner min() to a temporary variable first.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bvec.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 701b64a3b7c5..89b65b82d98f 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -74,7 +74,8 @@ static inline void bvec_iter_advance(const struct bio_vec *bv,
 		  "Attempted to advance past end of bvec iter\n");
 
 	while (bytes) {
-		unsigned len = min(bytes, bvec_iter_len(bv, *iter));
+		unsigned iter_len = bvec_iter_len(bv, *iter);
+		unsigned len = min(bytes, iter_len);
 
 		bytes -= len;
 		iter->bi_size -= len;
-- 
cgit v1.2.3


From 502aa0a5be633e6558574ebcf63b65afdfbfcd7a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 10 Aug 2016 14:46:27 -0400
Subject: nfsd: fix dentry refcounting on create

b44061d0b9 introduced a dentry ref counting bug.  Previously we were
grabbing one ref to dchild in nfsd_create(), but with the creation of
nfsd_create_locked() we have a ref for dchild from the lookup in
nfsd_create(), and then another ref in nfsd_create_locked().  The ref
from the lookup in nfsd_create() is never dropped and results in
dentries still in use at unmount.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Fixes: b44061d0b9 "nfsd: reorganize nfsd_create"
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ba944123167b..ff476e654b8f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1252,10 +1252,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (IS_ERR(dchild))
 		return nfserrno(host_err);
 	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
-	if (err) {
-		dput(dchild);
+	/*
+	 * We unconditionally drop our ref to dchild as fh_compose will have
+	 * already grabbed its own ref for it.
+	 */
+	dput(dchild);
+	if (err)
 		return err;
-	}
 	return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
 					rdev, resfhp);
 }
-- 
cgit v1.2.3


From ad05711cec12131e1277ce749a99d08ecf233aa7 Mon Sep 17 00:00:00 2001
From: "David A. Long" <dave.long@linaro.org>
Date: Wed, 10 Aug 2016 16:44:51 -0400
Subject: arm64: Remove stack duplicating code from jprobes

Because the arm64 calling standard allows stacked function arguments to be
anywhere in the stack frame, do not attempt to duplicate the stack frame for
jprobes handler functions.

Documentation changes to describe this issue have been broken out into a
separate patch in order to simultaneously address them in other
architecture(s).

Signed-off-by: David A. Long <dave.long@linaro.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kprobes.h   |  2 --
 arch/arm64/kernel/probes/kprobes.c | 31 +++++--------------------------
 2 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h
index 61b49150dfa3..1737aecfcc5e 100644
--- a/arch/arm64/include/asm/kprobes.h
+++ b/arch/arm64/include/asm/kprobes.h
@@ -22,7 +22,6 @@
 
 #define __ARCH_WANT_KPROBES_INSN_SLOT
 #define MAX_INSN_SIZE			1
-#define MAX_STACK_SIZE			128
 
 #define flush_insn_slot(p)		do { } while (0)
 #define kretprobe_blacklist_size	0
@@ -47,7 +46,6 @@ struct kprobe_ctlblk {
 	struct prev_kprobe prev_kprobe;
 	struct kprobe_step_ctx ss_ctx;
 	struct pt_regs jprobe_saved_regs;
-	char jprobes_stack[MAX_STACK_SIZE];
 };
 
 void arch_remove_kprobe(struct kprobe *);
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index bf9768588288..c6b0f40620d8 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -41,18 +41,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 static void __kprobes
 post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *);
 
-static inline unsigned long min_stack_size(unsigned long addr)
-{
-	unsigned long size;
-
-	if (on_irq_stack(addr, raw_smp_processor_id()))
-		size = IRQ_STACK_PTR(raw_smp_processor_id()) - addr;
-	else
-		size = (unsigned long)current_thread_info() + THREAD_START_SP - addr;
-
-	return min(size, FIELD_SIZEOF(struct kprobe_ctlblk, jprobes_stack));
-}
-
 static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
 {
 	/* prepare insn slot */
@@ -489,20 +477,15 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-	long stack_ptr = kernel_stack_pointer(regs);
 
 	kcb->jprobe_saved_regs = *regs;
 	/*
-	 * As Linus pointed out, gcc assumes that the callee
-	 * owns the argument space and could overwrite it, e.g.
-	 * tailcall optimization. So, to be absolutely safe
-	 * we also save and restore enough stack bytes to cover
-	 * the argument area.
+	 * Since we can't be sure where in the stack frame "stacked"
+	 * pass-by-value arguments are stored we just don't try to
+	 * duplicate any of the stack. Do not use jprobes on functions that
+	 * use more than 64 bytes (after padding each to an 8 byte boundary)
+	 * of arguments, or pass individual arguments larger than 16 bytes.
 	 */
-	kasan_disable_current();
-	memcpy(kcb->jprobes_stack, (void *)stack_ptr,
-	       min_stack_size(stack_ptr));
-	kasan_enable_current();
 
 	instruction_pointer_set(regs, (unsigned long) jp->entry);
 	preempt_disable();
@@ -554,10 +537,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	}
 	unpause_graph_tracing();
 	*regs = kcb->jprobe_saved_regs;
-	kasan_disable_current();
-	memcpy((void *)stack_addr, kcb->jprobes_stack,
-	       min_stack_size(stack_addr));
-	kasan_enable_current();
 	preempt_enable_no_resched();
 	return 1;
 }
-- 
cgit v1.2.3


From 42691398be08bd1fe99326911a0aa31f2c041d53 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Aug 2016 10:37:30 -0400
Subject: nfsd: Fix race between FREE_STATEID and LOCK

When running LTP's nfslock01 test, the Linux client can send a LOCK
and a FREE_STATEID request at the same time. The outcome is:

Frame 324    R OPEN stateid [2,O]

Frame 115004 C LOCK lockowner_is_new stateid [2,O] offset 672000 len 64
Frame 115008 R LOCK stateid [1,L]
Frame 115012 C WRITE stateid [0,L] offset 672000 len 64
Frame 115016 R WRITE NFS4_OK
Frame 115019 C LOCKU stateid [1,L] offset 672000 len 64
Frame 115022 R LOCKU NFS4_OK
Frame 115025 C FREE_STATEID stateid [2,L]
Frame 115026 C LOCK lockowner_is_new stateid [2,O] offset 672128 len 64
Frame 115029 R FREE_STATEID NFS4_OK
Frame 115030 R LOCK stateid [3,L]
Frame 115034 C WRITE stateid [0,L] offset 672128 len 64
Frame 115038 R WRITE NFS4ERR_BAD_STATEID

In other words, the server returns stateid L in a successful LOCK
reply, but it has already released it. Subsequent uses of stateid L
fail.

To address this, protect the generation check in nfsd4_free_stateid
with the st_mutex. This should guarantee that only one of two
outcomes occurs: either LOCK returns a fresh valid stateid, or
FREE_STATEID returns NFS4ERR_LOCKS_HELD.

Reported-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Fix-suggested-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 40 ++++++++++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8410ca275db1..0edc1822f144 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4903,6 +4903,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return nfs_ok;
 }
 
+static __be32
+nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
+{
+	struct nfs4_ol_stateid *stp = openlockstateid(s);
+	__be32 ret;
+
+	mutex_lock(&stp->st_mutex);
+
+	ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+	if (ret)
+		goto out;
+
+	ret = nfserr_locks_held;
+	if (check_for_locks(stp->st_stid.sc_file,
+			    lockowner(stp->st_stateowner)))
+		goto out;
+
+	release_lock_stateid(stp);
+	ret = nfs_ok;
+
+out:
+	mutex_unlock(&stp->st_mutex);
+	nfs4_put_stid(s);
+	return ret;
+}
+
 __be32
 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		   struct nfsd4_free_stateid *free_stateid)
@@ -4910,7 +4936,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	stateid_t *stateid = &free_stateid->fr_stateid;
 	struct nfs4_stid *s;
 	struct nfs4_delegation *dp;
-	struct nfs4_ol_stateid *stp;
 	struct nfs4_client *cl = cstate->session->se_client;
 	__be32 ret = nfserr_bad_stateid;
 
@@ -4929,18 +4954,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		ret = nfserr_locks_held;
 		break;
 	case NFS4_LOCK_STID:
-		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
-		if (ret)
-			break;
-		stp = openlockstateid(s);
-		ret = nfserr_locks_held;
-		if (check_for_locks(stp->st_stid.sc_file,
-				    lockowner(stp->st_stateowner)))
-			break;
-		WARN_ON(!unhash_lock_stateid(stp));
+		atomic_inc(&s->sc_count);
 		spin_unlock(&cl->cl_lock);
-		nfs4_put_stid(s);
-		ret = nfs_ok;
+		ret = nfsd4_free_lock_stateid(stateid, s);
 		goto out;
 	case NFS4_REVOKED_DELEG_STID:
 		dp = delegstateid(s);
-- 
cgit v1.2.3


From c1470b33bb6e18cddd361fef339ef225b8339fe7 Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Thu, 11 Aug 2016 15:32:55 -0700
Subject: mm/hugetlb: fix incorrect hugepages count during mem hotplug

When memory hotplug operates, free hugepages will be freed if the
movable node is offline.  Therefore, /proc/sys/vm/nr_hugepages will be
incorrect.

Fix it by reducing max_huge_pages when the node is offlined.

n-horiguchi@ah.jp.nec.com said:

: dissolve_free_huge_page intends to break a hugepage into buddy, and the
: destination hugepage is supposed to be allocated from the pool of the
: destination node, so the system-wide pool size is reduced.  So adding
: h->max_huge_pages-- makes sense to me.

Link: http://lkml.kernel.org/r/1470624546-902-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b9aa1b0b38b0..87e11d8ad536 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1448,6 +1448,7 @@ static void dissolve_free_huge_page(struct page *page)
 		list_del(&page->lru);
 		h->free_huge_pages--;
 		h->free_huge_pages_node[nid]--;
+		h->max_huge_pages--;
 		update_and_free_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
-- 
cgit v1.2.3


From 2f95ff90b99600f53df4a0aa652322d349d67957 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 11 Aug 2016 15:32:57 -0700
Subject: proc, meminfo: use correct helpers for calculating LRU sizes in
 meminfo

meminfo_proc_show() and si_mem_available() are using the wrong helpers
for calculating the size of the LRUs.  The user-visible impact is that
there appears to be an abnormally high number of unevictable pages.

Link: http://lkml.kernel.org/r/20160805105805.GR2799@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c | 2 +-
 mm/page_alloc.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 09e18fdf61e5..b9a8c813e5e6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		cached = 0;
 
 	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
-		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+		pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
 
 	available = si_mem_available();
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ab2c0ff8c2e6..3fbe73a6fe4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4060,7 +4060,7 @@ long si_mem_available(void)
 	int lru;
 
 	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
-		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+		pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
 
 	for_each_zone(zone)
 		wmark_low += zone->watermark[WMARK_LOW];
-- 
cgit v1.2.3


From 1f47b61fb4077936465dcde872a4e5cc4fe708da Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 11 Aug 2016 15:33:00 -0700
Subject: mm: memcontrol: fix swap counter leak on swapout from offline cgroup

An offline memory cgroup might have anonymous memory or shmem left
charged to it and no swap.  Since only swap entries pin the id of an
offline cgroup, such a cgroup will have no id and so an attempt to
swapout its anon/shmem will not store memory cgroup info in the swap
cgroup map.  As a result, memcg->swap or memcg->memsw will never get
uncharged from it and any of its ascendants.

Fix this by always charging swapout to the first ancestor cgroup that
hasn't released its id yet.

[hannes@cmpxchg.org: add comment to mem_cgroup_swapout]
[vdavydov@virtuozzo.com: use WARN_ON_ONCE() in mem_cgroup_id_get_online()]
  Link: http://lkml.kernel.org/r/20160803123445.GJ13263@esperanza
Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs")
Link: http://lkml.kernel.org/r/5336daa5c9a32e776067773d9da655d2dc126491.1470219853.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>	[3.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e74d7080ec9e..791b00ca4e8b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4082,6 +4082,24 @@ static void mem_cgroup_id_get(struct mem_cgroup *memcg)
 	atomic_inc(&memcg->id.ref);
 }
 
+static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
+{
+	while (!atomic_inc_not_zero(&memcg->id.ref)) {
+		/*
+		 * The root cgroup cannot be destroyed, so it's refcount must
+		 * always be >= 1.
+		 */
+		if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+			VM_BUG_ON(1);
+			break;
+		}
+		memcg = parent_mem_cgroup(memcg);
+		if (!memcg)
+			memcg = root_mem_cgroup;
+	}
+	return memcg;
+}
+
 static void mem_cgroup_id_put(struct mem_cgroup *memcg)
 {
 	if (atomic_dec_and_test(&memcg->id.ref)) {
@@ -5800,7 +5818,7 @@ subsys_initcall(mem_cgroup_init);
  */
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
-	struct mem_cgroup *memcg;
+	struct mem_cgroup *memcg, *swap_memcg;
 	unsigned short oldid;
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5815,16 +5833,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	if (!memcg)
 		return;
 
-	mem_cgroup_id_get(memcg);
-	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+	/*
+	 * In case the memcg owning these pages has been offlined and doesn't
+	 * have an ID allocated to it anymore, charge the closest online
+	 * ancestor for the swap instead and transfer the memory+swap charge.
+	 */
+	swap_memcg = mem_cgroup_id_get_online(memcg);
+	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
 	VM_BUG_ON_PAGE(oldid, page);
-	mem_cgroup_swap_statistics(memcg, true);
+	mem_cgroup_swap_statistics(swap_memcg, true);
 
 	page->mem_cgroup = NULL;
 
 	if (!mem_cgroup_is_root(memcg))
 		page_counter_uncharge(&memcg->memory, 1);
 
+	if (memcg != swap_memcg) {
+		if (!mem_cgroup_is_root(swap_memcg))
+			page_counter_charge(&swap_memcg->memsw, 1);
+		page_counter_uncharge(&memcg->memsw, 1);
+	}
+
 	/*
 	 * Interrupts should be disabled here because the caller holds the
 	 * mapping->tree_lock lock which is taken with interrupts-off. It is
@@ -5863,11 +5892,14 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 	if (!memcg)
 		return 0;
 
+	memcg = mem_cgroup_id_get_online(memcg);
+
 	if (!mem_cgroup_is_root(memcg) &&
-	    !page_counter_try_charge(&memcg->swap, 1, &counter))
+	    !page_counter_try_charge(&memcg->swap, 1, &counter)) {
+		mem_cgroup_id_put(memcg);
 		return -ENOMEM;
+	}
 
-	mem_cgroup_id_get(memcg);
 	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
 	VM_BUG_ON_PAGE(oldid, page);
 	mem_cgroup_swap_statistics(memcg, true);
-- 
cgit v1.2.3


From 615d66c37c755c49ce022c9e5ac0875d27d2603d Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 11 Aug 2016 15:33:03 -0700
Subject: mm: memcontrol: fix memcg id ref counter on swap charge move

Since commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure
after many small jobs") swap entries do not pin memcg->css.refcnt
directly.  Instead, they pin memcg->id.ref.  So we should adjust the
reference counters accordingly when moving swap charges between cgroups.

Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs")
Link: http://lkml.kernel.org/r/9ce297c64954a42dc90b543bc76106c4a94f07e8.1470219853.git.vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>	[3.19+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 791b00ca4e8b..2ff0289ad061 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4077,9 +4077,9 @@ static struct cftype mem_cgroup_legacy_files[] = {
 
 static DEFINE_IDR(mem_cgroup_idr);
 
-static void mem_cgroup_id_get(struct mem_cgroup *memcg)
+static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
 {
-	atomic_inc(&memcg->id.ref);
+	atomic_add(n, &memcg->id.ref);
 }
 
 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
@@ -4100,9 +4100,9 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
 	return memcg;
 }
 
-static void mem_cgroup_id_put(struct mem_cgroup *memcg)
+static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
 {
-	if (atomic_dec_and_test(&memcg->id.ref)) {
+	if (atomic_sub_and_test(n, &memcg->id.ref)) {
 		idr_remove(&mem_cgroup_idr, memcg->id.id);
 		memcg->id.id = 0;
 
@@ -4111,6 +4111,16 @@ static void mem_cgroup_id_put(struct mem_cgroup *memcg)
 	}
 }
 
+static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
+{
+	mem_cgroup_id_get_many(memcg, 1);
+}
+
+static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
+{
+	mem_cgroup_id_put_many(memcg, 1);
+}
+
 /**
  * mem_cgroup_from_id - look up a memcg from a memcg id
  * @id: the memcg id to look up
@@ -4745,6 +4755,8 @@ static void __mem_cgroup_clear_mc(void)
 		if (!mem_cgroup_is_root(mc.from))
 			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
 
+		mem_cgroup_id_put_many(mc.from, mc.moved_swap);
+
 		/*
 		 * we charged both to->memory and to->memsw, so we
 		 * should uncharge to->memory.
@@ -4752,9 +4764,9 @@ static void __mem_cgroup_clear_mc(void)
 		if (!mem_cgroup_is_root(mc.to))
 			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
 
-		css_put_many(&mc.from->css, mc.moved_swap);
+		mem_cgroup_id_get_many(mc.to, mc.moved_swap);
+		css_put_many(&mc.to->css, mc.moved_swap);
 
-		/* we've already done css_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
-- 
cgit v1.2.3


From bcbf0d566b6e59a6e873bfe415cc415111a819e2 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Thu, 11 Aug 2016 15:33:06 -0700
Subject: kasan: remove the unnecessary WARN_ONCE from quarantine.c

It's quite unlikely that the user will so little memory that the per-CPU
quarantines won't fit into the given fraction of the available memory.
Even in that case he won't be able to do anything with the information
given in the warning.

Link: http://lkml.kernel.org/r/1470929182-101413-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kuthonuzo Luruo <kuthonuzo.luruo@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/quarantine.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index b6728a33a4ac..baabaad4a4aa 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -217,11 +217,8 @@ void quarantine_reduce(void)
 	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
 		QUARANTINE_FRACTION;
 	percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
-	if (WARN_ONCE(new_quarantine_size < percpu_quarantines,
-		"Too little memory, disabling global KASAN quarantine.\n"))
-		new_quarantine_size = 0;
-	else
-		new_quarantine_size -= percpu_quarantines;
+	new_quarantine_size = (new_quarantine_size < percpu_quarantines) ?
+		0 : new_quarantine_size - percpu_quarantines;
 	WRITE_ONCE(quarantine_size, new_quarantine_size);
 
 	last = global_quarantine.head;
-- 
cgit v1.2.3


From f33e6f0671b3ba81acef4d7c078af86afcc855c4 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 11 Aug 2016 15:33:09 -0700
Subject: mm, oom: fix uninitialized ret in task_will_free_mem()

    mm/oom_kill.c: In function `task_will_free_mem':
    mm/oom_kill.c:767: warning: `ret' may be used uninitialized in this function

If __task_will_free_mem() is never called inside the for_each_process()
loop, ret will not be initialized.

Fixes: 1af8bb43269563e4 ("mm, oom: fortify task_will_free_mem()")
Link: http://lkml.kernel.org/r/1470255599-24841-1-git-send-email-geert@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7d0a275df822..d53a9aa00977 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -764,7 +764,7 @@ bool task_will_free_mem(struct task_struct *task)
 {
 	struct mm_struct *mm = task->mm;
 	struct task_struct *p;
-	bool ret;
+	bool ret = true;
 
 	/*
 	 * Skip tasks without mm because it might have passed its exit_mm and
-- 
cgit v1.2.3


From 5830169f47269f78f6624bd70165eb571270da82 Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Thu, 11 Aug 2016 15:33:12 -0700
Subject: mm/memory_hotplug.c: initialize per_cpu_nodestats for hotadded pgdats

The following oops occurs after a pgdat is hotadded:

  Unable to handle kernel paging request for data at address 0x00c30001
  Faulting instruction address: 0xc00000000022f8f4
  Oops: Kernel access of bad area, sig: 11 [#1]
  SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in: ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter nls_utf8 isofs sg virtio_balloon uio_pdrv_genirq uio ip_tables xfs libcrc32c sr_mod cdrom sd_mod virtio_net ibmvscsi scsi_transport_srp virtio_pci virtio_ring virtio dm_mirror dm_region_hash dm_log dm_mod
  CPU: 0 PID: 0 Comm: swapper/0 Tainted: G        W 4.8.0-rc1-device #110
  task: c000000000ef3080 task.stack: c000000000f6c000
  NIP: c00000000022f8f4 LR: c00000000022f948 CTR: 0000000000000000
  REGS: c000000000f6fa50 TRAP: 0300   Tainted: G        W (4.8.0-rc1-device)
  MSR: 800000010280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE,TM[E]>  CR: 84002028  XER: 20000000
  CFAR: d000000001d2013c DAR: 0000000000c30001 DSISR: 40000000 SOFTE: 0
  NIP refresh_cpu_vm_stats+0x1a4/0x2f0
  LR refresh_cpu_vm_stats+0x1f8/0x2f0
  Call Trace:
    refresh_cpu_vm_stats+0x1f8/0x2f0 (unreliable)

Add per_cpu_nodestats initialization to the hotplug codepath.

Link: http://lkml.kernel.org/r/1470931473-7090-1-git-send-email-arbab@linux.vnet.ibm.com
Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3894b65b1555..41266dc29f33 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1219,6 +1219,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 
 	/* init node's zones as empty zones, we don't have any present pages.*/
 	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
 
 	/*
 	 * The node we allocated has no zone fallback lists. For avoiding
@@ -1249,6 +1250,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 {
 	arch_refresh_nodedata(nid, NULL);
+	free_percpu(pgdat->per_cpu_nodestats);
 	arch_free_nodedata(pgdat);
 	return;
 }
-- 
cgit v1.2.3


From 68187872c76a96ed4db7bfb064272591f02e208b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 11 Aug 2016 17:45:21 +0200
Subject: uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions

Since instruction decoder now supports EVEX-encoded instructions, two fixes
are needed to correctly handle them in uprobes.

Extended bits for MODRM.rm field need to be sanitized just like we do it
for VEX3, to avoid encoding wrong register for register-relative access.

EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be
ignored by the CPU (since GPRs go only up to 15, not 31), but let's be
paranoid here: proper encoding for register-relative access
should have EVEX.x = 1.

Secondly, we should fetch vex.vvvv for EVEX too.
This is now super easy because instruction decoder populates
vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for VEX2.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org> # v4.1+
Fixes: 8a764a875fe3 ("x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX")
Link: http://lkml.kernel.org/r/20160811154521.20469-1-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/uprobes.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c1ff31d99ff..495c776de4b4 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
 		*cursor &= 0xfe;
 	}
 	/*
-	 * Similar treatment for VEX3 prefix.
-	 * TODO: add XOP/EVEX treatment when insn decoder supports them
+	 * Similar treatment for VEX3/EVEX prefix.
+	 * TODO: add XOP treatment when insn decoder supports them
 	 */
-	if (insn->vex_prefix.nbytes == 3) {
+	if (insn->vex_prefix.nbytes >= 3) {
 		/*
 		 * vex2:     c5    rvvvvLpp   (has no b bit)
 		 * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
 		 * evex:     62    rxbR00mm wvvvv1pp zllBVaaa
-		 *   (evex will need setting of both b and x since
-		 *   in non-sib encoding evex.x is 4th bit of MODRM.rm)
-		 * Setting VEX3.b (setting because it has inverted meaning):
+		 * Setting VEX3.b (setting because it has inverted meaning).
+		 * Setting EVEX.x since (in non-SIB encoding) EVEX.x
+		 * is the 4th bit of MODRM.rm, and needs the same treatment.
+		 * For VEX3-encoded insns, VEX3.x value has no effect in
+		 * non-SIB encoding, the change is superfluous but harmless.
 		 */
 		cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
-		*cursor |= 0x20;
+		*cursor |= 0x60;
 	}
 
 	/*
@@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
 
 	reg = MODRM_REG(insn);	/* Fetch modrm.reg */
 	reg2 = 0xff;		/* Fetch vex.vvvv */
-	if (insn->vex_prefix.nbytes == 2)
-		reg2 = insn->vex_prefix.bytes[1];
-	else if (insn->vex_prefix.nbytes == 3)
+	if (insn->vex_prefix.nbytes)
 		reg2 = insn->vex_prefix.bytes[2];
 	/*
-	 * TODO: add XOP, EXEV vvvv reading.
+	 * TODO: add XOP vvvv reading.
 	 *
 	 * vex.vvvv field is in bits 6-3, bits are inverted.
 	 * But in 32-bit mode, high-order bit may be ignored.
-- 
cgit v1.2.3


From 10e9e7bd598f9a66a11a22514c68c13c41fc821b Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 11 Aug 2016 07:30:20 -0700
Subject: perf/x86/intel/uncore: Fix uncore num_counters

Some uncore boxes' num_counters value for Haswell server and
Broadwell server are not correct (too large, off by one).

This issue was found by comparing the code with the document. Although
there is no bug report from users yet, accessing non-existent counters
is dangerous and the behavior is undefined: it may cause miscounting or
even crashes.

This patch makes them consistent with the uncore document.

Reported-by: Lukasz Odzioba <lukasz.odzioba@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1470925820-59847-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore_snbep.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 824e54086e07..8aee83bcf71f 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -2626,7 +2626,7 @@ void hswep_uncore_cpu_init(void)
 
 static struct intel_uncore_type hswep_uncore_ha = {
 	.name		= "ha",
-	.num_counters   = 5,
+	.num_counters   = 4,
 	.num_boxes	= 2,
 	.perf_ctr_bits	= 48,
 	SNBEP_UNCORE_PCI_COMMON_INIT(),
@@ -2645,7 +2645,7 @@ static struct uncore_event_desc hswep_uncore_imc_events[] = {
 
 static struct intel_uncore_type hswep_uncore_imc = {
 	.name		= "imc",
-	.num_counters   = 5,
+	.num_counters   = 4,
 	.num_boxes	= 8,
 	.perf_ctr_bits	= 48,
 	.fixed_ctr_bits	= 48,
@@ -2691,7 +2691,7 @@ static struct intel_uncore_type hswep_uncore_irp = {
 
 static struct intel_uncore_type hswep_uncore_qpi = {
 	.name			= "qpi",
-	.num_counters		= 5,
+	.num_counters		= 4,
 	.num_boxes		= 3,
 	.perf_ctr_bits		= 48,
 	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
@@ -2773,7 +2773,7 @@ static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
 
 static struct intel_uncore_type hswep_uncore_r3qpi = {
 	.name		= "r3qpi",
-	.num_counters   = 4,
+	.num_counters   = 3,
 	.num_boxes	= 3,
 	.perf_ctr_bits	= 44,
 	.constraints	= hswep_uncore_r3qpi_constraints,
@@ -2972,7 +2972,7 @@ static struct intel_uncore_type bdx_uncore_ha = {
 
 static struct intel_uncore_type bdx_uncore_imc = {
 	.name		= "imc",
-	.num_counters   = 5,
+	.num_counters   = 4,
 	.num_boxes	= 8,
 	.perf_ctr_bits	= 48,
 	.fixed_ctr_bits	= 48,
-- 
cgit v1.2.3


From 95f3be798472f63b495ca4712af005ea5ac7aa47 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 11 Aug 2016 07:31:14 -0700
Subject: perf/x86/intel/uncore: Add enable_box for client MSR uncore

There are bug reports about miscounting uncore counters on some
client machines like Sandybridge, Broadwell and Skylake. It is
very likely to be observed on idle systems.

This issue is caused by a hardware issue. PERF_GLOBAL_CTL could be
cleared after Package C7, and nothing will be count.
The related errata (HSD 158) could be found in:

  www.intel.com/content/dam/www/public/us/en/documents/specification-updates/4th-gen-core-family-desktop-specification-update.pdf

This patch tries to work around this issue by re-enabling PERF_GLOBAL_CTL
in ->enable_box(). The workaround does not cover all cases. It helps for new
events after returning from C7. But it cannot prevent C7, it will still
miscount if a counter is already active.

There is no drawback in leaving it enabled, so it does not need
disable_box() here.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1470925874-59943-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/uncore_snb.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 97a69dbba649..9d35ec0cb8fc 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -100,6 +100,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
 	}
 }
 
+static void snb_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+		SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
 static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
 {
 	if (box->pmu->pmu_idx == 0)
@@ -127,6 +133,7 @@ static struct attribute_group snb_uncore_format_group = {
 
 static struct intel_uncore_ops snb_uncore_msr_ops = {
 	.init_box	= snb_uncore_msr_init_box,
+	.enable_box	= snb_uncore_msr_enable_box,
 	.exit_box	= snb_uncore_msr_exit_box,
 	.disable_event	= snb_uncore_msr_disable_event,
 	.enable_event	= snb_uncore_msr_enable_event,
@@ -192,6 +199,12 @@ static void skl_uncore_msr_init_box(struct intel_uncore_box *box)
 	}
 }
 
+static void skl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(SKL_UNC_PERF_GLOBAL_CTL,
+		SNB_UNC_GLOBAL_CTL_EN | SKL_UNC_GLOBAL_CTL_CORE_ALL);
+}
+
 static void skl_uncore_msr_exit_box(struct intel_uncore_box *box)
 {
 	if (box->pmu->pmu_idx == 0)
@@ -200,6 +213,7 @@ static void skl_uncore_msr_exit_box(struct intel_uncore_box *box)
 
 static struct intel_uncore_ops skl_uncore_msr_ops = {
 	.init_box	= skl_uncore_msr_init_box,
+	.enable_box	= skl_uncore_msr_enable_box,
 	.exit_box	= skl_uncore_msr_exit_box,
 	.disable_event	= snb_uncore_msr_disable_event,
 	.enable_event	= snb_uncore_msr_enable_event,
-- 
cgit v1.2.3


From 75a4615c95a2de4fa592c1312fb8446e74b4e5eb Mon Sep 17 00:00:00 2001
From: Julius Niedworok <jniedwor@linux.vnet.ibm.com>
Date: Wed, 3 Aug 2016 16:39:54 +0200
Subject: KVM: s390: set the prefix initially properly

When KVM_RUN is triggered on a VCPU without an initial reset, a
validity intercept occurs.
Setting the prefix will set the KVM_REQ_MMU_RELOAD bit initially,
thus preventing the bug.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Julius Niedworok <jniedwor@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3f3ae4865d57..e63f6ed0a936 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1672,6 +1672,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 				    KVM_SYNC_CRS |
 				    KVM_SYNC_ARCH0 |
 				    KVM_SYNC_PFAULT;
+	kvm_s390_set_prefix(vcpu, 0);
 	if (test_kvm_facility(vcpu->kvm, 64))
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
 	/* fprs can be synchronized via vrs, even if the guest has no vx. With
-- 
cgit v1.2.3


From aca411a4b17a4aebe14ecdf253373db5b7ee6058 Mon Sep 17 00:00:00 2001
From: Julius Niedworok <jniedwor@linux.vnet.ibm.com>
Date: Wed, 3 Aug 2016 16:39:55 +0200
Subject: KVM: s390: reset KVM_REQ_MMU_RELOAD if mapping the prefix failed

When triggering KVM_RUN without a user memory region being mapped
(KVM_SET_USER_MEMORY_REGION) a validity intercept occurs. This could
happen, if the user memory region was not mapped initially or if it
was unmapped after the vcpu is initialized. The function
kvm_s390_handle_requests checks for the KVM_REQ_MMU_RELOAD bit. The
check function always clears this bit. If gmap_mprotect_notify
returns an error code, the mapping failed, but the KVM_REQ_MMU_RELOAD
was not set anymore. So the next time kvm_s390_handle_requests is
called, the execution would fall trough the check for
KVM_REQ_MMU_RELOAD. The bit needs to be resetted, if
gmap_mprotect_notify returns an error code. Resetting the bit with
kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu) fixes the bug.

Reviewed-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Julius Niedworok <jniedwor@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/kvm-s390.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e63f6ed0a936..f142215ed30d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2362,8 +2362,10 @@ retry:
 		rc = gmap_mprotect_notify(vcpu->arch.gmap,
 					  kvm_s390_get_prefix(vcpu),
 					  PAGE_SIZE * 2, PROT_WRITE);
-		if (rc)
+		if (rc) {
+			kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
 			return rc;
+		}
 		goto retry;
 	}
 
-- 
cgit v1.2.3


From 023e9fddc3616b005c3753fc1bb6526388cd7a30 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 9 Aug 2016 19:13:00 +0200
Subject: KVM: PPC: Move xics_debugfs_init out of create
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As we are about to hold the kvm->lock during the create operation on KVM
devices, we should move the call to xics_debugfs_init into its own
function, since holding a mutex over extended amounts of time might not
be a good idea.

Introduce an init operation on the kvm_device_ops struct which cannot
fail and call this, if configured, after the device has been created.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/powerpc/kvm/book3s_xics.c | 10 ++++++++--
 include/linux/kvm_host.h       |  6 ++++++
 virt/kvm/kvm_main.c            |  3 +++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index a75ba38a2d81..f2def8e45fef 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1341,8 +1341,6 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 		return ret;
 	}
 
-	xics_debugfs_init(xics);
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
 		/* Enable real mode support */
@@ -1354,9 +1352,17 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 	return 0;
 }
 
+static void kvmppc_xics_init(struct kvm_device *dev)
+{
+	struct kvmppc_xics *xics = (struct kvmppc_xics *)dev->private;
+
+	xics_debugfs_init(xics);
+}
+
 struct kvm_device_ops kvm_xics_ops = {
 	.name = "kvm-xics",
 	.create = kvmppc_xics_create,
+	.init = kvmppc_xics_init,
 	.destroy = kvmppc_xics_free,
 	.set_attr = xics_set_attr,
 	.get_attr = xics_get_attr,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 01e908ac4a39..d3c9b82812c3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1115,6 +1115,12 @@ struct kvm_device_ops {
 	const char *name;
 	int (*create)(struct kvm_device *dev, u32 type);
 
+	/*
+	 * init is called after create if create is successful and is called
+	 * outside of holding kvm->lock.
+	 */
+	void (*init)(struct kvm_device *dev);
+
 	/*
 	 * Destroy is responsible for freeing dev.
 	 *
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cc081ccfcaa3..ae642452e91a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2838,6 +2838,9 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 		return ret;
 	}
 
+	if (ops->init)
+		ops->init(dev);
+
 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
 	if (ret < 0) {
 		ops->destroy(dev);
-- 
cgit v1.2.3


From a28ebea2adc4a2bef5989a5a181ec238f59fbcad Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Tue, 9 Aug 2016 19:13:01 +0200
Subject: KVM: Protect device ops->create and list_add with kvm->lock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM devices were manipulating list data structures without any form of
synchronization, and some implementations of the create operations also
suffered from a lack of synchronization.

Now when we've split the xics create operation into create and init, we
can hold the kvm->lock mutex while calling the create operation and when
manipulating the devices list.

The error path in the generic code gets slightly ugly because we have to
take the mutex again and delete the device from the list, but holding
the mutex during anon_inode_getfd or releasing/locking the mutex in the
common non-error path seemed wrong.

Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/arm/kvm/arm.c             |  6 +++++-
 arch/powerpc/kvm/book3s_xics.c |  2 --
 include/linux/kvm_host.h       |  6 ++++++
 virt/kvm/arm/vgic/vgic-init.c  | 17 ++++-------------
 virt/kvm/kvm_main.c            | 13 ++++++++++++-
 5 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d94bb9093ead..75f130ef6504 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -1009,9 +1009,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 	switch (ioctl) {
 	case KVM_CREATE_IRQCHIP: {
+		int ret;
 		if (!vgic_present)
 			return -ENXIO;
-		return kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		mutex_lock(&kvm->lock);
+		ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
+		mutex_unlock(&kvm->lock);
+		return ret;
 	}
 	case KVM_ARM_SET_DEVICE_ADDR: {
 		struct kvm_arm_device_addr dev_addr;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index f2def8e45fef..05aa11399a78 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1329,12 +1329,10 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 	xics->kvm = kvm;
 
 	/* Already there ? */
-	mutex_lock(&kvm->lock);
 	if (kvm->arch.xics)
 		ret = -EEXIST;
 	else
 		kvm->arch.xics = xics;
-	mutex_unlock(&kvm->lock);
 
 	if (ret) {
 		kfree(xics);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d3c9b82812c3..9c28b4d4c90b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1113,6 +1113,12 @@ struct kvm_device {
 /* create, destroy, and name are mandatory */
 struct kvm_device_ops {
 	const char *name;
+
+	/*
+	 * create is called holding kvm->lock and any operations not suitable
+	 * to do while holding the lock should be deferred to init (see
+	 * below).
+	 */
 	int (*create)(struct kvm_device *dev, u32 type);
 
 	/*
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index fb4b0a79a950..83777c1cbae0 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -73,12 +73,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	int i, vcpu_lock_idx = -1, ret;
 	struct kvm_vcpu *vcpu;
 
-	mutex_lock(&kvm->lock);
-
-	if (irqchip_in_kernel(kvm)) {
-		ret = -EEXIST;
-		goto out;
-	}
+	if (irqchip_in_kernel(kvm))
+		return -EEXIST;
 
 	/*
 	 * This function is also called by the KVM_CREATE_IRQCHIP handler,
@@ -87,10 +83,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	 * the proper checks already.
 	 */
 	if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
-		!kvm_vgic_global_state.can_emulate_gicv2) {
-		ret = -ENODEV;
-		goto out;
-	}
+		!kvm_vgic_global_state.can_emulate_gicv2)
+		return -ENODEV;
 
 	/*
 	 * Any time a vcpu is run, vcpu_load is called which tries to grab the
@@ -138,9 +132,6 @@ out_unlock:
 		vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
 		mutex_unlock(&vcpu->mutex);
 	}
-
-out:
-	mutex_unlock(&kvm->lock);
 	return ret;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ae642452e91a..195078225aa5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -696,6 +696,11 @@ static void kvm_destroy_devices(struct kvm *kvm)
 {
 	struct kvm_device *dev, *tmp;
 
+	/*
+	 * We do not need to take the kvm->lock here, because nobody else
+	 * has a reference to the struct kvm at this point and therefore
+	 * cannot access the devices list anyhow.
+	 */
 	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
 		list_del(&dev->vm_node);
 		dev->ops->destroy(dev);
@@ -2832,11 +2837,15 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	dev->ops = ops;
 	dev->kvm = kvm;
 
+	mutex_lock(&kvm->lock);
 	ret = ops->create(dev, cd->type);
 	if (ret < 0) {
+		mutex_unlock(&kvm->lock);
 		kfree(dev);
 		return ret;
 	}
+	list_add(&dev->vm_node, &kvm->devices);
+	mutex_unlock(&kvm->lock);
 
 	if (ops->init)
 		ops->init(dev);
@@ -2844,10 +2853,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
 	if (ret < 0) {
 		ops->destroy(dev);
+		mutex_lock(&kvm->lock);
+		list_del(&dev->vm_node);
+		mutex_unlock(&kvm->lock);
 		return ret;
 	}
 
-	list_add(&dev->vm_node, &kvm->devices);
 	kvm_get_kvm(kvm);
 	cd->fd = ret;
 	return 0;
-- 
cgit v1.2.3


From c604cffa93478f8888bec62b23d6073dad03d43a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 11 Aug 2016 11:58:12 +0100
Subject: MIPS: KVM: Fix mapped fault broken commpage handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_mips_handle_mapped_seg_tlb_fault() appears to map the guest page at
virtual address 0 to PFN 0 if the guest has created its own mapping
there. The intention is unclear, but it may have been an attempt to
protect the zero page from being mapped to anything but the comm page in
code paths you wouldn't expect from genuine commpage accesses (guest
kernel mode cache instructions on that address, hitting trapping
instructions when executing from that address with a coincidental TLB
eviction during the KVM handling, and guest user mode accesses to that
address).

Fix this to check for mappings exactly at KVM_GUEST_COMMPAGE_ADDR (it
may not be at address 0 since commit 42aa12e74e91 ("MIPS: KVM: Move
commpage so 0x0 is unmapped")), and set the corresponding EntryLo to be
interpreted as 0 (invalid).

Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: <stable@vger.kernel.org> # 3.10.x-
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/mips/kvm/mmu.c | 49 ++++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 57319ee57c4f..96e0b24cfe5c 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -138,35 +138,42 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
 	struct kvm *kvm = vcpu->kvm;
 	kvm_pfn_t pfn0, pfn1;
+	long tlb_lo[2];
 	int ret;
 
-	if ((tlb->tlb_hi & VPN2_MASK) == 0) {
-		pfn0 = 0;
-		pfn1 = 0;
-	} else {
-		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0])
-					   >> PAGE_SHIFT) < 0)
-			return -1;
-
-		if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1])
-					   >> PAGE_SHIFT) < 0)
-			return -1;
-
-		pfn0 = kvm->arch.guest_pmap[
-			mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT];
-		pfn1 = kvm->arch.guest_pmap[
-			mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT];
-	}
+	tlb_lo[0] = tlb->tlb_lo[0];
+	tlb_lo[1] = tlb->tlb_lo[1];
+
+	/*
+	 * The commpage address must not be mapped to anything else if the guest
+	 * TLB contains entries nearby, or commpage accesses will break.
+	 */
+	if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) &
+			VPN2_MASK & (PAGE_MASK << 1)))
+		tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
+
+	if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[0])
+				   >> PAGE_SHIFT) < 0)
+		return -1;
+
+	if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[1])
+				   >> PAGE_SHIFT) < 0)
+		return -1;
+
+	pfn0 = kvm->arch.guest_pmap[
+		mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT];
+	pfn1 = kvm->arch.guest_pmap[
+		mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT];
 
 	/* Get attributes from the Guest TLB */
 	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
 		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		(tlb->tlb_lo[0] & ENTRYLO_D) |
-		(tlb->tlb_lo[0] & ENTRYLO_V);
+		(tlb_lo[0] & ENTRYLO_D) |
+		(tlb_lo[0] & ENTRYLO_V);
 	entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
 		((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
-		(tlb->tlb_lo[1] & ENTRYLO_D) |
-		(tlb->tlb_lo[1] & ENTRYLO_V);
+		(tlb_lo[1] & ENTRYLO_D) |
+		(tlb_lo[1] & ENTRYLO_V);
 
 	kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
 		  tlb->tlb_lo[0], tlb->tlb_lo[1]);
-- 
cgit v1.2.3


From 8985d50382359e5bf118fdbefc859d0dbf6cebc7 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 11 Aug 2016 11:58:13 +0100
Subject: MIPS: KVM: Add missing gfn range check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_mips_handle_mapped_seg_tlb_fault() calculates the guest frame number
based on the guest TLB EntryLo values, however it is not range checked
to ensure it lies within the guest_pmap. If the physical memory the
guest refers to is out of range then dump the guest TLB and emit an
internal error.

Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: <stable@vger.kernel.org> # 3.10.x-
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/mips/kvm/mmu.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 96e0b24cfe5c..701c1b64cef8 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -138,6 +138,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 	unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
 	struct kvm *kvm = vcpu->kvm;
 	kvm_pfn_t pfn0, pfn1;
+	gfn_t gfn0, gfn1;
 	long tlb_lo[2];
 	int ret;
 
@@ -152,18 +153,24 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
 			VPN2_MASK & (PAGE_MASK << 1)))
 		tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
 
-	if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[0])
-				   >> PAGE_SHIFT) < 0)
+	gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT;
+	gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT;
+	if (gfn0 >= kvm->arch.guest_pmap_npages ||
+	    gfn1 >= kvm->arch.guest_pmap_npages) {
+		kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n",
+			__func__, gfn0, gfn1, tlb->tlb_hi);
+		kvm_mips_dump_guest_tlbs(vcpu);
+		return -1;
+	}
+
+	if (kvm_mips_map_page(kvm, gfn0) < 0)
 		return -1;
 
-	if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb_lo[1])
-				   >> PAGE_SHIFT) < 0)
+	if (kvm_mips_map_page(kvm, gfn1) < 0)
 		return -1;
 
-	pfn0 = kvm->arch.guest_pmap[
-		mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT];
-	pfn1 = kvm->arch.guest_pmap[
-		mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT];
+	pfn0 = kvm->arch.guest_pmap[gfn0];
+	pfn1 = kvm->arch.guest_pmap[gfn1];
 
 	/* Get attributes from the Guest TLB */
 	entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
-- 
cgit v1.2.3


From 0741f52d1b980dbeb290afe67d88fc2928edd8ab Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 11 Aug 2016 11:58:14 +0100
Subject: MIPS: KVM: Fix gfn range check in kseg0 tlb faults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two consecutive gfns are loaded into host TLB, so ensure the range check
isn't off by one if guest_pmap_npages is odd.

Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: <stable@vger.kernel.org> # 3.10.x-
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/mips/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 701c1b64cef8..6a8a21859502 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -99,7 +99,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
 	}
 
 	gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
-	if (gfn >= kvm->arch.guest_pmap_npages) {
+	if ((gfn | 1) >= kvm->arch.guest_pmap_npages) {
 		kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
 			gfn, badvaddr);
 		kvm_mips_dump_host_tlbs();
-- 
cgit v1.2.3


From 9b731bcfdec4c159ad2e4312e25d69221709b96a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Thu, 11 Aug 2016 11:58:15 +0100
Subject: MIPS: KVM: Propagate kseg0/mapped tlb fault errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Propagate errors from kvm_mips_handle_kseg0_tlb_fault() and
kvm_mips_handle_mapped_seg_tlb_fault(), usually triggering an internal
error since they normally indicate the guest accessed bad physical
memory or the commpage in an unexpected way.

Fixes: 858dd5d45733 ("KVM/MIPS32: MMU/TLB operations for the Guest.")
Fixes: e685c689f3a8 ("KVM/MIPS32: Privileged instruction/target branch emulation.")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Radim Krčmář" <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: kvm@vger.kernel.org
Cc: <stable@vger.kernel.org> # 3.10.x-
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/mips/kvm/emulate.c | 35 ++++++++++++++++++++++++++---------
 arch/mips/kvm/mmu.c     | 12 +++++++++---
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 6eb52b9c9818..e788515f766b 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -1642,8 +1642,14 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 
 	preempt_disable();
 	if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
-		if (kvm_mips_host_tlb_lookup(vcpu, va) < 0)
-			kvm_mips_handle_kseg0_tlb_fault(va, vcpu);
+		if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 &&
+		    kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) {
+			kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n",
+				__func__, va, vcpu, read_c0_entryhi());
+			er = EMULATE_FAIL;
+			preempt_enable();
+			goto done;
+		}
 	} else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) ||
 		   KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
 		int index;
@@ -1680,12 +1686,18 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
 								run, vcpu);
 				preempt_enable();
 				goto dont_update_pc;
-			} else {
-				/*
-				 * We fault an entry from the guest tlb to the
-				 * shadow host TLB
-				 */
-				kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
+			}
+			/*
+			 * We fault an entry from the guest tlb to the
+			 * shadow host TLB
+			 */
+			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
+					__func__, va, index, vcpu,
+					read_c0_entryhi());
+				er = EMULATE_FAIL;
+				preempt_enable();
+				goto done;
 			}
 		}
 	} else {
@@ -2659,7 +2671,12 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
 			 * OK we have a Guest TLB entry, now inject it into the
 			 * shadow host TLB
 			 */
-			kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
+			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+				kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
+					__func__, va, index, vcpu,
+					read_c0_entryhi());
+				er = EMULATE_FAIL;
+			}
 		}
 	}
 
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 6a8a21859502..6cfdcf55572d 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -368,9 +368,15 @@ u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
 				local_irq_restore(flags);
 				return KVM_INVALID_INST;
 			}
-			kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
-							     &vcpu->arch.
-							     guest_tlb[index]);
+			if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
+						&vcpu->arch.guest_tlb[index])) {
+				kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
+					__func__, opc, index, vcpu,
+					read_c0_entryhi());
+				kvm_mips_dump_guest_tlbs(vcpu);
+				local_irq_restore(flags);
+				return KVM_INVALID_INST;
+			}
 			inst = *(opc);
 		}
 		local_irq_restore(flags);
-- 
cgit v1.2.3


From 9adeb8e72dbfe976709df01e259ed556ee60e779 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 9 Aug 2016 18:25:26 -0700
Subject: arm64: Handle el1 synchronous instruction aborts cleanly

Executing from a non-executable area gives an ugly message:

lkdtm: Performing direct entry EXEC_RODATA
lkdtm: attempting ok execution at ffff0000084c0e08
lkdtm: attempting bad execution at ffff000008880700
Bad mode in Synchronous Abort handler detected on CPU2, code 0x8400000e -- IABT (current EL)
CPU: 2 PID: 998 Comm: sh Not tainted 4.7.0-rc2+ #13
Hardware name: linux,dummy-virt (DT)
task: ffff800077e35780 ti: ffff800077970000 task.ti: ffff800077970000
PC is at lkdtm_rodata_do_nothing+0x0/0x8
LR is at execute_location+0x74/0x88

The 'IABT (current EL)' indicates the error but it's a bit cryptic
without knowledge of the ARM ARM. There is also no indication of the
specific address which triggered the fault. The increase in kernel
page permissions makes hitting this case more likely as well.
Handling the case in the vectors gives a much more familiar looking
error message:

lkdtm: Performing direct entry EXEC_RODATA
lkdtm: attempting ok execution at ffff0000084c0840
lkdtm: attempting bad execution at ffff000008880680
Unable to handle kernel paging request at virtual address ffff000008880680
pgd = ffff8000089b2000
[ffff000008880680] *pgd=00000000489b4003, *pud=0000000048904003, *pmd=0000000000000000
Internal error: Oops: 8400000e [#1] PREEMPT SMP
Modules linked in:
CPU: 1 PID: 997 Comm: sh Not tainted 4.7.0-rc1+ #24
Hardware name: linux,dummy-virt (DT)
task: ffff800077f9f080 ti: ffff800008a1c000 task.ti: ffff800008a1c000
PC is at lkdtm_rodata_do_nothing+0x0/0x8
LR is at execute_location+0x74/0x88

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/entry.S |  7 +++++++
 arch/arm64/mm/fault.c     | 14 ++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 96e4a2b64cc1..441420ca7d08 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -353,6 +353,8 @@ el1_sync:
 	lsr	x24, x1, #ESR_ELx_EC_SHIFT	// exception class
 	cmp	x24, #ESR_ELx_EC_DABT_CUR	// data abort in EL1
 	b.eq	el1_da
+	cmp	x24, #ESR_ELx_EC_IABT_CUR	// instruction abort in EL1
+	b.eq	el1_ia
 	cmp	x24, #ESR_ELx_EC_SYS64		// configurable trap
 	b.eq	el1_undef
 	cmp	x24, #ESR_ELx_EC_SP_ALIGN	// stack alignment exception
@@ -364,6 +366,11 @@ el1_sync:
 	cmp	x24, #ESR_ELx_EC_BREAKPT_CUR	// debug exception in EL1
 	b.ge	el1_dbg
 	b	el1_inv
+
+el1_ia:
+	/*
+	 * Fall through to the Data abort case
+	 */
 el1_da:
 	/*
 	 * Data abort handling
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c8beaa0da7df..05d2bd776c69 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -153,6 +153,11 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 }
 #endif
 
+static bool is_el1_instruction_abort(unsigned int esr)
+{
+	return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
+}
+
 /*
  * The kernel tried to access some page that wasn't present.
  */
@@ -161,8 +166,9 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
 {
 	/*
 	 * Are we prepared to handle this kernel fault?
+	 * We are almost certainly not prepared to handle instruction faults.
 	 */
-	if (fixup_exception(regs))
+	if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
 		return;
 
 	/*
@@ -267,7 +273,8 @@ static inline bool is_permission_fault(unsigned int esr)
 	unsigned int ec       = ESR_ELx_EC(esr);
 	unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
 
-	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
+	return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) ||
+	       (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM);
 }
 
 static bool is_el0_instruction_abort(unsigned int esr)
@@ -312,6 +319,9 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 		if (regs->orig_addr_limit == KERNEL_DS)
 			die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
 
+		if (is_el1_instruction_abort(esr))
+			die("Attempting to execute userspace memory", regs, esr);
+
 		if (!search_exception_tables(regs->pc))
 			die("Accessing user space memory outside uaccess.h routines", regs, esr);
 	}
-- 
cgit v1.2.3


From 0194e760f7d2f42adb5e1db31b27a4331dd89c2f Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 11 Aug 2016 14:11:05 +0100
Subject: arm64: hibernate: avoid potential TLB conflict

In create_safe_exec_page we install a set of global mappings in TTBR0,
then subsequently invalidate TLBs. While TTBR0 points at the zero page,
and the TLBs should be free of stale global entries, we may have stale
ASID-tagged entries (e.g. from the EFI runtime services mappings) for
the same VAs. Per the ARM ARM these ASID-tagged entries may conflict
with newly-allocated global entries, and we must follow a
Break-Before-Make approach to avoid issues resulting from this.

This patch reworks create_safe_exec_page to invalidate TLBs while the
zero page is still in place, ensuring that there are no potential
conflicts when the new TTBR0 value is installed. As a single CPU is
online while this code executes, we do not need to perform broadcast TLB
maintenance, and can call local_flush_tlb_all(), which also subsumes
some barriers. The remaining assembly is converted to use write_sysreg()
and isb().

Other than this, we safely manipulate TTBRs in the hibernate dance. The
code we install as part of the new TTBR0 mapping (the hibernated
kernel's swsusp_arch_suspend_exit) installs a zero page into TTBR1,
invalidates TLBs, then installs its preferred value. Upon being restored
to the middle of swsusp_arch_suspend, the new image will call
__cpu_suspend_exit, which will call cpu_uninstall_idmap, installing the
zero page in TTBR0 and invalidating all TLB entries.

Fixes: 82869ac57b5d ("arm64: kernel: Add support for hibernate/suspend-to-disk")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: James Morse <james.morse@arm.com>
Tested-by: James Morse <james.morse@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org> # 4.7+
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/hibernate.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 21ab5df9fa76..b2e7de8726e8 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -35,6 +35,7 @@
 #include <asm/sections.h>
 #include <asm/smp.h>
 #include <asm/suspend.h>
+#include <asm/sysreg.h>
 #include <asm/virt.h>
 
 /*
@@ -217,12 +218,22 @@ static int create_safe_exec_page(void *src_start, size_t length,
 	set_pte(pte, __pte(virt_to_phys((void *)dst) |
 			 pgprot_val(PAGE_KERNEL_EXEC)));
 
-	/* Load our new page tables */
-	asm volatile("msr	ttbr0_el1, %0;"
-		     "isb;"
-		     "tlbi	vmalle1is;"
-		     "dsb	ish;"
-		     "isb" : : "r"(virt_to_phys(pgd)));
+	/*
+	 * Load our new page tables. A strict BBM approach requires that we
+	 * ensure that TLBs are free of any entries that may overlap with the
+	 * global mappings we are about to install.
+	 *
+	 * For a real hibernate/resume cycle TTBR0 currently points to a zero
+	 * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI
+	 * runtime services), while for a userspace-driven test_resume cycle it
+	 * points to userspace page tables (and we must point it at a zero page
+	 * ourselves). Elsewhere we only (un)install the idmap with preemption
+	 * disabled, so T0SZ should be as required regardless.
+	 */
+	cpu_set_reserved_ttbr0();
+	local_flush_tlb_all();
+	write_sysreg(virt_to_phys(pgd), ttbr0_el1);
+	isb();
 
 	*phys_dst_addr = virt_to_phys((void *)dst);
 
-- 
cgit v1.2.3


From dfbca61af0b654990b9af8297ac574a9986d8275 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 11 Aug 2016 14:11:06 +0100
Subject: arm64: hibernate: handle allocation failures

In create_safe_exec_page(), we create a copy of the hibernate exit text,
along with some page tables to map this via TTBR0. We then install the
new tables in TTBR0.

In swsusp_arch_resume() we call create_safe_exec_page() before trying a
number of operations which may fail (e.g. copying the linear map page
tables). If these fail, we bail out of swsusp_arch_resume() and return
an error code, but leave TTBR0 as-is. Subsequently, the core hibernate
code will call free_basic_memory_bitmaps(), which will free all of the
memory allocations we made, including the page tables installed in
TTBR0.

Thus, we may have TTBR0 pointing at dangling freed memory for some
period of time. If the hibernate attempt was triggered by a user
requesting a hibernate test via the reboot syscall, we may return to
userspace with the clobbered TTBR0 value.

Avoid these issues by reorganising swsusp_arch_resume() such that we
have no failure paths after create_safe_exec_page(). We also add a check
that the zero page allocation succeeded, matching what we have for other
allocations.

Fixes: 82869ac57b5d ("arm64: kernel: Add support for hibernate/suspend-to-disk")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: James Morse <james.morse@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org> # 4.7+
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/hibernate.c | 59 +++++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index b2e7de8726e8..65d81f965e74 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -404,6 +404,38 @@ int swsusp_arch_resume(void)
 	void __noreturn (*hibernate_exit)(phys_addr_t, phys_addr_t, void *,
 					  void *, phys_addr_t, phys_addr_t);
 
+	/*
+	 * Restoring the memory image will overwrite the ttbr1 page tables.
+	 * Create a second copy of just the linear map, and use this when
+	 * restoring.
+	 */
+	tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
+	if (!tmp_pg_dir) {
+		pr_err("Failed to allocate memory for temporary page tables.");
+		rc = -ENOMEM;
+		goto out;
+	}
+	rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0);
+	if (rc)
+		goto out;
+
+	/*
+	 * Since we only copied the linear map, we need to find restore_pblist's
+	 * linear map address.
+	 */
+	lm_restore_pblist = LMADDR(restore_pblist);
+
+	/*
+	 * We need a zero page that is zero before & after resume in order to
+	 * to break before make on the ttbr1 page tables.
+	 */
+	zero_page = (void *)get_safe_page(GFP_ATOMIC);
+	if (!zero_page) {
+		pr_err("Failed to allocate zero page.");
+		rc = -ENOMEM;
+		goto out;
+	}
+
 	/*
 	 * Locate the exit code in the bottom-but-one page, so that *NULL
 	 * still has disastrous affects.
@@ -429,27 +461,6 @@ int swsusp_arch_resume(void)
 	 */
 	__flush_dcache_area(hibernate_exit, exit_size);
 
-	/*
-	 * Restoring the memory image will overwrite the ttbr1 page tables.
-	 * Create a second copy of just the linear map, and use this when
-	 * restoring.
-	 */
-	tmp_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
-	if (!tmp_pg_dir) {
-		pr_err("Failed to allocate memory for temporary page tables.");
-		rc = -ENOMEM;
-		goto out;
-	}
-	rc = copy_page_tables(tmp_pg_dir, PAGE_OFFSET, 0);
-	if (rc)
-		goto out;
-
-	/*
-	 * Since we only copied the linear map, we need to find restore_pblist's
-	 * linear map address.
-	 */
-	lm_restore_pblist = LMADDR(restore_pblist);
-
 	/*
 	 * KASLR will cause the el2 vectors to be in a different location in
 	 * the resumed kernel. Load hibernate's temporary copy into el2.
@@ -464,12 +475,6 @@ int swsusp_arch_resume(void)
 		__hyp_set_vectors(el2_vectors);
 	}
 
-	/*
-	 * We need a zero page that is zero before & after resume in order to
-	 * to break before make on the ttbr1 page tables.
-	 */
-	zero_page = (void *)get_safe_page(GFP_ATOMIC);
-
 	hibernate_exit(virt_to_phys(tmp_pg_dir), resume_hdr.ttbr1_el1,
 		       resume_hdr.reenter_kernel, lm_restore_pblist,
 		       resume_hdr.__hyp_stub_vectors, virt_to_phys(zero_page));
-- 
cgit v1.2.3


From 2323439fd0c844b0e84d16968699c419809d480e Mon Sep 17 00:00:00 2001
From: Riku Voipio <riku.voipio@linaro.org>
Date: Wed, 15 Jun 2016 11:27:59 +0300
Subject: arm64: defconfig: add options for virtualization and containers

Enable options commonly needed by popular virtualization
and container applications. Use modules when possible to
avoid too much overhead for users not interested.

- add namespace and cgroup options needed
- add seccomp - optional, but enhances Qemu etc
- bridge, nat, veth, macvtap and multicast for routing
  guests and containers
- btfrs and overlayfs modules for container COW backends
- while near it, make fuse a module instead of built-in.

Generated with make saveconfig and dropping unrelated spurious
change hunks while commiting. bloat-o-meter old-vmlinux vmlinux:

add/remove: 905/390 grow/shrink: 767/229 up/down: 183513/-94861 (88652)
....
Total: Before=10515408, After=10604060, chg +0.84%

Signed-off-by: Riku Voipio <riku.voipio@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/configs/defconfig | 52 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 0555b7caaf2c..2016444601f8 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -15,10 +15,14 @@ CONFIG_IKCONFIG_PROC=y
 CONFIG_LOG_BUF_SHIFT=14
 CONFIG_MEMCG=y
 CONFIG_MEMCG_SWAP=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_PIDS=y
 CONFIG_CGROUP_HUGETLB=y
-# CONFIG_UTS_NS is not set
-# CONFIG_IPC_NS is not set
-# CONFIG_NET_NS is not set
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_USER_NS=y
 CONFIG_SCHED_AUTOGROUP=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_KALLSYMS_ALL=y
@@ -71,6 +75,7 @@ CONFIG_PREEMPT=y
 CONFIG_KSM=y
 CONFIG_TRANSPARENT_HUGEPAGE=y
 CONFIG_CMA=y
+CONFIG_SECCOMP=y
 CONFIG_XEN=y
 CONFIG_KEXEC=y
 # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
@@ -84,10 +89,37 @@ CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_IP_PNP_BOOTP=y
-# CONFIG_IPV6 is not set
+CONFIG_IPV6=m
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m
+CONFIG_NETFILTER_XT_TARGET_LOG=m
+CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NF_CONNTRACK_IPV4=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_NF_CONNTRACK_IPV6=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_NAT=m
+CONFIG_IP6_NF_TARGET_MASQUERADE=m
+CONFIG_BRIDGE=m
+CONFIG_BRIDGE_VLAN_FILTERING=y
+CONFIG_VLAN_8021Q=m
+CONFIG_VLAN_8021Q_GVRP=y
+CONFIG_VLAN_8021Q_MVRP=y
 CONFIG_BPF_JIT=y
 CONFIG_CFG80211=m
 CONFIG_MAC80211=m
@@ -103,6 +135,7 @@ CONFIG_MTD=y
 CONFIG_MTD_M25P80=y
 CONFIG_MTD_SPI_NOR=y
 CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_NBD=m
 CONFIG_VIRTIO_BLK=y
 CONFIG_SRAM=y
 # CONFIG_SCSI_PROC_FS is not set
@@ -120,7 +153,10 @@ CONFIG_SATA_SIL24=y
 CONFIG_PATA_PLATFORM=y
 CONFIG_PATA_OF_PLATFORM=y
 CONFIG_NETDEVICES=y
+CONFIG_MACVLAN=m
+CONFIG_MACVTAP=m
 CONFIG_TUN=y
+CONFIG_VETH=m
 CONFIG_VIRTIO_NET=y
 CONFIG_AMD_XGBE=y
 CONFIG_NET_XGENE=y
@@ -350,12 +386,16 @@ CONFIG_EXYNOS_ADC=y
 CONFIG_PWM_SAMSUNG=y
 CONFIG_EXT2_FS=y
 CONFIG_EXT3_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_POSIX_ACL=y
 CONFIG_FANOTIFY=y
 CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS4_FS=y
-CONFIG_FUSE_FS=y
-CONFIG_CUSE=y
+CONFIG_FUSE_FS=m
+CONFIG_CUSE=m
+CONFIG_OVERLAY_FS=m
 CONFIG_VFAT_FS=y
 CONFIG_TMPFS=y
 CONFIG_HUGETLBFS=y
-- 
cgit v1.2.3


From 53fb45d3df6fb64eff6c314b3fa2e279a2496e5b Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Wed, 8 Jun 2016 21:26:23 +0900
Subject: arm64: defconfig: enable CONFIG_LOCALVERSION_AUTO

When CONFIG_LOCALVERSION_AUTO is disabled, the version string is
just a tag name (or with a '+' appended if HEAD is not a tagged
commit).

During the development (and especially when git-bisecting), longer
version string would be helpful to identify the commit we are running.

This is a default y option, so drop the unset to enable it.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/configs/defconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 2016444601f8..eadf4855ad2d 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1,4 +1,3 @@
-# CONFIG_LOCALVERSION_AUTO is not set
 CONFIG_SYSVIPC=y
 CONFIG_POSIX_MQUEUE=y
 CONFIG_AUDIT=y
-- 
cgit v1.2.3


From dd257933fa4b9fea66a1195f8a15111029810abc Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 11 Aug 2016 10:37:39 -0400
Subject: nfsd: don't return an unhashed lock stateid after taking mutex

nfsd4_lock will take the st_mutex before working with the stateid it
gets, but between the time when we drop the cl_lock and take the mutex,
the stateid could become unhashed (a'la FREE_STATEID). If that happens
the lock stateid returned to the client will be forgotten.

Fix this by first moving the st_mutex acquisition into
lookup_or_create_lock_state. Then, have it check to see if the lock
stateid is still hashed after taking the mutex. If it's not, then put
the stateid and try the find/create again.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Cc: stable@vger.kernel.org # feb9dad5 nfsd: Always lock state exclusively.
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0edc1822f144..a204d7e109d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5523,7 +5523,7 @@ static __be32
 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
 			    struct nfs4_ol_stateid *ost,
 			    struct nfsd4_lock *lock,
-			    struct nfs4_ol_stateid **lst, bool *new)
+			    struct nfs4_ol_stateid **plst, bool *new)
 {
 	__be32 status;
 	struct nfs4_file *fi = ost->st_stid.sc_file;
@@ -5531,7 +5531,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
 	struct nfs4_client *cl = oo->oo_owner.so_client;
 	struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
 	struct nfs4_lockowner *lo;
+	struct nfs4_ol_stateid *lst;
 	unsigned int strhashval;
+	bool hashed;
 
 	lo = find_lockowner_str(cl, &lock->lk_new_owner);
 	if (!lo) {
@@ -5547,12 +5549,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
 			goto out;
 	}
 
-	*lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
-	if (*lst == NULL) {
+retry:
+	lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+	if (lst == NULL) {
 		status = nfserr_jukebox;
 		goto out;
 	}
+
+	mutex_lock(&lst->st_mutex);
+
+	/* See if it's still hashed to avoid race with FREE_STATEID */
+	spin_lock(&cl->cl_lock);
+	hashed = !list_empty(&lst->st_perfile);
+	spin_unlock(&cl->cl_lock);
+
+	if (!hashed) {
+		mutex_unlock(&lst->st_mutex);
+		nfs4_put_stid(&lst->st_stid);
+		goto retry;
+	}
 	status = nfs_ok;
+	*plst = lst;
 out:
 	nfs4_put_stateowner(&lo->lo_owner);
 	return status;
@@ -5619,8 +5636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 		status = lookup_or_create_lock_state(cstate, open_stp, lock,
 							&lock_stp, &new);
-		if (status == nfs_ok)
-			mutex_lock(&lock_stp->st_mutex);
 	} else {
 		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid,
-- 
cgit v1.2.3


From 62822e2ec4ad091ba31f823f577ef80db52e3c2c Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 11 Aug 2016 14:49:29 -0700
Subject: PM / hibernate: Restore processor state before using per-CPU
 variables

Restore the processor state before calling any other functions to
ensure per-CPU variables can be used with KASLR memory randomization.

Tracing functions use per-CPU variables (GS based on x86) and one was
called just before restoring the processor state fully. It resulted
in a double fault when both the tracing & the exception handler
functions tried to use a per-CPU variable.

Fixes: bb3632c6101b (PM / sleep: trace events for suspend/resume)
Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Reported-by: Jiri Kosina <jikos@kernel.org>
Tested-by: Rafael J. Wysocki <rafael@kernel.org>
Tested-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 0ee1df0a0bd6..61761aa7cc19 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,12 +300,12 @@ static int create_image(int platform_mode)
 	save_processor_state();
 	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
 	error = swsusp_arch_suspend();
+	/* Restore control flow magically appears here */
+	restore_processor_state();
 	trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
 	if (error)
 		printk(KERN_ERR "PM: Error %d creating hibernation image\n",
 			error);
-	/* Restore control flow magically appears here */
-	restore_processor_state();
 	if (!in_suspend)
 		events_check_enabled = false;
 
-- 
cgit v1.2.3


From 783011b13095430262333fd64e17666954064664 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 21 Mar 2016 04:20:53 -0700
Subject: unicore32: mm: Add missing parameter to arch_vma_access_permitted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

unicore32 fails to compile with the following errors.

mm/memory.c: In function ‘__handle_mm_fault’:
mm/memory.c:3381: error:
	too many arguments to function ‘arch_vma_access_permitted’
mm/gup.c: In function ‘check_vma_flags’:
mm/gup.c:456: error:
	too many arguments to function ‘arch_vma_access_permitted’
mm/gup.c: In function ‘vma_permits_fault’:
mm/gup.c:640: error:
	too many arguments to function ‘arch_vma_access_permitted’

Fixes: d61172b4b695b ("mm/core, x86/mm/pkeys: Differentiate instruction fetches")
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Guan Xuetao <gxt@mprc.pku.edu.cn>
---
 arch/unicore32/include/asm/mmu_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index e35632ef23c7..62dfc644c908 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -98,7 +98,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 }
 
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
-		bool write, bool foreign)
+		bool write, bool execute, bool foreign)
 {
 	/* by default, allow everything */
 	return true;
-- 
cgit v1.2.3


From 2b05980d895e51fc46d3d27c8d1bcc696dddca99 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 8 Jun 2016 20:11:58 -0700
Subject: h8300: Add missing include file to asm/io.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

h8300 builds fail with

arch/h8300/include/asm/io.h:9:15: error: unknown type name ‘u8’
arch/h8300/include/asm/io.h:15:15: error: unknown type name ‘u16’
arch/h8300/include/asm/io.h:21:15: error: unknown type name ‘u32’

and many related errors.

Fixes: 23c82d41bdf4 ("kexec-allow-architectures-to-override-boot-mapping-fix")
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 arch/h8300/include/asm/io.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/h8300/include/asm/io.h b/arch/h8300/include/asm/io.h
index 2e221c5f0203..f86918aed9e1 100644
--- a/arch/h8300/include/asm/io.h
+++ b/arch/h8300/include/asm/io.h
@@ -3,6 +3,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/types.h>
+
 /* H8/300 internal I/O functions */
 
 #define __raw_readb __raw_readb
-- 
cgit v1.2.3


From 694d0d0bb2030d2e36df73e2d23d5770511dbc8d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Aug 2016 19:11:36 -0700
Subject: Linux 4.8-rc2

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 8c504f324154..5c18baad7218 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 8
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
+EXTRAVERSION = -rc2
 NAME = Psychotic Stoned Sheep
 
 # *DOCUMENTATION*
-- 
cgit v1.2.3


From c7ef587bd4abe75705197fed7c1aac899624ca81 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Thu, 11 Aug 2016 15:16:43 +0200
Subject: mfd: pm8921: Add support for pm8018

In order to support the Qualcomm MDM9615 PMIC, add support for the
pm8018 in pm8921 MFD driver.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/pm8921-core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/mfd/pm8921-core.c b/drivers/mfd/pm8921-core.c
index 1b7ec0870c2a..0e3a2ea25942 100644
--- a/drivers/mfd/pm8921-core.c
+++ b/drivers/mfd/pm8921-core.c
@@ -309,6 +309,7 @@ static const struct regmap_config ssbi_regmap_config = {
 };
 
 static const struct of_device_id pm8921_id_table[] = {
+	{ .compatible = "qcom,pm8018", },
 	{ .compatible = "qcom,pm8058", },
 	{ .compatible = "qcom,pm8921", },
 	{ }
-- 
cgit v1.2.3


From 08655bca2712880c9f30c8f3f8fb50e010920e74 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Thu, 11 Aug 2016 15:16:44 +0200
Subject: rtc: rtc-pm8xxx: Add support for pm8018 rtc

In order to support RTC on Qualcomm MDM9615 SoC, add support for
the pm8018 rtc in rtc-pm8xxx driver.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/devicetree/bindings/mfd/qcom-pm8xxx.txt | 1 +
 drivers/rtc/rtc-pm8xxx.c                              | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/mfd/qcom-pm8xxx.txt b/Documentation/devicetree/bindings/mfd/qcom-pm8xxx.txt
index f24f33409164..37a088f9a648 100644
--- a/Documentation/devicetree/bindings/mfd/qcom-pm8xxx.txt
+++ b/Documentation/devicetree/bindings/mfd/qcom-pm8xxx.txt
@@ -62,6 +62,7 @@ The below bindings specify the set of valid subnodes.
 		    "qcom,pm8058-rtc"
 		    "qcom,pm8921-rtc"
 		    "qcom,pm8941-rtc"
+		    "qcom,pm8018-rtc"
 
 - reg:
 	Usage: required
diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c
index 795fcbd02ea3..fac835530671 100644
--- a/drivers/rtc/rtc-pm8xxx.c
+++ b/drivers/rtc/rtc-pm8xxx.c
@@ -428,6 +428,7 @@ static const struct pm8xxx_rtc_regs pm8941_regs = {
  */
 static const struct of_device_id pm8xxx_id_table[] = {
 	{ .compatible = "qcom,pm8921-rtc", .data = &pm8921_regs },
+	{ .compatible = "qcom,pm8018-rtc", .data = &pm8921_regs },
 	{ .compatible = "qcom,pm8058-rtc", .data = &pm8058_regs },
 	{ .compatible = "qcom,pm8941-rtc", .data = &pm8941_regs },
 	{ },
-- 
cgit v1.2.3


From eb87a669dd4a79fdbca14c402fdbcdb8695739ff Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Thu, 11 Aug 2016 15:16:45 +0200
Subject: mfd: qcom-rpm: Add support for pm8018 RPM Regulator

In order to support the Qualcomm MDM9615 SoC, add support for the
RPM regulator entries in the qcom-rpm driver.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/devicetree/bindings/mfd/qcom-rpm.txt |  1 +
 drivers/mfd/qcom_rpm.c                             | 51 ++++++++++++++++++++++
 include/dt-bindings/mfd/qcom-rpm.h                 | 22 ++++++++++
 3 files changed, 74 insertions(+)

diff --git a/Documentation/devicetree/bindings/mfd/qcom-rpm.txt b/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
index b98b291a31ba..72d68d497efd 100644
--- a/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
+++ b/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
@@ -13,6 +13,7 @@ frequencies.
 		    "qcom,rpm-msm8660"
 		    "qcom,rpm-msm8960"
 		    "qcom,rpm-ipq8064"
+		    "qcom,rpm-mdm9615"
 
 - reg:
 	Usage: required
diff --git a/drivers/mfd/qcom_rpm.c b/drivers/mfd/qcom_rpm.c
index 2e44323455dd..a74210df5969 100644
--- a/drivers/mfd/qcom_rpm.c
+++ b/drivers/mfd/qcom_rpm.c
@@ -388,11 +388,62 @@ static const struct qcom_rpm_data ipq806x_template = {
 	.ack_sel_size = 7,
 };
 
+static const struct qcom_rpm_resource mdm9615_rpm_resource_table[] = {
+	[QCOM_RPM_CXO_CLK] =			{ 25, 9, 5, 1 },
+	[QCOM_RPM_SYS_FABRIC_CLK] =		{ 26, 10, 9, 1 },
+	[QCOM_RPM_DAYTONA_FABRIC_CLK] =		{ 27, 11, 11, 1 },
+	[QCOM_RPM_SFPB_CLK] =			{ 28, 12, 12, 1 },
+	[QCOM_RPM_CFPB_CLK] =			{ 29, 13, 13, 1 },
+	[QCOM_RPM_EBI1_CLK] =			{ 30, 14, 16, 1 },
+	[QCOM_RPM_APPS_FABRIC_HALT] =		{ 31, 15, 22, 2 },
+	[QCOM_RPM_APPS_FABRIC_MODE] =		{ 33, 16, 23, 3 },
+	[QCOM_RPM_APPS_FABRIC_IOCTL] =		{ 36, 17, 24, 1 },
+	[QCOM_RPM_APPS_FABRIC_ARB] =		{ 37, 18, 25, 27 },
+	[QCOM_RPM_PM8018_SMPS1] =		{ 64, 19, 30, 2 },
+	[QCOM_RPM_PM8018_SMPS2] =		{ 66, 21, 31, 2 },
+	[QCOM_RPM_PM8018_SMPS3] =		{ 68, 23, 32, 2 },
+	[QCOM_RPM_PM8018_SMPS4] =		{ 70, 25, 33, 2 },
+	[QCOM_RPM_PM8018_SMPS5] =		{ 72, 27, 34, 2 },
+	[QCOM_RPM_PM8018_LDO1] =		{ 74, 29, 35, 2 },
+	[QCOM_RPM_PM8018_LDO2] =		{ 76, 31, 36, 2 },
+	[QCOM_RPM_PM8018_LDO3] =		{ 78, 33, 37, 2 },
+	[QCOM_RPM_PM8018_LDO4] =		{ 80, 35, 38, 2 },
+	[QCOM_RPM_PM8018_LDO5] =		{ 82, 37, 39, 2 },
+	[QCOM_RPM_PM8018_LDO6] =		{ 84, 39, 40, 2 },
+	[QCOM_RPM_PM8018_LDO7] =		{ 86, 41, 41, 2 },
+	[QCOM_RPM_PM8018_LDO8] =		{ 88, 43, 42, 2 },
+	[QCOM_RPM_PM8018_LDO9] =		{ 90, 45, 43, 2 },
+	[QCOM_RPM_PM8018_LDO10] =		{ 92, 47, 44, 2 },
+	[QCOM_RPM_PM8018_LDO11] =		{ 94, 49, 45, 2 },
+	[QCOM_RPM_PM8018_LDO12] =		{ 96, 51, 46, 2 },
+	[QCOM_RPM_PM8018_LDO13] =		{ 98, 53, 47, 2 },
+	[QCOM_RPM_PM8018_LDO14] =		{ 100, 55, 48, 2 },
+	[QCOM_RPM_PM8018_LVS1] =		{ 102, 57, 49, 1 },
+	[QCOM_RPM_PM8018_NCP] =			{ 103, 58, 80, 2 },
+	[QCOM_RPM_CXO_BUFFERS] =		{ 105, 60, 81, 1 },
+	[QCOM_RPM_USB_OTG_SWITCH] =		{ 106, 61, 82, 1 },
+	[QCOM_RPM_HDMI_SWITCH] =		{ 107, 62, 83, 1 },
+	[QCOM_RPM_VOLTAGE_CORNER] =		{ 109, 64, 87, 1 },
+};
+
+static const struct qcom_rpm_data mdm9615_template = {
+	.version = 3,
+	.resource_table = mdm9615_rpm_resource_table,
+	.n_resources = ARRAY_SIZE(mdm9615_rpm_resource_table),
+	.req_ctx_off = 3,
+	.req_sel_off = 11,
+	.ack_ctx_off = 15,
+	.ack_sel_off = 23,
+	.req_sel_size = 4,
+	.ack_sel_size = 7,
+};
+
 static const struct of_device_id qcom_rpm_of_match[] = {
 	{ .compatible = "qcom,rpm-apq8064", .data = &apq8064_template },
 	{ .compatible = "qcom,rpm-msm8660", .data = &msm8660_template },
 	{ .compatible = "qcom,rpm-msm8960", .data = &msm8960_template },
 	{ .compatible = "qcom,rpm-ipq8064", .data = &ipq806x_template },
+	{ .compatible = "qcom,rpm-mdm9615", .data = &mdm9615_template },
 	{ }
 };
 MODULE_DEVICE_TABLE(of, qcom_rpm_of_match);
diff --git a/include/dt-bindings/mfd/qcom-rpm.h b/include/dt-bindings/mfd/qcom-rpm.h
index 13a9d4bf2662..54aef5e21763 100644
--- a/include/dt-bindings/mfd/qcom-rpm.h
+++ b/include/dt-bindings/mfd/qcom-rpm.h
@@ -147,6 +147,28 @@
 #define QCOM_RPM_SMB208_S1b			137
 #define QCOM_RPM_SMB208_S2a			138
 #define QCOM_RPM_SMB208_S2b			139
+#define QCOM_RPM_PM8018_SMPS1			140
+#define QCOM_RPM_PM8018_SMPS2			141
+#define QCOM_RPM_PM8018_SMPS3			142
+#define QCOM_RPM_PM8018_SMPS4			143
+#define QCOM_RPM_PM8018_SMPS5			144
+#define QCOM_RPM_PM8018_LDO1			145
+#define QCOM_RPM_PM8018_LDO2			146
+#define QCOM_RPM_PM8018_LDO3			147
+#define QCOM_RPM_PM8018_LDO4			148
+#define QCOM_RPM_PM8018_LDO5			149
+#define QCOM_RPM_PM8018_LDO6			150
+#define QCOM_RPM_PM8018_LDO7			151
+#define QCOM_RPM_PM8018_LDO8			152
+#define QCOM_RPM_PM8018_LDO9			153
+#define QCOM_RPM_PM8018_LDO10			154
+#define QCOM_RPM_PM8018_LDO11			155
+#define QCOM_RPM_PM8018_LDO12			156
+#define QCOM_RPM_PM8018_LDO13			157
+#define QCOM_RPM_PM8018_LDO14			158
+#define QCOM_RPM_PM8018_LVS1			159
+#define QCOM_RPM_PM8018_NCP			160
+#define QCOM_RPM_VOLTAGE_CORNER			161
 
 /*
  * Constants used to select force mode for regulators.
-- 
cgit v1.2.3


From ddc085d44dd001e65eecd0a8bf05064bff063668 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <narmstrong@baylibre.com>
Date: Thu, 11 Aug 2016 15:16:46 +0200
Subject: regulator: qcom_rpm-regulator: Add support for pm8018 rpm regulator

In order to support the Qualcomm MDM9615 SoC, add support for the
PM8018 RPM regulator in the qcom_rpm-regulator driver.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/devicetree/bindings/mfd/qcom-rpm.txt | 14 +++++
 drivers/regulator/qcom_rpm-regulator.c             | 66 ++++++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/Documentation/devicetree/bindings/mfd/qcom-rpm.txt b/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
index 72d68d497efd..485bc59fcc48 100644
--- a/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
+++ b/Documentation/devicetree/bindings/mfd/qcom-rpm.txt
@@ -60,6 +60,7 @@ Regulator nodes are identified by their compatible:
 		    "qcom,rpm-pm8058-regulators"
 		    "qcom,rpm-pm8901-regulators"
 		    "qcom,rpm-pm8921-regulators"
+		    "qcom,rpm-pm8018-regulators"
 
 - vdd_l0_l1_lvs-supply:
 - vdd_l2_l11_l12-supply:
@@ -138,6 +139,15 @@ Regulator nodes are identified by their compatible:
 	Definition: reference to regulator supplying the input pin, as
 		    described in the data sheet
 
+- vin_lvs1-supply:
+- vdd_l7-supply:
+- vdd_l8-supply:
+- vdd_l9_l10_l11_l12-supply:
+	Usage: optional (pm8018 only)
+	Value type: <phandle>
+	Definition: reference to regulator supplying the input pin, as
+		    described in the data sheet
+
 The regulator node houses sub-nodes for each regulator within the device. Each
 sub-node is identified using the node's name, with valid values listed for each
 of the pmics below.
@@ -157,6 +167,10 @@ pm8921:
 	l29, lvs1, lvs2, lvs3, lvs4, lvs5, lvs6, lvs7, usb-switch, hdmi-switch,
 	ncp
 
+pm8018:
+	s1, s2, s3, s4, s5, , l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11,
+	l12, l14, lvs1
+
 The content of each sub-node is defined by the standard binding for regulators -
 see regulator.txt - with additional custom properties described below:
 
diff --git a/drivers/regulator/qcom_rpm-regulator.c b/drivers/regulator/qcom_rpm-regulator.c
index e254272585b2..1b2acc43fea1 100644
--- a/drivers/regulator/qcom_rpm-regulator.c
+++ b/drivers/regulator/qcom_rpm-regulator.c
@@ -447,6 +447,44 @@ static struct regulator_ops switch_ops = {
 	.is_enabled = rpm_reg_is_enabled,
 };
 
+/*
+ * PM8018 regulators
+ */
+static const struct qcom_rpm_reg pm8018_pldo = {
+	.desc.linear_ranges = pldo_ranges,
+	.desc.n_linear_ranges = ARRAY_SIZE(pldo_ranges),
+	.desc.n_voltages = 161,
+	.desc.ops = &uV_ops,
+	.parts = &rpm8960_ldo_parts,
+	.supports_force_mode_auto = false,
+	.supports_force_mode_bypass = false,
+};
+
+static const struct qcom_rpm_reg pm8018_nldo = {
+	.desc.linear_ranges = nldo_ranges,
+	.desc.n_linear_ranges = ARRAY_SIZE(nldo_ranges),
+	.desc.n_voltages = 64,
+	.desc.ops = &uV_ops,
+	.parts = &rpm8960_ldo_parts,
+	.supports_force_mode_auto = false,
+	.supports_force_mode_bypass = false,
+};
+
+static const struct qcom_rpm_reg pm8018_smps = {
+	.desc.linear_ranges = smps_ranges,
+	.desc.n_linear_ranges = ARRAY_SIZE(smps_ranges),
+	.desc.n_voltages = 154,
+	.desc.ops = &uV_ops,
+	.parts = &rpm8960_smps_parts,
+	.supports_force_mode_auto = false,
+	.supports_force_mode_bypass = false,
+};
+
+static const struct qcom_rpm_reg pm8018_switch = {
+	.desc.ops = &switch_ops,
+	.parts = &rpm8960_switch_parts,
+};
+
 /*
  * PM8058 regulators
  */
@@ -755,6 +793,32 @@ struct rpm_regulator_data {
 	const char *supply;
 };
 
+static const struct rpm_regulator_data rpm_pm8018_regulators[] = {
+	{ "s1",  QCOM_RPM_PM8018_SMPS1, &pm8018_smps, "vdd_s1" },
+	{ "s2",  QCOM_RPM_PM8018_SMPS2, &pm8018_smps, "vdd_s2" },
+	{ "s3",  QCOM_RPM_PM8018_SMPS3, &pm8018_smps, "vdd_s3" },
+	{ "s4",  QCOM_RPM_PM8018_SMPS4, &pm8018_smps, "vdd_s4" },
+	{ "s5",  QCOM_RPM_PM8018_SMPS5, &pm8018_smps, "vdd_s5" },
+
+	{ "l2",  QCOM_RPM_PM8018_LDO2,  &pm8018_pldo, "vdd_l2" },
+	{ "l3",  QCOM_RPM_PM8018_LDO3,  &pm8018_pldo, "vdd_l3" },
+	{ "l4",  QCOM_RPM_PM8018_LDO4,  &pm8018_pldo, "vdd_l4" },
+	{ "l5",  QCOM_RPM_PM8018_LDO5,  &pm8018_pldo, "vdd_l5" },
+	{ "l6",  QCOM_RPM_PM8018_LDO6,  &pm8018_pldo, "vdd_l7" },
+	{ "l7",  QCOM_RPM_PM8018_LDO7,  &pm8018_pldo, "vdd_l7" },
+	{ "l8",  QCOM_RPM_PM8018_LDO8,  &pm8018_nldo, "vdd_l8" },
+	{ "l9",  QCOM_RPM_PM8018_LDO9,  &pm8921_nldo1200,
+						      "vdd_l9_l10_l11_l12" },
+	{ "l10", QCOM_RPM_PM8018_LDO10, &pm8018_nldo, "vdd_l9_l10_l11_l12" },
+	{ "l11", QCOM_RPM_PM8018_LDO11, &pm8018_nldo, "vdd_l9_l10_l11_l12" },
+	{ "l12", QCOM_RPM_PM8018_LDO12, &pm8018_nldo, "vdd_l9_l10_l11_l12" },
+	{ "l14", QCOM_RPM_PM8018_LDO14, &pm8018_pldo, "vdd_l14" },
+
+	{ "lvs1", QCOM_RPM_PM8018_LVS1, &pm8018_switch, "lvs1_in" },
+
+	{ }
+};
+
 static const struct rpm_regulator_data rpm_pm8058_regulators[] = {
 	{ "l0",   QCOM_RPM_PM8058_LDO0,   &pm8058_nldo, "vdd_l0_l1_lvs"	},
 	{ "l1",   QCOM_RPM_PM8058_LDO1,   &pm8058_nldo, "vdd_l0_l1_lvs" },
@@ -870,6 +934,8 @@ static const struct rpm_regulator_data rpm_pm8921_regulators[] = {
 };
 
 static const struct of_device_id rpm_of_match[] = {
+	{ .compatible = "qcom,rpm-pm8018-regulators",
+		.data = &rpm_pm8018_regulators },
 	{ .compatible = "qcom,rpm-pm8058-regulators", .data = &rpm_pm8058_regulators },
 	{ .compatible = "qcom,rpm-pm8901-regulators", .data = &rpm_pm8901_regulators },
 	{ .compatible = "qcom,rpm-pm8921-regulators", .data = &rpm_pm8921_regulators },
-- 
cgit v1.2.3


From 6f1d912b687d3d17c1731f5bda3b5d6703bce4a0 Mon Sep 17 00:00:00 2001
From: Vic Yang <victoryang@google.com>
Date: Wed, 10 Aug 2016 19:05:24 +0200
Subject: mfd: cros_ec: Add MKBP event support

Newer revisions of the ChromeOS EC add more events besides the keyboard
ones. So handle interrupts in the MFD driver and let consumers register
for notifications for the events they might care.

To keep backward compatibility, if the EC doesn't support MKBP event, we
fall back to the old MKBP key matrix host command.

Cc: Randall Spangler <rspangler@chromium.org>
Cc: Vincent Palatin <vpalatin@chromium.org>
Cc: Benson Leung <bleung@chromium.org>
Signed-off-by: Vic Yang <victoryang@google.com>
Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Tested-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec.c                   | 58 +++++++++++++++++++--
 drivers/platform/chrome/cros_ec_proto.c | 92 +++++++++++++++++++++++++++++++++
 include/linux/mfd/cros_ec.h             | 18 +++++++
 include/linux/mfd/cros_ec_commands.h    | 34 ++++++++++++
 4 files changed, 199 insertions(+), 3 deletions(-)

diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index 0eee63542038..abd83424b498 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/cros_ec.h>
+#include <asm/unaligned.h>
 
 #define CROS_EC_DEV_EC_INDEX 0
 #define CROS_EC_DEV_PD_INDEX 1
@@ -49,11 +50,28 @@ static const struct mfd_cell ec_pd_cell = {
 	.pdata_size = sizeof(pd_p),
 };
 
+static irqreturn_t ec_irq_thread(int irq, void *data)
+{
+	struct cros_ec_device *ec_dev = data;
+	int ret;
+
+	if (device_may_wakeup(ec_dev->dev))
+		pm_wakeup_event(ec_dev->dev, 0);
+
+	ret = cros_ec_get_next_event(ec_dev);
+	if (ret > 0)
+		blocking_notifier_call_chain(&ec_dev->event_notifier,
+					     0, ec_dev);
+	return IRQ_HANDLED;
+}
+
 int cros_ec_register(struct cros_ec_device *ec_dev)
 {
 	struct device *dev = ec_dev->dev;
 	int err = 0;
 
+	BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->event_notifier);
+
 	ec_dev->max_request = sizeof(struct ec_params_hello);
 	ec_dev->max_response = sizeof(struct ec_response_get_protocol_info);
 	ec_dev->max_passthru = 0;
@@ -70,13 +88,24 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 
 	cros_ec_query_all(ec_dev);
 
+	if (ec_dev->irq) {
+		err = request_threaded_irq(ec_dev->irq, NULL, ec_irq_thread,
+					   IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+					   "chromeos-ec", ec_dev);
+		if (err) {
+			dev_err(dev, "Failed to request IRQ %d: %d",
+				ec_dev->irq, err);
+			return err;
+		}
+	}
+
 	err = mfd_add_devices(ec_dev->dev, PLATFORM_DEVID_AUTO, &ec_cell, 1,
 			      NULL, ec_dev->irq, NULL);
 	if (err) {
 		dev_err(dev,
 			"Failed to register Embedded Controller subdevice %d\n",
 			err);
-		return err;
+		goto fail_mfd;
 	}
 
 	if (ec_dev->max_passthru) {
@@ -94,7 +123,7 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 			dev_err(dev,
 				"Failed to register Power Delivery subdevice %d\n",
 				err);
-			return err;
+			goto fail_mfd;
 		}
 	}
 
@@ -103,13 +132,18 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 		if (err) {
 			mfd_remove_devices(dev);
 			dev_err(dev, "Failed to register sub-devices\n");
-			return err;
+			goto fail_mfd;
 		}
 	}
 
 	dev_info(dev, "Chrome EC device registered\n");
 
 	return 0;
+
+fail_mfd:
+	if (ec_dev->irq)
+		free_irq(ec_dev->irq, ec_dev);
+	return err;
 }
 EXPORT_SYMBOL(cros_ec_register);
 
@@ -136,13 +170,31 @@ int cros_ec_suspend(struct cros_ec_device *ec_dev)
 }
 EXPORT_SYMBOL(cros_ec_suspend);
 
+static void cros_ec_drain_events(struct cros_ec_device *ec_dev)
+{
+	while (cros_ec_get_next_event(ec_dev) > 0)
+		blocking_notifier_call_chain(&ec_dev->event_notifier,
+					     1, ec_dev);
+}
+
 int cros_ec_resume(struct cros_ec_device *ec_dev)
 {
 	enable_irq(ec_dev->irq);
 
+	/*
+	 * In some cases, we need to distinguish between events that occur
+	 * during suspend if the EC is not a wake source. For example,
+	 * keypresses during suspend should be discarded if it does not wake
+	 * the system.
+	 *
+	 * If the EC is not a wake source, drain the event queue and mark them
+	 * as "queued during suspend".
+	 */
 	if (ec_dev->wake_enabled) {
 		disable_irq_wake(ec_dev->irq);
 		ec_dev->wake_enabled = 0;
+	} else {
+		cros_ec_drain_events(ec_dev);
 	}
 
 	return 0;
diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c
index 6c084b266651..04053fe1e980 100644
--- a/drivers/platform/chrome/cros_ec_proto.c
+++ b/drivers/platform/chrome/cros_ec_proto.c
@@ -19,6 +19,7 @@
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <asm/unaligned.h>
 
 #define EC_COMMAND_RETRIES	50
 
@@ -234,11 +235,44 @@ static int cros_ec_host_command_proto_query_v2(struct cros_ec_device *ec_dev)
 	return ret;
 }
 
+static int cros_ec_get_host_command_version_mask(struct cros_ec_device *ec_dev,
+	u16 cmd, u32 *mask)
+{
+	struct ec_params_get_cmd_versions *pver;
+	struct ec_response_get_cmd_versions *rver;
+	struct cros_ec_command *msg;
+	int ret;
+
+	msg = kmalloc(sizeof(*msg) + max(sizeof(*rver), sizeof(*pver)),
+		      GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	msg->version = 0;
+	msg->command = EC_CMD_GET_CMD_VERSIONS;
+	msg->insize = sizeof(*rver);
+	msg->outsize = sizeof(*pver);
+
+	pver = (struct ec_params_get_cmd_versions *)msg->data;
+	pver->cmd = cmd;
+
+	ret = cros_ec_cmd_xfer(ec_dev, msg);
+	if (ret > 0) {
+		rver = (struct ec_response_get_cmd_versions *)msg->data;
+		*mask = rver->version_mask;
+	}
+
+	kfree(msg);
+
+	return ret;
+}
+
 int cros_ec_query_all(struct cros_ec_device *ec_dev)
 {
 	struct device *dev = ec_dev->dev;
 	struct cros_ec_command *proto_msg;
 	struct ec_response_get_protocol_info *proto_info;
+	u32 ver_mask = 0;
 	int ret;
 
 	proto_msg = kzalloc(sizeof(*proto_msg) + sizeof(*proto_info),
@@ -328,6 +362,15 @@ int cros_ec_query_all(struct cros_ec_device *ec_dev)
 		goto exit;
 	}
 
+	/* Probe if MKBP event is supported */
+	ret = cros_ec_get_host_command_version_mask(ec_dev,
+						    EC_CMD_GET_NEXT_EVENT,
+						    &ver_mask);
+	if (ret < 0 || ver_mask == 0)
+		ec_dev->mkbp_event_supported = 0;
+	else
+		ec_dev->mkbp_event_supported = 1;
+
 exit:
 	kfree(proto_msg);
 	return ret;
@@ -397,3 +440,52 @@ int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev,
 	return ret;
 }
 EXPORT_SYMBOL(cros_ec_cmd_xfer_status);
+
+static int get_next_event(struct cros_ec_device *ec_dev)
+{
+	u8 buffer[sizeof(struct cros_ec_command) + sizeof(ec_dev->event_data)];
+	struct cros_ec_command *msg = (struct cros_ec_command *)&buffer;
+	int ret;
+
+	msg->version = 0;
+	msg->command = EC_CMD_GET_NEXT_EVENT;
+	msg->insize = sizeof(ec_dev->event_data);
+	msg->outsize = 0;
+
+	ret = cros_ec_cmd_xfer(ec_dev, msg);
+	if (ret > 0) {
+		ec_dev->event_size = ret - 1;
+		memcpy(&ec_dev->event_data, msg->data,
+		       sizeof(ec_dev->event_data));
+	}
+
+	return ret;
+}
+
+static int get_keyboard_state_event(struct cros_ec_device *ec_dev)
+{
+	u8 buffer[sizeof(struct cros_ec_command) +
+		  sizeof(ec_dev->event_data.data)];
+	struct cros_ec_command *msg = (struct cros_ec_command *)&buffer;
+
+	msg->version = 0;
+	msg->command = EC_CMD_MKBP_STATE;
+	msg->insize = sizeof(ec_dev->event_data.data);
+	msg->outsize = 0;
+
+	ec_dev->event_size = cros_ec_cmd_xfer(ec_dev, msg);
+	ec_dev->event_data.event_type = EC_MKBP_EVENT_KEY_MATRIX;
+	memcpy(&ec_dev->event_data.data, msg->data,
+	       sizeof(ec_dev->event_data.data));
+
+	return ec_dev->event_size;
+}
+
+int cros_ec_get_next_event(struct cros_ec_device *ec_dev)
+{
+	if (ec_dev->mkbp_event_supported)
+		return get_next_event(ec_dev);
+	else
+		return get_keyboard_state_event(ec_dev);
+}
+EXPORT_SYMBOL(cros_ec_get_next_event);
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index d641a18abacb..76f7ef4d3a0d 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -109,6 +109,10 @@ struct cros_ec_command {
  *     should check msg.result for the EC's result code.
  * @pkt_xfer: send packet to EC and get response
  * @lock: one transaction at a time
+ * @mkbp_event_supported: true if this EC supports the MKBP event protocol.
+ * @event_notifier: interrupt event notifier for transport devices.
+ * @event_data: raw payload transferred with the MKBP event.
+ * @event_size: size in bytes of the event data.
  */
 struct cros_ec_device {
 
@@ -137,6 +141,11 @@ struct cros_ec_device {
 	int (*pkt_xfer)(struct cros_ec_device *ec,
 			struct cros_ec_command *msg);
 	struct mutex lock;
+	bool mkbp_event_supported;
+	struct blocking_notifier_head event_notifier;
+
+	struct ec_response_get_next_event event_data;
+	int event_size;
 };
 
 /* struct cros_ec_platform - ChromeOS EC platform information
@@ -269,6 +278,15 @@ int cros_ec_register(struct cros_ec_device *ec_dev);
  */
 int cros_ec_query_all(struct cros_ec_device *ec_dev);
 
+/**
+ * cros_ec_get_next_event -  Fetch next event from the ChromeOS EC
+ *
+ * @ec_dev: Device to fetch event from
+ *
+ * Returns: 0 on success, Linux error number on failure
+ */
+int cros_ec_get_next_event(struct cros_ec_device *ec_dev);
+
 /* sysfs stuff */
 extern struct attribute_group cros_ec_attr_group;
 extern struct attribute_group cros_ec_lightbar_attr_group;
diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
index 7e7a8d4b4551..76728ff37d01 100644
--- a/include/linux/mfd/cros_ec_commands.h
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -1793,6 +1793,40 @@ struct ec_result_keyscan_seq_ctrl {
 	};
 } __packed;
 
+/*
+ * Command for retrieving the next pending MKBP event from the EC device
+ *
+ * The device replies with UNAVAILABLE if there aren't any pending events.
+ */
+#define EC_CMD_GET_NEXT_EVENT 0x67
+
+enum ec_mkbp_event {
+	/* Keyboard matrix changed. The event data is the new matrix state. */
+	EC_MKBP_EVENT_KEY_MATRIX = 0,
+
+	/* New host event. The event data is 4 bytes of host event flags. */
+	EC_MKBP_EVENT_HOST_EVENT = 1,
+
+	/* New Sensor FIFO data. The event data is fifo_info structure. */
+	EC_MKBP_EVENT_SENSOR_FIFO = 2,
+
+	/* Number of MKBP events */
+	EC_MKBP_EVENT_COUNT,
+};
+
+union ec_response_get_next_data {
+	uint8_t   key_matrix[13];
+
+	/* Unaligned */
+	uint32_t  host_event;
+} __packed;
+
+struct ec_response_get_next_event {
+	uint8_t event_type;
+	/* Followed by event data if any */
+	union ec_response_get_next_data data;
+} __packed;
+
 /*****************************************************************************/
 /* Temperature sensor commands */
 
-- 
cgit v1.2.3


From 44051a6825d58c02820c07ded54a010479460308 Mon Sep 17 00:00:00 2001
From: Vic Yang <victoryang@google.com>
Date: Wed, 10 Aug 2016 19:05:25 +0200
Subject: Input: cros_ec_keyb: Stop handling interrupts directly

Because events other that keyboard ones will be handled by now on by
other drivers, stop directly handling interrupts and instead listen to
the new notifier in the MFD driver.

Signed-off-by: Vic Yang <victoryang@google.com>
Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
Tested-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/input/keyboard/cros_ec_keyb.c | 135 ++++++++--------------------------
 1 file changed, 31 insertions(+), 104 deletions(-)

diff --git a/drivers/input/keyboard/cros_ec_keyb.c b/drivers/input/keyboard/cros_ec_keyb.c
index 4b0878f35471..25943e9bc8bf 100644
--- a/drivers/input/keyboard/cros_ec_keyb.c
+++ b/drivers/input/keyboard/cros_ec_keyb.c
@@ -27,6 +27,7 @@
 #include <linux/input.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
+#include <linux/notifier.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/input/matrix_keypad.h>
@@ -44,6 +45,7 @@
  * @dev: Device pointer
  * @idev: Input device
  * @ec: Top level ChromeOS device to use to talk to EC
+ * @notifier: interrupt event notifier for transport devices
  */
 struct cros_ec_keyb {
 	unsigned int rows;
@@ -57,6 +59,7 @@ struct cros_ec_keyb {
 	struct device *dev;
 	struct input_dev *idev;
 	struct cros_ec_device *ec;
+	struct notifier_block notifier;
 };
 
 
@@ -146,67 +149,44 @@ static void cros_ec_keyb_process(struct cros_ec_keyb *ckdev,
 	input_sync(ckdev->idev);
 }
 
-static int cros_ec_keyb_get_state(struct cros_ec_keyb *ckdev, uint8_t *kb_state)
-{
-	int ret = 0;
-	struct cros_ec_command *msg;
-
-	msg = kmalloc(sizeof(*msg) + ckdev->cols, GFP_KERNEL);
-	if (!msg)
-		return -ENOMEM;
-
-	msg->version = 0;
-	msg->command = EC_CMD_MKBP_STATE;
-	msg->insize = ckdev->cols;
-	msg->outsize = 0;
-
-	ret = cros_ec_cmd_xfer(ckdev->ec, msg);
-	if (ret < 0) {
-		dev_err(ckdev->dev, "Error transferring EC message %d\n", ret);
-		goto exit;
-	}
-
-	memcpy(kb_state, msg->data, ckdev->cols);
-exit:
-	kfree(msg);
-	return ret;
-}
-
-static irqreturn_t cros_ec_keyb_irq(int irq, void *data)
+static int cros_ec_keyb_open(struct input_dev *dev)
 {
-	struct cros_ec_keyb *ckdev = data;
-	struct cros_ec_device *ec = ckdev->ec;
-	int ret;
-	uint8_t kb_state[ckdev->cols];
-
-	if (device_may_wakeup(ec->dev))
-		pm_wakeup_event(ec->dev, 0);
-
-	ret = cros_ec_keyb_get_state(ckdev, kb_state);
-	if (ret >= 0)
-		cros_ec_keyb_process(ckdev, kb_state, ret);
-	else
-		dev_err(ckdev->dev, "failed to get keyboard state: %d\n", ret);
+	struct cros_ec_keyb *ckdev = input_get_drvdata(dev);
 
-	return IRQ_HANDLED;
+	return blocking_notifier_chain_register(&ckdev->ec->event_notifier,
+						&ckdev->notifier);
 }
 
-static int cros_ec_keyb_open(struct input_dev *dev)
+static void cros_ec_keyb_close(struct input_dev *dev)
 {
 	struct cros_ec_keyb *ckdev = input_get_drvdata(dev);
-	struct cros_ec_device *ec = ckdev->ec;
 
-	return request_threaded_irq(ec->irq, NULL, cros_ec_keyb_irq,
-					IRQF_TRIGGER_LOW | IRQF_ONESHOT,
-					"cros_ec_keyb", ckdev);
+	blocking_notifier_chain_unregister(&ckdev->ec->event_notifier,
+					   &ckdev->notifier);
 }
 
-static void cros_ec_keyb_close(struct input_dev *dev)
+static int cros_ec_keyb_work(struct notifier_block *nb,
+			     unsigned long queued_during_suspend, void *_notify)
 {
-	struct cros_ec_keyb *ckdev = input_get_drvdata(dev);
-	struct cros_ec_device *ec = ckdev->ec;
+	struct cros_ec_keyb *ckdev = container_of(nb, struct cros_ec_keyb,
+						  notifier);
 
-	free_irq(ec->irq, ckdev);
+	if (ckdev->ec->event_data.event_type != EC_MKBP_EVENT_KEY_MATRIX)
+		return NOTIFY_DONE;
+	/*
+	 * If EC is not the wake source, discard key state changes during
+	 * suspend.
+	 */
+	if (queued_during_suspend)
+		return NOTIFY_OK;
+	if (ckdev->ec->event_size != ckdev->cols) {
+		dev_err(ckdev->dev,
+			"Discarded incomplete key matrix event.\n");
+		return NOTIFY_OK;
+	}
+	cros_ec_keyb_process(ckdev, ckdev->ec->event_data.data.key_matrix,
+			     ckdev->ec->event_size);
+	return NOTIFY_OK;
 }
 
 /*
@@ -265,12 +245,8 @@ static int cros_ec_keyb_probe(struct platform_device *pdev)
 	if (!idev)
 		return -ENOMEM;
 
-	if (!ec->irq) {
-		dev_err(dev, "no EC IRQ specified\n");
-		return -EINVAL;
-	}
-
 	ckdev->ec = ec;
+	ckdev->notifier.notifier_call = cros_ec_keyb_work;
 	ckdev->dev = dev;
 	dev_set_drvdata(dev, ckdev);
 
@@ -311,54 +287,6 @@ static int cros_ec_keyb_probe(struct platform_device *pdev)
 	return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-/* Clear any keys in the buffer */
-static void cros_ec_keyb_clear_keyboard(struct cros_ec_keyb *ckdev)
-{
-	uint8_t old_state[ckdev->cols];
-	uint8_t new_state[ckdev->cols];
-	unsigned long duration;
-	int i, ret;
-
-	/*
-	 * Keep reading until we see that the scan state does not change.
-	 * That indicates that we are done.
-	 *
-	 * Assume that the EC keyscan buffer is at most 32 deep.
-	 */
-	duration = jiffies;
-	ret = cros_ec_keyb_get_state(ckdev, new_state);
-	for (i = 1; !ret && i < 32; i++) {
-		memcpy(old_state, new_state, sizeof(old_state));
-		ret = cros_ec_keyb_get_state(ckdev, new_state);
-		if (0 == memcmp(old_state, new_state, sizeof(old_state)))
-			break;
-	}
-	duration = jiffies - duration;
-	dev_info(ckdev->dev, "Discarded %d keyscan(s) in %dus\n", i,
-		jiffies_to_usecs(duration));
-}
-
-static int cros_ec_keyb_resume(struct device *dev)
-{
-	struct cros_ec_keyb *ckdev = dev_get_drvdata(dev);
-
-	/*
-	 * When the EC is not a wake source, then it could not have caused the
-	 * resume, so we clear the EC's key scan buffer. If the EC was a
-	 * wake source (e.g. the lid is open and the user might press a key to
-	 * wake) then the key scan buffer should be preserved.
-	 */
-	if (!ckdev->ec->was_wake_device)
-		cros_ec_keyb_clear_keyboard(ckdev);
-
-	return 0;
-}
-
-#endif
-
-static SIMPLE_DEV_PM_OPS(cros_ec_keyb_pm_ops, NULL, cros_ec_keyb_resume);
-
 #ifdef CONFIG_OF
 static const struct of_device_id cros_ec_keyb_of_match[] = {
 	{ .compatible = "google,cros-ec-keyb" },
@@ -372,7 +300,6 @@ static struct platform_driver cros_ec_keyb_driver = {
 	.driver = {
 		.name = "cros-ec-keyb",
 		.of_match_table = of_match_ptr(cros_ec_keyb_of_match),
-		.pm	= &cros_ec_keyb_pm_ops,
 	},
 };
 
-- 
cgit v1.2.3


From dc21c7ad3a8aad79cb14128c321833a47dc921c2 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 31 Aug 2016 14:28:10 +0530
Subject: mfd: lp873x: Add lp873x PMIC support

The LP873X chip is a power management IC for Portable Navigation Systems
    and Tablet Computing devices. It contains the following components:

     - Regulators.
     - Configurable General Purpose Output Signals (GPO).

PMIC interacts with the main processor through i2c. PMIC has
couple of LDOs (Linear Regulators), couple of BUCKs (Step-Down DC-DC
Converter Cores) and GPOs (General Purpose Output Signals).

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig        |  14 +++
 drivers/mfd/Makefile       |   2 +
 drivers/mfd/lp873x.c       |  99 +++++++++++++++++
 include/linux/mfd/lp873x.h | 269 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 384 insertions(+)
 create mode 100644 drivers/mfd/lp873x.c
 create mode 100644 include/linux/mfd/lp873x.h

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2d1fb6420592..69e51d0250bf 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1224,6 +1224,20 @@ config MFD_TPS65217
 	  This driver can also be built as a module.  If so, the module
 	  will be called tps65217.
 
+config MFD_TI_LP873X
+	tristate "TI LP873X Power Management IC"
+	depends on I2C
+	select MFD_CORE
+	select REGMAP_I2C
+	help
+	  If you say yes here then you get support for the LP873X series of
+	  Power Management Integrated Circuits (PMIC).
+	  These include voltage regulators, thermal protection, configurable
+	  General Purpose Outputs (GPO) that are used in portable devices.
+
+	  This driver can also be built as a module. If so, the module
+	  will be called lp873x.
+
 config MFD_TPS65218
 	tristate "TI TPS65218 Power Management chips"
 	depends on I2C
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 2ba3ba35f745..42acbcdeb4f6 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -22,6 +22,8 @@ obj-$(CONFIG_HTC_EGPIO)		+= htc-egpio.o
 obj-$(CONFIG_HTC_PASIC3)	+= htc-pasic3.o
 obj-$(CONFIG_HTC_I2CPLD)	+= htc-i2cpld.o
 
+obj-$(CONFIG_MFD_TI_LP873X)	+= lp873x.o
+
 obj-$(CONFIG_MFD_DAVINCI_VOICECODEC)	+= davinci_voicecodec.o
 obj-$(CONFIG_MFD_DM355EVM_MSP)	+= dm355evm_msp.o
 obj-$(CONFIG_MFD_TI_AM335X_TSCADC)	+= ti_am335x_tscadc.o
diff --git a/drivers/mfd/lp873x.c b/drivers/mfd/lp873x.c
new file mode 100644
index 000000000000..9af064c940ee
--- /dev/null
+++ b/drivers/mfd/lp873x.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2016 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * Author: Keerthy <j-keerthy@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/lp873x.h>
+
+static const struct regmap_config lp873x_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = LP873X_REG_MAX,
+};
+
+static const struct mfd_cell lp873x_cells[] = {
+	{ .name = "lp873x-regulator", },
+	{ .name = "lp873x-gpio", },
+};
+
+static int lp873x_probe(struct i2c_client *client,
+			const struct i2c_device_id *ids)
+{
+	struct lp873x *lp873;
+	int ret;
+	unsigned int otpid;
+
+	lp873 = devm_kzalloc(&client->dev, sizeof(*lp873), GFP_KERNEL);
+	if (!lp873)
+		return -ENOMEM;
+
+	lp873->dev = &client->dev;
+
+	lp873->regmap = devm_regmap_init_i2c(client, &lp873x_regmap_config);
+	if (IS_ERR(lp873->regmap)) {
+		ret = PTR_ERR(lp873->regmap);
+		dev_err(lp873->dev,
+			"Failed to initialize register map: %d\n", ret);
+		return ret;
+	}
+
+	mutex_init(&lp873->lock);
+
+	ret = regmap_read(lp873->regmap, LP873X_REG_OTP_REV, &otpid);
+	if (ret) {
+		dev_err(lp873->dev, "Failed to read OTP ID\n");
+		return ret;
+	}
+
+	lp873->rev = otpid & LP873X_OTP_REV_OTP_ID;
+
+	i2c_set_clientdata(client, lp873);
+
+	ret = mfd_add_devices(lp873->dev, PLATFORM_DEVID_AUTO, lp873x_cells,
+			      ARRAY_SIZE(lp873x_cells), NULL, 0, NULL);
+
+	return ret;
+}
+
+static const struct of_device_id of_lp873x_match_table[] = {
+	{ .compatible = "ti,lp8733", },
+	{ .compatible = "ti,lp8732", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, of_lp873x_match_table);
+
+static const struct i2c_device_id lp873x_id_table[] = {
+	{ "lp873x", 0 },
+	{ },
+};
+MODULE_DEVICE_TABLE(i2c, lp873x_id_table);
+
+static struct i2c_driver lp873x_driver = {
+	.driver	= {
+		.name	= "lp873x",
+		.of_match_table = of_lp873x_match_table,
+	},
+	.probe		= lp873x_probe,
+	.id_table	= lp873x_id_table,
+};
+module_i2c_driver(lp873x_driver);
+
+MODULE_AUTHOR("J Keerthy <j-keerthy@ti.com>");
+MODULE_DESCRIPTION("LP873X chip family Multi-Function Device driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/lp873x.h b/include/linux/mfd/lp873x.h
new file mode 100644
index 000000000000..83b1bd7588be
--- /dev/null
+++ b/include/linux/mfd/lp873x.h
@@ -0,0 +1,269 @@
+/*
+ * Functions to access LP873X power management chip.
+ *
+ * Copyright (C) 2016 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_MFD_LP873X_H
+#define __LINUX_MFD_LP873X_H
+
+#include <linux/i2c.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+
+/* LP873x chip id list */
+#define LP873X			0x00
+
+/* All register addresses */
+#define LP873X_REG_DEV_REV		0X00
+#define LP873X_REG_OTP_REV		0X01
+#define LP873X_REG_BUCK0_CTRL_1		0X02
+#define LP873X_REG_BUCK0_CTRL_2		0X03
+#define LP873X_REG_BUCK1_CTRL_1		0X04
+#define LP873X_REG_BUCK1_CTRL_2		0X05
+#define LP873X_REG_BUCK0_VOUT		0X06
+#define LP873X_REG_BUCK1_VOUT		0X07
+#define LP873X_REG_LDO0_CTRL		0X08
+#define LP873X_REG_LDO1_CTRL            0X09
+#define LP873X_REG_LDO0_VOUT		0X0A
+#define LP873X_REG_LDO1_VOUT		0X0B
+#define LP873X_REG_BUCK0_DELAY		0X0C
+#define LP873X_REG_BUCK1_DELAY		0X0D
+#define LP873X_REG_LDO0_DELAY		0X0E
+#define LP873X_REG_LDO1_DELAY		0X0F
+#define LP873X_REG_GPO_DELAY		0X10
+#define LP873X_REG_GPO2_DELAY		0X11
+#define LP873X_REG_GPO_CTRL		0X12
+#define LP873X_REG_CONFIG		0X13
+#define LP873X_REG_PLL_CTRL		0X14
+#define LP873X_REG_PGOOD_CTRL1		0X15
+#define LP873X_REG_PGOOD_CTRL2		0X16
+#define LP873X_REG_PG_FAULT		0X17
+#define LP873X_REG_RESET		0X18
+#define LP873X_REG_INT_TOP_1		0X19
+#define LP873X_REG_INT_TOP_2		0X1A
+#define LP873X_REG_INT_BUCK		0X1B
+#define LP873X_REG_INT_LDO		0X1C
+#define LP873X_REG_TOP_STAT		0X1D
+#define LP873X_REG_BUCK_STAT		0X1E
+#define LP873X_REG_LDO_STAT		0x1F
+#define LP873X_REG_TOP_MASK_1		0x20
+#define LP873X_REG_TOP_MASK_2		0x21
+#define LP873X_REG_BUCK_MASK		0x22
+#define LP873X_REG_LDO_MASK		0x23
+#define LP873X_REG_SEL_I_LOAD		0x24
+#define LP873X_REG_I_LOAD_2		0x25
+#define LP873X_REG_I_LOAD_1		0x26
+
+#define LP873X_REG_MAX			LP873X_REG_I_LOAD_1
+
+/* Register field definitions */
+#define LP873X_DEV_REV_DEV_ID			0xC0
+#define LP873X_DEV_REV_ALL_LAYER		0x30
+#define LP873X_DEV_REV_METAL_LAYER		0x0F
+
+#define LP873X_OTP_REV_OTP_ID			0xFF
+
+#define LP873X_BUCK0_CTRL_1_BUCK0_FPWM		BIT(3)
+#define LP873X_BUCK0_CTRL_1_BUCK0_RDIS_EN	BIT(2)
+#define LP873X_BUCK0_CTRL_1_BUCK0_EN_PIN_CTRL	BIT(1)
+#define LP873X_BUCK0_CTRL_1_BUCK0_EN		BIT(0)
+
+#define LP873X_BUCK0_CTRL_2_BUCK0_ILIM		0x38
+#define LP873X_BUCK0_CTRL_2_BUCK0_SLEW_RATE	0x07
+
+#define LP873X_BUCK1_CTRL_1_BUCK1_FPWM		BIT(3)
+#define LP873X_BUCK1_CTRL_1_BUCK1_RDIS_EN	BIT(2)
+#define LP873X_BUCK1_CTRL_1_BUCK1_EN_PIN_CTRL	BIT(1)
+#define LP873X_BUCK1_CTRL_1_BUCK1_EN		BIT(0)
+
+#define LP873X_BUCK1_CTRL_2_BUCK1_ILIM		0x38
+#define LP873X_BUCK1_CTRL_2_BUCK1_SLEW_RATE	0x07
+
+#define LP873X_BUCK0_VOUT_BUCK0_VSET		0xFF
+
+#define LP873X_BUCK1_VOUT_BUCK1_VSET		0xFF
+
+#define LP873X_LDO0_CTRL_LDO0_RDIS_EN		BIT(2)
+#define LP873X_LDO0_CTRL_LDO0_EN_PIN_CTRL	BIT(1)
+#define LP873X_LDO0_CTRL_LDO0_EN		BIT(0)
+
+#define LP873X_LDO1_CTRL_LDO1_RDIS_EN		BIT(2)
+#define LP873X_LDO1_CTRL_LDO1_EN_PIN_CTRL	BIT(1)
+#define LP873X_LDO1_CTRL_LDO1_EN		BIT(0)
+
+#define LP873X_LDO0_VOUT_LDO0_VSET		0x1F
+
+#define LP873X_LDO1_VOUT_LDO1_VSET		0x1F
+
+#define LP873X_BUCK0_DELAY_BUCK0_SD_DELAY	0xF0
+#define LP873X_BUCK0_DELAY_BUCK0_SU_DELAY	0x0F
+
+#define LP873X_BUCK1_DELAY_BUCK1_SD_DELAY	0xF0
+#define LP873X_BUCK1_DELAY_BUCK1_SU_DELAY	0x0F
+
+#define LP873X_LDO0_DELAY_LDO0_SD_DELAY	0xF0
+#define LP873X_LDO0_DELAY_LDO0_SU_DELAY	0x0F
+
+#define LP873X_LDO1_DELAY_LDO1_SD_DELAY	0xF0
+#define LP873X_LDO1_DELAY_LDO1_SU_DELAY	0x0F
+
+#define LP873X_GPO_DELAY_GPO_SD_DELAY		0xF0
+#define LP873X_GPO_DELAY_GPO_SU_DELAY		0x0F
+
+#define LP873X_GPO2_DELAY_GPO2_SD_DELAY	0xF0
+#define LP873X_GPO2_DELAY_GPO2_SU_DELAY	0x0F
+
+#define LP873X_GPO_CTRL_GPO2_OD		BIT(6)
+#define LP873X_GPO_CTRL_GPO2_EN_PIN_CTRL	BIT(5)
+#define LP873X_GPO_CTRL_GPO2_EN		BIT(4)
+#define LP873X_GPO_CTRL_GPO_OD			BIT(2)
+#define LP873X_GPO_CTRL_GPO_EN_PIN_CTRL	BIT(1)
+#define LP873X_GPO_CTRL_GPO_EN			BIT(0)
+
+#define LP873X_CONFIG_SU_DELAY_SEL		BIT(6)
+#define LP873X_CONFIG_SD_DELAY_SEL		BIT(5)
+#define LP873X_CONFIG_CLKIN_PIN_SEL		BIT(4)
+#define LP873X_CONFIG_CLKIN_PD			BIT(3)
+#define LP873X_CONFIG_EN_PD			BIT(2)
+#define LP873X_CONFIG_TDIE_WARN_LEVEL		BIT(1)
+#define LP873X_EN_SPREAD_SPEC			BIT(0)
+
+#define LP873X_PLL_CTRL_EN_PLL			BIT(6)
+#define LP873X_EXT_CLK_FREQ			0x1F
+
+#define LP873X_PGOOD_CTRL1_PGOOD_POL		BIT(7)
+#define LP873X_PGOOD_CTRL1_PGOOD_OD		BIT(6)
+#define LP873X_PGOOD_CTRL1_PGOOD_WINDOW_LDO	BIT(5)
+#define LP873X_PGOOD_CTRL1_PGOOD_WINDOWN_BUCK	BIT(4)
+#define LP873X_PGOOD_CTRL1_PGOOD_EN_PGOOD_LDO1	BIT(3)
+#define LP873X_PGOOD_CTRL1_PGOOD_EN_PGOOD_LDO0	BIT(2)
+#define LP873X_PGOOD_CTRL1_PGOOD_EN_PGOOD_BUCK1	BIT(1)
+#define LP873X_PGOOD_CTRL1_PGOOD_EN_PGOOD_BUCK0	BIT(0)
+
+#define LP873X_PGOOD_CTRL2_EN_PGOOD_TWARN	BIT(2)
+#define LP873X_PGOOD_CTRL2_EN_PG_FAULT_GATE	BIT(1)
+#define LP873X_PGOOD_CTRL2_PGOOD_MODE		BIT(0)
+
+#define LP873X_PG_FAULT_PG_FAULT_LDO1		BIT(3)
+#define LP873X_PG_FAULT_PG_FAULT_LDO0		BIT(2)
+#define LP873X_PG_FAULT_PG_FAULT_BUCK1		BIT(1)
+#define LP873X_PG_FAULT_PG_FAULT_BUCK0		BIT(0)
+
+#define LP873X_RESET_SW_RESET			BIT(0)
+
+#define LP873X_INT_TOP_1_PGOOD_INT		BIT(7)
+#define LP873X_INT_TOP_1_LDO_INT		BIT(6)
+#define LP873X_INT_TOP_1_BUCK_INT		BIT(5)
+#define LP873X_INT_TOP_1_SYNC_CLK_INT		BIT(4)
+#define LP873X_INT_TOP_1_TDIE_SD_INT		BIT(3)
+#define LP873X_INT_TOP_1_TDIE_WARN_INT		BIT(2)
+#define LP873X_INT_TOP_1_OVP_INT		BIT(1)
+#define LP873X_INT_TOP_1_I_MEAS_INT		BIT(0)
+
+#define LP873X_INT_TOP_2_RESET_REG_INT		BIT(0)
+
+#define LP873X_INT_BUCK_BUCK1_PG_INT		BIT(6)
+#define LP873X_INT_BUCK_BUCK1_SC_INT		BIT(5)
+#define LP873X_INT_BUCK_BUCK1_ILIM_INT		BIT(4)
+#define LP873X_INT_BUCK_BUCK0_PG_INT		BIT(2)
+#define LP873X_INT_BUCK_BUCK0_SC_INT		BIT(1)
+#define LP873X_INT_BUCK_BUCK0_ILIM_INT		BIT(0)
+
+#define LP873X_INT_LDO_LDO1_PG_INT		BIT(6)
+#define LP873X_INT_LDO_LDO1_SC_INT		BIT(5)
+#define LP873X_INT_LDO_LDO1_ILIM_INT		BIT(4)
+#define LP873X_INT_LDO_LDO0_PG_INT		BIT(2)
+#define LP873X_INT_LDO_LDO0_SC_INT		BIT(1)
+#define LP873X_INT_LDO_LDO0_ILIM_INT		BIT(0)
+
+#define LP873X_TOP_STAT_PGOOD_STAT		BIT(7)
+#define LP873X_TOP_STAT_SYNC_CLK_STAT		BIT(4)
+#define LP873X_TOP_STAT_TDIE_SD_STAT		BIT(3)
+#define LP873X_TOP_STAT_TDIE_WARN_STAT		BIT(2)
+#define LP873X_TOP_STAT_OVP_STAT		BIT(1)
+
+#define LP873X_BUCK_STAT_BUCK1_STAT		BIT(7)
+#define LP873X_BUCK_STAT_BUCK1_PG_STAT		BIT(6)
+#define LP873X_BUCK_STAT_BUCK1_ILIM_STAT	BIT(4)
+#define LP873X_BUCK_STAT_BUCK0_STAT		BIT(3)
+#define LP873X_BUCK_STAT_BUCK0_PG_STAT		BIT(2)
+#define LP873X_BUCK_STAT_BUCK0_ILIM_STAT	BIT(0)
+
+#define LP873X_LDO_STAT_LDO1_STAT		BIT(7)
+#define LP873X_LDO_STAT_LDO1_PG_STAT		BIT(6)
+#define LP873X_LDO_STAT_LDO1_ILIM_STAT		BIT(4)
+#define LP873X_LDO_STAT_LDO0_STAT		BIT(3)
+#define LP873X_LDO_STAT_LDO0_PG_STAT		BIT(2)
+#define LP873X_LDO_STAT_LDO0_ILIM_STAT		BIT(0)
+
+#define LP873X_TOP_MASK_1_PGOOD_INT_MASK	BIT(7)
+#define LP873X_TOP_MASK_1_SYNC_CLK_MASK	BIT(4)
+#define LP873X_TOP_MASK_1_TDIE_WARN_MASK	BIT(2)
+#define LP873X_TOP_MASK_1_I_MEAS_MASK		BIT(0)
+
+#define LP873X_TOP_MASK_2_RESET_REG_MASK	BIT(0)
+
+#define LP873X_BUCK_MASK_BUCK1_PGF_MASK	BIT(7)
+#define LP873X_BUCK_MASK_BUCK1_PGR_MASK	BIT(6)
+#define LP873X_BUCK_MASK_BUCK1_ILIM_MASK	BIT(4)
+#define LP873X_BUCK_MASK_BUCK0_PGF_MASK	BIT(3)
+#define LP873X_BUCK_MASK_BUCK0_PGR_MASK	BIT(2)
+#define LP873X_BUCK_MASK_BUCK0_ILIM_MASK	BIT(0)
+
+#define LP873X_LDO_MASK_LDO1_PGF_MASK		BIT(7)
+#define LP873X_LDO_MASK_LDO1_PGR_MASK		BIT(6)
+#define LP873X_LDO_MASK_LDO1_ILIM_MASK		BIT(4)
+#define LP873X_LDO_MASK_LDO0_PGF_MASK		BIT(3)
+#define LP873X_LDO_MASK_LDO0_PGR_MASK		BIT(2)
+#define LP873X_LDO_MASK_LDO0_ILIM_MASK		BIT(0)
+
+#define LP873X_SEL_I_LOAD_CURRENT_BUCK_SELECT	BIT(0)
+
+#define LP873X_I_LOAD_2_BUCK_LOAD_CURRENT	BIT(0)
+
+#define LP873X_I_LOAD_1_BUCK_LOAD_CURRENT	0xFF
+
+#define LP873X_MAX_REG_ID		LP873X_LDO_1
+
+/* Number of step-down converters available */
+#define LP873X_NUM_BUCK		2
+/* Number of LDO voltage regulators available */
+#define LP873X_NUM_LDO		2
+/* Number of total regulators available */
+#define LP873X_NUM_REGULATOR		(LP873X_NUM_BUCK + LP873X_NUM_LDO)
+
+enum lp873x_regulator_id {
+	/* BUCK's */
+	LP873X_BUCK_0,
+	LP873X_BUCK_1,
+	/* LDOs */
+	LP873X_LDO_0,
+	LP873X_LDO_1,
+};
+
+/**
+ * struct lp873x - state holder for the lp873x driver
+ * @dev: struct device pointer for MFD device
+ * @rev: revision of the lp873x
+ * @lock: lock guarding the data structure
+ * @regmap: register map of the lp873x PMIC
+ *
+ * Device data may be used to access the LP873X chip
+ */
+struct lp873x {
+	struct device *dev;
+	u8 rev;
+	struct mutex lock;	/* lock guarding the data structure */
+	struct regmap *regmap;
+};
+#endif /* __LINUX_MFD_LP873X_H */
-- 
cgit v1.2.3


From 83f141030cec8861969121582f13ab2caff5c4ba Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 31 Aug 2016 14:28:11 +0530
Subject: gpio: lp873x: Add support for General Purpose Outputs

Add driver for lp873x PMIC family GPOs. Two GPOs are supported
and can be configured in Open-drain output or Push-pull output.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/gpio/Kconfig       |  10 +++
 drivers/gpio/Makefile      |   1 +
 drivers/gpio/gpio-lp873x.c | 193 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 204 insertions(+)
 create mode 100644 drivers/gpio/gpio-lp873x.c

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 98dd47a30fc7..346e9a9b4397 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -874,6 +874,16 @@ config GPIO_LP3943
 	  LP3943 can be used as a GPIO expander which provides up to 16 GPIOs.
 	  Open drain outputs are required for this usage.
 
+config GPIO_LP873X
+	tristate "TI LP873X GPO"
+	depends on MFD_TI_LP873X
+	help
+	  This driver supports the GPO on TI Lp873x PMICs. 2 GPOs are present
+	  on LP873X PMICs.
+
+	  This driver can also be built as a module. If so, the module will be
+          called gpio-lp873x.
+
 config GPIO_MAX77620
 	tristate "GPIO support for PMIC MAX77620 and MAX20024"
 	depends on MFD_MAX77620
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 2a035ed8f168..d60432f2a126 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_GPIO_LOONGSON)	+= gpio-loongson.o
 obj-$(CONFIG_GPIO_LP3943)	+= gpio-lp3943.o
 obj-$(CONFIG_GPIO_LPC18XX)	+= gpio-lpc18xx.o
 obj-$(CONFIG_ARCH_LPC32XX)	+= gpio-lpc32xx.o
+obj-$(CONFIG_GPIO_LP873X)	+= gpio-lp873x.o
 obj-$(CONFIG_GPIO_LYNXPOINT)	+= gpio-lynxpoint.o
 obj-$(CONFIG_GPIO_MAX730X)	+= gpio-max730x.o
 obj-$(CONFIG_GPIO_MAX7300)	+= gpio-max7300.o
diff --git a/drivers/gpio/gpio-lp873x.c b/drivers/gpio/gpio-lp873x.c
new file mode 100644
index 000000000000..f10d49da1554
--- /dev/null
+++ b/drivers/gpio/gpio-lp873x.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2016 Texas Instruments Incorporated - http://www.ti.com/
+ *	Keerthy <j-keerthy@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether expressed or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.
+ *
+ * Based on the TPS65218 driver
+ */
+
+#include <linux/gpio.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/lp873x.h>
+
+#define BITS_PER_GPO		0x4
+#define LP873X_GPO_CTRL_OD	0x2
+
+struct lp873x_gpio {
+	struct gpio_chip chip;
+	struct lp873x *lp873;
+};
+
+static int lp873x_gpio_get_direction(struct gpio_chip *chip,
+				     unsigned int offset)
+{
+	/* This device is output only */
+	return 0;
+}
+
+static int lp873x_gpio_direction_input(struct gpio_chip *chip,
+				       unsigned int offset)
+{
+	/* This device is output only */
+	return -EINVAL;
+}
+
+static int lp873x_gpio_direction_output(struct gpio_chip *chip,
+					unsigned int offset, int value)
+{
+	struct lp873x_gpio *gpio = gpiochip_get_data(chip);
+
+	/* Set the initial value */
+	return regmap_update_bits(gpio->lp873->regmap, LP873X_REG_GPO_CTRL,
+				  BIT(offset * BITS_PER_GPO),
+				  value ? BIT(offset * BITS_PER_GPO) : 0);
+}
+
+static int lp873x_gpio_get(struct gpio_chip *chip, unsigned int offset)
+{
+	struct lp873x_gpio *gpio = gpiochip_get_data(chip);
+	int ret, val;
+
+	ret = regmap_read(gpio->lp873->regmap, LP873X_REG_GPO_CTRL, &val);
+	if (ret < 0)
+		return ret;
+
+	return val & BIT(offset * BITS_PER_GPO);
+}
+
+static void lp873x_gpio_set(struct gpio_chip *chip, unsigned int offset,
+			    int value)
+{
+	struct lp873x_gpio *gpio = gpiochip_get_data(chip);
+
+	regmap_update_bits(gpio->lp873->regmap, LP873X_REG_GPO_CTRL,
+			   BIT(offset * BITS_PER_GPO),
+			   value ? BIT(offset * BITS_PER_GPO) : 0);
+}
+
+static int lp873x_gpio_request(struct gpio_chip *gc, unsigned int offset)
+{
+	struct lp873x_gpio *gpio = gpiochip_get_data(gc);
+	int ret;
+
+	switch (offset) {
+	case 0:
+		/* No MUX Set up Needed for GPO */
+		break;
+	case 1:
+		/* Setup the CLKIN_PIN_SEL MUX to GPO2 */
+		ret = regmap_update_bits(gpio->lp873->regmap, LP873X_REG_CONFIG,
+					 LP873X_CONFIG_CLKIN_PIN_SEL, 0);
+		if (ret)
+			return ret;
+
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int lp873x_gpio_set_single_ended(struct gpio_chip *gc,
+					unsigned int offset,
+					enum single_ended_mode mode)
+{
+	struct lp873x_gpio *gpio = gpiochip_get_data(gc);
+
+	switch (mode) {
+	case LINE_MODE_OPEN_DRAIN:
+		return regmap_update_bits(gpio->lp873->regmap,
+					  LP873X_REG_GPO_CTRL,
+					  BIT(offset * BITS_PER_GPO +
+					  LP873X_GPO_CTRL_OD),
+					  BIT(offset * BITS_PER_GPO +
+					  LP873X_GPO_CTRL_OD));
+	case LINE_MODE_PUSH_PULL:
+		return regmap_update_bits(gpio->lp873->regmap,
+					  LP873X_REG_GPO_CTRL,
+					  BIT(offset * BITS_PER_GPO +
+					  LP873X_GPO_CTRL_OD), 0);
+	default:
+		return -ENOTSUPP;
+	}
+}
+
+static struct gpio_chip template_chip = {
+	.label			= "lp873x-gpio",
+	.owner			= THIS_MODULE,
+	.request		= lp873x_gpio_request,
+	.get_direction		= lp873x_gpio_get_direction,
+	.direction_input	= lp873x_gpio_direction_input,
+	.direction_output	= lp873x_gpio_direction_output,
+	.get			= lp873x_gpio_get,
+	.set			= lp873x_gpio_set,
+	.set_single_ended	= lp873x_gpio_set_single_ended,
+	.base			= -1,
+	.ngpio			= 2,
+	.can_sleep		= true,
+};
+
+static int lp873x_gpio_probe(struct platform_device *pdev)
+{
+	struct lp873x_gpio *gpio;
+	int ret;
+
+	gpio = devm_kzalloc(&pdev->dev, sizeof(*gpio), GFP_KERNEL);
+	if (!gpio)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, gpio);
+
+	gpio->lp873 = dev_get_drvdata(pdev->dev.parent);
+	gpio->chip = template_chip;
+	gpio->chip.parent = gpio->lp873->dev;
+
+	ret = gpiochip_add_data(&gpio->chip, gpio);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Could not register gpiochip, %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int lp873x_gpio_remove(struct platform_device *pdev)
+{
+	struct lp873x_gpio *gpio = platform_get_drvdata(pdev);
+
+	gpiochip_remove(&gpio->chip);
+
+	return 0;
+}
+
+static const struct platform_device_id lp873x_gpio_id_table[] = {
+	{ "lp873x-gpio", },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, lp873x_gpio_id_table);
+
+static struct platform_driver lp873x_gpio_driver = {
+	.driver = {
+		.name = "lp873x-gpio",
+	},
+	.probe = lp873x_gpio_probe,
+	.remove = lp873x_gpio_remove,
+	.id_table = lp873x_gpio_id_table,
+};
+module_platform_driver(lp873x_gpio_driver);
+
+MODULE_AUTHOR("Keerthy <j-keerthy@ti.com>");
+MODULE_DESCRIPTION("LP873X GPIO driver");
+MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From f618ed218dc01207882a8f02b1310c7daaf5156a Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 31 Aug 2016 14:28:12 +0530
Subject: regulator: lp873x: Change the MFD config option as per latest naming

Change the MFD config option as per latest naming

Signed-off-by: Keerthy <j-keerthy@ti.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 6c88e31c01f7..97dc4cc3ef43 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -323,7 +323,7 @@ config REGULATOR_LP872X
 
 config REGULATOR_LP873X
 	tristate "TI LP873X Power regulators"
-	depends on MFD_LP873X && OF
+	depends on MFD_TI_LP873X && OF
 	help
 	  This driver supports LP873X voltage regulator chips. LP873X
 	  provides two step-down converters and two general-purpose LDO
-- 
cgit v1.2.3


From 2eedcbfc0612c87e22c6325fde49ecf140e5873a Mon Sep 17 00:00:00 2001
From: Wadim Egorov <w.egorov@phytec.de>
Date: Mon, 29 Aug 2016 13:07:58 +0200
Subject: mfd: rk808: Add RK818 support

The RK818 chip is a Power Management IC (PMIC) for multimedia and handheld
devices. It contains the following components:

- Regulators
- RTC
- Clocking
- Battery support

Both RK808 and RK818 chips are using a similar register map,
so we can reuse the RTC and Clocking functionality.

Signed-off-by: Wadim Egorov <w.egorov@phytec.de>
Tested-by: Andy Yan <andy.yan@rock-chips.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig       |   4 +-
 drivers/mfd/rk808.c       | 226 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/mfd/rk808.h | 154 +++++++++++++++++++++++++++++--
 3 files changed, 342 insertions(+), 42 deletions(-)

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2d1fb6420592..a55be9520b76 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -852,13 +852,13 @@ config MFD_RC5T583
 	  different functionality of the device.
 
 config MFD_RK808
-	tristate "Rockchip RK808 Power Management chip"
+	tristate "Rockchip RK808/RK818 Power Management Chip"
 	depends on I2C && OF
 	select MFD_CORE
 	select REGMAP_I2C
 	select REGMAP_IRQ
 	help
-	  If you say yes here you get support for the RK808
+	  If you say yes here you get support for the RK808 and RK818
 	  Power Management chips.
 	  This driver provides common support for accessing the device
 	  through I2C interface. The device supports multiple sub-devices
diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index 49d7f624fc94..0f8acc5882a4 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -1,11 +1,15 @@
 /*
- * MFD core driver for Rockchip RK808
+ * MFD core driver for Rockchip RK808/RK818
  *
  * Copyright (c) 2014, Fuzhou Rockchip Electronics Co., Ltd
  *
  * Author: Chris Zhong <zyw@rock-chips.com>
  * Author: Zhang Qing <zhangqing@rock-chips.com>
  *
+ * Copyright (C) 2016 PHYTEC Messtechnik GmbH
+ *
+ * Author: Wadim Egorov <w.egorov@phytec.de>
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
  * version 2, as published by the Free Software Foundation.
@@ -21,6 +25,7 @@
 #include <linux/mfd/rk808.h>
 #include <linux/mfd/core.h>
 #include <linux/module.h>
+#include <linux/of_device.h>
 #include <linux/regmap.h>
 
 struct rk808_reg_data {
@@ -57,6 +62,14 @@ static bool rk808_is_volatile_reg(struct device *dev, unsigned int reg)
 	return false;
 }
 
+static const struct regmap_config rk818_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = RK818_USB_CTRL_REG,
+	.cache_type = REGCACHE_RBTREE,
+	.volatile_reg = rk808_is_volatile_reg,
+};
+
 static const struct regmap_config rk808_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -79,11 +92,21 @@ static const struct mfd_cell rk808s[] = {
 	{
 		.name = "rk808-rtc",
 		.num_resources = ARRAY_SIZE(rtc_resources),
-		.resources = &rtc_resources[0],
+		.resources = rtc_resources,
 	},
 };
 
-static const struct rk808_reg_data pre_init_reg[] = {
+static const struct mfd_cell rk818s[] = {
+	{ .name = "rk808-clkout", },
+	{ .name = "rk808-regulator", },
+	{
+		.name = "rk808-rtc",
+		.num_resources = ARRAY_SIZE(rtc_resources),
+		.resources = rtc_resources,
+	},
+};
+
+static const struct rk808_reg_data rk808_pre_init_reg[] = {
 	{ RK808_BUCK3_CONFIG_REG, BUCK_ILMIN_MASK,  BUCK_ILMIN_150MA },
 	{ RK808_BUCK4_CONFIG_REG, BUCK_ILMIN_MASK,  BUCK_ILMIN_200MA },
 	{ RK808_BOOST_CONFIG_REG, BOOST_ILMIN_MASK, BOOST_ILMIN_100MA },
@@ -94,6 +117,24 @@ static const struct rk808_reg_data pre_init_reg[] = {
 						    VB_LO_SEL_3500MV },
 };
 
+static const struct rk808_reg_data rk818_pre_init_reg[] = {
+	/* improve efficiency */
+	{ RK818_BUCK2_CONFIG_REG, BUCK2_RATE_MASK,  BUCK_ILMIN_250MA },
+	{ RK818_BUCK4_CONFIG_REG, BUCK_ILMIN_MASK,  BUCK_ILMIN_250MA },
+	{ RK818_BOOST_CONFIG_REG, BOOST_ILMIN_MASK, BOOST_ILMIN_100MA },
+	{ RK818_USB_CTRL_REG,	  RK818_USB_ILIM_SEL_MASK,
+						    RK818_USB_ILMIN_2000MA },
+	/* close charger when usb lower then 3.4V */
+	{ RK818_USB_CTRL_REG,	  RK818_USB_CHG_SD_VSEL_MASK,
+						    (0x7 << 4) },
+	/* no action when vref */
+	{ RK818_H5V_EN_REG,	  BIT(1),	    RK818_REF_RDY_CTRL },
+	/* enable HDMI 5V */
+	{ RK818_H5V_EN_REG,	  BIT(0),	    RK818_H5V_EN },
+	{ RK808_VB_MON_REG,	  MASK_ALL,	    VB_LO_ACT |
+						    VB_LO_SEL_3500MV },
+};
+
 static const struct regmap_irq rk808_irqs[] = {
 	/* INT_STS */
 	[RK808_IRQ_VOUT_LO] = {
@@ -136,6 +177,76 @@ static const struct regmap_irq rk808_irqs[] = {
 	},
 };
 
+static const struct regmap_irq rk818_irqs[] = {
+	/* INT_STS */
+	[RK818_IRQ_VOUT_LO] = {
+		.mask = RK818_IRQ_VOUT_LO_MSK,
+		.reg_offset = 0,
+	},
+	[RK818_IRQ_VB_LO] = {
+		.mask = RK818_IRQ_VB_LO_MSK,
+		.reg_offset = 0,
+	},
+	[RK818_IRQ_PWRON] = {
+		.mask = RK818_IRQ_PWRON_MSK,
+		.reg_offset = 0,
+	},
+	[RK818_IRQ_PWRON_LP] = {
+		.mask = RK818_IRQ_PWRON_LP_MSK,
+		.reg_offset = 0,
+	},
+	[RK818_IRQ_HOTDIE] = {
+		.mask = RK818_IRQ_HOTDIE_MSK,
+		.reg_offset = 0,
+	},
+	[RK818_IRQ_RTC_ALARM] = {
+		.mask = RK818_IRQ_RTC_ALARM_MSK,
+		.reg_offset = 0,
+	},
+	[RK818_IRQ_RTC_PERIOD] = {
+		.mask = RK818_IRQ_RTC_PERIOD_MSK,
+		.reg_offset = 0,
+	},
+	[RK818_IRQ_USB_OV] = {
+		.mask = RK818_IRQ_USB_OV_MSK,
+		.reg_offset = 0,
+	},
+
+	/* INT_STS2 */
+	[RK818_IRQ_PLUG_IN] = {
+		.mask = RK818_IRQ_PLUG_IN_MSK,
+		.reg_offset = 1,
+	},
+	[RK818_IRQ_PLUG_OUT] = {
+		.mask = RK818_IRQ_PLUG_OUT_MSK,
+		.reg_offset = 1,
+	},
+	[RK818_IRQ_CHG_OK] = {
+		.mask = RK818_IRQ_CHG_OK_MSK,
+		.reg_offset = 1,
+	},
+	[RK818_IRQ_CHG_TE] = {
+		.mask = RK818_IRQ_CHG_TE_MSK,
+		.reg_offset = 1,
+	},
+	[RK818_IRQ_CHG_TS1] = {
+		.mask = RK818_IRQ_CHG_TS1_MSK,
+		.reg_offset = 1,
+	},
+	[RK818_IRQ_TS2] = {
+		.mask = RK818_IRQ_TS2_MSK,
+		.reg_offset = 1,
+	},
+	[RK818_IRQ_CHG_CVTLIM] = {
+		.mask = RK818_IRQ_CHG_CVTLIM_MSK,
+		.reg_offset = 1,
+	},
+	[RK818_IRQ_DISCHG_ILIM] = {
+		.mask = RK818_IRQ_DISCHG_ILIM_MSK,
+		.reg_offset = 1,
+	},
+};
+
 static struct regmap_irq_chip rk808_irq_chip = {
 	.name = "rk808",
 	.irqs = rk808_irqs,
@@ -148,6 +259,18 @@ static struct regmap_irq_chip rk808_irq_chip = {
 	.init_ack_masked = true,
 };
 
+static struct regmap_irq_chip rk818_irq_chip = {
+	.name = "rk818",
+	.irqs = rk818_irqs,
+	.num_irqs = ARRAY_SIZE(rk818_irqs),
+	.num_regs = 2,
+	.irq_reg_stride = 2,
+	.status_base = RK818_INT_STS_REG1,
+	.mask_base = RK818_INT_STS_MSK_REG1,
+	.ack_base = RK818_INT_STS_REG1,
+	.init_ack_masked = true,
+};
+
 static struct i2c_client *rk808_i2c_client;
 static void rk808_device_shutdown(void)
 {
@@ -167,55 +290,100 @@ static void rk808_device_shutdown(void)
 		dev_err(&rk808_i2c_client->dev, "power off error!\n");
 }
 
+static const struct of_device_id rk808_of_match[] = {
+	{ .compatible = "rockchip,rk808" },
+	{ .compatible = "rockchip,rk818" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, rk808_of_match);
+
 static int rk808_probe(struct i2c_client *client,
 		       const struct i2c_device_id *id)
 {
 	struct device_node *np = client->dev.of_node;
 	struct rk808 *rk808;
+	const struct rk808_reg_data *pre_init_reg;
+	const struct mfd_cell *cells;
+	int nr_pre_init_regs;
+	int nr_cells;
 	int pm_off = 0;
 	int ret;
 	int i;
 
-	if (!client->irq) {
-		dev_err(&client->dev, "No interrupt support, no core IRQ\n");
-		return -EINVAL;
-	}
-
 	rk808 = devm_kzalloc(&client->dev, sizeof(*rk808), GFP_KERNEL);
 	if (!rk808)
 		return -ENOMEM;
 
-	rk808->regmap = devm_regmap_init_i2c(client, &rk808_regmap_config);
+	rk808->variant = i2c_smbus_read_word_data(client, RK808_ID_MSB);
+	if (rk808->variant < 0) {
+		dev_err(&client->dev, "Failed to read the chip id at 0x%02x\n",
+			RK808_ID_MSB);
+		return rk808->variant;
+	}
+
+	dev_dbg(&client->dev, "Chip id: 0x%x\n", (unsigned int)rk808->variant);
+
+	switch (rk808->variant) {
+	case RK808_ID:
+		rk808->regmap_cfg = &rk808_regmap_config;
+		rk808->regmap_irq_chip = &rk808_irq_chip;
+		pre_init_reg = rk808_pre_init_reg;
+		nr_pre_init_regs = ARRAY_SIZE(rk808_pre_init_reg);
+		cells = rk808s;
+		nr_cells = ARRAY_SIZE(rk808s);
+		break;
+	case RK818_ID:
+		rk808->regmap_cfg = &rk818_regmap_config;
+		rk808->regmap_irq_chip = &rk818_irq_chip;
+		pre_init_reg = rk818_pre_init_reg;
+		nr_pre_init_regs = ARRAY_SIZE(rk818_pre_init_reg);
+		cells = rk818s;
+		nr_cells = ARRAY_SIZE(rk818s);
+		break;
+	default:
+		dev_err(&client->dev, "Unsupported RK8XX ID %lu\n",
+			rk808->variant);
+		return -EINVAL;
+	}
+
+	rk808->i2c = client;
+	i2c_set_clientdata(client, rk808);
+
+	rk808->regmap = devm_regmap_init_i2c(client, rk808->regmap_cfg);
 	if (IS_ERR(rk808->regmap)) {
 		dev_err(&client->dev, "regmap initialization failed\n");
 		return PTR_ERR(rk808->regmap);
 	}
 
-	for (i = 0; i < ARRAY_SIZE(pre_init_reg); i++) {
-		ret = regmap_update_bits(rk808->regmap, pre_init_reg[i].addr,
-					 pre_init_reg[i].mask,
-					 pre_init_reg[i].value);
-		if (ret) {
-			dev_err(&client->dev,
-				"0x%x write err\n", pre_init_reg[i].addr);
-			return ret;
-		}
+	if (!client->irq) {
+		dev_err(&client->dev, "No interrupt support, no core IRQ\n");
+		return -EINVAL;
 	}
 
 	ret = regmap_add_irq_chip(rk808->regmap, client->irq,
 				  IRQF_ONESHOT, -1,
-				  &rk808_irq_chip, &rk808->irq_data);
+				  rk808->regmap_irq_chip, &rk808->irq_data);
 	if (ret) {
 		dev_err(&client->dev, "Failed to add irq_chip %d\n", ret);
 		return ret;
 	}
 
-	rk808->i2c = client;
-	i2c_set_clientdata(client, rk808);
+	for (i = 0; i < nr_pre_init_regs; i++) {
+		ret = regmap_update_bits(rk808->regmap,
+					pre_init_reg[i].addr,
+					pre_init_reg[i].mask,
+					pre_init_reg[i].value);
+		if (ret) {
+			dev_err(&client->dev,
+				"0x%x write err\n",
+				pre_init_reg[i].addr);
+			return ret;
+		}
+	}
 
-	ret = devm_mfd_add_devices(&client->dev, -1,
-				   rk808s, ARRAY_SIZE(rk808s), NULL, 0,
-				   regmap_irq_get_domain(rk808->irq_data));
+	ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_NONE,
+			      cells, nr_cells, NULL, 0,
+			      regmap_irq_get_domain(rk808->irq_data));
 	if (ret) {
 		dev_err(&client->dev, "failed to add MFD devices %d\n", ret);
 		goto err_irq;
@@ -245,14 +413,9 @@ static int rk808_remove(struct i2c_client *client)
 	return 0;
 }
 
-static const struct of_device_id rk808_of_match[] = {
-	{ .compatible = "rockchip,rk808" },
-	{ },
-};
-MODULE_DEVICE_TABLE(of, rk808_of_match);
-
 static const struct i2c_device_id rk808_ids[] = {
 	{ "rk808" },
+	{ "rk818" },
 	{ },
 };
 MODULE_DEVICE_TABLE(i2c, rk808_ids);
@@ -272,4 +435,5 @@ module_i2c_driver(rk808_i2c_driver);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Chris Zhong <zyw@rock-chips.com>");
 MODULE_AUTHOR("Zhang Qing <zhangqing@rock-chips.com>");
-MODULE_DESCRIPTION("RK808 PMIC driver");
+MODULE_AUTHOR("Wadim Egorov <w.egorov@phytec.de>");
+MODULE_DESCRIPTION("RK808/RK818 PMIC driver");
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 441b6ee72691..fc5db6fcb57d 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -1,11 +1,15 @@
 /*
- * rk808.h for Rockchip RK808
+ * Register definitions for Rockchip's RK808/RK818 PMIC
  *
  * Copyright (c) 2014, Fuzhou Rockchip Electronics Co., Ltd
  *
  * Author: Chris Zhong <zyw@rock-chips.com>
  * Author: Zhang Qing <zhangqing@rock-chips.com>
  *
+ * Copyright (C) 2016 PHYTEC Messtechnik GmbH
+ *
+ * Author: Wadim Egorov <w.egorov@phytec.de>
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
  * version 2, as published by the Free Software Foundation.
@@ -16,8 +20,8 @@
  * more details.
  */
 
-#ifndef __LINUX_REGULATOR_rk808_H
-#define __LINUX_REGULATOR_rk808_H
+#ifndef __LINUX_REGULATOR_RK808_H
+#define __LINUX_REGULATOR_RK808_H
 
 #include <linux/regulator/machine.h>
 #include <linux/regmap.h>
@@ -28,7 +32,7 @@
 
 #define RK808_DCDC1	0 /* (0+RK808_START) */
 #define RK808_LDO1	4 /* (4+RK808_START) */
-#define RK808_NUM_REGULATORS   14
+#define RK808_NUM_REGULATORS	14
 
 enum rk808_reg {
 	RK808_ID_DCDC1,
@@ -65,6 +69,8 @@ enum rk808_reg {
 #define RK808_RTC_INT_REG	0x12
 #define RK808_RTC_COMP_LSB_REG	0x13
 #define RK808_RTC_COMP_MSB_REG	0x14
+#define RK808_ID_MSB		0x17
+#define RK808_ID_LSB		0x18
 #define RK808_CLK32OUT_REG	0x20
 #define RK808_VB_MON_REG	0x21
 #define RK808_THERMAL_REG	0x22
@@ -115,7 +121,92 @@ enum rk808_reg {
 #define RK808_INT_STS_MSK_REG2	0x4f
 #define RK808_IO_POL_REG	0x50
 
-/* IRQ Definitions */
+/* RK818 */
+#define RK818_DCDC1			0
+#define RK818_LDO1			4
+#define RK818_NUM_REGULATORS		17
+
+enum rk818_reg {
+	RK818_ID_DCDC1,
+	RK818_ID_DCDC2,
+	RK818_ID_DCDC3,
+	RK818_ID_DCDC4,
+	RK818_ID_BOOST,
+	RK818_ID_LDO1,
+	RK818_ID_LDO2,
+	RK818_ID_LDO3,
+	RK818_ID_LDO4,
+	RK818_ID_LDO5,
+	RK818_ID_LDO6,
+	RK818_ID_LDO7,
+	RK818_ID_LDO8,
+	RK818_ID_LDO9,
+	RK818_ID_SWITCH,
+	RK818_ID_HDMI_SWITCH,
+	RK818_ID_OTG_SWITCH,
+};
+
+#define RK818_DCDC_EN_REG		0x23
+#define RK818_LDO_EN_REG		0x24
+#define RK818_SLEEP_SET_OFF_REG1	0x25
+#define RK818_SLEEP_SET_OFF_REG2	0x26
+#define RK818_DCDC_UV_STS_REG		0x27
+#define RK818_DCDC_UV_ACT_REG		0x28
+#define RK818_LDO_UV_STS_REG		0x29
+#define RK818_LDO_UV_ACT_REG		0x2a
+#define RK818_DCDC_PG_REG		0x2b
+#define RK818_LDO_PG_REG		0x2c
+#define RK818_VOUT_MON_TDB_REG		0x2d
+#define RK818_BUCK1_CONFIG_REG		0x2e
+#define RK818_BUCK1_ON_VSEL_REG		0x2f
+#define RK818_BUCK1_SLP_VSEL_REG	0x30
+#define RK818_BUCK2_CONFIG_REG		0x32
+#define RK818_BUCK2_ON_VSEL_REG		0x33
+#define RK818_BUCK2_SLP_VSEL_REG	0x34
+#define RK818_BUCK3_CONFIG_REG		0x36
+#define RK818_BUCK4_CONFIG_REG		0x37
+#define RK818_BUCK4_ON_VSEL_REG		0x38
+#define RK818_BUCK4_SLP_VSEL_REG	0x39
+#define RK818_BOOST_CONFIG_REG		0x3a
+#define RK818_LDO1_ON_VSEL_REG		0x3b
+#define RK818_LDO1_SLP_VSEL_REG		0x3c
+#define RK818_LDO2_ON_VSEL_REG		0x3d
+#define RK818_LDO2_SLP_VSEL_REG		0x3e
+#define RK818_LDO3_ON_VSEL_REG		0x3f
+#define RK818_LDO3_SLP_VSEL_REG		0x40
+#define RK818_LDO4_ON_VSEL_REG		0x41
+#define RK818_LDO4_SLP_VSEL_REG		0x42
+#define RK818_LDO5_ON_VSEL_REG		0x43
+#define RK818_LDO5_SLP_VSEL_REG		0x44
+#define RK818_LDO6_ON_VSEL_REG		0x45
+#define RK818_LDO6_SLP_VSEL_REG		0x46
+#define RK818_LDO7_ON_VSEL_REG		0x47
+#define RK818_LDO7_SLP_VSEL_REG		0x48
+#define RK818_LDO8_ON_VSEL_REG		0x49
+#define RK818_LDO8_SLP_VSEL_REG		0x4a
+#define RK818_BOOST_LDO9_ON_VSEL_REG	0x54
+#define RK818_BOOST_LDO9_SLP_VSEL_REG	0x55
+#define RK818_DEVCTRL_REG		0x4b
+#define RK818_INT_STS_REG1		0X4c
+#define RK818_INT_STS_MSK_REG1		0x4d
+#define RK818_INT_STS_REG2		0x4e
+#define RK818_INT_STS_MSK_REG2		0x4f
+#define RK818_IO_POL_REG		0x50
+#define RK818_H5V_EN_REG		0x52
+#define RK818_SLEEP_SET_OFF_REG3	0x53
+#define RK818_BOOST_LDO9_ON_VSEL_REG	0x54
+#define RK818_BOOST_LDO9_SLP_VSEL_REG	0x55
+#define RK818_BOOST_CTRL_REG		0x56
+#define RK818_DCDC_ILMAX		0x90
+#define RK818_USB_CTRL_REG		0xa1
+
+#define RK818_H5V_EN			BIT(0)
+#define RK818_REF_RDY_CTRL		BIT(1)
+#define RK818_USB_ILIM_SEL_MASK		0xf
+#define RK818_USB_ILMIN_2000MA		0x7
+#define RK818_USB_CHG_SD_VSEL_MASK	0x70
+
+/* RK808 IRQ Definitions */
 #define RK808_IRQ_VOUT_LO	0
 #define RK808_IRQ_VB_LO		1
 #define RK808_IRQ_PWRON		2
@@ -137,6 +228,43 @@ enum rk808_reg {
 #define RK808_IRQ_PLUG_IN_INT_MSK	BIT(0)
 #define RK808_IRQ_PLUG_OUT_INT_MSK	BIT(1)
 
+/* RK818 IRQ Definitions */
+#define RK818_IRQ_VOUT_LO	0
+#define RK818_IRQ_VB_LO		1
+#define RK818_IRQ_PWRON		2
+#define RK818_IRQ_PWRON_LP	3
+#define RK818_IRQ_HOTDIE	4
+#define RK818_IRQ_RTC_ALARM	5
+#define RK818_IRQ_RTC_PERIOD	6
+#define RK818_IRQ_USB_OV	7
+#define RK818_IRQ_PLUG_IN	8
+#define RK818_IRQ_PLUG_OUT	9
+#define RK818_IRQ_CHG_OK	10
+#define RK818_IRQ_CHG_TE	11
+#define RK818_IRQ_CHG_TS1	12
+#define RK818_IRQ_TS2		13
+#define RK818_IRQ_CHG_CVTLIM	14
+#define RK818_IRQ_DISCHG_ILIM	7
+
+#define RK818_IRQ_VOUT_LO_MSK		BIT(0)
+#define RK818_IRQ_VB_LO_MSK		BIT(1)
+#define RK818_IRQ_PWRON_MSK		BIT(2)
+#define RK818_IRQ_PWRON_LP_MSK		BIT(3)
+#define RK818_IRQ_HOTDIE_MSK		BIT(4)
+#define RK818_IRQ_RTC_ALARM_MSK		BIT(5)
+#define RK818_IRQ_RTC_PERIOD_MSK	BIT(6)
+#define RK818_IRQ_USB_OV_MSK		BIT(7)
+#define RK818_IRQ_PLUG_IN_MSK		BIT(0)
+#define RK818_IRQ_PLUG_OUT_MSK		BIT(1)
+#define RK818_IRQ_CHG_OK_MSK		BIT(2)
+#define RK818_IRQ_CHG_TE_MSK		BIT(3)
+#define RK818_IRQ_CHG_TS1_MSK		BIT(4)
+#define RK818_IRQ_TS2_MSK		BIT(5)
+#define RK818_IRQ_CHG_CVTLIM_MSK	BIT(6)
+#define RK818_IRQ_DISCHG_ILIM_MSK	BIT(7)
+
+#define RK818_NUM_IRQ		16
+
 #define RK808_VBAT_LOW_2V8	0x00
 #define RK808_VBAT_LOW_2V9	0x01
 #define RK808_VBAT_LOW_3V0	0x02
@@ -191,9 +319,17 @@ enum {
 	BOOST_ILMIN_250MA,
 };
 
+enum {
+	RK808_ID = 0x0000,
+	RK818_ID = 0x8181,
+};
+
 struct rk808 {
-	struct i2c_client *i2c;
-	struct regmap_irq_chip_data *irq_data;
-	struct regmap *regmap;
+	struct i2c_client		*i2c;
+	struct regmap_irq_chip_data	*irq_data;
+	struct regmap			*regmap;
+	long				variant;
+	const struct regmap_config	*regmap_cfg;
+	const struct regmap_irq_chip	*regmap_irq_chip;
 };
-#endif /* __LINUX_REGULATOR_rk808_H */
+#endif /* __LINUX_REGULATOR_RK808_H */
-- 
cgit v1.2.3


From 11375293530bb8434946c8c043f1adf5ffb6be10 Mon Sep 17 00:00:00 2001
From: Wadim Egorov <w.egorov@phytec.de>
Date: Mon, 29 Aug 2016 13:07:59 +0200
Subject: regulator: rk808: Add regulator driver for RK818

Add support for the rk818 regulator. The regulator module consists
of 4 DCDCs, 9 LDOs, 1 switch and 1 BOOST converter which is used to
power OTG and HDMI5V.

The output voltages are configurable and are meant to supply power
to the main processor and other components.

Signed-off-by: Wadim Egorov <w.egorov@phytec.de>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/Kconfig           |   4 +-
 drivers/regulator/rk808-regulator.c | 143 ++++++++++++++++++++++++++++++++++--
 2 files changed, 138 insertions(+), 9 deletions(-)

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 6c88e31c01f7..d3a86a8cce86 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -635,11 +635,11 @@ config REGULATOR_RC5T583
 	  outputs which can be controlled by i2c communication.
 
 config REGULATOR_RK808
-	tristate "Rockchip RK808 Power regulators"
+	tristate "Rockchip RK808/RK818 Power regulators"
 	depends on MFD_RK808
 	help
 	  Select this option to enable the power regulator of ROCKCHIP
-	  PMIC RK808.
+	  PMIC RK808 and RK818.
 	  This driver supports the control of different power rails of device
 	  through regulator interface. The device supports multiple DCDC/LDO
 	  outputs which can be controlled by i2c communication.
diff --git a/drivers/regulator/rk808-regulator.c b/drivers/regulator/rk808-regulator.c
index 40d07ba036e7..5f412a5e35e3 100644
--- a/drivers/regulator/rk808-regulator.c
+++ b/drivers/regulator/rk808-regulator.c
@@ -1,11 +1,15 @@
 /*
- * Regulator driver for Rockchip RK808
+ * Regulator driver for Rockchip RK808/RK818
  *
  * Copyright (c) 2014, Fuzhou Rockchip Electronics Co., Ltd
  *
  * Author: Chris Zhong <zyw@rock-chips.com>
  * Author: Zhang Qing <zhangqing@rock-chips.com>
  *
+ * Copyright (C) 2016 PHYTEC Messtechnik GmbH
+ *
+ * Author: Wadim Egorov <w.egorov@phytec.de>
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
  * version 2, as published by the Free Software Foundation.
@@ -32,6 +36,12 @@
 #define RK808_BUCK4_VSEL_MASK	0xf
 #define RK808_LDO_VSEL_MASK	0x1f
 
+#define RK818_BUCK_VSEL_MASK		0x3f
+#define RK818_BUCK4_VSEL_MASK		0x1f
+#define RK818_LDO_VSEL_MASK		0x1f
+#define RK818_LDO3_ON_VSEL_MASK		0xf
+#define RK818_BOOST_ON_VSEL_MASK	0xe0
+
 /* Ramp rate definitions for buck1 / buck2 only */
 #define RK808_RAMP_RATE_OFFSET		3
 #define RK808_RAMP_RATE_MASK		(3 << RK808_RAMP_RATE_OFFSET)
@@ -454,6 +464,108 @@ static const struct regulator_desc rk808_reg[] = {
 		RK808_DCDC_EN_REG, BIT(6)),
 };
 
+static const struct regulator_desc rk818_reg[] = {
+	{
+		.name = "DCDC_REG1",
+		.supply_name = "vcc1",
+		.of_match = of_match_ptr("DCDC_REG1"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK818_ID_DCDC1,
+		.ops = &rk808_reg_ops,
+		.type = REGULATOR_VOLTAGE,
+		.min_uV = 712500,
+		.uV_step = 12500,
+		.n_voltages = 64,
+		.vsel_reg = RK818_BUCK1_ON_VSEL_REG,
+		.vsel_mask = RK818_BUCK_VSEL_MASK,
+		.enable_reg = RK818_DCDC_EN_REG,
+		.enable_mask = BIT(0),
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG2",
+		.supply_name = "vcc2",
+		.of_match = of_match_ptr("DCDC_REG2"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK818_ID_DCDC2,
+		.ops = &rk808_reg_ops,
+		.type = REGULATOR_VOLTAGE,
+		.min_uV = 712500,
+		.uV_step = 12500,
+		.n_voltages = 64,
+		.vsel_reg = RK818_BUCK2_ON_VSEL_REG,
+		.vsel_mask = RK818_BUCK_VSEL_MASK,
+		.enable_reg = RK818_DCDC_EN_REG,
+		.enable_mask = BIT(1),
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG3",
+		.supply_name = "vcc3",
+		.of_match = of_match_ptr("DCDC_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK818_ID_DCDC3,
+		.ops = &rk808_switch_ops,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = 1,
+		.enable_reg = RK818_DCDC_EN_REG,
+		.enable_mask = BIT(2),
+		.owner = THIS_MODULE,
+	},
+	RK8XX_DESC(RK818_ID_DCDC4, "DCDC_REG4", "vcc4", 1800, 3600, 100,
+		RK818_BUCK4_ON_VSEL_REG, RK818_BUCK4_VSEL_MASK,
+		RK818_DCDC_EN_REG, BIT(3), 0),
+	RK8XX_DESC(RK818_ID_BOOST, "DCDC_BOOST", "boost", 4700, 5400, 100,
+		RK818_BOOST_LDO9_ON_VSEL_REG, RK818_BOOST_ON_VSEL_MASK,
+		RK818_DCDC_EN_REG, BIT(4), 0),
+	RK8XX_DESC(RK818_ID_LDO1, "LDO_REG1", "vcc6", 1800, 3400, 100,
+		RK818_LDO1_ON_VSEL_REG, RK818_LDO_VSEL_MASK, RK818_LDO_EN_REG,
+		BIT(0), 400),
+	RK8XX_DESC(RK818_ID_LDO2, "LDO_REG2", "vcc6", 1800, 3400, 100,
+		RK818_LDO1_ON_VSEL_REG, RK818_LDO_VSEL_MASK, RK818_LDO_EN_REG,
+		BIT(1), 400),
+	{
+		.name = "LDO_REG3",
+		.supply_name = "vcc7",
+		.of_match = of_match_ptr("LDO_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK818_ID_LDO3,
+		.ops = &rk808_reg_ops_ranges,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = 16,
+		.linear_ranges = rk808_ldo3_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk808_ldo3_voltage_ranges),
+		.vsel_reg = RK818_LDO3_ON_VSEL_REG,
+		.vsel_mask = RK818_LDO3_ON_VSEL_MASK,
+		.enable_reg = RK818_LDO_EN_REG,
+		.enable_mask = BIT(2),
+		.enable_time = 400,
+		.owner = THIS_MODULE,
+	},
+	RK8XX_DESC(RK818_ID_LDO4, "LDO_REG4", "vcc8", 1800, 3400, 100,
+		RK818_LDO4_ON_VSEL_REG, RK818_LDO_VSEL_MASK, RK818_LDO_EN_REG,
+		BIT(3), 400),
+	RK8XX_DESC(RK818_ID_LDO5, "LDO_REG5", "vcc7", 1800, 3400, 100,
+		RK818_LDO5_ON_VSEL_REG, RK818_LDO_VSEL_MASK, RK818_LDO_EN_REG,
+		BIT(4), 400),
+	RK8XX_DESC(RK818_ID_LDO6, "LDO_REG6", "vcc8", 800, 2500, 100,
+		RK818_LDO6_ON_VSEL_REG, RK818_LDO_VSEL_MASK, RK818_LDO_EN_REG,
+		BIT(5), 400),
+	RK8XX_DESC(RK818_ID_LDO7, "LDO_REG7", "vcc7", 800, 2500, 100,
+		RK818_LDO7_ON_VSEL_REG, RK818_LDO_VSEL_MASK, RK818_LDO_EN_REG,
+		BIT(6), 400),
+	RK8XX_DESC(RK818_ID_LDO8, "LDO_REG8", "vcc8", 1800, 3400, 100,
+		RK818_LDO8_ON_VSEL_REG, RK818_LDO_VSEL_MASK, RK818_LDO_EN_REG,
+		BIT(7), 400),
+	RK8XX_DESC(RK818_ID_LDO9, "LDO_REG9", "vcc9", 1800, 3400, 100,
+		RK818_BOOST_LDO9_ON_VSEL_REG, RK818_LDO_VSEL_MASK,
+		RK818_DCDC_EN_REG, BIT(5), 400),
+	RK8XX_DESC_SWITCH(RK818_ID_SWITCH, "SWITCH_REG", "vcc9",
+		RK818_DCDC_EN_REG, BIT(6)),
+	RK8XX_DESC_SWITCH(RK818_ID_HDMI_SWITCH, "HDMI_SWITCH", "h_5v",
+		RK818_H5V_EN_REG, BIT(0)),
+	RK8XX_DESC_SWITCH(RK818_ID_OTG_SWITCH, "OTG_SWITCH", "usb",
+		RK818_DCDC_EN_REG, BIT(7)),
+};
+
 static int rk808_regulator_dt_parse_pdata(struct device *dev,
 				   struct device *client_dev,
 				   struct regmap *map,
@@ -499,7 +611,8 @@ static int rk808_regulator_probe(struct platform_device *pdev)
 	struct regulator_config config = {};
 	struct regulator_dev *rk808_rdev;
 	struct rk808_regulator_data *pdata;
-	int ret, i;
+	const struct regulator_desc *regulators;
+	int ret, i, nregulators;
 
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
@@ -512,14 +625,29 @@ static int rk808_regulator_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, pdata);
 
+	switch (rk808->variant) {
+	case RK808_ID:
+		regulators = rk808_reg;
+		nregulators = RK808_NUM_REGULATORS;
+		break;
+	case RK818_ID:
+		regulators = rk818_reg;
+		nregulators = RK818_NUM_REGULATORS;
+		break;
+	default:
+		dev_err(&client->dev, "unsupported RK8XX ID %lu\n",
+			rk808->variant);
+		return -EINVAL;
+	}
+
 	config.dev = &client->dev;
 	config.driver_data = pdata;
 	config.regmap = rk808->regmap;
 
 	/* Instantiate the regulators */
-	for (i = 0; i < RK808_NUM_REGULATORS; i++) {
+	for (i = 0; i < nregulators; i++) {
 		rk808_rdev = devm_regulator_register(&pdev->dev,
-						     &rk808_reg[i], &config);
+						     &regulators[i], &config);
 		if (IS_ERR(rk808_rdev)) {
 			dev_err(&client->dev,
 				"failed to register %d regulator\n", i);
@@ -540,8 +668,9 @@ static struct platform_driver rk808_regulator_driver = {
 
 module_platform_driver(rk808_regulator_driver);
 
-MODULE_DESCRIPTION("regulator driver for the rk808 series PMICs");
-MODULE_AUTHOR("Chris Zhong<zyw@rock-chips.com>");
-MODULE_AUTHOR("Zhang Qing<zhangqing@rock-chips.com>");
+MODULE_DESCRIPTION("regulator driver for the RK808/RK818 series PMICs");
+MODULE_AUTHOR("Chris Zhong <zyw@rock-chips.com>");
+MODULE_AUTHOR("Zhang Qing <zhangqing@rock-chips.com>");
+MODULE_AUTHOR("Wadim Egorov <w.egorov@phytec.de>");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("platform:rk808-regulator");
-- 
cgit v1.2.3


From 71557e50583373ced165d4b3822f8d0a2490fee1 Mon Sep 17 00:00:00 2001
From: Wadim Egorov <w.egorov@phytec.de>
Date: Mon, 29 Aug 2016 13:08:00 +0200
Subject: mfd: dt-bindings: Add RK818 device tree bindings document

Add device tree bindings documentation for Rockchip's RK818 PMIC.

Signed-off-by: Wadim Egorov <w.egorov@phytec.de>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/devicetree/bindings/mfd/rk808.txt | 37 +++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/mfd/rk808.txt b/Documentation/devicetree/bindings/mfd/rk808.txt
index 4ca6aab4273a..9636ae8d8d41 100644
--- a/Documentation/devicetree/bindings/mfd/rk808.txt
+++ b/Documentation/devicetree/bindings/mfd/rk808.txt
@@ -1,7 +1,11 @@
-RK808 Power Management Integrated Circuit
+RK8XX Power Management Integrated Circuit
+
+The rk8xx family current members:
+rk808
+rk818
 
 Required properties:
-- compatible: "rockchip,rk808"
+- compatible: "rockchip,rk808", "rockchip,rk818"
 - reg: I2C slave address
 - interrupt-parent: The parent interrupt controller.
 - interrupts: the interrupt outputs of the controller.
@@ -13,6 +17,8 @@ Optional properties:
   default output clock name
 - rockchip,system-power-controller: Telling whether or not this pmic is controlling
   the system power.
+
+Optional RK808 properties:
 - vcc1-supply:  The input supply for DCDC_REG1
 - vcc2-supply:  The input supply for DCDC_REG2
 - vcc3-supply:  The input supply for DCDC_REG3
@@ -29,7 +35,20 @@ Optional properties:
   the gpio controller. If DVS GPIOs aren't present, voltage changes will happen
   very quickly with no slow ramp time.
 
-Regulators: All the regulators of RK808 to be instantiated shall be
+Optional RK818 properties:
+- vcc1-supply:  The input supply for DCDC_REG1
+- vcc2-supply:  The input supply for DCDC_REG2
+- vcc3-supply:  The input supply for DCDC_REG3
+- vcc4-supply:  The input supply for DCDC_REG4
+- boost-supply: The input supply for DCDC_BOOST
+- vcc6-supply:  The input supply for LDO_REG1 and LDO_REG2
+- vcc7-supply:  The input supply for LDO_REG3, LDO_REG5 and LDO_REG7
+- vcc8-supply:  The input supply for LDO_REG4, LDO_REG6 and LDO_REG8
+- vcc9-supply:  The input supply for LDO_REG9 and SWITCH_REG
+- h_5v-supply:  The input supply for HDMI_SWITCH
+- usb-supply:   The input supply for OTG_SWITCH
+
+Regulators: All the regulators of RK8XX to be instantiated shall be
 listed in a child node named 'regulators'. Each regulator is represented
 by a child node of the 'regulators' node.
 
@@ -48,6 +67,18 @@ number as described in RK808 datasheet.
 	- SWITCH_REGn
 		- valid values for n are 1 to 2
 
+Following regulators of the RK818 PMIC block are supported. Note that
+the 'n' in regulator name, as in DCDC_REGn or LDOn, represents the DCDC or LDO
+number as described in RK818 datasheet.
+
+	- DCDC_REGn
+		- valid values for n are 1 to 4.
+	- LDO_REGn
+		- valid values for n are 1 to 9.
+	- SWITCH_REG
+	- HDMI_SWITCH
+	- OTG_SWITCH
+
 Standard regulator bindings are used inside regulator subnodes. Check
   Documentation/devicetree/bindings/regulator/regulator.txt
 for more details
-- 
cgit v1.2.3


From 5d2fd5c79e169547fcd4bf363f96cc544fd3489e Mon Sep 17 00:00:00 2001
From: Wadim Egorov <w.egorov@phytec.de>
Date: Mon, 29 Aug 2016 13:08:01 +0200
Subject: rtc: Kconfig: Name RK818 in Kconfig for RTC_DRV_RK808

The RK808 and RK818 PMICs are using a similar register map.
We can reuse the rtc driver for the RK818 PMIC. So let's add
the RK818 in the Kconfig description.

Signed-off-by: Wadim Egorov <w.egorov@phytec.de>
Acked-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/rtc/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index e215f50794b6..1102e8f63608 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -328,11 +328,11 @@ config RTC_DRV_MAX77686
 	  will be called rtc-max77686.
 
 config RTC_DRV_RK808
-	tristate "Rockchip RK808 RTC"
+	tristate "Rockchip RK808/RK818 RTC"
 	depends on MFD_RK808
 	help
 	  If you say yes here you will get support for the
-	  RTC of RK808 PMIC.
+	  RTC of RK808 and RK818 PMIC.
 
 	  This driver can also be built as a module. If so, the module
 	  will be called rk808-rtc.
-- 
cgit v1.2.3


From 2ca342d391e3d8b56ed64626db8cfba8101b7c1d Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sat, 27 Aug 2016 15:55:39 +0800
Subject: regulator: axp20x: Support AXP806 variant

The X-Powers AXP806 PMIC has a new set of buck and LDO regulators, and
also a switch. The buck regulators support teaming into multi-phase
groups, with A+B, A+B+C, D+E groupings.

Some registers controlling DCDC converter work settings are at different
offsets. Deal with them as well.

Add support for this new variant.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/axp20x-regulator.c | 118 ++++++++++++++++++++++++++++++++---
 1 file changed, 111 insertions(+), 7 deletions(-)

diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c
index 6d9ac76a772f..54382ef902c6 100644
--- a/drivers/regulator/axp20x-regulator.c
+++ b/drivers/regulator/axp20x-regulator.c
@@ -244,16 +244,64 @@ static const struct regulator_desc axp22x_drivevbus_regulator = {
 	.ops		= &axp20x_ops_sw,
 };
 
-static const struct regulator_linear_range axp809_dcdc4_ranges[] = {
-	REGULATOR_LINEAR_RANGE(600000, 0x0, 0x2f, 20000),
-	REGULATOR_LINEAR_RANGE(1800000, 0x30, 0x38, 100000),
+static const struct regulator_linear_range axp806_dcdca_ranges[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0x0, 0x32, 10000),
+	REGULATOR_LINEAR_RANGE(1120000, 0x33, 0x47, 20000),
 };
 
-static const struct regulator_linear_range axp809_dldo1_ranges[] = {
+static const struct regulator_linear_range axp806_dcdcd_ranges[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0x0, 0x2d, 20000),
+	REGULATOR_LINEAR_RANGE(1600000, 0x2e, 0x3f, 100000),
+};
+
+static const struct regulator_linear_range axp806_cldo2_ranges[] = {
 	REGULATOR_LINEAR_RANGE(700000, 0x0, 0x1a, 100000),
 	REGULATOR_LINEAR_RANGE(3400000, 0x1b, 0x1f, 200000),
 };
 
+static const struct regulator_desc axp806_regulators[] = {
+	AXP_DESC_RANGES(AXP806, DCDCA, "dcdca", "vina", axp806_dcdca_ranges,
+			72, AXP806_DCDCA_V_CTRL, 0x7f, AXP806_PWR_OUT_CTRL1,
+			BIT(0)),
+	AXP_DESC(AXP806, DCDCB, "dcdcb", "vinb", 1000, 2550, 50,
+		 AXP806_DCDCB_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL1, BIT(1)),
+	AXP_DESC_RANGES(AXP806, DCDCC, "dcdcc", "vinc", axp806_dcdca_ranges,
+			72, AXP806_DCDCC_V_CTRL, 0x7f, AXP806_PWR_OUT_CTRL1,
+			BIT(2)),
+	AXP_DESC_RANGES(AXP806, DCDCD, "dcdcd", "vind", axp806_dcdcd_ranges,
+			64, AXP806_DCDCD_V_CTRL, 0x3f, AXP806_PWR_OUT_CTRL1,
+			BIT(3)),
+	AXP_DESC(AXP806, DCDCE, "dcdce", "vine", 1100, 3400, 100,
+		 AXP806_DCDCB_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL1, BIT(4)),
+	AXP_DESC(AXP806, ALDO1, "aldo1", "aldoin", 700, 3300, 100,
+		 AXP806_ALDO1_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL1, BIT(5)),
+	AXP_DESC(AXP806, ALDO2, "aldo2", "aldoin", 700, 3400, 100,
+		 AXP806_ALDO2_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL1, BIT(6)),
+	AXP_DESC(AXP806, ALDO3, "aldo3", "aldoin", 700, 3300, 100,
+		 AXP806_ALDO3_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL1, BIT(7)),
+	AXP_DESC(AXP806, BLDO1, "bldo1", "bldoin", 700, 1900, 100,
+		 AXP806_BLDO1_V_CTRL, 0x0f, AXP806_PWR_OUT_CTRL2, BIT(0)),
+	AXP_DESC(AXP806, BLDO2, "bldo2", "bldoin", 700, 1900, 100,
+		 AXP806_BLDO2_V_CTRL, 0x0f, AXP806_PWR_OUT_CTRL2, BIT(1)),
+	AXP_DESC(AXP806, BLDO3, "bldo3", "bldoin", 700, 1900, 100,
+		 AXP806_BLDO3_V_CTRL, 0x0f, AXP806_PWR_OUT_CTRL2, BIT(2)),
+	AXP_DESC(AXP806, BLDO4, "bldo4", "bldoin", 700, 1900, 100,
+		 AXP806_BLDO4_V_CTRL, 0x0f, AXP806_PWR_OUT_CTRL2, BIT(3)),
+	AXP_DESC(AXP806, CLDO1, "cldo1", "cldoin", 700, 3300, 100,
+		 AXP806_CLDO1_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL2, BIT(4)),
+	AXP_DESC_RANGES(AXP806, CLDO2, "cldo2", "cldoin", axp806_cldo2_ranges,
+			32, AXP806_CLDO2_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL2,
+			BIT(5)),
+	AXP_DESC(AXP806, CLDO3, "cldo3", "cldoin", 700, 3300, 100,
+		 AXP806_CLDO3_V_CTRL, 0x1f, AXP806_PWR_OUT_CTRL2, BIT(6)),
+	AXP_DESC_SW(AXP806, SW, "sw", "swin", AXP806_PWR_OUT_CTRL2, BIT(7)),
+};
+
+static const struct regulator_linear_range axp809_dcdc4_ranges[] = {
+	REGULATOR_LINEAR_RANGE(600000, 0x0, 0x2f, 20000),
+	REGULATOR_LINEAR_RANGE(1800000, 0x30, 0x38, 100000),
+};
+
 static const struct regulator_desc axp809_regulators[] = {
 	AXP_DESC(AXP809, DCDC1, "dcdc1", "vin1", 1600, 3400, 100,
 		 AXP22X_DCDC1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL1, BIT(1)),
@@ -278,7 +326,7 @@ static const struct regulator_desc axp809_regulators[] = {
 		 AXP22X_ALDO2_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL1, BIT(7)),
 	AXP_DESC(AXP809, ALDO3, "aldo3", "aldoin", 700, 3300, 100,
 		 AXP22X_ALDO3_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2, BIT(5)),
-	AXP_DESC_RANGES(AXP809, DLDO1, "dldo1", "dldoin", axp809_dldo1_ranges,
+	AXP_DESC_RANGES(AXP809, DLDO1, "dldo1", "dldoin", axp806_cldo2_ranges,
 			32, AXP22X_DLDO1_V_OUT, 0x1f, AXP22X_PWR_OUT_CTRL2,
 			BIT(3)),
 	AXP_DESC(AXP809, DLDO2, "dldo2", "dldoin", 700, 3300, 100,
@@ -302,6 +350,7 @@ static const struct regulator_desc axp809_regulators[] = {
 static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq)
 {
 	struct axp20x_dev *axp20x = dev_get_drvdata(pdev->dev.parent);
+	unsigned int reg = AXP20X_DCDC_FREQ;
 	u32 min, max, def, step;
 
 	switch (axp20x->variant) {
@@ -312,6 +361,14 @@ static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq)
 		def = 1500;
 		step = 75;
 		break;
+	case AXP806_ID:
+		/*
+		 * AXP806 DCDC work frequency setting has the same range and
+		 * step as AXP22X, but at a different register.
+		 * Fall through to the check below.
+		 * (See include/linux/mfd/axp20x.h)
+		 */
+		reg = AXP806_DCDC_FREQ_CTRL;
 	case AXP221_ID:
 	case AXP223_ID:
 	case AXP809_ID:
@@ -343,7 +400,7 @@ static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq)
 
 	dcdcfreq = (dcdcfreq - min) / step;
 
-	return regmap_update_bits(axp20x->regmap, AXP20X_DCDC_FREQ,
+	return regmap_update_bits(axp20x->regmap, reg,
 				  AXP20X_FREQ_DCDC_MASK, dcdcfreq);
 }
 
@@ -377,6 +434,7 @@ static int axp20x_regulator_parse_dt(struct platform_device *pdev)
 static int axp20x_set_dcdc_workmode(struct regulator_dev *rdev, int id, u32 workmode)
 {
 	struct axp20x_dev *axp20x = rdev_get_drvdata(rdev);
+	unsigned int reg = AXP20X_DCDC_MODE;
 	unsigned int mask;
 
 	switch (axp20x->variant) {
@@ -392,6 +450,13 @@ static int axp20x_set_dcdc_workmode(struct regulator_dev *rdev, int id, u32 work
 		workmode <<= ffs(mask) - 1;
 		break;
 
+	case AXP806_ID:
+		reg = AXP806_DCDC_MODE_CTRL2;
+		/*
+		 * AXP806 DCDC regulator IDs have the same range as AXP22X.
+		 * Fall through to the check below.
+		 * (See include/linux/mfd/axp20x.h)
+		 */
 	case AXP221_ID:
 	case AXP223_ID:
 	case AXP809_ID:
@@ -408,7 +473,34 @@ static int axp20x_set_dcdc_workmode(struct regulator_dev *rdev, int id, u32 work
 		return -EINVAL;
 	}
 
-	return regmap_update_bits(rdev->regmap, AXP20X_DCDC_MODE, mask, workmode);
+	return regmap_update_bits(rdev->regmap, reg, mask, workmode);
+}
+
+/*
+ * This function checks whether a regulator is part of a poly-phase
+ * output setup based on the registers settings. Returns true if it is.
+ */
+static bool axp20x_is_polyphase_slave(struct axp20x_dev *axp20x, int id)
+{
+	u32 reg = 0;
+
+	/* Only AXP806 has poly-phase outputs */
+	if (axp20x->variant != AXP806_ID)
+		return false;
+
+	regmap_read(axp20x->regmap, AXP806_DCDC_MODE_CTRL2, &reg);
+
+	switch (id) {
+	case AXP806_DCDCB:
+		return (((reg & GENMASK(7, 6)) == BIT(6)) ||
+			((reg & GENMASK(7, 6)) == BIT(7)));
+	case AXP806_DCDCC:
+		return ((reg & GENMASK(7, 6)) == BIT(7));
+	case AXP806_DCDCE:
+		return !!(reg & BIT(5));
+	}
+
+	return false;
 }
 
 static int axp20x_regulator_probe(struct platform_device *pdev)
@@ -440,6 +532,10 @@ static int axp20x_regulator_probe(struct platform_device *pdev)
 		drivevbus = of_property_read_bool(pdev->dev.parent->of_node,
 						  "x-powers,drive-vbus-en");
 		break;
+	case AXP806_ID:
+		regulators = axp806_regulators;
+		nregulators = AXP806_REG_ID_MAX;
+		break;
 	case AXP809_ID:
 		regulators = axp809_regulators;
 		nregulators = AXP809_REG_ID_MAX;
@@ -457,6 +553,14 @@ static int axp20x_regulator_probe(struct platform_device *pdev)
 		const struct regulator_desc *desc = &regulators[i];
 		struct regulator_desc *new_desc;
 
+		/*
+		 * If this regulator is a slave in a poly-phase setup,
+		 * skip it, as its controls are bound to the master
+		 * regulator and won't work.
+		 */
+		if (axp20x_is_polyphase_slave(axp20x, i))
+			continue;
+
 		/*
 		 * Regulators DC1SW and DC5LDO are connected internally,
 		 * so we have to handle their supply names separately.
-- 
cgit v1.2.3


From 204ae2963e101344e50e3a679912f795e6b852c5 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sat, 27 Aug 2016 15:55:37 +0800
Subject: mfd: axp20x: Add bindings for AXP806 PMIC

This patch adds the basic and regulator bindings for the X-Powers AXP806
PMIC.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 Documentation/devicetree/bindings/mfd/axp20x.txt | 28 ++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/mfd/axp20x.txt b/Documentation/devicetree/bindings/mfd/axp20x.txt
index 585a95546288..8f3ad9ab4637 100644
--- a/Documentation/devicetree/bindings/mfd/axp20x.txt
+++ b/Documentation/devicetree/bindings/mfd/axp20x.txt
@@ -10,7 +10,8 @@ axp809 (X-Powers)
 
 Required properties:
 - compatible: "x-powers,axp152", "x-powers,axp202", "x-powers,axp209",
-	      "x-powers,axp221", "x-powers,axp223", "x-powers,axp809"
+	      "x-powers,axp221", "x-powers,axp223", "x-powers,axp806",
+	      "x-powers,axp809"
 - reg: The I2C slave address or RSB hardware address for the AXP chip
 - interrupt-parent: The parent interrupt controller
 - interrupts: SoC NMI / GPIO interrupt connected to the PMIC's IRQ pin
@@ -47,7 +48,6 @@ Optional properties for DCDC regulators:
 			  probably makes sense for HiFi audio related
 			  applications that aren't battery constrained.
 
-
 AXP202/AXP209 regulators, type, and corresponding input supply names:
 
 Regulator	  Type		  Supply Name		  Notes
@@ -86,6 +86,30 @@ LDO_IO1		: LDO		: ips-supply		: GPIO 1
 RTC_LDO		: LDO		: ips-supply		: always on
 DRIVEVBUS	: Enable output	: drivevbus-supply	: external regulator
 
+AXP806 regulators, type, and corresponding input supply names:
+
+Regulator	  Type		  Supply Name		  Notes
+---------	  ----		  -----------		  -----
+DCDCA		: DC-DC buck	: vina-supply		: poly-phase capable
+DCDCB		: DC-DC buck	: vinb-supply		: poly-phase capable
+DCDCC		: DC-DC	buck	: vinc-supply		: poly-phase capable
+DCDCD		: DC-DC	buck	: vind-supply		: poly-phase capable
+DCDCE		: DC-DC	buck	: vine-supply		: poly-phase capable
+ALDO1		: LDO		: aldoin-supply		: shared supply
+ALDO2		: LDO		: aldoin-supply		: shared supply
+ALDO3		: LDO		: aldoin-supply		: shared supply
+BLDO1		: LDO		: bldoin-supply		: shared supply
+BLDO2		: LDO		: bldoin-supply		: shared supply
+BLDO3		: LDO		: bldoin-supply		: shared supply
+BLDO4		: LDO		: bldoin-supply		: shared supply
+CLDO1		: LDO		: cldoin-supply		: shared supply
+CLDO2		: LDO		: cldoin-supply		: shared supply
+CLDO3		: LDO		: cldoin-supply		: shared supply
+SW		: On/Off Switch : swin-supply
+
+Additionally, the AXP806 DC-DC regulators support poly-phase arrangements
+for higher output current. The possible groupings are: A+B, A+B+C, D+E.
+
 AXP809 regulators, type, and corresponding input supply names:
 
 Regulator	  Type		  Supply Name		  Notes
-- 
cgit v1.2.3


From 8824ee8573483e1c91691b5be3d3730e75551dce Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Sat, 27 Aug 2016 15:55:38 +0800
Subject: mfd: axp20x: Add support for AXP806 PMIC

The X-Powers AXP806 is a new PMIC that is paired with Allwinner's A80
SoC, along with a master AXP809 PMIC.

This PMIC has a new register layout, and supports some functions not
seen in other X-Powers PMICs, such as master-slave mode, or having
multiple AXP806 PMICs on the same bus with address space extension,
or supporting both I2C and RSB mode. I2C has not been tested.

This patch adds support for the interrupts of the PMIC. A regulator
sub-device is enabled, but actual regulator support will come in a
later patch.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x-rsb.c   |  1 +
 drivers/mfd/axp20x.c       | 72 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 60 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 133 insertions(+)

diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c
index a407527bcd09..a732cb50bcff 100644
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -61,6 +61,7 @@ static int axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
 
 static const struct of_device_id axp20x_rsb_of_match[] = {
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
+	{ .compatible = "x-powers,axp806", .data = (void *)AXP806_ID },
 	{ .compatible = "x-powers,axp809", .data = (void *)AXP809_ID },
 	{ },
 };
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index fd80b0981f0f..96102753847f 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -38,6 +38,7 @@ static const char * const axp20x_model_names[] = {
 	"AXP221",
 	"AXP223",
 	"AXP288",
+	"AXP806",
 	"AXP809",
 };
 
@@ -129,6 +130,27 @@ static const struct regmap_access_table axp288_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp288_volatile_ranges),
 };
 
+static const struct regmap_range axp806_writeable_ranges[] = {
+	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_DATACACHE(3)),
+	regmap_reg_range(AXP806_PWR_OUT_CTRL1, AXP806_CLDO3_V_CTRL),
+	regmap_reg_range(AXP20X_IRQ1_EN, AXP20X_IRQ2_EN),
+	regmap_reg_range(AXP20X_IRQ1_STATE, AXP20X_IRQ2_STATE),
+};
+
+static const struct regmap_range axp806_volatile_ranges[] = {
+	regmap_reg_range(AXP20X_IRQ1_STATE, AXP20X_IRQ2_STATE),
+};
+
+static const struct regmap_access_table axp806_writeable_table = {
+	.yes_ranges	= axp806_writeable_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp806_writeable_ranges),
+};
+
+static const struct regmap_access_table axp806_volatile_table = {
+	.yes_ranges	= axp806_volatile_ranges,
+	.n_yes_ranges	= ARRAY_SIZE(axp806_volatile_ranges),
+};
+
 static struct resource axp152_pek_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_RIS_EDGE, "PEK_DBR"),
 	DEFINE_RES_IRQ_NAMED(AXP152_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
@@ -278,6 +300,15 @@ static const struct regmap_config axp288_regmap_config = {
 	.cache_type	= REGCACHE_RBTREE,
 };
 
+static const struct regmap_config axp806_regmap_config = {
+	.reg_bits	= 8,
+	.val_bits	= 8,
+	.wr_table	= &axp806_writeable_table,
+	.volatile_table	= &axp806_volatile_table,
+	.max_register	= AXP806_VREF_TEMP_WARN_L,
+	.cache_type	= REGCACHE_RBTREE,
+};
+
 #define INIT_REGMAP_IRQ(_variant, _irq, _off, _mask)			\
 	[_variant##_IRQ_##_irq] = { .reg_offset = (_off), .mask = BIT(_mask) }
 
@@ -409,6 +440,21 @@ static const struct regmap_irq axp288_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP288, BC_USB_CHNG,            5, 1),
 };
 
+static const struct regmap_irq axp806_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP806, DIE_TEMP_HIGH_LV1,	0, 0),
+	INIT_REGMAP_IRQ(AXP806, DIE_TEMP_HIGH_LV2,	0, 1),
+	INIT_REGMAP_IRQ(AXP806, DCDCA_V_LOW,		0, 3),
+	INIT_REGMAP_IRQ(AXP806, DCDCB_V_LOW,		0, 4),
+	INIT_REGMAP_IRQ(AXP806, DCDCC_V_LOW,		0, 5),
+	INIT_REGMAP_IRQ(AXP806, DCDCD_V_LOW,		0, 6),
+	INIT_REGMAP_IRQ(AXP806, DCDCE_V_LOW,		0, 7),
+	INIT_REGMAP_IRQ(AXP806, PWROK_LONG,		1, 0),
+	INIT_REGMAP_IRQ(AXP806, PWROK_SHORT,		1, 1),
+	INIT_REGMAP_IRQ(AXP806, WAKEUP,			1, 4),
+	INIT_REGMAP_IRQ(AXP806, PWROK_FALL,		1, 5),
+	INIT_REGMAP_IRQ(AXP806, PWROK_RISE,		1, 6),
+};
+
 static const struct regmap_irq axp809_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP809, ACIN_OVER_V,		0, 7),
 	INIT_REGMAP_IRQ(AXP809, ACIN_PLUGIN,		0, 6),
@@ -494,6 +540,18 @@ static const struct regmap_irq_chip axp288_regmap_irq_chip = {
 
 };
 
+static const struct regmap_irq_chip axp806_regmap_irq_chip = {
+	.name			= "axp806",
+	.status_base		= AXP20X_IRQ1_STATE,
+	.ack_base		= AXP20X_IRQ1_STATE,
+	.mask_base		= AXP20X_IRQ1_EN,
+	.mask_invert		= true,
+	.init_ack_masked	= true,
+	.irqs			= axp806_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp806_regmap_irqs),
+	.num_regs		= 2,
+};
+
 static const struct regmap_irq_chip axp809_regmap_irq_chip = {
 	.name			= "axp809",
 	.status_base		= AXP20X_IRQ1_STATE,
@@ -660,12 +718,20 @@ static struct mfd_cell axp288_cells[] = {
 	},
 };
 
+static struct mfd_cell axp806_cells[] = {
+	{
+		.id			= 2,
+		.name			= "axp20x-regulator",
+	},
+};
+
 static struct mfd_cell axp809_cells[] = {
 	{
 		.name			= "axp20x-pek",
 		.num_resources		= ARRAY_SIZE(axp809_pek_resources),
 		.resources		= axp809_pek_resources,
 	}, {
+		.id			= 1,
 		.name			= "axp20x-regulator",
 	},
 };
@@ -732,6 +798,12 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_cfg = &axp288_regmap_config;
 		axp20x->regmap_irq_chip = &axp288_regmap_irq_chip;
 		break;
+	case AXP806_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp806_cells);
+		axp20x->cells = axp806_cells;
+		axp20x->regmap_cfg = &axp806_regmap_config;
+		axp20x->regmap_irq_chip = &axp806_regmap_irq_chip;
+		break;
 	case AXP809_ID:
 		axp20x->nr_cells = ARRAY_SIZE(axp809_cells);
 		axp20x->cells = axp809_cells;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 0be4982f08fe..fec597fb34cb 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -20,6 +20,7 @@ enum {
 	AXP221_ID,
 	AXP223_ID,
 	AXP288_ID,
+	AXP806_ID,
 	AXP809_ID,
 	NR_AXP20X_VARIANTS,
 };
@@ -91,6 +92,30 @@ enum {
 #define AXP22X_ALDO3_V_OUT		0x2a
 #define AXP22X_CHRG_CTRL3		0x35
 
+#define AXP806_STARTUP_SRC		0x00
+#define AXP806_CHIP_ID			0x03
+#define AXP806_PWR_OUT_CTRL1		0x10
+#define AXP806_PWR_OUT_CTRL2		0x11
+#define AXP806_DCDCA_V_CTRL		0x12
+#define AXP806_DCDCB_V_CTRL		0x13
+#define AXP806_DCDCC_V_CTRL		0x14
+#define AXP806_DCDCD_V_CTRL		0x15
+#define AXP806_DCDCE_V_CTRL		0x16
+#define AXP806_ALDO1_V_CTRL		0x17
+#define AXP806_ALDO2_V_CTRL		0x18
+#define AXP806_ALDO3_V_CTRL		0x19
+#define AXP806_DCDC_MODE_CTRL1		0x1a
+#define AXP806_DCDC_MODE_CTRL2		0x1b
+#define AXP806_DCDC_FREQ_CTRL		0x1c
+#define AXP806_BLDO1_V_CTRL		0x20
+#define AXP806_BLDO2_V_CTRL		0x21
+#define AXP806_BLDO3_V_CTRL		0x22
+#define AXP806_BLDO4_V_CTRL		0x23
+#define AXP806_CLDO1_V_CTRL		0x24
+#define AXP806_CLDO2_V_CTRL		0x25
+#define AXP806_CLDO3_V_CTRL		0x26
+#define AXP806_VREF_TEMP_WARN_L		0xf3
+
 /* Interrupt */
 #define AXP152_IRQ1_EN			0x40
 #define AXP152_IRQ2_EN			0x41
@@ -265,6 +290,26 @@ enum {
 	AXP22X_REG_ID_MAX,
 };
 
+enum {
+	AXP806_DCDCA = 0,
+	AXP806_DCDCB,
+	AXP806_DCDCC,
+	AXP806_DCDCD,
+	AXP806_DCDCE,
+	AXP806_ALDO1,
+	AXP806_ALDO2,
+	AXP806_ALDO3,
+	AXP806_BLDO1,
+	AXP806_BLDO2,
+	AXP806_BLDO3,
+	AXP806_BLDO4,
+	AXP806_CLDO1,
+	AXP806_CLDO2,
+	AXP806_CLDO3,
+	AXP806_SW,
+	AXP806_REG_ID_MAX,
+};
+
 enum {
 	AXP809_DCDC1 = 0,
 	AXP809_DCDC2,
@@ -414,6 +459,21 @@ enum axp288_irqs {
 	AXP288_IRQ_BC_USB_CHNG,
 };
 
+enum axp806_irqs {
+	AXP806_IRQ_DIE_TEMP_HIGH_LV1,
+	AXP806_IRQ_DIE_TEMP_HIGH_LV2,
+	AXP806_IRQ_DCDCA_V_LOW,
+	AXP806_IRQ_DCDCB_V_LOW,
+	AXP806_IRQ_DCDCC_V_LOW,
+	AXP806_IRQ_DCDCD_V_LOW,
+	AXP806_IRQ_DCDCE_V_LOW,
+	AXP806_IRQ_PWROK_LONG,
+	AXP806_IRQ_PWROK_SHORT,
+	AXP806_IRQ_WAKEUP,
+	AXP806_IRQ_PWROK_FALL,
+	AXP806_IRQ_PWROK_RISE,
+};
+
 enum axp809_irqs {
 	AXP809_IRQ_ACIN_OVER_V = 1,
 	AXP809_IRQ_ACIN_PLUGIN,
-- 
cgit v1.2.3


From cdd8da8cc66b3d205120560649e530978ccbc567 Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Fri, 2 Sep 2016 16:52:46 +0100
Subject: mfd: arizona: Add gating of external MCLKn clocks

This patch adds requesting of the clocks supplied on MCLK1, MCLK2 pins,
gating of the 32k clock is added to the arizona_clk32k_enable(),
arizona_clk32k_disable() helpers.

It's a temporary change until the CODEC's clock controller gets exposed
through the clk API and is helpful for board configurations where the
MCLK clocks are not provided by always on oscillators.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/arizona-core.c       | 30 ++++++++++++++++++++++++++++--
 include/linux/mfd/arizona/core.h |  9 +++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index e4f97b3c824b..4fa0fae51006 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -10,6 +10,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/gpio.h>
@@ -49,7 +50,15 @@ int arizona_clk32k_enable(struct arizona *arizona)
 		case ARIZONA_32KZ_MCLK1:
 			ret = pm_runtime_get_sync(arizona->dev);
 			if (ret != 0)
-				goto out;
+				goto err_ref;
+			ret = clk_prepare_enable(arizona->mclk[ARIZONA_MCLK1]);
+			if (ret != 0)
+				goto err_pm;
+			break;
+		case ARIZONA_32KZ_MCLK2:
+			ret = clk_prepare_enable(arizona->mclk[ARIZONA_MCLK2]);
+			if (ret != 0)
+				goto err_ref;
 			break;
 		}
 
@@ -58,7 +67,9 @@ int arizona_clk32k_enable(struct arizona *arizona)
 					 ARIZONA_CLK_32K_ENA);
 	}
 
-out:
+err_pm:
+	pm_runtime_put_sync(arizona->dev);
+err_ref:
 	if (ret != 0)
 		arizona->clk32k_ref--;
 
@@ -83,6 +94,10 @@ int arizona_clk32k_disable(struct arizona *arizona)
 		switch (arizona->pdata.clk32k_src) {
 		case ARIZONA_32KZ_MCLK1:
 			pm_runtime_put_sync(arizona->dev);
+			clk_disable_unprepare(arizona->mclk[ARIZONA_MCLK1]);
+			break;
+		case ARIZONA_32KZ_MCLK2:
+			clk_disable_unprepare(arizona->mclk[ARIZONA_MCLK2]);
 			break;
 		}
 	}
@@ -1000,6 +1015,7 @@ static const struct mfd_cell wm8998_devs[] = {
 
 int arizona_dev_init(struct arizona *arizona)
 {
+	const char * const mclk_name[] = { "mclk1", "mclk2" };
 	struct device *dev = arizona->dev;
 	const char *type_name = NULL;
 	unsigned int reg, val, mask;
@@ -1016,6 +1032,16 @@ int arizona_dev_init(struct arizona *arizona)
 	else
 		arizona_of_get_core_pdata(arizona);
 
+	BUILD_BUG_ON(ARRAY_SIZE(arizona->mclk) != ARRAY_SIZE(mclk_name));
+	for (i = 0; i < ARRAY_SIZE(arizona->mclk); i++) {
+		arizona->mclk[i] = devm_clk_get(arizona->dev, mclk_name[i]);
+		if (IS_ERR(arizona->mclk[i])) {
+			dev_info(arizona->dev, "Failed to get %s: %ld\n",
+				 mclk_name[i], PTR_ERR(arizona->mclk[i]));
+			arizona->mclk[i] = NULL;
+		}
+	}
+
 	regcache_cache_only(arizona->regmap, true);
 
 	switch (arizona->type) {
diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h
index 58ab4c0fe761..b9909bb0642b 100644
--- a/include/linux/mfd/arizona/core.h
+++ b/include/linux/mfd/arizona/core.h
@@ -13,6 +13,7 @@
 #ifndef _WM_ARIZONA_CORE_H
 #define _WM_ARIZONA_CORE_H
 
+#include <linux/clk.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/regmap.h>
@@ -21,6 +22,12 @@
 
 #define ARIZONA_MAX_CORE_SUPPLIES 2
 
+enum {
+	ARIZONA_MCLK1,
+	ARIZONA_MCLK2,
+	ARIZONA_NUM_MCLK
+};
+
 enum arizona_type {
 	WM5102 = 1,
 	WM5110 = 2,
@@ -139,6 +146,8 @@ struct arizona {
 	struct mutex clk_lock;
 	int clk32k_ref;
 
+	struct clk *mclk[ARIZONA_NUM_MCLK];
+
 	bool ctrlif_error;
 
 	struct snd_soc_dapm_context *dapm;
-- 
cgit v1.2.3