From d0b6e0a8ef24b1b07078ababe5d91bcdf4f4264a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Sep 2017 21:36:55 +0200
Subject: watchdog/hardlockup: Provide interface to stop/restart perf events

Provide an interface to stop and restart perf NMI watchdog events on all
CPUs. This is only usable during init and especially for handling the perf
HT bug on Intel machines. It's safe to use it this way as nothing can
start/stop the NMI watchdog in parallel.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.167649596@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a36abe2da13e..b24d4a58674a 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -75,7 +75,11 @@ static inline void hardlockup_detector_disable(void) {}
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void arch_touch_nmi_watchdog(void);
+extern void hardlockup_detector_perf_stop(void);
+extern void hardlockup_detector_perf_restart(void);
 #else
+static inline void hardlockup_detector_perf_stop(void) { }
+static inline void hardlockup_detector_perf_restart(void) { }
 #if !defined(CONFIG_HAVE_NMI_WATCHDOG)
 static inline void arch_touch_nmi_watchdog(void) {}
 #endif
-- 
cgit v1.2.3


From 6554fd8cf06db86f861bb24d7487b2873ca444c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:36:57 +0200
Subject: watchdog/core: Provide interface to stop from poweroff()

PARISC has a a busy looping power off routine. If the watchdog is enabled
the watchdog timer will still fire, but the thread is not running, which
causes the softlockup watchdog to trigger.

Provide a interface which allows to turn the watchdog off.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: linux-parisc@vger.kernel.org
Link: http://lkml.kernel.org/r/20170912194146.327343752@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h |  6 +++---
 kernel/watchdog.c   | 14 +++++++++++++-
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b24d4a58674a..85bb268be39c 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -12,10 +12,10 @@
 
 #ifdef CONFIG_LOCKUP_DETECTOR
 void lockup_detector_init(void);
+void lockup_detector_soft_poweroff(void);
 #else
-static inline void lockup_detector_init(void)
-{
-}
+static inline void lockup_detector_init(void) { }
+static inline void lockup_detector_soft_poweroff(void) { }
 #endif
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f5d52024f6b7..f23e373aa3bf 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -333,7 +333,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	int duration;
 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
-	if (atomic_read(&watchdog_park_in_progress) != 0)
+	if (!watchdog_enabled ||
+	    atomic_read(&watchdog_park_in_progress) != 0)
 		return HRTIMER_NORESTART;
 
 	/* kick the hardlockup detector */
@@ -660,6 +661,17 @@ static void set_sample_period(void)
 }
 #endif /* SOFTLOCKUP */
 
+/**
+ * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
+ *
+ * Special interface for parisc. It prevents lockup detector warnings from
+ * the default pm_poweroff() function which busy loops forever.
+ */
+void lockup_detector_soft_poweroff(void)
+{
+	watchdog_enabled = 0;
+}
+
 /*
  * Suspend the hard and soft lockup detector by parking the watchdog threads.
  */
-- 
cgit v1.2.3


From 5490125d77a43016b26f629d4b485e2c62172551 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:36:59 +0200
Subject: watchdog/core: Remove broken suspend/resume interfaces

This interface has several issues:

 - It's causing recursive locking of the hotplug lock.

 - It's complete overkill to teardown all threads and then recreate them

The same can be achieved with the simple hardlockup_detector_perf_stop /
restart() interfaces. The abuse from the busy looping poweroff() loop of
PARISC has been solved as well.

Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.487537732@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/watchdog.c |  3 --
 include/linux/nmi.h            | 12 ------
 kernel/watchdog.c              | 89 +-----------------------------------------
 3 files changed, 1 insertion(+), 103 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..5ded171f02d6 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -310,9 +310,6 @@ static int start_wd_on_cpu(unsigned int cpu)
 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
 		return 0;
 
-	if (watchdog_suspended)
-		return 0;
-
 	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
 		return 0;
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 85bb268be39c..7eefe7abf44b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -164,7 +164,6 @@ extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 extern struct cpumask watchdog_cpumask;
 extern unsigned long *watchdog_cpumask_bits;
-extern int __read_mostly watchdog_suspended;
 #ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
@@ -192,17 +191,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int lockup_detector_suspend(void);
-extern void lockup_detector_resume(void);
-#else
-static inline int lockup_detector_suspend(void)
-{
-	return 0;
-}
-
-static inline void lockup_detector_resume(void)
-{
-}
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f23e373aa3bf..b2d46757917e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -97,19 +97,6 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
  * unregistered/stopped, so it is an indicator whether the threads exist.
  */
 static int __read_mostly watchdog_running;
-/*
- * If a subsystem has a need to deactivate the watchdog temporarily, it
- * can use the suspend/resume interface to achieve this. The content of
- * the 'watchdog_suspended' variable reflects this state. Existing threads
- * are parked/unparked by the lockup_detector_{suspend|resume} functions
- * (see comment blocks pertaining to those functions for further details).
- *
- * 'watchdog_suspended' also prevents threads from being registered/started
- * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
- * of 'watchdog_running' cannot change while the watchdog is deactivated
- * temporarily (see related code in 'proc' handlers).
- */
-int __read_mostly watchdog_suspended;
 
 /*
  * These functions can be overridden if an architecture implements its
@@ -136,7 +123,6 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
  * - watchdog_cpumask
  * - sysctl_hardlockup_all_cpu_backtrace
  * - hardlockup_panic
- * - watchdog_suspended
  */
 void __weak watchdog_nmi_reconfigure(void)
 {
@@ -672,61 +658,6 @@ void lockup_detector_soft_poweroff(void)
 	watchdog_enabled = 0;
 }
 
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
- */
-int lockup_detector_suspend(void)
-{
-	int ret = 0;
-
-	get_online_cpus();
-	mutex_lock(&watchdog_proc_mutex);
-	/*
-	 * Multiple suspend requests can be active in parallel (counted by
-	 * the 'watchdog_suspended' variable). If the watchdog threads are
-	 * running, the first caller takes care that they will be parked.
-	 * The state of 'watchdog_running' cannot change while a suspend
-	 * request is active (see related code in 'proc' handlers).
-	 */
-	if (watchdog_running && !watchdog_suspended)
-		ret = watchdog_park_threads();
-
-	if (ret == 0)
-		watchdog_suspended++;
-	else {
-		watchdog_disable_all_cpus();
-		pr_err("Failed to suspend lockup detectors, disabled\n");
-		watchdog_enabled = 0;
-	}
-
-	watchdog_nmi_reconfigure();
-
-	mutex_unlock(&watchdog_proc_mutex);
-
-	return ret;
-}
-
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
- */
-void lockup_detector_resume(void)
-{
-	mutex_lock(&watchdog_proc_mutex);
-
-	watchdog_suspended--;
-	/*
-	 * The watchdog threads are unparked if they were previously running
-	 * and if there is no more active suspend request.
-	 */
-	if (watchdog_running && !watchdog_suspended)
-		watchdog_unpark_threads();
-
-	watchdog_nmi_reconfigure();
-
-	mutex_unlock(&watchdog_proc_mutex);
-	put_online_cpus();
-}
-
 #ifdef CONFIG_SYSCTL
 
 /*
@@ -775,12 +706,6 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 	get_online_cpus();
 	mutex_lock(&watchdog_proc_mutex);
 
-	if (watchdog_suspended) {
-		/* no parameter changes allowed while watchdog is suspended */
-		err = -EAGAIN;
-		goto out;
-	}
-
 	/*
 	 * If the parameter is being read return the state of the corresponding
 	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,12 +797,6 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 	get_online_cpus();
 	mutex_lock(&watchdog_proc_mutex);
 
-	if (watchdog_suspended) {
-		/* no parameter changes allowed while watchdog is suspended */
-		err = -EAGAIN;
-		goto out;
-	}
-
 	old = ACCESS_ONCE(watchdog_thresh);
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
@@ -917,12 +836,6 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	get_online_cpus();
 	mutex_lock(&watchdog_proc_mutex);
 
-	if (watchdog_suspended) {
-		/* no parameter changes allowed while watchdog is suspended */
-		err = -EAGAIN;
-		goto out;
-	}
-
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
 	if (!err && write) {
 		/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -941,7 +854,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 
 		watchdog_nmi_reconfigure();
 	}
-out:
+
 	mutex_unlock(&watchdog_proc_mutex);
 	put_online_cpus();
 	return err;
-- 
cgit v1.2.3


From 941154bd6937a710ae9193a3c733c0029e5ae7b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:04 +0200
Subject: watchdog/hardlockup/perf: Prevent CPU hotplug deadlock

The following deadlock is possible in the watchdog hotplug code:

  cpus_write_lock()
    ...
      takedown_cpu()
        smpboot_park_threads()
          smpboot_park_thread()
            kthread_park()
              ->park() := watchdog_disable()
                watchdog_nmi_disable()
                  perf_event_release_kernel();
                    put_event()
                      _free_event()
                        ->destroy() := hw_perf_event_destroy()
                          x86_release_hardware()
                            release_ds_buffers()
                              get_online_cpus()

when a per cpu watchdog perf event is destroyed which drops the last
reference to the PMU hardware. The cleanup code there invokes
get_online_cpus() which instantly deadlocks because the hotplug percpu
rwsem is write locked.

To solve this add a deferring mechanism:

  cpus_write_lock()
			   kthread_park()
			    watchdog_nmi_disable(deferred)
			      perf_event_disable(event);
			      move_event_to_deferred(event);
			   ....
  cpus_write_unlock()
  cleaup_deferred_events()
    perf_event_release_kernel()

This is still properly serialized against concurrent hotplug via the
cpu_add_remove_lock, which is held by the task which initiated the hotplug
event.

This is also used to handle event destruction when the watchdog threads are
parked via other mechanisms than CPU hotplug.

Analyzed-by: Peter Zijlstra <peterz@infradead.org>

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194146.884469246@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  6 ++++++
 kernel/cpu.c          |  6 ++++++
 kernel/watchdog.c     | 25 +++++++++++++++++++++++++
 kernel/watchdog_hld.c | 34 ++++++++++++++++++++++++++++------
 4 files changed, 65 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 7eefe7abf44b..80354e6fa86d 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -13,9 +13,11 @@
 #ifdef CONFIG_LOCKUP_DETECTOR
 void lockup_detector_init(void);
 void lockup_detector_soft_poweroff(void);
+void lockup_detector_cleanup(void);
 #else
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
+static inline void lockup_detector_cleanup(void) { }
 #endif
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
@@ -77,9 +79,13 @@ static inline void hardlockup_detector_disable(void) {}
 extern void arch_touch_nmi_watchdog(void);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
+extern void hardlockup_detector_perf_disable(void);
+extern void hardlockup_detector_perf_cleanup(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
+static inline void hardlockup_detector_perf_disable(void) { }
+static inline void hardlockup_detector_perf_cleanup(void) { }
 #if !defined(CONFIG_HAVE_NMI_WATCHDOG)
 static inline void arch_touch_nmi_watchdog(void) {}
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..a96b348591df 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -24,6 +24,7 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/nmi.h>
 #include <linux/smpboot.h>
 #include <linux/relay.h>
 #include <linux/slab.h>
@@ -734,6 +735,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 out:
 	cpus_write_unlock();
+	/*
+	 * Do post unplug cleanup. This is still protected against
+	 * concurrent CPU hotplug via cpu_add_remove_lock.
+	 */
+	lockup_detector_cleanup();
 	return ret;
 }
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index af000956286c..dd1fd59683c5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -109,8 +109,10 @@ int __weak watchdog_nmi_enable(unsigned int cpu)
 {
 	return 0;
 }
+
 void __weak watchdog_nmi_disable(unsigned int cpu)
 {
+	hardlockup_detector_perf_disable();
 }
 
 /*
@@ -193,6 +195,8 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif
 #endif
 
+static void __lockup_detector_cleanup(void);
+
 /*
  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
  * lockups can have false positives under extreme conditions. So we generally
@@ -631,6 +635,24 @@ static void set_sample_period(void)
 }
 #endif /* SOFTLOCKUP */
 
+static void __lockup_detector_cleanup(void)
+{
+	lockdep_assert_held(&watchdog_mutex);
+	hardlockup_detector_perf_cleanup();
+}
+
+/**
+ * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
+ *
+ * Caller must not hold the cpu hotplug rwsem.
+ */
+void lockup_detector_cleanup(void)
+{
+	mutex_lock(&watchdog_mutex);
+	__lockup_detector_cleanup();
+	mutex_unlock(&watchdog_mutex);
+}
+
 /**
  * lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
  *
@@ -665,6 +687,8 @@ static int proc_watchdog_update(void)
 
 	watchdog_nmi_reconfigure();
 
+	__lockup_detector_cleanup();
+
 	return err;
 
 }
@@ -837,6 +861,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 		}
 
 		watchdog_nmi_reconfigure();
+		__lockup_detector_cleanup();
 	}
 
 	mutex_unlock(&watchdog_mutex);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 7b602714ea53..94111ccb09b5 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -21,6 +21,8 @@
 static DEFINE_PER_CPU(bool, hard_watchdog_warn);
 static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+static DEFINE_PER_CPU(struct perf_event *, dead_event);
+static struct cpumask dead_events_mask;
 
 static unsigned long hardlockup_allcpu_dumped;
 static bool hardlockup_detector_disabled;
@@ -239,16 +241,18 @@ out:
 	return 0;
 }
 
-void watchdog_nmi_disable(unsigned int cpu)
+/**
+ * hardlockup_detector_perf_disable - Disable the local event
+ */
+void hardlockup_detector_perf_disable(void)
 {
-	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+	struct perf_event *event = this_cpu_read(watchdog_ev);
 
 	if (event) {
 		perf_event_disable(event);
-		per_cpu(watchdog_ev, cpu) = NULL;
-
-		/* should be in cleanup, but blocks oprofile */
-		perf_event_release_kernel(event);
+		this_cpu_write(watchdog_ev, NULL);
+		this_cpu_write(dead_event, event);
+		cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
 
 		/* watchdog_nmi_enable() expects this to be zero initially. */
 		if (atomic_dec_and_test(&watchdog_cpus))
@@ -256,6 +260,24 @@ void watchdog_nmi_disable(unsigned int cpu)
 	}
 }
 
+/**
+ * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
+ *
+ * Called from lockup_detector_cleanup(). Serialized by the caller.
+ */
+void hardlockup_detector_perf_cleanup(void)
+{
+	int cpu;
+
+	for_each_cpu(cpu, &dead_events_mask) {
+		struct perf_event *event = per_cpu(dead_event, cpu);
+
+		per_cpu(dead_event, cpu) = NULL;
+		perf_event_release_kernel(event);
+	}
+	cpumask_clear(&dead_events_mask);
+}
+
 /**
  * hardlockup_detector_perf_stop - Globally stop watchdog events
  *
-- 
cgit v1.2.3


From 01f0a02701cbcf32d22cfc9d1ab9a3f0ff2ba68c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:05 +0200
Subject: watchdog/core: Remove the park_in_progress obfuscation

Commit:

  b94f51183b06 ("kernel/watchdog: prevent false hardlockup on overloaded system")

tries to fix the following issue:

proc_write()
   set_sample_period()    <--- New sample period becoms visible
			  <----- Broken starts
   proc_watchdog_update()
     watchdog_enable_all_cpus()		watchdog_hrtimer_fn()
     update_watchdog_all_cpus()		   restart_timer(sample_period)
        watchdog_park_threads()

					thread->park()
					  disable_nmi()
			  <----- Broken ends

The reason why this is broken is that the update of the watchdog threshold
becomes immediately effective and visible for the hrtimer function which
uses that value to rearm the timer. But the NMI/perf side still uses the
old value up to the point where it is disabled. If the rate has been
lowered then the NMI can run fast enough to 'detect' a hard lockup because
the timer has not fired due to the longer period.

The patch 'fixed' this by adding a variable:

proc_write()
   set_sample_period()
					<----- Broken starts
   proc_watchdog_update()
     watchdog_enable_all_cpus()		watchdog_hrtimer_fn()
     update_watchdog_all_cpus()		   restart_timer(sample_period)
         watchdog_park_threads()
	  park_in_progress = 1
					<----- Broken ends
				        nmi_watchdog()
					  if (park_in_progress)
					     return;

The only effect of this variable was to make the window where the breakage
can hit small enough that it was not longer observable in testing. From a
correctness point of view it is a pointless bandaid which merily papers
over the root cause: the unsychronized update of the variable.

Looking deeper into the related code pathes unearthed similar problems in
the watchdog_start()/stop() functions.

 watchdog_start()
	perf_nmi_event_start()
	hrtimer_start()

 watchdog_stop()
	hrtimer_cancel()
	perf_nmi_event_stop()

In both cases the call order is wrong because if the tasks gets preempted
or the VM gets scheduled out long enough after the first call, then there is
a chance that the next NMI will see a stale hrtimer interrupt count and
trigger a false positive hard lockup splat.

Get rid of park_in_progress so the code can be gradually deobfuscated and
pruned from several layers of duct tape papering over the root cause,
which has been either ignored or not understood at all.

Once this is removed the underlying problem will be fixed by rewriting the
proc interface to do a proper synchronized update.

Address the start/stop() ordering problem as well by reverting the call
order, so this part is at least correct now.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1709052038270.2393@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  1 -
 kernel/watchdog.c     | 37 +++++++++++++++++--------------------
 kernel/watchdog_hld.c |  7 ++-----
 3 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 80354e6fa86d..91a3a4a4c8ae 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,7 +27,6 @@ extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
 extern int soft_watchdog_enabled;
-extern atomic_t watchdog_park_in_progress;
 #else
 static inline void touch_softlockup_watchdog_sched(void)
 {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dd1fd59683c5..c290135fb415 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -136,8 +136,6 @@ void __weak watchdog_nmi_reconfigure(void)
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -322,8 +320,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	int duration;
 	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
 
-	if (!watchdog_enabled ||
-	    atomic_read(&watchdog_park_in_progress) != 0)
+	if (!watchdog_enabled)
 		return HRTIMER_NORESTART;
 
 	/* kick the hardlockup detector */
@@ -437,32 +434,37 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
 
 static void watchdog_enable(unsigned int cpu)
 {
-	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
-	/* kick off the timer for the hardlockup detector */
+	/*
+	 * Start the timer first to prevent the NMI watchdog triggering
+	 * before the timer has a chance to fire.
+	 */
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = watchdog_timer_fn;
+	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+		      HRTIMER_MODE_REL_PINNED);
 
+	/* Initialize timestamp */
+	__touch_watchdog();
 	/* Enable the perf event */
 	watchdog_nmi_enable(cpu);
 
-	/* done here because hrtimer_start can only pin to smp_processor_id() */
-	hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-		      HRTIMER_MODE_REL_PINNED);
-
-	/* initialize timestamp */
 	watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
-	__touch_watchdog();
 }
 
 static void watchdog_disable(unsigned int cpu)
 {
-	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
+	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 
 	watchdog_set_prio(SCHED_NORMAL, 0);
-	hrtimer_cancel(hrtimer);
-	/* disable the perf event */
+	/*
+	 * Disable the perf event first. That prevents that a large delay
+	 * between disabling the timer and disabling the perf event causes
+	 * the perf NMI to detect a false positive.
+	 */
 	watchdog_nmi_disable(cpu);
+	hrtimer_cancel(hrtimer);
 }
 
 static void watchdog_cleanup(unsigned int cpu, bool online)
@@ -518,16 +520,11 @@ static int watchdog_park_threads(void)
 {
 	int cpu, ret = 0;
 
-	atomic_set(&watchdog_park_in_progress, 1);
-
 	for_each_watchdog_cpu(cpu) {
 		ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
 		if (ret)
 			break;
 	}
-
-	atomic_set(&watchdog_park_in_progress, 0);
-
 	return ret;
 }
 
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 94111ccb09b5..0aa191ee3d51 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -106,15 +106,12 @@ static struct perf_event_attr wd_hw_attr = {
 
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
-		 struct perf_sample_data *data,
-		 struct pt_regs *regs)
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs)
 {
 	/* Ensure the watchdog never gets throttled */
 	event->hw.interrupts = 0;
 
-	if (atomic_read(&watchdog_park_in_progress) != 0)
-		return;
-
 	if (__this_cpu_read(watchdog_nmi_touch) == true) {
 		__this_cpu_write(watchdog_nmi_touch, false);
 		return;
-- 
cgit v1.2.3


From 0d85923c7a81719567311ba0eae8ecb2efd4c8a0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:09 +0200
Subject: smpboot/threads, watchdog/core: Avoid runtime allocation

smpboot_update_cpumask_threads_percpu() allocates a temporary cpumask at
runtime. This is suboptimal because the call site needs more code size for
proper error handling than a statically allocated temporary mask requires
data size.

Add static temporary cpumask. The function is globaly serialized, so no
further protection required.

Remove the half baken error handling in the watchdog code and get rid of
the export as there are no in tree modular users of that function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.297288838@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/smpboot.h |  4 ++--
 kernel/smpboot.c        | 22 +++++++---------------
 kernel/watchdog.c       | 21 +++++----------------
 3 files changed, 14 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index 12910cf19869..c149aa7bedf3 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -55,7 +55,7 @@ smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 }
 
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-					 const struct cpumask *);
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					  const struct cpumask *);
 
 #endif
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1d71c051a951..ed7507b69b48 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -344,39 +344,31 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
  * by the client, but only by calling this function.
  * This function can only be called on a registered smp_hotplug_thread.
  */
-int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-					 const struct cpumask *new)
+void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
+					  const struct cpumask *new)
 {
 	struct cpumask *old = plug_thread->cpumask;
-	cpumask_var_t tmp;
+	static struct cpumask tmp;
 	unsigned int cpu;
 
-	if (!alloc_cpumask_var(&tmp, GFP_KERNEL))
-		return -ENOMEM;
-
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
 
 	/* Park threads that were exclusively enabled on the old mask. */
-	cpumask_andnot(tmp, old, new);
-	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+	cpumask_andnot(&tmp, old, new);
+	for_each_cpu_and(cpu, &tmp, cpu_online_mask)
 		smpboot_park_thread(plug_thread, cpu);
 
 	/* Unpark threads that are exclusively enabled on the new mask. */
-	cpumask_andnot(tmp, new, old);
-	for_each_cpu_and(cpu, tmp, cpu_online_mask)
+	cpumask_andnot(&tmp, new, old);
+	for_each_cpu_and(cpu, &tmp, cpu_online_mask)
 		smpboot_unpark_thread(plug_thread, cpu);
 
 	cpumask_copy(old, new);
 
 	mutex_unlock(&smpboot_threads_lock);
 	put_online_cpus();
-
-	free_cpumask_var(tmp);
-
-	return 0;
 }
-EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread);
 
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index cedf45ab4d81..8935a3a4c2fb 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -787,31 +787,20 @@ out:
 	return err;
 }
 
-static int watchdog_update_cpus(void)
+static void watchdog_update_cpus(void)
 {
-	if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR)) {
-		return smpboot_update_cpumask_percpu_thread(&watchdog_threads,
-							    &watchdog_cpumask);
+	if (IS_ENABLED(CONFIG_SOFTLOCKUP_DETECTOR) && watchdog_running) {
+		smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+						     &watchdog_cpumask);
 		__lockup_detector_cleanup();
 	}
-	return 0;
 }
 
 static void proc_watchdog_cpumask_update(void)
 {
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
-
-	if (watchdog_running) {
-		/*
-		 * Failure would be due to being unable to allocate a
-		 * temporary cpumask, so we are likely not in a position to
-		 * do much else to make things better.
-		 */
-		if (watchdog_update_cpus() != 0)
-			pr_err("cpumask update failed\n");
-	}
-
+	watchdog_update_cpus();
 	watchdog_nmi_reconfigure();
 }
 
-- 
cgit v1.2.3


From 3b371b5936e7777c819619c00ca60f196a8e13fa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:13 +0200
Subject: watchdog/core: Clean up header mess

Having the same #ifdef in various places does not make it more
readable. Collect stuff into one place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.627096864@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h | 60 ++++++++++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 91a3a4a4c8ae..cfebb3bc4eed 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -14,11 +14,29 @@
 void lockup_detector_init(void);
 void lockup_detector_soft_poweroff(void);
 void lockup_detector_cleanup(void);
+bool is_hardlockup(void);
+
+extern int watchdog_user_enabled;
+extern int nmi_watchdog_enabled;
+extern int soft_watchdog_enabled;
+extern int watchdog_thresh;
+extern unsigned long watchdog_enabled;
+
+extern struct cpumask watchdog_cpumask;
+extern unsigned long *watchdog_cpumask_bits;
+#ifdef CONFIG_SMP
+extern int sysctl_softlockup_all_cpu_backtrace;
+extern int sysctl_hardlockup_all_cpu_backtrace;
 #else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
+#endif /* !CONFIG_SMP */
+
+#else /* CONFIG_LOCKUP_DETECTOR */
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
 static inline void lockup_detector_cleanup(void) { }
-#endif
+#endif /* !CONFIG_LOCKUP_DETECTOR */
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 extern void touch_softlockup_watchdog_sched(void);
@@ -26,28 +44,17 @@ extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
-extern int soft_watchdog_enabled;
 #else
-static inline void touch_softlockup_watchdog_sched(void)
-{
-}
-static inline void touch_softlockup_watchdog(void)
-{
-}
-static inline void touch_softlockup_watchdog_sync(void)
-{
-}
-static inline void touch_all_softlockup_watchdogs(void)
-{
-}
+static inline void touch_softlockup_watchdog_sched(void) { }
+static inline void touch_softlockup_watchdog(void) { }
+static inline void touch_softlockup_watchdog_sync(void) { }
+static inline void touch_all_softlockup_watchdogs(void) { }
 #endif
 
 #ifdef CONFIG_DETECT_HUNG_TASK
 void reset_hung_task_detector(void);
 #else
-static inline void reset_hung_task_detector(void)
-{
-}
+static inline void reset_hung_task_detector(void) { }
 #endif
 
 /*
@@ -92,7 +99,7 @@ static inline void arch_touch_nmi_watchdog(void) {}
 
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
- * 
+ *
  * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
  * may be used to reset the timeout - for code which intentionally
  * disables interrupts for a long time. This call is stateless.
@@ -162,21 +169,6 @@ static inline bool trigger_single_cpu_backtrace(int cpu)
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 #endif
 
-#ifdef CONFIG_LOCKUP_DETECTOR
-extern int nmi_watchdog_enabled;
-extern int watchdog_user_enabled;
-extern int watchdog_thresh;
-extern unsigned long watchdog_enabled;
-extern struct cpumask watchdog_cpumask;
-extern unsigned long *watchdog_cpumask_bits;
-#ifdef CONFIG_SMP
-extern int sysctl_softlockup_all_cpu_backtrace;
-extern int sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
-#endif
-
 #if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \
     defined(CONFIG_HARDLOCKUP_DETECTOR)
 void watchdog_update_hrtimer_threshold(u64 period);
@@ -184,7 +176,6 @@ void watchdog_update_hrtimer_threshold(u64 period);
 static inline void watchdog_update_hrtimer_threshold(u64 period) { }
 #endif
 
-extern bool is_hardlockup(void);
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
 			 void __user *, size_t *, loff_t *);
@@ -196,7 +187,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-#endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 #include <asm/nmi.h>
-- 
cgit v1.2.3


From 51d4052b01ca555e0d1d5fe297b309beb6c64aa0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:14 +0200
Subject: watchdog/sysctl: Get rid of the #ifdeffery

The sysctl of the nmi_watchdog file prevents writes by setting:

    min = max = 0

if none of the users is enabled. That involves ifdeffery and is competely
non obvious.

If none of the facilities is enabeld, then the file can simply be made read
only. Move the ifdeffery into the header and use a constant for file
permissions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.706073616@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h | 6 ++++++
 kernel/sysctl.c     | 6 +-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index cfebb3bc4eed..5774b443dba1 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -81,6 +81,12 @@ extern unsigned int hardlockup_panic;
 static inline void hardlockup_detector_disable(void) {}
 #endif
 
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+# define NMI_WATCHDOG_SYSCTL_PERM	0644
+#else
+# define NMI_WATCHDOG_SYSCTL_PERM	0444
+#endif
+
 #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
 extern void arch_touch_nmi_watchdog(void);
 extern void hardlockup_detector_perf_stop(void);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..539cb4e97bb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -891,14 +891,10 @@ static struct ctl_table kern_table[] = {
 		.procname       = "nmi_watchdog",
 		.data           = &nmi_watchdog_enabled,
 		.maxlen         = sizeof (int),
-		.mode           = 0644,
+		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
 		.proc_handler   = proc_nmi_watchdog,
 		.extra1		= &zero,
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
 		.extra2		= &one,
-#else
-		.extra2		= &zero,
-#endif
 	},
 	{
 		.procname	= "watchdog_cpumask",
-- 
cgit v1.2.3


From 7feeb9cd4f5b34476ffb9e6d58d58c5416375b19 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:15 +0200
Subject: watchdog/sysctl: Clean up sysctl variable name space

Reflect that these variables are user interface related and remove the
whitespace damage in the sysctl table while at it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194147.783210221@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h | 16 ++++++++--------
 kernel/sysctl.c     | 16 ++++++++--------
 kernel/watchdog.c   | 41 ++++++++++++++++++++---------------------
 3 files changed, 36 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 5774b443dba1..4a8d1037364e 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,8 +17,8 @@ void lockup_detector_cleanup(void);
 bool is_hardlockup(void);
 
 extern int watchdog_user_enabled;
-extern int nmi_watchdog_enabled;
-extern int soft_watchdog_enabled;
+extern int nmi_watchdog_user_enabled;
+extern int soft_watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 
@@ -62,12 +62,12 @@ static inline void reset_hung_task_detector(void) { }
  * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
  * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
  *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
+ * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and
+ * 'soft_watchdog_user_enabled' are variables that are only used as an
+ * 'interface' between the parameters in /proc/sys/kernel and the internal
+ * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is
+ * handled differently because its value is not boolean, and the lockup
+ * detectors are 'suspended' while 'watchdog_thresh' is equal zero.
  */
 #define NMI_WATCHDOG_ENABLED_BIT   0
 #define SOFT_WATCHDOG_ENABLED_BIT  1
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 539cb4e97bb8..4c08ed4a379e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -871,9 +871,9 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "watchdog",
-		.data           = &watchdog_user_enabled,
-		.maxlen         = sizeof (int),
-		.mode           = 0644,
+		.data		= &watchdog_user_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
 		.proc_handler   = proc_watchdog,
 		.extra1		= &zero,
 		.extra2		= &one,
@@ -889,8 +889,8 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname       = "nmi_watchdog",
-		.data           = &nmi_watchdog_enabled,
-		.maxlen         = sizeof (int),
+		.data		= &nmi_watchdog_user_enabled,
+		.maxlen		= sizeof(int),
 		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
 		.proc_handler   = proc_nmi_watchdog,
 		.extra1		= &zero,
@@ -906,9 +906,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 	{
 		.procname       = "soft_watchdog",
-		.data           = &soft_watchdog_enabled,
-		.maxlen         = sizeof (int),
-		.mode           = 0644,
+		.data		= &soft_watchdog_user_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
 		.proc_handler   = proc_soft_watchdog,
 		.extra1		= &zero,
 		.extra2		= &one,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index ca8747221e87..baae9fc95031 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,8 +31,6 @@
 
 static DEFINE_MUTEX(watchdog_mutex);
 
-int __read_mostly nmi_watchdog_enabled;
-
 #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
 unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
 						NMI_WATCHDOG_ENABLED;
@@ -40,6 +38,17 @@ unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
 unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
 #endif
 
+int __read_mostly nmi_watchdog_user_enabled;
+int __read_mostly soft_watchdog_user_enabled;
+int __read_mostly watchdog_user_enabled;
+int __read_mostly watchdog_thresh = 10;
+
+struct cpumask watchdog_allowed_mask __read_mostly;
+static bool softlockup_threads_initialized __read_mostly;
+
+struct cpumask watchdog_cpumask __read_mostly;
+unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
+
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /*
  * Should we panic when a soft-lockup or hard-lockup occurs:
@@ -85,12 +94,6 @@ __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 # endif /* CONFIG_SMP */
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
-int __read_mostly watchdog_user_enabled;
-int __read_mostly watchdog_thresh = 10;
-
-struct cpumask watchdog_cpumask __read_mostly;
-unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-
 /*
  * These functions can be overridden if an architecture implements its
  * own hardlockup detector.
@@ -113,7 +116,7 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
  * watchdog_nmi_reconfigure can be implemented to be notified after any
  * watchdog configuration change. The arch hardlockup watchdog should
  * respond to the following variables:
- * - nmi_watchdog_enabled
+ * - watchdog_enabled
  * - watchdog_thresh
  * - watchdog_cpumask
  * - sysctl_hardlockup_all_cpu_backtrace
@@ -126,10 +129,6 @@ void __weak watchdog_nmi_reconfigure(void) { }
 /* Global variables, exported for sysctl */
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-int __read_mostly soft_watchdog_enabled;
-
-struct cpumask watchdog_allowed_mask __read_mostly;
-static bool softlockup_threads_initialized __read_mostly;
 
 static u64 __read_mostly sample_period;
 
@@ -606,14 +605,14 @@ static void proc_watchdog_update(void)
 /*
  * common function for watchdog, nmi_watchdog and soft_watchdog parameter
  *
- * caller             | table->data points to | 'which' contains the flag(s)
- * -------------------|-----------------------|-----------------------------
- * proc_watchdog      | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed
- *                    |                       | with SOFT_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_nmi_watchdog  | nmi_watchdog_enabled  | NMI_WATCHDOG_ENABLED
- * -------------------|-----------------------|-----------------------------
- * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED
+ * caller             | table->data points to      | 'which'
+ * -------------------|----------------------------|--------------------------
+ * proc_watchdog      | watchdog_user_enabled      | NMI_WATCHDOG_ENABLED |
+ *                    |                            | SOFT_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_nmi_watchdog  | nmi_watchdog_user_enabled  | NMI_WATCHDOG_ENABLED
+ * -------------------|----------------------------|--------------------------
+ * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED
  */
 static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
-- 
cgit v1.2.3


From 6592ad2fcc8f15b4f99b36c1db7d9f65510c203b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:16 +0200
Subject: watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage

Both the perf reconfiguration and the powerpc watchdog_nmi_reconfigure()
need to be done in two steps.

     1) Stop all NMIs
     2) Read the new parameters and start NMIs

Right now watchdog_nmi_reconfigure() is a combination of both. To allow a
clean reconfiguration add a 'run' argument and split the functionality in
powerpc.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20170912194147.862865570@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/kernel/watchdog.c | 17 +++++++++--------
 include/linux/nmi.h            |  2 ++
 kernel/watchdog.c              | 31 ++++++++++++++++++++++---------
 3 files changed, 33 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 5ded171f02d6..291af79a9826 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -355,17 +355,18 @@ static void watchdog_calc_timeouts(void)
 	wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
 }
 
-void watchdog_nmi_reconfigure(void)
+void watchdog_nmi_reconfigure(bool run)
 {
 	int cpu;
 
-	watchdog_calc_timeouts();
-
-	for_each_cpu(cpu, &wd_cpus_enabled)
-		stop_wd_on_cpu(cpu);
-
-	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
-		start_wd_on_cpu(cpu);
+	if (!run) {
+		for_each_cpu(cpu, &wd_cpus_enabled)
+			stop_wd_on_cpu(cpu);
+	} else {
+		watchdog_calc_timeouts();
+		for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
+			start_wd_on_cpu(cpu);
+	}
 }
 
 /*
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 4a8d1037364e..eee255bc0fd6 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -103,6 +103,8 @@ static inline void arch_touch_nmi_watchdog(void) {}
 #endif
 #endif
 
+void watchdog_nmi_reconfigure(bool run);
+
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
  *
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index baae9fc95031..5693afd2b8ea 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -112,17 +112,25 @@ void __weak watchdog_nmi_disable(unsigned int cpu)
 	hardlockup_detector_perf_disable();
 }
 
-/*
- * watchdog_nmi_reconfigure can be implemented to be notified after any
- * watchdog configuration change. The arch hardlockup watchdog should
- * respond to the following variables:
+/**
+ * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs
+ * @run:	If false stop the watchdogs on all enabled CPUs
+ *		If true start the watchdogs on all enabled CPUs
+ *
+ * The core call order is:
+ * watchdog_nmi_reconfigure(false);
+ * update_variables();
+ * watchdog_nmi_reconfigure(true);
+ *
+ * The second call which starts the watchdogs again guarantees that the
+ * following variables are stable across the call.
  * - watchdog_enabled
  * - watchdog_thresh
  * - watchdog_cpumask
- * - sysctl_hardlockup_all_cpu_backtrace
- * - hardlockup_panic
+ *
+ * After the call the variables can be changed again.
  */
-void __weak watchdog_nmi_reconfigure(void) { }
+void __weak watchdog_nmi_reconfigure(bool run) { }
 
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
 
@@ -515,10 +523,12 @@ static void softlockup_unpark_threads(void)
 
 static void softlockup_reconfigure_threads(bool enabled)
 {
+	watchdog_nmi_reconfigure(false);
 	softlockup_park_all_threads();
 	set_sample_period();
 	if (enabled)
 		softlockup_unpark_threads();
+	watchdog_nmi_reconfigure(true);
 }
 
 /*
@@ -559,7 +569,11 @@ static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
 static inline void softlockup_init_threads(void) { }
-static inline void softlockup_reconfigure_threads(bool enabled) { }
+static void softlockup_reconfigure_threads(bool enabled)
+{
+	watchdog_nmi_reconfigure(false);
+	watchdog_nmi_reconfigure(true);
+}
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
 static void __lockup_detector_cleanup(void)
@@ -599,7 +613,6 @@ static void proc_watchdog_update(void)
 	/* Remove impossible cpus to keep sysctl output clean. */
 	cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
 	softlockup_reconfigure_threads(watchdog_enabled && watchdog_thresh);
-	watchdog_nmi_reconfigure();
 }
 
 /*
-- 
cgit v1.2.3


From 178b9f7a36d2c74a38274b66dd89f53611298a19 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:18 +0200
Subject: watchdog/hardlockup/perf: Implement init time perf validation

The watchdog tries to create perf events even after it figured out that
perf is not functional or the requested event is not supported.

That's braindead as this can be done once at init time and if not supported
the NMI watchdog can be turned off unconditonally.

Implement the perf hardlockup detector functionality for that. This creates
a new event create function, which will replace the unholy mess of the
existing one in later patches.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194148.019090547@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  8 ++++++--
 kernel/watchdog_hld.c | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index eee255bc0fd6..72c62a809e92 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -93,14 +93,18 @@ extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_disable(void);
 extern void hardlockup_detector_perf_cleanup(void);
+extern int hardlockup_detector_perf_init(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_detector_perf_disable(void) { }
 static inline void hardlockup_detector_perf_cleanup(void) { }
-#if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+# if !defined(CONFIG_HAVE_NMI_WATCHDOG)
+static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
 static inline void arch_touch_nmi_watchdog(void) {}
-#endif
+# else
+static inline int hardlockup_detector_perf_init(void) { return 0; }
+# endif
 #endif
 
 void watchdog_nmi_reconfigure(bool run);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 0aa191ee3d51..f7e752e6e9b4 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -238,6 +238,27 @@ out:
 	return 0;
 }
 
+static int hardlockup_detector_event_create(void)
+{
+	unsigned int cpu = smp_processor_id();
+	struct perf_event_attr *wd_attr;
+	struct perf_event *evt;
+
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+	/* Try to register using hardware perf events */
+	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+					       watchdog_overflow_callback, NULL);
+	if (IS_ERR(evt)) {
+		pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+			PTR_ERR(evt));
+		return PTR_ERR(evt);
+	}
+	this_cpu_write(watchdog_ev, evt);
+	return 0;
+}
+
 /**
  * hardlockup_detector_perf_disable - Disable the local event
  */
@@ -315,3 +336,19 @@ void __init hardlockup_detector_perf_restart(void)
 			perf_event_enable(event);
 	}
 }
+
+/**
+ * hardlockup_detector_perf_init - Probe whether NMI event is available at all
+ */
+int __init hardlockup_detector_perf_init(void)
+{
+	int ret = hardlockup_detector_event_create();
+
+	if (ret) {
+		pr_info("Perf NMI watchdog permanetely disabled\n");
+	} else {
+		perf_event_release_kernel(this_cpu_read(watchdog_ev));
+		this_cpu_write(watchdog_ev, NULL);
+	}
+	return ret;
+}
-- 
cgit v1.2.3


From 2a1b8ee4f5665b4291e43e4a25d964c3eb2f4c32 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 12 Sep 2017 21:37:20 +0200
Subject: watchdog/hardlockup/perf: Implement CPU enable replacement

watchdog_nmi_enable() is an unparseable mess, Provide a clean perf specific
implementation, which will be used when the existing setup/teardown mess is
replaced.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Don Zickus <dzickus@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Link: http://lkml.kernel.org/r/20170912194148.180215498@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/nmi.h   |  2 ++
 kernel/watchdog_hld.c | 11 +++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 72c62a809e92..89ba8b23c6fe 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -92,12 +92,14 @@ extern void arch_touch_nmi_watchdog(void);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_disable(void);
+extern void hardlockup_detector_perf_enable(void);
 extern void hardlockup_detector_perf_cleanup(void);
 extern int hardlockup_detector_perf_init(void);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_detector_perf_disable(void) { }
+static inline void hardlockup_detector_perf_enable(void) { }
 static inline void hardlockup_detector_perf_cleanup(void) { }
 # if !defined(CONFIG_HAVE_NMI_WATCHDOG)
 static inline int hardlockup_detector_perf_init(void) { return -ENODEV; }
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index f7e752e6e9b4..99a3f22e48cc 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -259,6 +259,17 @@ static int hardlockup_detector_event_create(void)
 	return 0;
 }
 
+/**
+ * hardlockup_detector_perf_enable - Enable the local event
+ */
+void hardlockup_detector_perf_enable(void)
+{
+	if (hardlockup_detector_event_create())
+		return;
+
+	perf_event_enable(this_cpu_read(watchdog_ev));
+}
+
 /**
  * hardlockup_detector_perf_disable - Disable the local event
  */
-- 
cgit v1.2.3


From 13541226dc056fa3f54417ce12f18ba711a1591c Mon Sep 17 00:00:00 2001
From: Vineet Gupta <Vineet.Gupta1@synopsys.com>
Date: Thu, 31 Aug 2017 11:06:07 -0700
Subject: ARC: reset: remove the misleading v1 suffix all over

There is no plan yet to do a v2 board. And even if we were to do it only
some IPs would actually change, so it be best to add suffixes at that
point, not now !

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 .../devicetree/bindings/reset/snps,hsdk-reset.txt  |  28 +++++
 .../bindings/reset/snps,hsdk-v1-reset.txt          |  28 -----
 MAINTAINERS                                        |   6 +-
 drivers/reset/Kconfig                              |   6 +-
 drivers/reset/Makefile                             |   2 +-
 drivers/reset/reset-hsdk-v1.c                      | 137 ---------------------
 drivers/reset/reset-hsdk.c                         | 137 +++++++++++++++++++++
 include/dt-bindings/reset/snps,hsdk-reset.h        |  17 +++
 include/dt-bindings/reset/snps,hsdk-v1-reset.h     |  17 ---
 9 files changed, 189 insertions(+), 189 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt
 delete mode 100644 Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt
 delete mode 100644 drivers/reset/reset-hsdk-v1.c
 create mode 100644 drivers/reset/reset-hsdk.c
 create mode 100644 include/dt-bindings/reset/snps,hsdk-reset.h
 delete mode 100644 include/dt-bindings/reset/snps,hsdk-v1-reset.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt b/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt
new file mode 100644
index 000000000000..830069b1c37c
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt
@@ -0,0 +1,28 @@
+Binding for the Synopsys HSDK reset controller
+
+This binding uses the common reset binding[1].
+
+[1] Documentation/devicetree/bindings/reset/reset.txt
+
+Required properties:
+- compatible: should be "snps,hsdk-reset".
+- reg: should always contain 2 pairs address - length: first for reset
+  configuration register and second for corresponding SW reset and status bits
+  register.
+- #reset-cells: from common reset binding; Should always be set to 1.
+
+Example:
+	reset: reset@880 {
+		compatible = "snps,hsdk-reset";
+		#reset-cells = <1>;
+		reg = <0x8A0 0x4>, <0xFF0 0x4>;
+	};
+
+Specifying reset lines connected to IP modules:
+	ethernet@.... {
+		....
+		resets = <&reset HSDK_V1_ETH_RESET>;
+		....
+	};
+
+The index could be found in <dt-bindings/reset/snps,hsdk-reset.h>
diff --git a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt b/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt
deleted file mode 100644
index 6a68146ee353..000000000000
--- a/Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-Binding for the HSDK v1 reset controller
-
-This binding uses the common reset binding[1].
-
-[1] Documentation/devicetree/bindings/reset/reset.txt
-
-Required properties:
-- compatible: should be "snps,hsdk-v1.0-reset".
-- reg: should always contain 2 pairs address - length: first for reset
-  configuration register and second for corresponding SW reset and status bits
-  register.
-- #reset-cells: from common reset binding; Should always be set to 1.
-
-Example:
-	reset: reset@880 {
-		compatible = "snps,hsdk-v1.0-reset";
-		#reset-cells = <1>;
-		reg = <0x8A0 0x4>, <0xFF0 0x4>;
-	};
-
-Specifying reset lines connected to IP modules:
-	ethernet@.... {
-		....
-		resets = <&reset HSDK_V1_ETH_RESET>;
-		....
-	};
-
-The index could be found in <dt-bindings/reset/snps,hsdk-v1-reset.h>
diff --git a/MAINTAINERS b/MAINTAINERS
index 2281af4b41b6..3b9887b3644e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12915,9 +12915,9 @@ F:	drivers/mmc/host/dw_mmc*
 SYNOPSYS HSDK RESET CONTROLLER DRIVER
 M:	Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>
 S:	Supported
-F:	drivers/reset/reset-hsdk-v1.c
-F:	include/dt-bindings/reset/snps,hsdk-v1-reset.h
-F:	Documentation/devicetree/bindings/reset/snps,hsdk-v1-reset.txt
+F:	drivers/reset/reset-hsdk.c
+F:	include/dt-bindings/reset/snps,hsdk-reset.h
+F:	Documentation/devicetree/bindings/reset/snps,hsdk-reset.txt
 
 SYSTEM CONFIGURATION (SYSCON)
 M:	Lee Jones <lee.jones@linaro.org>
diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig
index d063bd5373dd..a7c7d5a8c089 100644
--- a/drivers/reset/Kconfig
+++ b/drivers/reset/Kconfig
@@ -34,12 +34,12 @@ config RESET_BERLIN
 	help
 	  This enables the reset controller driver for Marvell Berlin SoCs.
 
-config RESET_HSDK_V1
-	bool "HSDK v1 Reset Driver"
+config RESET_HSDK
+	bool "Synopsys HSDK Reset Driver"
 	depends on HAS_IOMEM
 	default n
 	help
-	  This enables the reset controller driver for HSDK v1.
+	  This enables the reset controller driver for HSDK board.
 
 config RESET_IMX7
 	bool "i.MX7 Reset Driver" if COMPILE_TEST
diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile
index d368367110e5..af1c15c330b3 100644
--- a/drivers/reset/Makefile
+++ b/drivers/reset/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_ARCH_TEGRA) += tegra/
 obj-$(CONFIG_RESET_A10SR) += reset-a10sr.o
 obj-$(CONFIG_RESET_ATH79) += reset-ath79.o
 obj-$(CONFIG_RESET_BERLIN) += reset-berlin.o
-obj-$(CONFIG_RESET_HSDK_V1) += reset-hsdk-v1.o
+obj-$(CONFIG_RESET_HSDK) += reset-hsdk.o
 obj-$(CONFIG_RESET_IMX7) += reset-imx7.o
 obj-$(CONFIG_RESET_LANTIQ) += reset-lantiq.o
 obj-$(CONFIG_RESET_LPC18XX) += reset-lpc18xx.o
diff --git a/drivers/reset/reset-hsdk-v1.c b/drivers/reset/reset-hsdk-v1.c
deleted file mode 100644
index bca13e4bf622..000000000000
--- a/drivers/reset/reset-hsdk-v1.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (C) 2017 Synopsys.
- *
- * Synopsys HSDKv1 SDP reset driver.
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <linux/iopoll.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/platform_device.h>
-#include <linux/reset-controller.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-
-#define to_hsdkv1_rst(p)	container_of((p), struct hsdkv1_rst, rcdev)
-
-struct hsdkv1_rst {
-	void __iomem			*regs_ctl;
-	void __iomem			*regs_rst;
-	spinlock_t			lock;
-	struct reset_controller_dev	rcdev;
-};
-
-static const u32 rst_map[] = {
-	BIT(16), /* APB_RST  */
-	BIT(17), /* AXI_RST  */
-	BIT(18), /* ETH_RST  */
-	BIT(19), /* USB_RST  */
-	BIT(20), /* SDIO_RST */
-	BIT(21), /* HDMI_RST */
-	BIT(22), /* GFX_RST  */
-	BIT(25), /* DMAC_RST */
-	BIT(31), /* EBI_RST  */
-};
-
-#define HSDK_MAX_RESETS			ARRAY_SIZE(rst_map)
-
-#define CGU_SYS_RST_CTRL		0x0
-#define CGU_IP_SW_RESET			0x0
-#define CGU_IP_SW_RESET_DELAY_SHIFT	16
-#define CGU_IP_SW_RESET_DELAY_MASK	GENMASK(31, CGU_IP_SW_RESET_DELAY_SHIFT)
-#define CGU_IP_SW_RESET_DELAY		0
-#define CGU_IP_SW_RESET_RESET		BIT(0)
-#define SW_RESET_TIMEOUT		10000
-
-static void hsdkv1_reset_config(struct hsdkv1_rst *rst, unsigned long id)
-{
-	writel(rst_map[id], rst->regs_ctl + CGU_SYS_RST_CTRL);
-}
-
-static int hsdkv1_reset_do(struct hsdkv1_rst *rst)
-{
-	u32 reg;
-
-	reg = readl(rst->regs_rst + CGU_IP_SW_RESET);
-	reg &= ~CGU_IP_SW_RESET_DELAY_MASK;
-	reg |= CGU_IP_SW_RESET_DELAY << CGU_IP_SW_RESET_DELAY_SHIFT;
-	reg |= CGU_IP_SW_RESET_RESET;
-	writel(reg, rst->regs_rst + CGU_IP_SW_RESET);
-
-	/* wait till reset bit is back to 0 */
-	return readl_poll_timeout_atomic(rst->regs_rst + CGU_IP_SW_RESET, reg,
-		!(reg & CGU_IP_SW_RESET_RESET), 5, SW_RESET_TIMEOUT);
-}
-
-static int hsdkv1_reset_reset(struct reset_controller_dev *rcdev,
-			      unsigned long id)
-{
-	struct hsdkv1_rst *rst = to_hsdkv1_rst(rcdev);
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&rst->lock, flags);
-	hsdkv1_reset_config(rst, id);
-	ret = hsdkv1_reset_do(rst);
-	spin_unlock_irqrestore(&rst->lock, flags);
-
-	return ret;
-}
-
-static const struct reset_control_ops hsdkv1_reset_ops = {
-	.reset	= hsdkv1_reset_reset,
-};
-
-static int hsdkv1_reset_probe(struct platform_device *pdev)
-{
-	struct hsdkv1_rst *rst;
-	struct resource *mem;
-
-	rst = devm_kzalloc(&pdev->dev, sizeof(*rst), GFP_KERNEL);
-	if (!rst)
-		return -ENOMEM;
-
-	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	rst->regs_ctl = devm_ioremap_resource(&pdev->dev, mem);
-	if (IS_ERR(rst->regs_ctl))
-		return PTR_ERR(rst->regs_ctl);
-
-	mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	rst->regs_rst = devm_ioremap_resource(&pdev->dev, mem);
-	if (IS_ERR(rst->regs_rst))
-		return PTR_ERR(rst->regs_rst);
-
-	spin_lock_init(&rst->lock);
-
-	rst->rcdev.owner = THIS_MODULE;
-	rst->rcdev.ops = &hsdkv1_reset_ops;
-	rst->rcdev.of_node = pdev->dev.of_node;
-	rst->rcdev.nr_resets = HSDK_MAX_RESETS;
-	rst->rcdev.of_reset_n_cells = 1;
-
-	return reset_controller_register(&rst->rcdev);
-}
-
-static const struct of_device_id hsdkv1_reset_dt_match[] = {
-	{ .compatible = "snps,hsdk-v1.0-reset" },
-	{ },
-};
-
-static struct platform_driver hsdkv1_reset_driver = {
-	.probe	= hsdkv1_reset_probe,
-	.driver	= {
-		.name = "hsdk-v1.0-reset",
-		.of_match_table = hsdkv1_reset_dt_match,
-	},
-};
-builtin_platform_driver(hsdkv1_reset_driver);
-
-MODULE_AUTHOR("Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>");
-MODULE_DESCRIPTION("Synopsys HSDKv1 SDP reset driver");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/reset/reset-hsdk.c b/drivers/reset/reset-hsdk.c
new file mode 100644
index 000000000000..8bce391c6943
--- /dev/null
+++ b/drivers/reset/reset-hsdk.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2017 Synopsys.
+ *
+ * Synopsys HSDK Development platform reset driver.
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/reset-controller.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define to_hsdk_rst(p)	container_of((p), struct hsdk_rst, rcdev)
+
+struct hsdk_rst {
+	void __iomem			*regs_ctl;
+	void __iomem			*regs_rst;
+	spinlock_t			lock;
+	struct reset_controller_dev	rcdev;
+};
+
+static const u32 rst_map[] = {
+	BIT(16), /* APB_RST  */
+	BIT(17), /* AXI_RST  */
+	BIT(18), /* ETH_RST  */
+	BIT(19), /* USB_RST  */
+	BIT(20), /* SDIO_RST */
+	BIT(21), /* HDMI_RST */
+	BIT(22), /* GFX_RST  */
+	BIT(25), /* DMAC_RST */
+	BIT(31), /* EBI_RST  */
+};
+
+#define HSDK_MAX_RESETS			ARRAY_SIZE(rst_map)
+
+#define CGU_SYS_RST_CTRL		0x0
+#define CGU_IP_SW_RESET			0x0
+#define CGU_IP_SW_RESET_DELAY_SHIFT	16
+#define CGU_IP_SW_RESET_DELAY_MASK	GENMASK(31, CGU_IP_SW_RESET_DELAY_SHIFT)
+#define CGU_IP_SW_RESET_DELAY		0
+#define CGU_IP_SW_RESET_RESET		BIT(0)
+#define SW_RESET_TIMEOUT		10000
+
+static void hsdk_reset_config(struct hsdk_rst *rst, unsigned long id)
+{
+	writel(rst_map[id], rst->regs_ctl + CGU_SYS_RST_CTRL);
+}
+
+static int hsdk_reset_do(struct hsdk_rst *rst)
+{
+	u32 reg;
+
+	reg = readl(rst->regs_rst + CGU_IP_SW_RESET);
+	reg &= ~CGU_IP_SW_RESET_DELAY_MASK;
+	reg |= CGU_IP_SW_RESET_DELAY << CGU_IP_SW_RESET_DELAY_SHIFT;
+	reg |= CGU_IP_SW_RESET_RESET;
+	writel(reg, rst->regs_rst + CGU_IP_SW_RESET);
+
+	/* wait till reset bit is back to 0 */
+	return readl_poll_timeout_atomic(rst->regs_rst + CGU_IP_SW_RESET, reg,
+		!(reg & CGU_IP_SW_RESET_RESET), 5, SW_RESET_TIMEOUT);
+}
+
+static int hsdk_reset_reset(struct reset_controller_dev *rcdev,
+			      unsigned long id)
+{
+	struct hsdk_rst *rst = to_hsdk_rst(rcdev);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&rst->lock, flags);
+	hsdk_reset_config(rst, id);
+	ret = hsdk_reset_do(rst);
+	spin_unlock_irqrestore(&rst->lock, flags);
+
+	return ret;
+}
+
+static const struct reset_control_ops hsdk_reset_ops = {
+	.reset	= hsdk_reset_reset,
+};
+
+static int hsdk_reset_probe(struct platform_device *pdev)
+{
+	struct hsdk_rst *rst;
+	struct resource *mem;
+
+	rst = devm_kzalloc(&pdev->dev, sizeof(*rst), GFP_KERNEL);
+	if (!rst)
+		return -ENOMEM;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	rst->regs_ctl = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(rst->regs_ctl))
+		return PTR_ERR(rst->regs_ctl);
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	rst->regs_rst = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(rst->regs_rst))
+		return PTR_ERR(rst->regs_rst);
+
+	spin_lock_init(&rst->lock);
+
+	rst->rcdev.owner = THIS_MODULE;
+	rst->rcdev.ops = &hsdk_reset_ops;
+	rst->rcdev.of_node = pdev->dev.of_node;
+	rst->rcdev.nr_resets = HSDK_MAX_RESETS;
+	rst->rcdev.of_reset_n_cells = 1;
+
+	return reset_controller_register(&rst->rcdev);
+}
+
+static const struct of_device_id hsdk_reset_dt_match[] = {
+	{ .compatible = "snps,hsdk-reset" },
+	{ },
+};
+
+static struct platform_driver hsdk_reset_driver = {
+	.probe	= hsdk_reset_probe,
+	.driver	= {
+		.name = "hsdk-reset",
+		.of_match_table = hsdk_reset_dt_match,
+	},
+};
+builtin_platform_driver(hsdk_reset_driver);
+
+MODULE_AUTHOR("Eugeniy Paltsev <Eugeniy.Paltsev@synopsys.com>");
+MODULE_DESCRIPTION("Synopsys HSDK SDP reset driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/dt-bindings/reset/snps,hsdk-reset.h b/include/dt-bindings/reset/snps,hsdk-reset.h
new file mode 100644
index 000000000000..e1a643e4bc91
--- /dev/null
+++ b/include/dt-bindings/reset/snps,hsdk-reset.h
@@ -0,0 +1,17 @@
+/**
+ * This header provides index for the HSDK reset controller.
+ */
+#ifndef _DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK
+#define _DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK
+
+#define HSDK_APB_RESET	0
+#define HSDK_AXI_RESET	1
+#define HSDK_ETH_RESET	2
+#define HSDK_USB_RESET	3
+#define HSDK_SDIO_RESET	4
+#define HSDK_HDMI_RESET	5
+#define HSDK_GFX_RESET	6
+#define HSDK_DMAC_RESET	7
+#define HSDK_EBI_RESET	8
+
+#endif /*_DT_BINDINGS_RESET_CONTROLLER_SNPS_HSDK*/
diff --git a/include/dt-bindings/reset/snps,hsdk-v1-reset.h b/include/dt-bindings/reset/snps,hsdk-v1-reset.h
deleted file mode 100644
index d898c89b7123..000000000000
--- a/include/dt-bindings/reset/snps,hsdk-v1-reset.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/**
- * This header provides index for the HSDK v1 reset controller.
- */
-#ifndef _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1
-#define _DT_BINDINGS_RESET_CONTROLLER_HSDK_V1
-
-#define HSDK_V1_APB_RESET	0
-#define HSDK_V1_AXI_RESET	1
-#define HSDK_V1_ETH_RESET	2
-#define HSDK_V1_USB_RESET	3
-#define HSDK_V1_SDIO_RESET	4
-#define HSDK_V1_HDMI_RESET	5
-#define HSDK_V1_GFX_RESET	6
-#define HSDK_V1_DMAC_RESET	7
-#define HSDK_V1_EBI_RESET	8
-
-#endif /*_DT_BINDINGS_RESET_CONTROLLER_HSDK_V1*/
-- 
cgit v1.2.3


From 5a5d718f952b55e7fa96bfaf6f31f2c08babd77b Mon Sep 17 00:00:00 2001
From: Sriram Periyasamy <sriramx.periyasamy@intel.com>
Date: Tue, 19 Sep 2017 17:25:05 -0500
Subject: ALSA: hda - program ICT bits to support HBR audio

On recent Intel platforms (Haswell, Broadwell, Skylake, ApolloLake,
KabyLake, ...), the IEC Coding Type (ICT) bitfield in the Digital
Converter Control #3 needs to be set explicitly for HDMI/DisplayPort
High Bit Rate (HBR) audio playback to work. This was not required in
earlier platforms when HBR was first introduced. The ICT bits are
defined in Section 7.3.3.9 of the HDaudio 1.0a specification.

Since the ICT bitfield was not specified for HDAudio 1.0 devices
(before 2009), we only program it on machines more recent than
Haswell.

We tested that this fix is not needed on Baytrail-I (MinnowBoard
Turbot) and believe by extension it also does not apply to Braswell.

[ Moved AC_VERB_SET_DIGI_CONVERT_3 definition to the right place
  by tiwai ]

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=98797

Signed-off-by: Sriram Periyasamy <sriramx.periyasamy@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Subhransu S. Prusty <subhransu.s.prusty@intel.com>
Acked-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/hda_verbs.h  |  1 +
 sound/pci/hda/patch_hdmi.c | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/sound/hda_verbs.h b/include/sound/hda_verbs.h
index d0509db6d0ec..f89cd5ee1c7a 100644
--- a/include/sound/hda_verbs.h
+++ b/include/sound/hda_verbs.h
@@ -95,6 +95,7 @@ enum {
 #define AC_VERB_SET_EAPD_BTLENABLE		0x70c
 #define AC_VERB_SET_DIGI_CONVERT_1		0x70d
 #define AC_VERB_SET_DIGI_CONVERT_2		0x70e
+#define AC_VERB_SET_DIGI_CONVERT_3		0x73e
 #define AC_VERB_SET_VOLUME_KNOB_CONTROL		0x70f
 #define AC_VERB_SET_GPIO_DATA			0x715
 #define AC_VERB_SET_GPIO_MASK			0x716
diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 2b64fabd5faa..c19c81d230bd 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -906,6 +906,7 @@ static int hdmi_setup_stream(struct hda_codec *codec, hda_nid_t cvt_nid,
 			      hda_nid_t pin_nid, u32 stream_tag, int format)
 {
 	struct hdmi_spec *spec = codec->spec;
+	unsigned int param;
 	int err;
 
 	err = spec->ops.pin_hbr_setup(codec, pin_nid, is_hbr_format(format));
@@ -915,6 +916,26 @@ static int hdmi_setup_stream(struct hda_codec *codec, hda_nid_t cvt_nid,
 		return err;
 	}
 
+	if (is_haswell_plus(codec)) {
+
+		/*
+		 * on recent platforms IEC Coding Type is required for HBR
+		 * support, read current Digital Converter settings and set
+		 * ICT bitfield if needed.
+		 */
+		param = snd_hda_codec_read(codec, cvt_nid, 0,
+					   AC_VERB_GET_DIGI_CONVERT_1, 0);
+
+		param = (param >> 16) & ~(AC_DIG3_ICT);
+
+		/* on recent platforms ICT mode is required for HBR support */
+		if (is_hbr_format(format))
+			param |= 0x1;
+
+		snd_hda_codec_write(codec, cvt_nid, 0,
+				    AC_VERB_SET_DIGI_CONVERT_3, param);
+	}
+
 	snd_hda_codec_setup_stream(codec, cvt_nid, stream_tag, 0, format);
 	return 0;
 }
-- 
cgit v1.2.3


From 62e082430ea4bb5b28909ca4375bb683931e22aa Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 20 Sep 2017 07:29:49 -0400
Subject: dm ioctl: fix alignment of event number in the device list

The size of struct dm_name_list is different on 32-bit and 64-bit
kernels (so "(nl + 1)" differs between 32-bit and 64-bit kernels).

This mismatch caused some harmless difference in padding when using 32-bit
or 64-bit kernel. Commit 23d70c5e52dd ("dm ioctl: report event number in
DM_LIST_DEVICES") added reporting event number in the output of
DM_LIST_DEVICES_CMD. This difference in padding makes it impossible for
userspace to determine the location of the event number (the location
would be different when running on 32-bit and 64-bit kernels).

Fix the padding by using offsetof(struct dm_name_list, name) instead of
sizeof(struct dm_name_list) to determine the location of entries.

Also, the ioctl version number is incremented to 37 so that userspace
can use the version number to determine that the event number is present
and correctly located.

In addition, a global event is now raised when a DM device is created,
removed, renamed or when table is swapped, so that the user can monitor
for device changes.

Reported-by: Eugene Syromiatnikov <esyr@redhat.com>
Fixes: 23d70c5e52dd ("dm ioctl: report event number in DM_LIST_DEVICES")
Cc: stable@vger.kernel.org # 4.13
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-ioctl.c         | 37 ++++++++++++++++++++++++-------------
 drivers/md/dm.c               | 10 ++++++++--
 include/uapi/linux/dm-ioctl.h |  4 ++--
 4 files changed, 35 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 24eddbdf2ab4..203144762f36 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -149,5 +149,6 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
 
 extern atomic_t dm_global_event_nr;
 extern wait_queue_head_t dm_global_eventq;
+void dm_issue_global_event(void);
 
 #endif
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 8756a6850431..e52676fa9832 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -477,9 +477,13 @@ static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_si
  * Round up the ptr to an 8-byte boundary.
  */
 #define ALIGN_MASK 7
+static inline size_t align_val(size_t val)
+{
+	return (val + ALIGN_MASK) & ~ALIGN_MASK;
+}
 static inline void *align_ptr(void *ptr)
 {
-	return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
+	return (void *)align_val((size_t)ptr);
 }
 
 /*
@@ -505,7 +509,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
 	struct hash_cell *hc;
 	size_t len, needed = 0;
 	struct gendisk *disk;
-	struct dm_name_list *nl, *old_nl = NULL;
+	struct dm_name_list *orig_nl, *nl, *old_nl = NULL;
 	uint32_t *event_nr;
 
 	down_write(&_hash_lock);
@@ -516,17 +520,15 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
 	 */
 	for (i = 0; i < NUM_BUCKETS; i++) {
 		list_for_each_entry (hc, _name_buckets + i, name_list) {
-			needed += sizeof(struct dm_name_list);
-			needed += strlen(hc->name) + 1;
-			needed += ALIGN_MASK;
-			needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK;
+			needed += align_val(offsetof(struct dm_name_list, name) + strlen(hc->name) + 1);
+			needed += align_val(sizeof(uint32_t));
 		}
 	}
 
 	/*
 	 * Grab our output buffer.
 	 */
-	nl = get_result_buffer(param, param_size, &len);
+	nl = orig_nl = get_result_buffer(param, param_size, &len);
 	if (len < needed) {
 		param->flags |= DM_BUFFER_FULL_FLAG;
 		goto out;
@@ -549,11 +551,16 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_
 			strcpy(nl->name, hc->name);
 
 			old_nl = nl;
-			event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1);
+			event_nr = align_ptr(nl->name + strlen(hc->name) + 1);
 			*event_nr = dm_get_event_nr(hc->md);
 			nl = align_ptr(event_nr + 1);
 		}
 	}
+	/*
+	 * If mismatch happens, security may be compromised due to buffer
+	 * overflow, so it's better to crash.
+	 */
+	BUG_ON((char *)nl - (char *)orig_nl != needed);
 
  out:
 	up_write(&_hash_lock);
@@ -1621,7 +1628,8 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para
  * which has a variable size, is not used by the function processing
  * the ioctl.
  */
-#define IOCTL_FLAGS_NO_PARAMS	1
+#define IOCTL_FLAGS_NO_PARAMS		1
+#define IOCTL_FLAGS_ISSUE_GLOBAL_EVENT	2
 
 /*-----------------------------------------------------------------
  * Implementation of open/close/ioctl on the special char
@@ -1635,12 +1643,12 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
 		ioctl_fn fn;
 	} _ioctls[] = {
 		{DM_VERSION_CMD, 0, NULL}, /* version is dealt with elsewhere */
-		{DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS, remove_all},
+		{DM_REMOVE_ALL_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, remove_all},
 		{DM_LIST_DEVICES_CMD, 0, list_devices},
 
-		{DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_create},
-		{DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS, dev_remove},
-		{DM_DEV_RENAME_CMD, 0, dev_rename},
+		{DM_DEV_CREATE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_create},
+		{DM_DEV_REMOVE_CMD, IOCTL_FLAGS_NO_PARAMS | IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_remove},
+		{DM_DEV_RENAME_CMD, IOCTL_FLAGS_ISSUE_GLOBAL_EVENT, dev_rename},
 		{DM_DEV_SUSPEND_CMD, IOCTL_FLAGS_NO_PARAMS, dev_suspend},
 		{DM_DEV_STATUS_CMD, IOCTL_FLAGS_NO_PARAMS, dev_status},
 		{DM_DEV_WAIT_CMD, 0, dev_wait},
@@ -1869,6 +1877,9 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us
 	    unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
 		DMERR("ioctl %d tried to output some data but has IOCTL_FLAGS_NO_PARAMS set", cmd);
 
+	if (!r && ioctl_flags & IOCTL_FLAGS_ISSUE_GLOBAL_EVENT)
+		dm_issue_global_event();
+
 	/*
 	 * Copy the results back to userland.
 	 */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6e54145969c5..4be85324f44d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -52,6 +52,12 @@ static struct workqueue_struct *deferred_remove_workqueue;
 atomic_t dm_global_event_nr = ATOMIC_INIT(0);
 DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
 
+void dm_issue_global_event(void)
+{
+	atomic_inc(&dm_global_event_nr);
+	wake_up(&dm_global_eventq);
+}
+
 /*
  * One of these is allocated per bio.
  */
@@ -1865,9 +1871,8 @@ static void event_callback(void *context)
 	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
 
 	atomic_inc(&md->event_nr);
-	atomic_inc(&dm_global_event_nr);
 	wake_up(&md->eventq);
-	wake_up(&dm_global_eventq);
+	dm_issue_global_event();
 }
 
 /*
@@ -2283,6 +2288,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	}
 
 	map = __bind(md, table, &limits);
+	dm_issue_global_event();
 
 out:
 	mutex_unlock(&md->suspend_lock);
diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h
index 412c06a624c8..ccaea525340b 100644
--- a/include/uapi/linux/dm-ioctl.h
+++ b/include/uapi/linux/dm-ioctl.h
@@ -269,9 +269,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	36
+#define DM_VERSION_MINOR	37
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2017-06-09)"
+#define DM_VERSION_EXTRA	"-ioctl (2017-09-20)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From b4391db42308c9940944b5d7be5ca4b78fb88dd0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 22 Sep 2017 23:29:19 +0200
Subject: netlink: fix nla_put_{u8,u16,u32} for KASAN

When CONFIG_KASAN is enabled, the "--param asan-stack=1" causes rather large
stack frames in some functions. This goes unnoticed normally because
CONFIG_FRAME_WARN is disabled with CONFIG_KASAN by default as of commit
3f181b4d8652 ("lib/Kconfig.debug: disable -Wframe-larger-than warnings with
KASAN=y").

The kernelci.org build bot however has the warning enabled and that led
me to investigate it a little further, as every build produces these warnings:

net/wireless/nl80211.c:4389:1: warning: the frame size of 2240 bytes is larger than 2048 bytes [-Wframe-larger-than=]
net/wireless/nl80211.c:1895:1: warning: the frame size of 3776 bytes is larger than 2048 bytes [-Wframe-larger-than=]
net/wireless/nl80211.c:1410:1: warning: the frame size of 2208 bytes is larger than 2048 bytes [-Wframe-larger-than=]
net/bridge/br_netlink.c:1282:1: warning: the frame size of 2544 bytes is larger than 2048 bytes [-Wframe-larger-than=]

Most of this problem is now solved in gcc-8, which can consolidate
the stack slots for the inline function arguments. On older compilers
we can add a workaround by declaring a local variable in each function
to pass the inline function argument.

Cc: stable@vger.kernel.org
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 73 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index e51cf5f81597..14c289393071 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -773,7 +773,10 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
  */
 static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
 {
-	return nla_put(skb, attrtype, sizeof(u8), &value);
+	/* temporary variables to work around GCC PR81715 with asan-stack=1 */
+	u8 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(u8), &tmp);
 }
 
 /**
@@ -784,7 +787,9 @@ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
  */
 static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
 {
-	return nla_put(skb, attrtype, sizeof(u16), &value);
+	u16 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(u16), &tmp);
 }
 
 /**
@@ -795,7 +800,9 @@ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
  */
 static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
 {
-	return nla_put(skb, attrtype, sizeof(__be16), &value);
+	__be16 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(__be16), &tmp);
 }
 
 /**
@@ -806,7 +813,9 @@ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
  */
 static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
 {
-	return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, value);
+	__be16 tmp = value;
+
+	return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
 }
 
 /**
@@ -817,7 +826,9 @@ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
  */
 static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
 {
-	return nla_put(skb, attrtype, sizeof(__le16), &value);
+	__le16 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(__le16), &tmp);
 }
 
 /**
@@ -828,7 +839,9 @@ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
  */
 static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
 {
-	return nla_put(skb, attrtype, sizeof(u32), &value);
+	u32 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(u32), &tmp);
 }
 
 /**
@@ -839,7 +852,9 @@ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
  */
 static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
 {
-	return nla_put(skb, attrtype, sizeof(__be32), &value);
+	__be32 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(__be32), &tmp);
 }
 
 /**
@@ -850,7 +865,9 @@ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
  */
 static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
 {
-	return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, value);
+	__be32 tmp = value;
+
+	return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
 }
 
 /**
@@ -861,7 +878,9 @@ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
  */
 static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
 {
-	return nla_put(skb, attrtype, sizeof(__le32), &value);
+	__le32 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(__le32), &tmp);
 }
 
 /**
@@ -874,7 +893,9 @@ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
 static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
 				    u64 value, int padattr)
 {
-	return nla_put_64bit(skb, attrtype, sizeof(u64), &value, padattr);
+	u64 tmp = value;
+
+	return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
 }
 
 /**
@@ -887,7 +908,9 @@ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
 static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
 			       int padattr)
 {
-	return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr);
+	__be64 tmp = value;
+
+	return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr);
 }
 
 /**
@@ -900,7 +923,9 @@ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
 static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
 				int padattr)
 {
-	return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, value,
+	__be64 tmp = value;
+
+	return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp,
 			    padattr);
 }
 
@@ -914,7 +939,9 @@ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
 static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
 			       int padattr)
 {
-	return nla_put_64bit(skb, attrtype, sizeof(__le64), &value, padattr);
+	__le64 tmp = value;
+
+	return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr);
 }
 
 /**
@@ -925,7 +952,9 @@ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
  */
 static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
 {
-	return nla_put(skb, attrtype, sizeof(s8), &value);
+	s8 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(s8), &tmp);
 }
 
 /**
@@ -936,7 +965,9 @@ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
  */
 static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
 {
-	return nla_put(skb, attrtype, sizeof(s16), &value);
+	s16 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(s16), &tmp);
 }
 
 /**
@@ -947,7 +978,9 @@ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
  */
 static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
 {
-	return nla_put(skb, attrtype, sizeof(s32), &value);
+	s32 tmp = value;
+
+	return nla_put(skb, attrtype, sizeof(s32), &tmp);
 }
 
 /**
@@ -960,7 +993,9 @@ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
 static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
 			      int padattr)
 {
-	return nla_put_64bit(skb, attrtype, sizeof(s64), &value, padattr);
+	s64 tmp = value;
+
+	return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
 }
 
 /**
@@ -1010,7 +1045,9 @@ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
 static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype,
 				  __be32 addr)
 {
-	return nla_put_be32(skb, attrtype, addr);
+	__be32 tmp = addr;
+
+	return nla_put_be32(skb, attrtype, tmp);
 }
 
 /**
-- 
cgit v1.2.3


From 99d3cd27f755d63fd6cf85169eaa873d90769aa5 Mon Sep 17 00:00:00 2001
From: Inbar Karmy <inbark@mellanox.com>
Date: Thu, 24 Aug 2017 17:21:44 +0300
Subject: net/mlx5: Fix FPGA capability location

Currently, FPGA capability is located in (mdev)->caps.hca_cur,
change the location to be (mdev)->caps.fpga,
since hca_cur is reserved for HCA device capabilities.

Fixes: e29341fb3a5b ("net/mlx5: FPGA, Add basic support for Innova")
Signed-off-by: Inbar Karmy <inbark@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c  | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 3 +--
 include/linux/mlx5/device.h                         | 5 ++---
 include/linux/mlx5/driver.h                         | 1 +
 5 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
index e37453d838db..c0fd2212e890 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.c
@@ -71,11 +71,11 @@ int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
 	return 0;
 }
 
-int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps)
+int mlx5_fpga_caps(struct mlx5_core_dev *dev)
 {
 	u32 in[MLX5_ST_SZ_DW(fpga_cap)] = {0};
 
-	return mlx5_core_access_reg(dev, in, sizeof(in), caps,
+	return mlx5_core_access_reg(dev, in, sizeof(in), dev->caps.fpga,
 				    MLX5_ST_SZ_BYTES(fpga_cap),
 				    MLX5_REG_FPGA_CAP, 0, 0);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
index 94bdfd47c3f0..d05233c9b4f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/cmd.h
@@ -65,7 +65,7 @@ struct mlx5_fpga_qp_counters {
 	u64 rx_total_drop;
 };
 
-int mlx5_fpga_caps(struct mlx5_core_dev *dev, u32 *caps);
+int mlx5_fpga_caps(struct mlx5_core_dev *dev);
 int mlx5_fpga_query(struct mlx5_core_dev *dev, struct mlx5_fpga_query *query);
 int mlx5_fpga_ctrl_op(struct mlx5_core_dev *dev, u8 op);
 int mlx5_fpga_access_reg(struct mlx5_core_dev *dev, u8 size, u64 addr,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index 9034e9960a76..dc8970346521 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -139,8 +139,7 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
 	if (err)
 		goto out;
 
-	err = mlx5_fpga_caps(fdev->mdev,
-			     fdev->mdev->caps.hca_cur[MLX5_CAP_FPGA]);
+	err = mlx5_fpga_caps(fdev->mdev);
 	if (err)
 		goto out;
 
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index eaf4ad209c8f..e32dbc4934db 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -980,7 +980,6 @@ enum mlx5_cap_type {
 	MLX5_CAP_RESERVED,
 	MLX5_CAP_VECTOR_CALC,
 	MLX5_CAP_QOS,
-	MLX5_CAP_FPGA,
 	/* NUM OF CAP Types */
 	MLX5_CAP_NUM
 };
@@ -1110,10 +1109,10 @@ enum mlx5_mcam_feature_groups {
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
 
 #define MLX5_CAP_FPGA(mdev, cap) \
-	MLX5_GET(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap)
+	MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap)
 
 #define MLX5_CAP64_FPGA(mdev, cap) \
-	MLX5_GET64(fpga_cap, (mdev)->caps.hca_cur[MLX5_CAP_FPGA], cap)
+	MLX5_GET64(fpga_cap, (mdev)->caps.fpga, cap)
 
 enum {
 	MLX5_CMD_STAT_OK			= 0x0,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 02ff700e4f30..401c8972cc3a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -774,6 +774,7 @@ struct mlx5_core_dev {
 		u32 hca_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
+		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
 	} caps;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
-- 
cgit v1.2.3


From 16f1c5bb3ed75b3cf3ced537db40f7e1a244debe Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@mellanox.com>
Date: Sun, 30 Jul 2017 11:02:51 +0300
Subject: net/mlx5: Check device capability for maximum flow counters

Added check for the maximal number of flow counters attached
to rule (FTE).

Fixes: bd5251dbf156b ('net/mlx5_core: Introduce flow steering destination of type counter')
Signed-off-by: Raed Salem <raeds@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  |  8 ++++++++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 11 +++++++++++
 include/linux/mlx5/mlx5_ifc.h                     |  3 ++-
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index e0d0efd903bc..36ecc2b2e187 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -293,6 +293,9 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	}
 
 	if (fte->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		int max_list_size = BIT(MLX5_CAP_FLOWTABLE_TYPE(dev,
+					log_max_flow_counter,
+					ft->type));
 		int list_size = 0;
 
 		list_for_each_entry(dst, &fte->node.children, node.list) {
@@ -305,12 +308,17 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 			in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
 			list_size++;
 		}
+		if (list_size > max_list_size) {
+			err = -EINVAL;
+			goto err_out;
+		}
 
 		MLX5_SET(flow_context, in_flow_context, flow_counter_list_size,
 			 list_size);
 	}
 
 	err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out));
+err_out:
 	kvfree(in);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 5509a752f98e..48dd78975062 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -52,6 +52,7 @@ enum fs_flow_table_type {
 	FS_FT_FDB             = 0X4,
 	FS_FT_SNIFFER_RX	= 0X5,
 	FS_FT_SNIFFER_TX	= 0X6,
+	FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX,
 };
 
 enum fs_flow_table_op_mod {
@@ -260,4 +261,14 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
 #define fs_for_each_dst(pos, fte)			\
 	fs_list_for_each_entry(pos, &(fte)->node.children)
 
+#define MLX5_CAP_FLOWTABLE_TYPE(mdev, cap, type) (		\
+	(type == FS_FT_NIC_RX) ? MLX5_CAP_FLOWTABLE_NIC_RX(mdev, cap) :		\
+	(type == FS_FT_ESW_EGRESS_ACL) ? MLX5_CAP_ESW_EGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_ESW_INGRESS_ACL) ? MLX5_CAP_ESW_INGRESS_ACL(mdev, cap) :		\
+	(type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) :		\
+	(type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) :		\
+	(BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\
+	)
+
 #endif
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a528b35a022e..69772347f866 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -327,7 +327,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         reserved_at_80[0x18];
 	u8         log_max_destination[0x8];
 
-	u8         reserved_at_a0[0x18];
+	u8         log_max_flow_counter[0x8];
+	u8         reserved_at_a8[0x10];
 	u8         log_max_flow[0x8];
 
 	u8         reserved_at_c0[0x40];
-- 
cgit v1.2.3


From e6b72ee88a56bcfe63f72e9c30766484c45bec72 Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Tue, 26 Sep 2017 18:35:45 +0200
Subject: netfilter: ebtables: fix race condition in frame_filter_net_init()

It is possible for ebt_in_hook to be triggered before ebt_table is assigned
resulting in a NULL-pointer dereference. Make sure hooks are
registered as the last step.

Fixes: aee12a0a3727 ("ebtables: remove nf_hook_register usage")
Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h |  7 ++++---
 net/bridge/netfilter/ebtable_broute.c     |  4 ++--
 net/bridge/netfilter/ebtable_filter.c     |  4 ++--
 net/bridge/netfilter/ebtable_nat.c        |  4 ++--
 net/bridge/netfilter/ebtables.c           | 17 +++++++++--------
 5 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 2c2a5514b0df..528b24c78308 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -108,9 +108,10 @@ struct ebt_table {
 
 #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \
 		     ~(__alignof__(struct _xt_align)-1))
-extern struct ebt_table *ebt_register_table(struct net *net,
-					    const struct ebt_table *table,
-					    const struct nf_hook_ops *);
+extern int ebt_register_table(struct net *net,
+			      const struct ebt_table *table,
+			      const struct nf_hook_ops *ops,
+			      struct ebt_table **res);
 extern void ebt_unregister_table(struct net *net, struct ebt_table *table,
 				 const struct nf_hook_ops *);
 extern unsigned int ebt_do_table(struct sk_buff *skb,
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 2585b100ebbb..276b60262981 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb)
 
 static int __net_init broute_net_init(struct net *net)
 {
-	net->xt.broute_table = ebt_register_table(net, &broute_table, NULL);
-	return PTR_ERR_OR_ZERO(net->xt.broute_table);
+	return ebt_register_table(net, &broute_table, NULL,
+				  &net->xt.broute_table);
 }
 
 static void __net_exit broute_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 45a00dbdbcad..c41da5fac84f 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = {
 
 static int __net_init frame_filter_net_init(struct net *net)
 {
-	net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter);
-	return PTR_ERR_OR_ZERO(net->xt.frame_filter);
+	return ebt_register_table(net, &frame_filter, ebt_ops_filter,
+				  &net->xt.frame_filter);
 }
 
 static void __net_exit frame_filter_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 57cd5bb154e7..08df7406ecb3 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = {
 
 static int __net_init frame_nat_net_init(struct net *net)
 {
-	net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat);
-	return PTR_ERR_OR_ZERO(net->xt.frame_nat);
+	return ebt_register_table(net, &frame_nat, ebt_ops_nat,
+				  &net->xt.frame_nat);
 }
 
 static void __net_exit frame_nat_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 83951f978445..3b3dcf719e07 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
 	kfree(table);
 }
 
-struct ebt_table *
-ebt_register_table(struct net *net, const struct ebt_table *input_table,
-		   const struct nf_hook_ops *ops)
+int ebt_register_table(struct net *net, const struct ebt_table *input_table,
+		       const struct nf_hook_ops *ops, struct ebt_table **res)
 {
 	struct ebt_table_info *newinfo;
 	struct ebt_table *t, *table;
@@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
 	    repl->entries == NULL || repl->entries_size == 0 ||
 	    repl->counters != NULL || input_table->private != NULL) {
 		BUGPRINT("Bad table data for ebt_register_table!!!\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	/* Don't add one table to multiple lists. */
@@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
 	list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
 	mutex_unlock(&ebt_mutex);
 
+	WRITE_ONCE(*res, table);
+
 	if (!ops)
-		return table;
+		return 0;
 
 	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
 	if (ret) {
 		__ebt_unregister_table(net, table);
-		return ERR_PTR(ret);
+		*res = NULL;
 	}
 
-	return table;
+	return ret;
 free_unlock:
 	mutex_unlock(&ebt_mutex);
 free_chainstack:
@@ -1276,7 +1277,7 @@ free_newinfo:
 free_table:
 	kfree(table);
 out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
 void ebt_unregister_table(struct net *net, struct ebt_table *table,
-- 
cgit v1.2.3


From 7487449c86c65202b3b725c4524cb48dd65e4e6f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 28 Sep 2017 15:51:36 +0200
Subject: IPv4: early demux can return an error code

Currently no error is emitted, but this infrastructure will
used by the next patch to allow source address validation
for mcast sockets.
Since early demux can do a route lookup and an ipv4 route
lookup can return an error code this is consistent with the
current ipv4 route infrastructure.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/protocol.h |  4 ++--
 include/net/tcp.h      |  2 +-
 include/net/udp.h      |  2 +-
 net/ipv4/ip_input.c    | 25 +++++++++++++++----------
 net/ipv4/tcp_ipv4.c    |  9 +++++----
 net/ipv4/udp.c         | 11 ++++++-----
 6 files changed, 30 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/protocol.h b/include/net/protocol.h
index 65ba335b0e7e..4fc75f7ae23b 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -39,8 +39,8 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-	void			(*early_demux)(struct sk_buff *skb);
-	void                    (*early_demux_handler)(struct sk_buff *skb);
+	int			(*early_demux)(struct sk_buff *skb);
+	int			(*early_demux_handler)(struct sk_buff *skb);
 	int			(*handler)(struct sk_buff *skb);
 	void			(*err_handler)(struct sk_buff *skb, u32 info);
 	unsigned int		no_policy:1,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3bc910a9bfc6..89974c5286d8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -345,7 +345,7 @@ void tcp_v4_err(struct sk_buff *skb, u32);
 
 void tcp_shutdown(struct sock *sk, int how);
 
-void tcp_v4_early_demux(struct sk_buff *skb);
+int tcp_v4_early_demux(struct sk_buff *skb);
 int tcp_v4_rcv(struct sk_buff *skb);
 
 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
diff --git a/include/net/udp.h b/include/net/udp.h
index 12dfbfe2e2d7..6c759c8594e2 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -259,7 +259,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
 	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
 }
 
-void udp_v4_early_demux(struct sk_buff *skb);
+int udp_v4_early_demux(struct sk_buff *skb);
 bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fa2dc8f692c6..57fc13c6ab2b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -311,9 +311,10 @@ drop:
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	struct rtable *rt;
+	int (*edemux)(struct sk_buff *skb);
 	struct net_device *dev = skb->dev;
-	void (*edemux)(struct sk_buff *skb);
+	struct rtable *rt;
+	int err;
 
 	/* if ingress device is enslaved to an L3 master device pass the
 	 * skb to its handler for processing
@@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 
 		ipprot = rcu_dereference(inet_protos[protocol]);
 		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
-			edemux(skb);
+			err = edemux(skb);
+			if (unlikely(err))
+				goto drop_error;
 			/* must reload iph, skb->head might have changed */
 			iph = ip_hdr(skb);
 		}
@@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_valid_dst(skb)) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, dev);
-		if (unlikely(err)) {
-			if (err == -EXDEV)
-				__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
-			goto drop;
-		}
+		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					   iph->tos, dev);
+		if (unlikely(err))
+			goto drop_error;
 	}
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 drop:
 	kfree_skb(skb);
 	return NET_RX_DROP;
+
+drop_error:
+	if (err == -EXDEV)
+		__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
+	goto drop;
 }
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d9416b5162bc..85164d4d3e53 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1503,23 +1503,23 @@ csum_err:
 }
 EXPORT_SYMBOL(tcp_v4_do_rcv);
 
-void tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb)
 {
 	const struct iphdr *iph;
 	const struct tcphdr *th;
 	struct sock *sk;
 
 	if (skb->pkt_type != PACKET_HOST)
-		return;
+		return 0;
 
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
-		return;
+		return 0;
 
 	iph = ip_hdr(skb);
 	th = tcp_hdr(skb);
 
 	if (th->doff < sizeof(struct tcphdr) / 4)
-		return;
+		return 0;
 
 	sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
 				       iph->saddr, th->source,
@@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb)
 				skb_dst_set_noref(skb, dst);
 		}
 	}
+	return 0;
 }
 
 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ef29df8648e4..9b30f821fe96 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2221,7 +2221,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 	return NULL;
 }
 
-void udp_v4_early_demux(struct sk_buff *skb)
+int udp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 	const struct iphdr *iph;
@@ -2234,7 +2234,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 
 	/* validate the packet */
 	if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
-		return;
+		return 0;
 
 	iph = ip_hdr(skb);
 	uh = udp_hdr(skb);
@@ -2244,14 +2244,14 @@ void udp_v4_early_demux(struct sk_buff *skb)
 		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 
 		if (!in_dev)
-			return;
+			return 0;
 
 		/* we are supposed to accept bcast packets */
 		if (skb->pkt_type == PACKET_MULTICAST) {
 			ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
 					       iph->protocol);
 			if (!ours)
-				return;
+				return 0;
 		}
 
 		sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
@@ -2263,7 +2263,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 	}
 
 	if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
-		return;
+		return 0;
 
 	skb->sk = sk;
 	skb->destructor = sock_efree;
@@ -2278,6 +2278,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
 		 */
 		skb_dst_set_noref(skb, dst);
 	}
+	return 0;
 }
 
 int udp_rcv(struct sk_buff *skb)
-- 
cgit v1.2.3


From bc044e8db7962e727a75b591b9851ff2ac5cf846 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 28 Sep 2017 15:51:37 +0200
Subject: udp: perform source validation for mcast early demux

The UDP early demux can leverate the rx dst cache even for
multicast unconnected sockets.

In such scenario the ipv4 source address is validated only on
the first packet in the given flow. After that, when we fetch
the dst entry  from the socket rx cache, we stop enforcing
the rp_filter and we even start accepting any kind of martian
addresses.

Disabling the dst cache for unconnected multicast socket will
cause large performace regression, nearly reducing by half the
max ingress tput.

Instead we factor out a route helper to completely validate an
skb source address for multicast packets and we call it from
the UDP early demux for mcast packets landing on unconnected
sockets, after successful fetching the related cached dst entry.

This still gives a measurable, but limited performance
regression:

		rp_filter = 0		rp_filter = 1
edmux disabled:	1182 Kpps		1127 Kpps
edmux before:	2238 Kpps		2238 Kpps
edmux after:	2037 Kpps		2019 Kpps

The above figures are on top of current net tree.
Applying the net-next commit 6e617de84e87 ("net: avoid a full
fib lookup when rp_filter is disabled.") the delta with
rp_filter == 0 will decrease even more.

Fixes: 421b3885bf6d ("udp: ipv4: Add udp early demux")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h |  4 +++-
 net/ipv4/route.c    | 46 ++++++++++++++++++++++++++--------------------
 net/ipv4/udp.c      | 13 ++++++++++++-
 3 files changed, 41 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 57dfc6850d37..d538e6db1afe 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -175,7 +175,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
 	fl4->fl4_gre_key = gre_key;
 	return ip_route_output_key(net, fl4);
 }
-
+int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			  u8 tos, struct net_device *dev,
+			  struct in_device *in_dev, u32 *itag);
 int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
 			 u8 tos, struct net_device *devin);
 int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 94d4cd2d5ea4..ac6fde5d45f1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 EXPORT_SYMBOL(rt_dst_alloc);
 
 /* called in rcu_read_lock() section */
-static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-				u8 tos, struct net_device *dev, int our)
+int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			  u8 tos, struct net_device *dev,
+			  struct in_device *in_dev, u32 *itag)
 {
-	struct rtable *rth;
-	struct in_device *in_dev = __in_dev_get_rcu(dev);
-	unsigned int flags = RTCF_MULTICAST;
-	u32 itag = 0;
 	int err;
 
 	/* Primary sanity checks. */
-
 	if (!in_dev)
 		return -EINVAL;
 
 	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
 	    skb->protocol != htons(ETH_P_IP))
-		goto e_inval;
+		return -EINVAL;
 
 	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
-		goto e_inval;
+		return -EINVAL;
 
 	if (ipv4_is_zeronet(saddr)) {
 		if (!ipv4_is_local_multicast(daddr))
-			goto e_inval;
+			return -EINVAL;
 	} else {
 		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
-					  in_dev, &itag);
+					  in_dev, itag);
 		if (err < 0)
-			goto e_err;
+			return err;
 	}
+	return 0;
+}
+
+/* called in rcu_read_lock() section */
+static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			     u8 tos, struct net_device *dev, int our)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	unsigned int flags = RTCF_MULTICAST;
+	struct rtable *rth;
+	u32 itag = 0;
+	int err;
+
+	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
+	if (err)
+		return err;
+
 	if (our)
 		flags |= RTCF_LOCAL;
 
 	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
 			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
 	if (!rth)
-		goto e_nobufs;
+		return -ENOBUFS;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	rth->dst.tclassid = itag;
@@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	skb_dst_set(skb, &rth->dst);
 	return 0;
-
-e_nobufs:
-	return -ENOBUFS;
-e_inval:
-	return -EINVAL;
-e_err:
-	return err;
 }
 
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9b30f821fe96..5676237d2b0f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2224,6 +2224,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
 int udp_v4_early_demux(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
+	struct in_device *in_dev = NULL;
 	const struct iphdr *iph;
 	const struct udphdr *uh;
 	struct sock *sk = NULL;
@@ -2241,7 +2242,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
 
 	if (skb->pkt_type == PACKET_BROADCAST ||
 	    skb->pkt_type == PACKET_MULTICAST) {
-		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+		in_dev = __in_dev_get_rcu(skb->dev);
 
 		if (!in_dev)
 			return 0;
@@ -2272,11 +2273,21 @@ int udp_v4_early_demux(struct sk_buff *skb)
 	if (dst)
 		dst = dst_check(dst, 0);
 	if (dst) {
+		u32 itag = 0;
+
 		/* set noref for now.
 		 * any place which wants to hold dst has to call
 		 * dst_hold_safe()
 		 */
 		skb_dst_set_noref(skb, dst);
+
+		/* for unconnected multicast sockets we need to validate
+		 * the source on each packet
+		 */
+		if (!inet_sk(sk)->inet_daddr && in_dev)
+			return ip_mc_validate_source(skb, iph->daddr,
+						     iph->saddr, iph->tos,
+						     skb->dev, in_dev, &itag);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 28a0bc4120d38a394499382ba21d6965a67a3703 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 27 Sep 2017 21:35:12 -0400
Subject: scsi: sd: Implement blacklist option for WRITE SAME w/ UNMAP

SBC-4 states:

  "A MAXIMUM UNMAP LBA COUNT field set to a non-zero value indicates the
   maximum number of LBAs that may be unmapped by an UNMAP command"

  "A MAXIMUM WRITE SAME LENGTH field set to a non-zero value indicates
   the maximum number of contiguous logical blocks that the device server
   allows to be unmapped or written in a single WRITE SAME command."

Despite the spec being clear on the topic, some devices incorrectly
expect WRITE SAME commands with the UNMAP bit set to be limited to the
value reported in MAXIMUM UNMAP LBA COUNT in the Block Limits VPD.

Implement a blacklist option that can be used to accommodate devices
with this behavior.

Cc: <stable@vger.kernel.org>
Reported-by: Bill Kuzeja <William.Kuzeja@stratus.com>
Reported-by: Ewan D. Milne <emilne@redhat.com>
Reviewed-by: Ewan D. Milne <emilne@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_scan.c    |  3 +++
 drivers/scsi/sd.c           | 16 ++++++++++++----
 include/scsi/scsi_device.h  |  1 +
 include/scsi/scsi_devinfo.h |  1 +
 4 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index e7818afeda2b..15590a063ad9 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -956,6 +956,9 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 	if (*bflags & BLIST_NO_DIF)
 		sdev->no_dif = 1;
 
+	if (*bflags & BLIST_UNMAP_LIMIT_WS)
+		sdev->unmap_limit_for_ws = 1;
+
 	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
 
 	if (*bflags & BLIST_TRY_VPD_PAGES)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index fb9f8b5f4673..3d26a729825c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -715,13 +715,21 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 		break;
 
 	case SD_LBP_WS16:
-		max_blocks = min_not_zero(sdkp->max_ws_blocks,
-					  (u32)SD_MAX_WS16_BLOCKS);
+		if (sdkp->device->unmap_limit_for_ws)
+			max_blocks = sdkp->max_unmap_blocks;
+		else
+			max_blocks = sdkp->max_ws_blocks;
+
+		max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS);
 		break;
 
 	case SD_LBP_WS10:
-		max_blocks = min_not_zero(sdkp->max_ws_blocks,
-					  (u32)SD_MAX_WS10_BLOCKS);
+		if (sdkp->device->unmap_limit_for_ws)
+			max_blocks = sdkp->max_unmap_blocks;
+		else
+			max_blocks = sdkp->max_ws_blocks;
+
+		max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS);
 		break;
 
 	case SD_LBP_ZERO:
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 82e93ee94708..67c5a9f223f7 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -192,6 +192,7 @@ struct scsi_device {
 	unsigned no_dif:1;	/* T10 PI (DIF) should be disabled */
 	unsigned broken_fua:1;		/* Don't set FUA bit */
 	unsigned lun_in_cdb:1;		/* Store LUN bits in CDB[1] */
+	unsigned unmap_limit_for_ws:1;	/* Use the UNMAP limit for WRITE SAME */
 
 	atomic_t disk_events_disable_depth; /* disable depth for disk events */
 
diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h
index 9592570e092a..36b03013d629 100644
--- a/include/scsi/scsi_devinfo.h
+++ b/include/scsi/scsi_devinfo.h
@@ -29,5 +29,6 @@
 #define BLIST_TRY_VPD_PAGES	0x10000000 /* Attempt to read VPD pages */
 #define BLIST_NO_RSOC		0x20000000 /* don't try to issue RSOC */
 #define BLIST_MAX_1024		0x40000000 /* maximum 1024 sector cdb length */
+#define BLIST_UNMAP_LIMIT_WS	0x80000000 /* Use UNMAP limit for WRITE SAME */
 
 #endif
-- 
cgit v1.2.3


From 1c048a250aae1aaab0ba9dbec908f0c6cdb8614f Mon Sep 17 00:00:00 2001
From: Khazhismel Kumykov <khazhy@google.com>
Date: Thu, 13 Jul 2017 09:11:22 -0700
Subject: scsi: libiscsi: Remove iscsi_destroy_session

iscsi_session_teardown was the only user of this function. Function
currently is just short for iscsi_remove_session + iscsi_free_session.

Signed-off-by: Khazhismel Kumykov <khazhy@google.com>
Acked-by: Chris Leech <cleech@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 16 ----------------
 include/scsi/scsi_transport_iscsi.h |  1 -
 2 files changed, 17 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 0190aeff5f7f..7404d26895f5 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2210,22 +2210,6 @@ void iscsi_free_session(struct iscsi_cls_session *session)
 }
 EXPORT_SYMBOL_GPL(iscsi_free_session);
 
-/**
- * iscsi_destroy_session - destroy iscsi session
- * @session: iscsi_session
- *
- * Can be called by a LLD or iscsi_transport. There must not be
- * any running connections.
- */
-int iscsi_destroy_session(struct iscsi_cls_session *session)
-{
-	iscsi_remove_session(session);
-	ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n");
-	iscsi_free_session(session);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_destroy_session);
-
 /**
  * iscsi_create_conn - create iscsi class connection
  * @session: iscsi cls session
diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h
index 6183d20a01fb..b266d2a3bcb1 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost,
 						unsigned int target_id);
 extern void iscsi_remove_session(struct iscsi_cls_session *session);
 extern void iscsi_free_session(struct iscsi_cls_session *session);
-extern int iscsi_destroy_session(struct iscsi_cls_session *session);
 extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess,
 						int dd_size, uint32_t cid);
 extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn);
-- 
cgit v1.2.3


From 90caccdd8cc0215705f18b92771b449b01e2474a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 3 Oct 2017 15:37:20 -0700
Subject: bpf: fix bpf_tail_call() x64 JIT

- bpf prog_array just like all other types of bpf array accepts 32-bit index.
  Clarify that in the comment.
- fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
- tighten corresponding check in the interpreter to stay consistent

The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
though JIT code is wrong it will check bounds correctly.
Hence two fixes tags. All other JITs don't have this problem.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/x86/net/bpf_jit_comp.c | 4 ++--
 include/uapi/linux/bpf.h    | 2 +-
 kernel/bpf/core.c           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 8c9573660d51..0554e8aef4d5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	/* if (index >= array->map.max_entries)
 	 *   goto out;
 	 */
-	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
+	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
 #define OFFSET1 43 /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..f90860d1f897 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -312,7 +312,7 @@ union bpf_attr {
  *     jump into another BPF program
  *     @ctx: context pointer passed to next program
  *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
- *     @index: index inside array that selects specific program to run
+ *     @index: 32-bit index inside array that selects specific program to run
  *     Return: 0 on success or negative error
  *
  * int bpf_clone_redirect(skb, ifindex, flags)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..7b62df86be1d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ select_insn:
 		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
 		struct bpf_array *array = container_of(map, struct bpf_array, map);
 		struct bpf_prog *prog;
-		u64 index = BPF_R3;
+		u32 index = BPF_R3;
 
 		if (unlikely(index >= array->map.max_entries))
 			goto out;
-- 
cgit v1.2.3


From fa87b91c94a8fd2c8502b6761be2d08a8e9bcf55 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 3 Oct 2017 16:14:24 -0700
Subject: include/linux/mm.h: fix typo in VM_MPX definition

There's a typo in recent change of VM_MPX definition.  We want it to be
VM_HIGH_ARCH_4, not VM_HIGH_ARCH_BIT_4.

This bug does cause visible regressions.  In arch_vma_name the vmflags
are tested against VM_MPX.  With the incorrect value of VM_MPX, a number
of vmas (such as the stack) test positive and end up being marked as
"[mpx]" in /proc/N/maps instead of their correct names.

This confuses tools like rr which expect to be able to find familiar
vmas.

Fixes: df3735c5b40f ("x86,mpx: make mpx depend on x86-64 to free up VMA flag")
Link: http://lkml.kernel.org/r/20170918140253.36856-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kyle Huey <me@kylehuey.com>
Cc: <stable@vger.kernel.org>	[4.14+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f8c10d336e42..065d99deb847 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -240,7 +240,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #if defined(CONFIG_X86_INTEL_MPX)
 /* MPX specific bounds table or bounds directory */
-# define VM_MPX		VM_HIGH_ARCH_BIT_4
+# define VM_MPX		VM_HIGH_ARCH_4
 #else
 # define VM_MPX		VM_NONE
 #endif
-- 
cgit v1.2.3


From 4d4bbd8526a8fbeb2c090ea360211fceff952383 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:14:50 -0700
Subject: mm, oom_reaper: skip mm structs with mmu notifiers

Andrea has noticed that the oom_reaper doesn't invalidate the range via
mmu notifiers (mmu_notifier_invalidate_range_start/end) and that can
corrupt the memory of the kvm guest for example.

tlb_flush_mmu_tlbonly already invokes mmu notifiers but that is not
sufficient as per Andrea:

 "mmu_notifier_invalidate_range cannot be used in replacement of
  mmu_notifier_invalidate_range_start/end. For KVM
  mmu_notifier_invalidate_range is a noop and rightfully so. A MMU
  notifier implementation has to implement either ->invalidate_range
  method or the invalidate_range_start/end methods, not both. And if you
  implement invalidate_range_start/end like KVM is forced to do, calling
  mmu_notifier_invalidate_range in common code is a noop for KVM.

  For those MMU notifiers that can get away only implementing
  ->invalidate_range, the ->invalidate_range is implicitly called by
  mmu_notifier_invalidate_range_end(). And only those secondary MMUs
  that share the same pagetable with the primary MMU (like AMD iommuv2)
  can get away only implementing ->invalidate_range"

As the callback is allowed to sleep and the implementation is out of
hand of the MM it is safer to simply bail out if there is an mmu
notifier registered.  In order to not fail too early make the
mm_has_notifiers check under the oom_lock and have a little nap before
failing to give the current oom victim some more time to exit.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170913113427.2291-1-mhocko@kernel.org
Fixes: aac453635549 ("mm, oom: introduce oom reaper")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h |  5 +++++
 mm/oom_kill.c                | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 7b2e31b1745a..6866e8126982 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -400,6 +400,11 @@ extern void mmu_notifier_synchronize(void);
 
 #else /* CONFIG_MMU_NOTIFIER */
 
+static inline int mm_has_notifiers(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline void mmu_notifier_release(struct mm_struct *mm)
 {
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 99736e026712..dee0f75c3013 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -40,6 +40,7 @@
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <linux/init.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -494,6 +495,21 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		goto unlock_oom;
 	}
 
+	/*
+	 * If the mm has notifiers then we would need to invalidate them around
+	 * unmap_page_range and that is risky because notifiers can sleep and
+	 * what they do is basically undeterministic.  So let's have a short
+	 * sleep to give the oom victim some more time.
+	 * TODO: we really want to get rid of this ugly hack and make sure that
+	 * notifiers cannot block for unbounded amount of time and add
+	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+	 */
+	if (mm_has_notifiers(mm)) {
+		up_read(&mm->mmap_sem);
+		schedule_timeout_idle(HZ);
+		goto unlock_oom;
+	}
+
 	/*
 	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
 	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
-- 
cgit v1.2.3


From a1b2289cef92ef0e9a92afcd2e1ea71d5bcaaf64 Mon Sep 17 00:00:00 2001
From: Sherry Yang <sherryy@android.com>
Date: Tue, 3 Oct 2017 16:15:00 -0700
Subject: android: binder: drop lru lock in isolate callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the global lru lock in isolate callback before calling
zap_page_range which calls cond_resched, and re-acquire the global lru
lock before returning.  Also change return code to LRU_REMOVED_RETRY.

Use mmput_async when fail to acquire mmap sem in an atomic context.

Fix "BUG: sleeping function called from invalid context"
errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled.

Also restore mmput_async, which was initially introduced in commit
ec8d7c14ea14 ("mm, oom_reaper: do not mmput synchronously from the oom
reaper context"), and was removed in commit 212925802454 ("mm: oom: let
oom_reap_task and exit_mmap run concurrently").

Link: http://lkml.kernel.org/r/20170914182231.90908-1-sherryy@android.com
Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder")
Signed-off-by: Sherry Yang <sherryy@android.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reported-by: Kyle Yan <kyan@codeaurora.org>
Acked-by: Arve Hjønnevåg <arve@android.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Martijn Coenen <maco@google.com>
Cc: Todd Kjos <tkjos@google.com>
Cc: Riley Andrews <riandrews@android.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hoeun Ryu <hoeun.ryu@gmail.com>
Cc: Christopher Lameter <cl@linux.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/android/binder_alloc.c | 18 ++++++++++++------
 include/linux/sched/mm.h       |  6 ++++++
 kernel/fork.c                  | 18 ++++++++++++++++++
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 8fe165844e47..064f5e31ec55 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 	struct binder_alloc *alloc;
 	uintptr_t page_addr;
 	size_t index;
+	struct vm_area_struct *vma;
 
 	alloc = page->alloc;
 	if (!mutex_trylock(&alloc->mutex))
@@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	index = page - alloc->pages;
 	page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
-	if (alloc->vma) {
+	vma = alloc->vma;
+	if (vma) {
 		mm = get_task_mm(alloc->tsk);
 		if (!mm)
 			goto err_get_task_mm_failed;
 		if (!down_write_trylock(&mm->mmap_sem))
 			goto err_down_write_mmap_sem_failed;
+	}
+
+	list_lru_isolate(lru, item);
+	spin_unlock(lock);
 
+	if (vma) {
 		trace_binder_unmap_user_start(alloc, index);
 
-		zap_page_range(alloc->vma,
+		zap_page_range(vma,
 			       page_addr + alloc->user_buffer_offset,
 			       PAGE_SIZE);
 
@@ -950,13 +957,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
 
 	trace_binder_unmap_kernel_end(alloc, index);
 
-	list_lru_isolate(lru, item);
-
+	spin_lock(lock);
 	mutex_unlock(&alloc->mutex);
-	return LRU_REMOVED;
+	return LRU_REMOVED_RETRY;
 
 err_down_write_mmap_sem_failed:
-	mmput(mm);
+	mmput_async(mm);
 err_get_task_mm_failed:
 err_page_already_freed:
 	mutex_unlock(&alloc->mutex);
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 3a19c253bdb1..ae53e413fb13 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -84,6 +84,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
+ * be called from the atomic context as well
+ */
+void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct,
+					    async_put_work);
+
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+	if (atomic_dec_and_test(&mm->mm_users)) {
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
+	}
+}
+#endif
+
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
  *
-- 
cgit v1.2.3


From c2315c187fa0d3ab363fdebe22718170b40473e3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 3 Oct 2017 16:15:42 -0700
Subject: exec: load_script: kill the onstack interp[BINPRM_BUF_SIZE] array

Patch series "exec: binfmt_misc: fix use-after-free, kill
iname[BINPRM_BUF_SIZE]".

It looks like this code was always wrong, then commit 948b701a607f
("binfmt_misc: add persistent opened binary handler for containers")
added more problems.

This patch (of 6):

load_script() can simply use i_name instead, it points into bprm->buf[]
and nobody can change this memory until we call prepare_binprm().

The only complication is that we need to also change the signature of
bprm_change_interp() but this change looks good too.

While at it, do whitespace/style cleanups.

NOTE: the real motivation for this change is that people want to
increase BINPRM_BUF_SIZE, we need to change load_misc_binary() too but
this looks more complicated because afaics it is very buggy.

Link: http://lkml.kernel.org/r/20170918163446.GA26793@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Travis Gummels <tgummels@redhat.com>
Cc: Ben Woodard <woodard@redhat.com>
Cc: Jim Foraker <foraker1@llnl.gov>
Cc: <tdhooge@llnl.gov>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_script.c      | 17 +++++++++--------
 fs/exec.c               |  2 +-
 include/linux/binfmts.h |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index afdf4e3cafc2..7cde3f46ad26 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -19,7 +19,6 @@ static int load_script(struct linux_binprm *bprm)
 	const char *i_arg, *i_name;
 	char *cp;
 	struct file *file;
-	char interp[BINPRM_BUF_SIZE];
 	int retval;
 
 	if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!'))
@@ -55,7 +54,7 @@ static int load_script(struct linux_binprm *bprm)
 			break;
 	}
 	for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++);
-	if (*cp == '\0') 
+	if (*cp == '\0')
 		return -ENOEXEC; /* No interpreter name found */
 	i_name = cp;
 	i_arg = NULL;
@@ -65,7 +64,6 @@ static int load_script(struct linux_binprm *bprm)
 		*cp++ = '\0';
 	if (*cp)
 		i_arg = cp;
-	strcpy (interp, i_name);
 	/*
 	 * OK, we've parsed out the interpreter name and
 	 * (optional) argument.
@@ -80,24 +78,27 @@ static int load_script(struct linux_binprm *bprm)
 	if (retval)
 		return retval;
 	retval = copy_strings_kernel(1, &bprm->interp, bprm);
-	if (retval < 0) return retval; 
+	if (retval < 0)
+		return retval;
 	bprm->argc++;
 	if (i_arg) {
 		retval = copy_strings_kernel(1, &i_arg, bprm);
-		if (retval < 0) return retval; 
+		if (retval < 0)
+			return retval;
 		bprm->argc++;
 	}
 	retval = copy_strings_kernel(1, &i_name, bprm);
-	if (retval) return retval; 
+	if (retval)
+		return retval;
 	bprm->argc++;
-	retval = bprm_change_interp(interp, bprm);
+	retval = bprm_change_interp(i_name, bprm);
 	if (retval < 0)
 		return retval;
 
 	/*
 	 * OK, now restart the process with the interpreter's dentry.
 	 */
-	file = open_exec(interp);
+	file = open_exec(i_name);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
diff --git a/fs/exec.c b/fs/exec.c
index ac34d9724684..5470d3c1892a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1410,7 +1410,7 @@ static void free_bprm(struct linux_binprm *bprm)
 	kfree(bprm);
 }
 
-int bprm_change_interp(char *interp, struct linux_binprm *bprm)
+int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
 {
 	/* If a binfmt changed the interp, free it first. */
 	if (bprm->interp != bprm->filename)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index fb44d6180ca0..18d05b5491f3 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -131,7 +131,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm,
 			   int executable_stack);
 extern int transfer_args_to_stack(struct linux_binprm *bprm,
 				  unsigned long *sp_location);
-extern int bprm_change_interp(char *interp, struct linux_binprm *bprm);
+extern int bprm_change_interp(const char *interp, struct linux_binprm *bprm);
 extern int copy_strings_kernel(int argc, const char *const *argv,
 			       struct linux_binprm *bprm);
 extern int prepare_bprm_creds(struct linux_binprm *bprm);
-- 
cgit v1.2.3


From 7240767450d6d8380fb153e2998a1bb4ede7b029 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 3 Oct 2017 16:16:04 -0700
Subject: include/linux/bitfield.h: remove 32bit from FIELD_GET comment block

I do not see anything that restricts this macro to 32 bit width.

Link: http://lkml.kernel.org/r/1505921975-23379-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitfield.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 8b9d6fff002d..f2deb71958b2 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -92,7 +92,7 @@
 /**
  * FIELD_GET() - extract a bitfield element
  * @_mask: shifted mask defining the field's length and position
- * @_reg:  32bit value of entire bitfield
+ * @_reg:  value of entire bitfield
  *
  * FIELD_GET() extracts the field specified by @_mask from the
  * bitfield passed in as @_reg by masking and shifting it down.
-- 
cgit v1.2.3


From 1dd2bfc86818ddbc95f98e312e7704350223fd7d Mon Sep 17 00:00:00 2001
From: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
Date: Tue, 3 Oct 2017 16:16:29 -0700
Subject: mm/memory_hotplug: change pfn_to_section_nr/section_nr_to_pfn macro
 to inline function

pfn_to_section_nr() and section_nr_to_pfn() are defined as macro.
pfn_to_section_nr() has no issue even if it is defined as macro.  But
section_nr_to_pfn() has overflow issue if sec is defined as int.

section_nr_to_pfn() just shifts sec by PFN_SECTION_SHIFT.  If sec is
defined as unsigned long, section_nr_to_pfn() returns pfn as 64 bit value.
But if sec is defined as int, section_nr_to_pfn() returns pfn as 32 bit
value.

__remove_section() calculates start_pfn using section_nr_to_pfn() and
scn_nr defined as int.  So if hot-removed memory address is over 16TB,
overflow issue occurs and section_nr_to_pfn() does not calculate correct
pfn.

To make callers use proper arg, the patch changes the macros to inline
functions.

Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory")
Link: http://lkml.kernel.org/r/e643a387-e573-6bbf-d418-c60c8ee3d15e@gmail.com
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 10 ++++++++--
 mm/memory_hotplug.c    |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 356a814e7c8e..c8f89417740b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1094,8 +1094,14 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #error Allocator MAX_ORDER exceeds SECTION_SIZE
 #endif
 
-#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
-#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
+static inline unsigned long pfn_to_section_nr(unsigned long pfn)
+{
+	return pfn >> PFN_SECTION_SHIFT;
+}
+static inline unsigned long section_nr_to_pfn(unsigned long sec)
+{
+	return sec << PFN_SECTION_SHIFT;
+}
 
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 23d5bd968950..efd1ad37bb57 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -551,7 +551,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
 		return ret;
 
 	scn_nr = __section_nr(ms);
-	start_pfn = section_nr_to_pfn(scn_nr);
+	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
 
 	sparse_remove_one_section(zone, ms, map_offset);
-- 
cgit v1.2.3


From 32e57c29e3c038ac802b7cc214a8795a4234055f Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Tue, 3 Oct 2017 16:16:54 -0700
Subject: include/linux/fs.h: fix comment about struct address_space

Before commit 9c5d760b8d22 ("mm: split gfp_mask and mapping flags into
separate fields") the private_* fields of struct adrress_space were
grouped together and using "ditto" in comments describing the last
fields was correct.

With introduction of gpf_mask between private_lock and private_list
"ditto" references the wrong description.

Fix it by using the elaborate description.

Link: http://lkml.kernel.org/r/1507009987-8746-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e73742e73..13dab191a23e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -403,7 +403,7 @@ struct address_space {
 	unsigned long		flags;		/* error bits */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	gfp_t			gfp_mask;	/* implicit gfp mask for allocations */
-	struct list_head	private_list;	/* ditto */
+	struct list_head	private_list;	/* for use by the address_space */
 	void			*private_data;	/* ditto */
 	errseq_t		wb_err;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
-- 
cgit v1.2.3


From de3ee99b097dd51938276e3af388cd4ad0f2750a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 20 Sep 2017 10:56:14 +0200
Subject: mmc: Delete bounce buffer handling

In may, Steven sent a patch deleting the bounce buffer handling
and the CONFIG_MMC_BLOCK_BOUNCE option.

I chose the less invasive path of making it a runtime config
option, and we merged that successfully for kernel v4.12.

The code is however just standing in the way and taking up
space for seemingly no gain on any systems in wide use today.

Pierre says the code was there to improve speed on TI SDHCI
controllers on certain HP laptops and possibly some Ricoh
controllers as well. Early SDHCI controllers lacked the
scatter-gather feature, which made software bounce buffers
a significant speed boost.

We are clearly talking about the list of SDHCI PCI-based
MMC/SD card readers found in the pci_ids[] list in
drivers/mmc/host/sdhci-pci-core.c.

The TI SDHCI derivative is not supported by the upstream
kernel. This leaves the Ricoh.

What we can however notice is that the x86 defconfigs in the
kernel did not enable CONFIG_MMC_BLOCK_BOUNCE option, which
means that any such laptop would have to have a custom
configured kernel to actually take advantage of this
bounce buffer speed-up. It simply seems like there was
a speed optimization for the Ricoh controllers that noone
was using. (I have not checked the distro defconfigs but
I am pretty sure the situation is the same there.)

Bounce buffers increased performance on the OMAP HSMMC
at one point, and was part of the original submission in
commit a45c6cb81647 ("[ARM] 5369/1: omap mmc: Add new
   omap hsmmc controller for 2430 and 34xx, v3")

This optimization was removed in
commit 0ccd76d4c236 ("omap_hsmmc: Implement scatter-gather
   emulation")
which found that scatter-gather emulation provided even
better performance.

The same was introduced for SDHCI in
commit 2134a922c6e7 ("sdhci: scatter-gather (ADMA) support")

I am pretty positively convinced that software
scatter-gather emulation will do for any host controller what
the bounce buffers were doing. Essentially, the bounce buffer
was a reimplementation of software scatter-gather-emulation in
the MMC subsystem, and it should be done away with.

Cc: Pierre Ossman <pierre@ossman.eu>
Cc: Juha Yrjola <juha.yrjola@solidboot.com>
Cc: Steven J. Hill <Steven.Hill@cavium.com>
Cc: Shawn Lin <shawn.lin@rock-chips.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Suggested-by: Steven J. Hill <Steven.Hill@cavium.com>
Suggested-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c  |   3 --
 drivers/mmc/core/queue.c  | 125 ++++------------------------------------------
 drivers/mmc/core/queue.h  |   6 ---
 drivers/mmc/host/cavium.c |   2 +-
 drivers/mmc/host/pxamci.c |   6 +--
 include/linux/mmc/host.h  |   2 +-
 6 files changed, 12 insertions(+), 132 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 29fc1e662891..2ad7b5c69156 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1634,8 +1634,6 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq,
 	}
 
 	mqrq->areq.mrq = &brq->mrq;
-
-	mmc_queue_bounce_pre(mqrq);
 }
 
 static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
@@ -1829,7 +1827,6 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req)
 		brq = &mq_rq->brq;
 		old_req = mmc_queue_req_to_req(mq_rq);
 		type = rq_data_dir(old_req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE;
-		mmc_queue_bounce_post(mq_rq);
 
 		switch (status) {
 		case MMC_BLK_SUCCESS:
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 74c663b1c0a7..0a4e77a5ba33 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -23,8 +23,6 @@
 #include "core.h"
 #include "card.h"
 
-#define MMC_QUEUE_BOUNCESZ	65536
-
 /*
  * Prepare a MMC request. This just filters out odd stuff.
  */
@@ -150,26 +148,6 @@ static void mmc_queue_setup_discard(struct request_queue *q,
 		queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
 }
 
-static unsigned int mmc_queue_calc_bouncesz(struct mmc_host *host)
-{
-	unsigned int bouncesz = MMC_QUEUE_BOUNCESZ;
-
-	if (host->max_segs != 1 || (host->caps & MMC_CAP_NO_BOUNCE_BUFF))
-		return 0;
-
-	if (bouncesz > host->max_req_size)
-		bouncesz = host->max_req_size;
-	if (bouncesz > host->max_seg_size)
-		bouncesz = host->max_seg_size;
-	if (bouncesz > host->max_blk_count * 512)
-		bouncesz = host->max_blk_count * 512;
-
-	if (bouncesz <= 512)
-		return 0;
-
-	return bouncesz;
-}
-
 /**
  * mmc_init_request() - initialize the MMC-specific per-request data
  * @q: the request queue
@@ -184,26 +162,9 @@ static int mmc_init_request(struct request_queue *q, struct request *req,
 	struct mmc_card *card = mq->card;
 	struct mmc_host *host = card->host;
 
-	if (card->bouncesz) {
-		mq_rq->bounce_buf = kmalloc(card->bouncesz, gfp);
-		if (!mq_rq->bounce_buf)
-			return -ENOMEM;
-		if (card->bouncesz > 512) {
-			mq_rq->sg = mmc_alloc_sg(1, gfp);
-			if (!mq_rq->sg)
-				return -ENOMEM;
-			mq_rq->bounce_sg = mmc_alloc_sg(card->bouncesz / 512,
-							gfp);
-			if (!mq_rq->bounce_sg)
-				return -ENOMEM;
-		}
-	} else {
-		mq_rq->bounce_buf = NULL;
-		mq_rq->bounce_sg = NULL;
-		mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp);
-		if (!mq_rq->sg)
-			return -ENOMEM;
-	}
+	mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp);
+	if (!mq_rq->sg)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -212,13 +173,6 @@ static void mmc_exit_request(struct request_queue *q, struct request *req)
 {
 	struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req);
 
-	/* It is OK to kfree(NULL) so this will be smooth */
-	kfree(mq_rq->bounce_sg);
-	mq_rq->bounce_sg = NULL;
-
-	kfree(mq_rq->bounce_buf);
-	mq_rq->bounce_buf = NULL;
-
 	kfree(mq_rq->sg);
 	mq_rq->sg = NULL;
 }
@@ -242,12 +196,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
 		limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
 
-	/*
-	 * mmc_init_request() depends on card->bouncesz so it must be calculated
-	 * before blk_init_allocated_queue() starts allocating requests.
-	 */
-	card->bouncesz = mmc_queue_calc_bouncesz(host);
-
 	mq->card = card;
 	mq->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!mq->queue)
@@ -271,17 +219,11 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	if (mmc_can_erase(card))
 		mmc_queue_setup_discard(mq->queue, card);
 
-	if (card->bouncesz) {
-		blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512);
-		blk_queue_max_segments(mq->queue, card->bouncesz / 512);
-		blk_queue_max_segment_size(mq->queue, card->bouncesz);
-	} else {
-		blk_queue_bounce_limit(mq->queue, limit);
-		blk_queue_max_hw_sectors(mq->queue,
-			min(host->max_blk_count, host->max_req_size / 512));
-		blk_queue_max_segments(mq->queue, host->max_segs);
-		blk_queue_max_segment_size(mq->queue, host->max_seg_size);
-	}
+	blk_queue_bounce_limit(mq->queue, limit);
+	blk_queue_max_hw_sectors(mq->queue,
+		min(host->max_blk_count, host->max_req_size / 512));
+	blk_queue_max_segments(mq->queue, host->max_segs);
+	blk_queue_max_segment_size(mq->queue, host->max_seg_size);
 
 	sema_init(&mq->thread_sem, 1);
 
@@ -370,56 +312,7 @@ void mmc_queue_resume(struct mmc_queue *mq)
  */
 unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq)
 {
-	unsigned int sg_len;
-	size_t buflen;
-	struct scatterlist *sg;
 	struct request *req = mmc_queue_req_to_req(mqrq);
-	int i;
-
-	if (!mqrq->bounce_buf)
-		return blk_rq_map_sg(mq->queue, req, mqrq->sg);
-
-	sg_len = blk_rq_map_sg(mq->queue, req, mqrq->bounce_sg);
-
-	mqrq->bounce_sg_len = sg_len;
-
-	buflen = 0;
-	for_each_sg(mqrq->bounce_sg, sg, sg_len, i)
-		buflen += sg->length;
-
-	sg_init_one(mqrq->sg, mqrq->bounce_buf, buflen);
-
-	return 1;
-}
-
-/*
- * If writing, bounce the data to the buffer before the request
- * is sent to the host driver
- */
-void mmc_queue_bounce_pre(struct mmc_queue_req *mqrq)
-{
-	if (!mqrq->bounce_buf)
-		return;
-
-	if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != WRITE)
-		return;
-
-	sg_copy_to_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len,
-		mqrq->bounce_buf, mqrq->sg[0].length);
-}
-
-/*
- * If reading, bounce the data from the buffer after the request
- * has been handled by the host driver
- */
-void mmc_queue_bounce_post(struct mmc_queue_req *mqrq)
-{
-	if (!mqrq->bounce_buf)
-		return;
-
-	if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != READ)
-		return;
 
-	sg_copy_from_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len,
-		mqrq->bounce_buf, mqrq->sg[0].length);
+	return blk_rq_map_sg(mq->queue, req, mqrq->sg);
 }
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 04fc89360a7a..f18d3f656baa 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -49,9 +49,6 @@ enum mmc_drv_op {
 struct mmc_queue_req {
 	struct mmc_blk_request	brq;
 	struct scatterlist	*sg;
-	char			*bounce_buf;
-	struct scatterlist	*bounce_sg;
-	unsigned int		bounce_sg_len;
 	struct mmc_async_req	areq;
 	enum mmc_drv_op		drv_op;
 	int			drv_op_result;
@@ -81,11 +78,8 @@ extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);
-
 extern unsigned int mmc_queue_map_sg(struct mmc_queue *,
 				     struct mmc_queue_req *);
-extern void mmc_queue_bounce_pre(struct mmc_queue_req *);
-extern void mmc_queue_bounce_post(struct mmc_queue_req *);
 
 extern int mmc_access_rpmb(struct mmc_queue *);
 
diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c
index 27fb625cbcf3..fbd29f00fca0 100644
--- a/drivers/mmc/host/cavium.c
+++ b/drivers/mmc/host/cavium.c
@@ -1038,7 +1038,7 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host)
 	 */
 	mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED |
 		     MMC_CAP_ERASE | MMC_CAP_CMD23 | MMC_CAP_POWER_OFF_CARD |
-		     MMC_CAP_3_3V_DDR | MMC_CAP_NO_BOUNCE_BUFF;
+		     MMC_CAP_3_3V_DDR;
 
 	if (host->use_sg)
 		mmc->max_segs = 16;
diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c
index 59ab194cb009..c763b404510f 100644
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -702,11 +702,7 @@ static int pxamci_probe(struct platform_device *pdev)
 
 	pxamci_init_ocr(host);
 
-	/*
-	 * This architecture used to disable bounce buffers through its
-	 * defconfig, now it is done at runtime as a host property.
-	 */
-	mmc->caps = MMC_CAP_NO_BOUNCE_BUFF;
+	mmc->caps = 0;
 	host->cmdat = 0;
 	if (!cpu_is_pxa25x()) {
 		mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index f3f2d07feb2a..9a43763a68ad 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -316,7 +316,7 @@ struct mmc_host {
 #define MMC_CAP_UHS_SDR50	(1 << 18)	/* Host supports UHS SDR50 mode */
 #define MMC_CAP_UHS_SDR104	(1 << 19)	/* Host supports UHS SDR104 mode */
 #define MMC_CAP_UHS_DDR50	(1 << 20)	/* Host supports UHS DDR50 mode */
-#define MMC_CAP_NO_BOUNCE_BUFF	(1 << 21)	/* Disable bounce buffers on host */
+/* (1 << 21) is free for reuse */
 #define MMC_CAP_DRIVER_TYPE_A	(1 << 23)	/* Host supports Driver Type A */
 #define MMC_CAP_DRIVER_TYPE_C	(1 << 24)	/* Host supports Driver Type C */
 #define MMC_CAP_DRIVER_TYPE_D	(1 << 25)	/* Host supports Driver Type D */
-- 
cgit v1.2.3


From 6b9dc4806b28214a4a260517e59439e0ac12a15e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 2 Oct 2017 12:34:50 +0200
Subject: watchdog/core, powerpc: Replace watchdog_nmi_reconfigure()

The recent cleanup of the watchdog code split watchdog_nmi_reconfigure()
into two stages. One to stop the NMI and one to restart it after
reconfiguration. That was done by adding a boolean 'run' argument to the
code, which is functionally correct but not necessarily a piece of art.

Replace it by two explicit functions: watchdog_nmi_stop() and
watchdog_nmi_start().

Fixes: 6592ad2fcc8f ("watchdog/core, powerpc: Make watchdog_nmi_reconfigure() two stage")
Requested-by: Linus 'Nursing his pet-peeve' Torvalds <torvalds@linuxfoundation.org>
Signed-off-by: Thomas 'Mopping up garbage' Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1710021957480.2114@nanos
---
 arch/powerpc/kernel/watchdog.c | 23 ++++++++++++++---------
 include/linux/nmi.h            |  3 ++-
 kernel/watchdog.c              | 33 ++++++++++++++++++---------------
 3 files changed, 34 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index dfb067764480..2673ec8bec00 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -355,19 +355,24 @@ static void watchdog_calc_timeouts(void)
 	wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
 }
 
-void watchdog_nmi_reconfigure(bool run)
+void watchdog_nmi_stop(void)
 {
 	int cpu;
 
 	cpus_read_lock();
-	if (!run) {
-		for_each_cpu(cpu, &wd_cpus_enabled)
-			stop_wd_on_cpu(cpu);
-	} else {
-		watchdog_calc_timeouts();
-		for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
-			start_wd_on_cpu(cpu);
-	}
+	for_each_cpu(cpu, &wd_cpus_enabled)
+		stop_wd_on_cpu(cpu);
+	cpus_read_unlock();
+}
+
+void watchdog_nmi_start(void)
+{
+	int cpu;
+
+	cpus_read_lock();
+	watchdog_calc_timeouts();
+	for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
+		start_wd_on_cpu(cpu);
 	cpus_read_unlock();
 }
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 89ba8b23c6fe..0c9ed49fb21a 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -109,7 +109,8 @@ static inline int hardlockup_detector_perf_init(void) { return 0; }
 # endif
 #endif
 
-void watchdog_nmi_reconfigure(bool run);
+void watchdog_nmi_stop(void);
+void watchdog_nmi_start(void);
 
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f6ef163b72cd..6ad6226535d0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -123,24 +123,27 @@ int __weak __init watchdog_nmi_probe(void)
 }
 
 /**
- * watchdog_nmi_reconfigure - Optional function to reconfigure NMI watchdogs
- * @run:	If false stop the watchdogs on all enabled CPUs
- *		If true start the watchdogs on all enabled CPUs
+ * watchdog_nmi_stop - Stop the watchdog for reconfiguration
  *
- * The core call order is:
- * watchdog_nmi_reconfigure(false);
+ * The reconfiguration steps are:
+ * watchdog_nmi_stop();
  * update_variables();
- * watchdog_nmi_reconfigure(true);
+ * watchdog_nmi_start();
+ */
+void __weak watchdog_nmi_stop(void) { }
+
+/**
+ * watchdog_nmi_start - Start the watchdog after reconfiguration
  *
- * The second call which starts the watchdogs again guarantees that the
- * following variables are stable across the call.
+ * Counterpart to watchdog_nmi_stop().
+ *
+ * The following variables have been updated in update_variables() and
+ * contain the currently valid configuration:
  * - watchdog_enabled
  * - watchdog_thresh
  * - watchdog_cpumask
- *
- * After the call the variables can be changed again.
  */
-void __weak watchdog_nmi_reconfigure(bool run) { }
+void __weak watchdog_nmi_start(void) { }
 
 /**
  * lockup_detector_update_enable - Update the sysctl enable bit
@@ -551,13 +554,13 @@ static void softlockup_unpark_threads(void)
 
 static void softlockup_reconfigure_threads(void)
 {
-	watchdog_nmi_reconfigure(false);
+	watchdog_nmi_stop();
 	softlockup_park_all_threads();
 	set_sample_period();
 	lockup_detector_update_enable();
 	if (watchdog_enabled && watchdog_thresh)
 		softlockup_unpark_threads();
-	watchdog_nmi_reconfigure(true);
+	watchdog_nmi_start();
 }
 
 /*
@@ -602,9 +605,9 @@ static inline void watchdog_disable_all_cpus(void) { }
 static inline void softlockup_init_threads(void) { }
 static void softlockup_reconfigure_threads(void)
 {
-	watchdog_nmi_reconfigure(false);
+	watchdog_nmi_stop();
 	lockup_detector_update_enable();
-	watchdog_nmi_reconfigure(true);
+	watchdog_nmi_start();
 }
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
-- 
cgit v1.2.3


From 34ddaa3e5c0096fef52485186c7eb6cf56ddc686 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 3 Oct 2017 16:39:02 +0200
Subject: powerpc/watchdog: Make use of watchdog_nmi_probe()

The rework of the core hotplug code triggers the WARN_ON in start_wd_cpu()
on powerpc because it is called multiple times for the boot CPU.

The first call is via:

  start_wd_on_cpu+0x80/0x2f0
  watchdog_nmi_reconfigure+0x124/0x170
  softlockup_reconfigure_threads+0x110/0x130
  lockup_detector_init+0xbc/0xe0
  kernel_init_freeable+0x18c/0x37c
  kernel_init+0x2c/0x160
  ret_from_kernel_thread+0x5c/0xbc

And then again via the CPU hotplug registration:

  start_wd_on_cpu+0x80/0x2f0
  cpuhp_invoke_callback+0x194/0x620
  cpuhp_thread_fun+0x7c/0x1b0
  smpboot_thread_fn+0x290/0x2a0
  kthread+0x168/0x1b0
  ret_from_kernel_thread+0x5c/0xbc

This can be avoided by setting up the cpu hotplug state with nocalls and
move the initialization to the watchdog_nmi_probe() function. That
initializes the hotplug callbacks without invoking the callback and the
following core initialization function then configures the watchdog for the
online CPUs (in this case CPU0) via softlockup_reconfigure_threads().

Reported-and-tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/kernel/watchdog.c | 17 ++++++++---------
 include/linux/nmi.h            |  1 +
 kernel/watchdog.c              |  5 ++++-
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index f9b4c6352d24..c702a8981452 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -373,22 +373,21 @@ void watchdog_nmi_start(void)
 }
 
 /*
- * This runs after lockup_detector_init() which sets up watchdog_cpumask.
+ * Invoked from core watchdog init.
  */
-static int __init powerpc_watchdog_init(void)
+int __init watchdog_nmi_probe(void)
 {
 	int err;
 
-	watchdog_calc_timeouts();
-
-	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online",
-				start_wd_on_cpu, stop_wd_on_cpu);
-	if (err < 0)
+	err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"powerpc/watchdog:online",
+					start_wd_on_cpu, stop_wd_on_cpu);
+	if (err < 0) {
 		pr_warn("Watchdog could not be initialized");
-
+		return err;
+	}
 	return 0;
 }
-arch_initcall(powerpc_watchdog_init);
 
 static void handle_backtrace_ipi(struct pt_regs *regs)
 {
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 0c9ed49fb21a..27e249ed7c5c 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -111,6 +111,7 @@ static inline int hardlockup_detector_perf_init(void) { return 0; }
 
 void watchdog_nmi_stop(void);
 void watchdog_nmi_start(void);
+int watchdog_nmi_probe(void);
 
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index fff90fe10007..5c6fb7cd9ae8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -608,7 +608,6 @@ static inline int watchdog_park_threads(void) { return 0; }
 static inline void watchdog_unpark_threads(void) { }
 static inline int watchdog_enable_all_cpus(void) { return 0; }
 static inline void watchdog_disable_all_cpus(void) { }
-static inline void softlockup_init_threads(void) { }
 static void softlockup_reconfigure_threads(void)
 {
 	cpus_read_lock();
@@ -617,6 +616,10 @@ static void softlockup_reconfigure_threads(void)
 	watchdog_nmi_start();
 	cpus_read_unlock();
 }
+static inline void softlockup_init_threads(void)
+{
+	softlockup_reconfigure_threads();
+}
 #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
 
 static void __lockup_detector_cleanup(void)
-- 
cgit v1.2.3


From 192b2d78722ffea188e5ec6ae5d55010dce05a4b Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Fri, 29 Sep 2017 21:09:36 -0700
Subject: Drivers: hv: vmbus: Fix bugs in rescind handling

This patch addresses the following bugs in the current rescind handling code:

1. Fixes a race condition where we may be invoking hv_process_channel_removal()
on an already freed channel.

2. Prevents indefinite wait when rescinding sub-channels by correctly setting
the probe_complete state.

I would like to thank Dexuan for patiently reviewing earlier versions of this
patch and identifying many of the issues fixed here.

Greg, please apply this to 4.14-final.

Fixes: '54a66265d675 ("Drivers: hv: vmbus: Fix rescind handling")'

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Cc: stable@vger.kernel.org # (4.13 and above)
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel.c      |  6 +++---
 drivers/hv/channel_mgmt.c | 37 ++++++++++++++++++-------------------
 drivers/hv/vmbus_drv.c    |  3 +--
 include/linux/hyperv.h    |  2 +-
 4 files changed, 23 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index efd5db743319..894b67ac2cae 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -640,6 +640,7 @@ void vmbus_close(struct vmbus_channel *channel)
 		 */
 		return;
 	}
+	mutex_lock(&vmbus_connection.channel_mutex);
 	/*
 	 * Close all the sub-channels first and then close the
 	 * primary channel.
@@ -648,16 +649,15 @@ void vmbus_close(struct vmbus_channel *channel)
 		cur_channel = list_entry(cur, struct vmbus_channel, sc_list);
 		vmbus_close_internal(cur_channel);
 		if (cur_channel->rescind) {
-			mutex_lock(&vmbus_connection.channel_mutex);
-			hv_process_channel_removal(cur_channel,
+			hv_process_channel_removal(
 					   cur_channel->offermsg.child_relid);
-			mutex_unlock(&vmbus_connection.channel_mutex);
 		}
 	}
 	/*
 	 * Now close the primary.
 	 */
 	vmbus_close_internal(channel);
+	mutex_unlock(&vmbus_connection.channel_mutex);
 }
 EXPORT_SYMBOL_GPL(vmbus_close);
 
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index bcbb031f7263..018d2e0f8ec5 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -159,7 +159,7 @@ static void vmbus_rescind_cleanup(struct vmbus_channel *channel)
 
 
 	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
-
+	channel->rescind = true;
 	list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list,
 				msglistentry) {
 
@@ -381,14 +381,21 @@ static void vmbus_release_relid(u32 relid)
 		       true);
 }
 
-void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid)
+void hv_process_channel_removal(u32 relid)
 {
 	unsigned long flags;
-	struct vmbus_channel *primary_channel;
+	struct vmbus_channel *primary_channel, *channel;
 
-	BUG_ON(!channel->rescind);
 	BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
 
+	/*
+	 * Make sure channel is valid as we may have raced.
+	 */
+	channel = relid2channel(relid);
+	if (!channel)
+		return;
+
+	BUG_ON(!channel->rescind);
 	if (channel->target_cpu != get_cpu()) {
 		put_cpu();
 		smp_call_function_single(channel->target_cpu,
@@ -515,6 +522,7 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	if (!fnew) {
 		if (channel->sc_creation_callback != NULL)
 			channel->sc_creation_callback(newchannel);
+		newchannel->probe_done = true;
 		return;
 	}
 
@@ -834,7 +842,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 {
 	struct vmbus_channel_rescind_offer *rescind;
 	struct vmbus_channel *channel;
-	unsigned long flags;
 	struct device *dev;
 
 	rescind = (struct vmbus_channel_rescind_offer *)hdr;
@@ -873,16 +880,6 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		return;
 	}
 
-	spin_lock_irqsave(&channel->lock, flags);
-	channel->rescind = true;
-	spin_unlock_irqrestore(&channel->lock, flags);
-
-	/*
-	 * Now that we have posted the rescind state, perform
-	 * rescind related cleanup.
-	 */
-	vmbus_rescind_cleanup(channel);
-
 	/*
 	 * Now wait for offer handling to complete.
 	 */
@@ -901,6 +898,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 	if (channel->device_obj) {
 		if (channel->chn_rescind_callback) {
 			channel->chn_rescind_callback(channel);
+			vmbus_rescind_cleanup(channel);
 			return;
 		}
 		/*
@@ -909,6 +907,7 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		 */
 		dev = get_device(&channel->device_obj->device);
 		if (dev) {
+			vmbus_rescind_cleanup(channel);
 			vmbus_device_unregister(channel->device_obj);
 			put_device(dev);
 		}
@@ -921,16 +920,16 @@ static void vmbus_onoffer_rescind(struct vmbus_channel_message_header *hdr)
 		 * 1. Close all sub-channels first
 		 * 2. Then close the primary channel.
 		 */
+		mutex_lock(&vmbus_connection.channel_mutex);
+		vmbus_rescind_cleanup(channel);
 		if (channel->state == CHANNEL_OPEN_STATE) {
 			/*
 			 * The channel is currently not open;
 			 * it is safe for us to cleanup the channel.
 			 */
-			mutex_lock(&vmbus_connection.channel_mutex);
-			hv_process_channel_removal(channel,
-						channel->offermsg.child_relid);
-			mutex_unlock(&vmbus_connection.channel_mutex);
+			hv_process_channel_removal(rescind->child_relid);
 		}
+		mutex_unlock(&vmbus_connection.channel_mutex);
 	}
 }
 
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index a9d49f6f6501..937801ac2fe0 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -768,8 +768,7 @@ static void vmbus_device_release(struct device *device)
 	struct vmbus_channel *channel = hv_dev->channel;
 
 	mutex_lock(&vmbus_connection.channel_mutex);
-	hv_process_channel_removal(channel,
-				   channel->offermsg.child_relid);
+	hv_process_channel_removal(channel->offermsg.child_relid);
 	mutex_unlock(&vmbus_connection.channel_mutex);
 	kfree(hv_dev);
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index c458d7b7ad19..6431087816ba 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1403,7 +1403,7 @@ extern bool vmbus_prep_negotiate_resp(struct icmsg_hdr *icmsghdrp, u8 *buf,
 				const int *srv_version, int srv_vercnt,
 				int *nego_fw_version, int *nego_srv_version);
 
-void hv_process_channel_removal(struct vmbus_channel *channel, u32 relid);
+void hv_process_channel_removal(u32 relid);
 
 void vmbus_setevent(struct vmbus_channel *channel);
 /*
-- 
cgit v1.2.3


From 5803b023881857db32ffefa0d269c90280a67ee0 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 9 Oct 2017 10:02:56 +0200
Subject: ALSA: seq: Fix copy_from_user() call inside lock

The event handler in the virmidi sequencer code takes a read-lock for
the linked list traverse, while it's calling snd_seq_dump_var_event()
in the loop.  The latter function may expand the user-space data
depending on the event type.  It eventually invokes copy_from_user(),
which might be a potential dead-lock.

The sequencer core guarantees that the user-space data is passed only
with atomic=0 argument, but snd_virmidi_dev_receive_event() ignores it
and always takes read-lock().  For avoiding the problem above, this
patch introduces rwsem for non-atomic case, while keeping rwlock for
atomic case.

Also while we're at it: the superfluous irq flags is dropped in
snd_virmidi_input_open().

Reported-by: Jia-Ju Bai <baijiaju1990@163.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/seq_virmidi.h  |  1 +
 sound/core/seq/seq_virmidi.c | 27 +++++++++++++++++++--------
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/seq_virmidi.h b/include/sound/seq_virmidi.h
index a03acd0d398a..695257ae64ac 100644
--- a/include/sound/seq_virmidi.h
+++ b/include/sound/seq_virmidi.h
@@ -60,6 +60,7 @@ struct snd_virmidi_dev {
 	int port;			/* created/attached port */
 	unsigned int flags;		/* SNDRV_VIRMIDI_* */
 	rwlock_t filelist_lock;
+	struct rw_semaphore filelist_sem;
 	struct list_head filelist;
 };
 
diff --git a/sound/core/seq/seq_virmidi.c b/sound/core/seq/seq_virmidi.c
index 8d93a4021c78..f48a4cd24ffc 100644
--- a/sound/core/seq/seq_virmidi.c
+++ b/sound/core/seq/seq_virmidi.c
@@ -77,13 +77,17 @@ static void snd_virmidi_init_event(struct snd_virmidi *vmidi,
  * decode input event and put to read buffer of each opened file
  */
 static int snd_virmidi_dev_receive_event(struct snd_virmidi_dev *rdev,
-					 struct snd_seq_event *ev)
+					 struct snd_seq_event *ev,
+					 bool atomic)
 {
 	struct snd_virmidi *vmidi;
 	unsigned char msg[4];
 	int len;
 
-	read_lock(&rdev->filelist_lock);
+	if (atomic)
+		read_lock(&rdev->filelist_lock);
+	else
+		down_read(&rdev->filelist_sem);
 	list_for_each_entry(vmidi, &rdev->filelist, list) {
 		if (!vmidi->trigger)
 			continue;
@@ -97,7 +101,10 @@ static int snd_virmidi_dev_receive_event(struct snd_virmidi_dev *rdev,
 				snd_rawmidi_receive(vmidi->substream, msg, len);
 		}
 	}
-	read_unlock(&rdev->filelist_lock);
+	if (atomic)
+		read_unlock(&rdev->filelist_lock);
+	else
+		up_read(&rdev->filelist_sem);
 
 	return 0;
 }
@@ -115,7 +122,7 @@ int snd_virmidi_receive(struct snd_rawmidi *rmidi, struct snd_seq_event *ev)
 	struct snd_virmidi_dev *rdev;
 
 	rdev = rmidi->private_data;
-	return snd_virmidi_dev_receive_event(rdev, ev);
+	return snd_virmidi_dev_receive_event(rdev, ev, true);
 }
 #endif  /*  0  */
 
@@ -130,7 +137,7 @@ static int snd_virmidi_event_input(struct snd_seq_event *ev, int direct,
 	rdev = private_data;
 	if (!(rdev->flags & SNDRV_VIRMIDI_USE))
 		return 0; /* ignored */
-	return snd_virmidi_dev_receive_event(rdev, ev);
+	return snd_virmidi_dev_receive_event(rdev, ev, atomic);
 }
 
 /*
@@ -209,7 +216,6 @@ static int snd_virmidi_input_open(struct snd_rawmidi_substream *substream)
 	struct snd_virmidi_dev *rdev = substream->rmidi->private_data;
 	struct snd_rawmidi_runtime *runtime = substream->runtime;
 	struct snd_virmidi *vmidi;
-	unsigned long flags;
 
 	vmidi = kzalloc(sizeof(*vmidi), GFP_KERNEL);
 	if (vmidi == NULL)
@@ -223,9 +229,11 @@ static int snd_virmidi_input_open(struct snd_rawmidi_substream *substream)
 	vmidi->client = rdev->client;
 	vmidi->port = rdev->port;	
 	runtime->private_data = vmidi;
-	write_lock_irqsave(&rdev->filelist_lock, flags);
+	down_write(&rdev->filelist_sem);
+	write_lock_irq(&rdev->filelist_lock);
 	list_add_tail(&vmidi->list, &rdev->filelist);
-	write_unlock_irqrestore(&rdev->filelist_lock, flags);
+	write_unlock_irq(&rdev->filelist_lock);
+	up_write(&rdev->filelist_sem);
 	vmidi->rdev = rdev;
 	return 0;
 }
@@ -264,9 +272,11 @@ static int snd_virmidi_input_close(struct snd_rawmidi_substream *substream)
 	struct snd_virmidi_dev *rdev = substream->rmidi->private_data;
 	struct snd_virmidi *vmidi = substream->runtime->private_data;
 
+	down_write(&rdev->filelist_sem);
 	write_lock_irq(&rdev->filelist_lock);
 	list_del(&vmidi->list);
 	write_unlock_irq(&rdev->filelist_lock);
+	up_write(&rdev->filelist_sem);
 	snd_midi_event_free(vmidi->parser);
 	substream->runtime->private_data = NULL;
 	kfree(vmidi);
@@ -520,6 +530,7 @@ int snd_virmidi_new(struct snd_card *card, int device, struct snd_rawmidi **rrmi
 	rdev->rmidi = rmidi;
 	rdev->device = device;
 	rdev->client = -1;
+	init_rwsem(&rdev->filelist_sem);
 	rwlock_init(&rdev->filelist_lock);
 	INIT_LIST_HEAD(&rdev->filelist);
 	rdev->seq_mode = SNDRV_VIRMIDI_SEQ_DISPATCH;
-- 
cgit v1.2.3


From 98589a0998b8b13c4a8fa1ccb0e62751a019faa5 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Mon, 9 Oct 2017 15:27:15 +0300
Subject: netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'

Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
support for attaching an eBPF object by an fd, with the
'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
IPT_SO_SET_REPLACE call.

However this breaks subsequent iptables calls:

 # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
 # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
 iptables: Invalid argument. Run `dmesg' for more information.

That's because iptables works by loading existing rules using
IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with
the replacement set.

However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number
(from the initial "iptables -m bpf" invocation) - so when 2nd invocation
occurs, userspace passes a bogus fd number, which leads to
'bpf_mt_check_v1' to fail.

One suggested solution [1] was to hack iptables userspace, to perform a
"entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new,
process-local fd per every 'xt_bpf_info_v1' entry seen.

However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to
depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects.

This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given
'.fd' and instead perform an in-kernel lookup for the bpf object given
the provided '.path'.

It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named
XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is
expected to provide the path of the pinned object.

Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved.

References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
            [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2

Reported-by: Rafael Buchbinder <rafi@rbk.ms>
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/bpf.h                   |  5 +++++
 include/uapi/linux/netfilter/xt_bpf.h |  1 +
 kernel/bpf/inode.c                    |  1 +
 net/netfilter/xt_bpf.c                | 22 ++++++++++++++++++++--
 4 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..f1af7d63d678 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -368,6 +368,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
+static inline int bpf_obj_get_user(const char __user *pathname)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 						       u32 key)
 {
diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index b97725af2ac0..da161b56c79e 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -23,6 +23,7 @@ enum xt_bpf_modes {
 	XT_BPF_MODE_FD_PINNED,
 	XT_BPF_MODE_FD_ELF,
 };
+#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
 
 struct xt_bpf_info_v1 {
 	__u16 mode;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..be1dde967208 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ out:
 	putname(pname);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
 
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 38986a95216c..29123934887b 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/syscalls.h>
 #include <linux/skbuff.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
@@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
 	return 0;
 }
 
+static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
+{
+	mm_segment_t oldfs = get_fs();
+	int retval, fd;
+
+	set_fs(KERNEL_DS);
+	fd = bpf_obj_get_user(path);
+	set_fs(oldfs);
+	if (fd < 0)
+		return fd;
+
+	retval = __bpf_mt_check_fd(fd, ret);
+	sys_close(fd);
+	return retval;
+}
+
 static int bpf_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_bpf_info *info = par->matchinfo;
@@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
 		return __bpf_mt_check_bytecode(info->bpf_program,
 					       info->bpf_program_num_elem,
 					       &info->filter);
-	else if (info->mode == XT_BPF_MODE_FD_PINNED ||
-		 info->mode == XT_BPF_MODE_FD_ELF)
+	else if (info->mode == XT_BPF_MODE_FD_ELF)
 		return __bpf_mt_check_fd(info->fd, &info->filter);
+	else if (info->mode == XT_BPF_MODE_PATH_PINNED)
+		return __bpf_mt_check_path(info->path, &info->filter);
 	else
 		return -EINVAL;
 }
-- 
cgit v1.2.3


From d153b153446f7d8832bb2ebd92309c8a6003b3bb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 Sep 2017 11:35:30 +0200
Subject: sched/core: Fix wake_affine() performance regression

Eric reported a sysbench regression against commit:

  3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")

Similarly, Rik was looking at the NAS-lu.C benchmark, which regressed
against his v3.10 enterprise kernel.

PRE (current tip/master):

 ivb-ep sysbench:

   2: [30 secs]     transactions:                        64110  (2136.94 per sec.)
   5: [30 secs]     transactions:                        143644 (4787.99 per sec.)
  10: [30 secs]     transactions:                        274298 (9142.93 per sec.)
  20: [30 secs]     transactions:                        418683 (13955.45 per sec.)
  40: [30 secs]     transactions:                        320731 (10690.15 per sec.)
  80: [30 secs]     transactions:                        355096 (11834.28 per sec.)

 hsw-ex NAS:

 OMP_PROC_BIND/lu.C.x_threads_144_run_1.log: Time in seconds =                    18.01
 OMP_PROC_BIND/lu.C.x_threads_144_run_2.log: Time in seconds =                    17.89
 OMP_PROC_BIND/lu.C.x_threads_144_run_3.log: Time in seconds =                    17.93
 lu.C.x_threads_144_run_1.log: Time in seconds =                   434.68
 lu.C.x_threads_144_run_2.log: Time in seconds =                   405.36
 lu.C.x_threads_144_run_3.log: Time in seconds =                   433.83

POST (+patch):

 ivb-ep sysbench:

   2: [30 secs]     transactions:                        64494  (2149.75 per sec.)
   5: [30 secs]     transactions:                        145114 (4836.99 per sec.)
  10: [30 secs]     transactions:                        278311 (9276.69 per sec.)
  20: [30 secs]     transactions:                        437169 (14571.60 per sec.)
  40: [30 secs]     transactions:                        669837 (22326.73 per sec.)
  80: [30 secs]     transactions:                        631739 (21055.88 per sec.)

 hsw-ex NAS:

 lu.C.x_threads_144_run_1.log: Time in seconds =                    23.36
 lu.C.x_threads_144_run_2.log: Time in seconds =                    22.96
 lu.C.x_threads_144_run_3.log: Time in seconds =                    22.52

This patch takes out all the shiny wake_affine() stuff and goes back to
utter basics. Between the two CPUs involved with the wakeup (the CPU
doing the wakeup and the CPU we ran on previously) pick the CPU we can
run on _now_.

This restores much of the regressions against the older kernels,
but leaves some ground in the overloaded case. The default-enabled
WA_WEIGHT (which will be introduced in the next patch) is an attempt
to address the overloaded situation.

Reported-by: Eric Farman <farman@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Rosato <mjrosato@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jinpuwang@gmail.com
Cc: vcaputo@pengaru.com
Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/topology.h |   8 ---
 kernel/sched/fair.c            | 126 +++++------------------------------------
 kernel/sched/features.h        |   1 +
 3 files changed, 16 insertions(+), 119 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index d7b6dab956ec..7d065abc7a47 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
-
-	/*
-	 * Some variables from the most recent sd_lb_stats for this domain,
-	 * used by wake_affine().
-	 */
-	unsigned long	nr_running;
-	unsigned long	load;
-	unsigned long	capacity;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..28cabed85387 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,115 +5356,36 @@ static int wake_wide(struct task_struct *p)
 	return 1;
 }
 
-struct llc_stats {
-	unsigned long	nr_running;
-	unsigned long	load;
-	unsigned long	capacity;
-	int		has_capacity;
-};
-
-static bool get_llc_stats(struct llc_stats *stats, int cpu)
-{
-	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-
-	if (!sds)
-		return false;
-
-	stats->nr_running	= READ_ONCE(sds->nr_running);
-	stats->load		= READ_ONCE(sds->load);
-	stats->capacity		= READ_ONCE(sds->capacity);
-	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu);
-
-	return true;
-}
-
 /*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
  *
- * Since we're running on 'stale' values, we might in fact create an imbalance
- * but recomputing these values is expensive, as that'd mean iteration 2 cache
- * domains worth of CPUs.
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *			will be) idle.
  */
+
 static bool
-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
-		int this_cpu, int prev_cpu, int sync)
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+		 int this_cpu, int prev_cpu, int sync)
 {
-	struct llc_stats prev_stats, this_stats;
-	s64 this_eff_load, prev_eff_load;
-	unsigned long task_load;
-
-	if (!get_llc_stats(&prev_stats, prev_cpu) ||
-	    !get_llc_stats(&this_stats, this_cpu))
-		return false;
-
-	/*
-	 * If sync wakeup then subtract the (maximum possible)
-	 * effect of the currently running task from the load
-	 * of the current LLC.
-	 */
-	if (sync) {
-		unsigned long current_load = task_h_load(current);
-
-		/* in this case load hits 0 and this LLC is considered 'idle' */
-		if (current_load > this_stats.load)
-			return true;
-
-		this_stats.load -= current_load;
-	}
-
-	/*
-	 * The has_capacity stuff is not SMT aware, but by trying to balance
-	 * the nr_running on both ends we try and fill the domain at equal
-	 * rates, thereby first consuming cores before siblings.
-	 */
-
-	/* if the old cache has capacity, stay there */
-	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
-		return false;
-
-	/* if this cache has capacity, come here */
-	if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
+	if (idle_cpu(this_cpu))
 		return true;
 
-	/*
-	 * Check to see if we can move the load without causing too much
-	 * imbalance.
-	 */
-	task_load = task_h_load(p);
-
-	this_eff_load = 100;
-	this_eff_load *= prev_stats.capacity;
-
-	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-	prev_eff_load *= this_stats.capacity;
-
-	this_eff_load *= this_stats.load + task_load;
-	prev_eff_load *= prev_stats.load - task_load;
+	if (sync && cpu_rq(this_cpu)->nr_running == 1)
+		return true;
 
-	return this_eff_load <= prev_eff_load;
+	return false;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
 	int this_cpu = smp_processor_id();
-	bool affine;
-
-	/*
-	 * Default to no affine wakeups; wake_affine() should not effect a task
-	 * placement the load-balancer feels inclined to undo. The conservative
-	 * option is therefore to not move tasks when they wake up.
-	 */
-	affine = false;
+	bool affine = false;
 
-	/*
-	 * If the wakeup is across cache domains, try to evaluate if movement
-	 * makes sense, otherwise rely on select_idle_siblings() to do
-	 * placement inside the cache domain.
-	 */
-	if (!cpus_share_cache(prev_cpu, this_cpu))
-		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
+	if (sched_feat(WA_IDLE) && !affine)
+		affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
 	if (affine) {
@@ -7600,7 +7521,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  */
 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
 {
-	struct sched_domain_shared *shared = env->sd->shared;
 	struct sched_domain *child = env->sd->child;
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
@@ -7672,22 +7592,6 @@ next_group:
 		if (env->dst_rq->rd->overload != overload)
 			env->dst_rq->rd->overload = overload;
 	}
-
-	if (!shared)
-		return;
-
-	/*
-	 * Since these are sums over groups they can contain some CPUs
-	 * multiple times for the NUMA domains.
-	 *
-	 * Currently only wake_affine_llc() and find_busiest_group()
-	 * uses these numbers, only the last is affected by this problem.
-	 *
-	 * XXX fix that.
-	 */
-	WRITE_ONCE(shared->nr_running,	sds->total_running);
-	WRITE_ONCE(shared->load,	sds->total_load);
-	WRITE_ONCE(shared->capacity,	sds->total_capacity);
 }
 
 /**
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index d3fb15555291..0a519f8c224d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,4 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 SCHED_FEAT(ATTACH_AGE_LOAD, true)
 
+SCHED_FEAT(WA_IDLE, true)
-- 
cgit v1.2.3


From 8a1ac5dc7be09883051b1bf89a5e57d7ad850fa5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 15:57:40 -0700
Subject: include/linux/of.h: provide of_n_{addr,size}_cells wrappers for
 !CONFIG_OF

The pci-rcar driver is enabled for compile tests, and this has shown that
the driver cannot build without CONFIG_OF, following the inclusion of
commit f8f2fe7355fb ("PCI: rcar: Use new OF interrupt mapping when possible"):

  drivers/pci/host/pcie-rcar.c: In function 'pci_dma_range_parser_init':
  drivers/pci/host/pcie-rcar.c:1039:2: error: implicit declaration of function 'of_n_addr_cells' [-Werror=implicit-function-declaration]
    parser->pna = of_n_addr_cells(node);
    ^

As pointed out by Ben Dooks and Geert Uytterhoeven, this is actually
supposed to build fine, which we can achieve if we make the declaration
of of_irq_parse_and_map_pci conditional on CONFIG_OF and provide an
empty inline function otherwise, as we do for a lot of other of
interfaces.

This lets us build the rcar_pci driver again without CONFIG_OF for build
testing.  All platforms using this driver select OF, so this doesn't
change anything for the users.

[akpm@linux-foundation.org: be consistent with surrounding code]
Link: http://lkml.kernel.org/r/20170911200805.3363318-1-arnd@arndb.de
Fixes: c25da4778803 ("PCI: rcar: Add Renesas R-Car PCIe driver")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Magnus Damm <damm@opensource.se>
Cc: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/of.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/of.h b/include/linux/of.h
index cfc34117fc92..b240ed69dc96 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -734,6 +734,16 @@ static inline struct device_node *of_get_cpu_node(int cpu,
 	return NULL;
 }
 
+static inline int of_n_addr_cells(struct device_node *np)
+{
+	return 0;
+
+}
+static inline int of_n_size_cells(struct device_node *np)
+{
+	return 0;
+}
+
 static inline int of_property_read_u64(const struct device_node *np,
 				       const char *propname, u64 *out_value)
 {
-- 
cgit v1.2.3


From e8c97af0c1f23d6ffedcaa3918861f2595e1db62 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 13 Oct 2017 15:58:11 -0700
Subject: linux/kernel.h: add/correct kernel-doc notation

Add kernel-doc notation for some macros.  Correct kernel-doc comments &
typos for a few macros.

Link: http://lkml.kernel.org/r/76fa1403-1511-be4c-e9c4-456b43edfad3@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 90 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 0ad4c3044cf9..91189bb0c818 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -44,6 +44,12 @@
 
 #define STACK_MAGIC	0xdeadbeef
 
+/**
+ * REPEAT_BYTE - repeat the value @x multiple times as an unsigned long value
+ * @x: value to repeat
+ *
+ * NOTE: @x is not checked for > 0xff; larger values produce odd results.
+ */
 #define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
 
 /* @a is a power of 2 value */
@@ -57,6 +63,10 @@
 #define READ			0
 #define WRITE			1
 
+/**
+ * ARRAY_SIZE - get the number of elements in array @arr
+ * @arr: array to be sized
+ */
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
 #define u64_to_user_ptr(x) (		\
@@ -76,7 +86,15 @@
 #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
 #define round_down(x, y) ((x) & ~__round_mask(x, y))
 
+/**
+ * FIELD_SIZEOF - get the size of a struct's field
+ * @t: the target struct
+ * @f: the target struct's field
+ * Return: the size of @f in the struct definition without having a
+ * declared instance of @t.
+ */
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
+
 #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
 
 #define DIV_ROUND_DOWN_ULL(ll, d) \
@@ -107,7 +125,7 @@
 /*
  * Divide positive or negative dividend by positive or negative divisor
  * and round to closest integer. Result is undefined for negative
- * divisors if he dividend variable type is unsigned and for negative
+ * divisors if the dividend variable type is unsigned and for negative
  * dividends if the divisor variable type is unsigned.
  */
 #define DIV_ROUND_CLOSEST(x, divisor)(			\
@@ -247,13 +265,13 @@ extern int _cond_resched(void);
  * @ep_ro: right open interval endpoint
  *
  * Perform a "reciprocal multiplication" in order to "scale" a value into
- * range [0, ep_ro), where the upper interval endpoint is right-open.
+ * range [0, @ep_ro), where the upper interval endpoint is right-open.
  * This is useful, e.g. for accessing a index of an array containing
- * ep_ro elements, for example. Think of it as sort of modulus, only that
+ * @ep_ro elements, for example. Think of it as sort of modulus, only that
  * the result isn't that of modulo. ;) Note that if initial input is a
  * small value, then result will return 0.
  *
- * Return: a result based on val in interval [0, ep_ro).
+ * Return: a result based on @val in interval [0, @ep_ro).
  */
 static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
 {
@@ -618,8 +636,8 @@ do {									\
  * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
  *
- * Note: __trace_printk is an internal function for trace_printk and
- *       the @ip is passed in via the trace_printk macro.
+ * Note: __trace_printk is an internal function for trace_printk() and
+ *       the @ip is passed in via the trace_printk() macro.
  *
  * This function allows a kernel developer to debug fast path sections
  * that printk is not appropriate for. By scattering in various
@@ -629,7 +647,7 @@ do {									\
  * This is intended as a debugging tool for the developer only.
  * Please refrain from leaving trace_printks scattered around in
  * your code. (Extra memory is used for special buffers that are
- * allocated when trace_printk() is used)
+ * allocated when trace_printk() is used.)
  *
  * A little optization trick is done here. If there's only one
  * argument, there's no need to scan the string for printf formats.
@@ -681,7 +699,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...);
  *       the @ip is passed in via the trace_puts macro.
  *
  * This is similar to trace_printk() but is made for those really fast
- * paths that a developer wants the least amount of "Heisenbug" affects,
+ * paths that a developer wants the least amount of "Heisenbug" effects,
  * where the processing of the print format is still too much.
  *
  * This function allows a kernel developer to debug fast path sections
@@ -692,7 +710,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...);
  * This is intended as a debugging tool for the developer only.
  * Please refrain from leaving trace_puts scattered around in
  * your code. (Extra memory is used for special buffers that are
- * allocated when trace_puts() is used)
+ * allocated when trace_puts() is used.)
  *
  * Returns: 0 if nothing was written, positive # if string was.
  *  (1 when __trace_bputs is used, strlen(str) when __trace_puts is used)
@@ -771,6 +789,12 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	t2 min2 = (y);					\
 	(void) (&min1 == &min2);			\
 	min1 < min2 ? min1 : min2; })
+
+/**
+ * min - return minimum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
 #define min(x, y)					\
 	__min(typeof(x), typeof(y),			\
 	      __UNIQUE_ID(min1_), __UNIQUE_ID(min2_),	\
@@ -781,12 +805,31 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 	t2 max2 = (y);					\
 	(void) (&max1 == &max2);			\
 	max1 > max2 ? max1 : max2; })
+
+/**
+ * max - return maximum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
 #define max(x, y)					\
 	__max(typeof(x), typeof(y),			\
 	      __UNIQUE_ID(max1_), __UNIQUE_ID(max2_),	\
 	      x, y)
 
+/**
+ * min3 - return minimum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
 #define min3(x, y, z) min((typeof(x))min(x, y), z)
+
+/**
+ * max3 - return maximum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
 #define max3(x, y, z) max((typeof(x))max(x, y), z)
 
 /**
@@ -805,8 +848,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
  * @lo: lowest allowable value
  * @hi: highest allowable value
  *
- * This macro does strict typechecking of lo/hi to make sure they are of the
- * same type as val.  See the unnecessary pointer comparisons.
+ * This macro does strict typechecking of @lo/@hi to make sure they are of the
+ * same type as @val.  See the unnecessary pointer comparisons.
  */
 #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
 
@@ -816,11 +859,24 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
  *
  * Or not use min/max/clamp at all, of course.
  */
+
+/**
+ * min_t - return minimum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
 #define min_t(type, x, y)				\
 	__min(type, type,				\
 	      __UNIQUE_ID(min1_), __UNIQUE_ID(min2_),	\
 	      x, y)
 
+/**
+ * max_t - return maximum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
 #define max_t(type, x, y)				\
 	__max(type, type,				\
 	      __UNIQUE_ID(min1_), __UNIQUE_ID(min2_),	\
@@ -834,7 +890,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
  * @hi: maximum allowable value
  *
  * This macro does no typechecking and uses temporary variables of type
- * 'type' to make all the comparisons.
+ * @type to make all the comparisons.
  */
 #define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
 
@@ -845,15 +901,17 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
  * @hi: maximum allowable value
  *
  * This macro does no typechecking and uses temporary variables of whatever
- * type the input argument 'val' is.  This is useful when val is an unsigned
- * type and min and max are literals that will otherwise be assigned a signed
+ * type the input argument @val is.  This is useful when @val is an unsigned
+ * type and @lo and @hi are literals that will otherwise be assigned a signed
  * integer type.
  */
 #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
 
 
-/*
- * swap - swap value of @a and @b
+/**
+ * swap - swap values of @a and @b
+ * @a: first value
+ * @b: second value
  */
 #define swap(a, b) \
 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
-- 
cgit v1.2.3


From f892760aa66a2d657deaf59538fb69433036767c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 13 Oct 2017 15:58:15 -0700
Subject: fs/mpage.c: fix mpage_writepage() for pages with buffers

When using FAT on a block device which supports rw_page, we can hit
BUG_ON(!PageLocked(page)) in try_to_free_buffers().  This is because we
call clean_buffers() after unlocking the page we've written.  Introduce
a new clean_page_buffers() which cleans all buffers associated with a
page and call it from within bdev_write_page().

[akpm@linux-foundation.org: s/PAGE_SIZE/~0U/ per Linus and Matthew]
Link: http://lkml.kernel.org/r/20171006211541.GA7409@bombadil.infradead.org
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Reported-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Tested-by: Toshi Kani <toshi.kani@hpe.com>
Acked-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c              |  6 ++++--
 fs/mpage.c                  | 14 +++++++++++---
 include/linux/buffer_head.h |  1 +
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 93d088ffc05c..789f55e851ae 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -716,10 +716,12 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	set_page_writeback(page);
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, true);
-	if (result)
+	if (result) {
 		end_page_writeback(page);
-	else
+	} else {
+		clean_page_buffers(page);
 		unlock_page(page);
+	}
 	blk_queue_exit(bdev->bd_queue);
 	return result;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index 37bb77c1302c..c991faec70b9 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -468,6 +468,16 @@ static void clean_buffers(struct page *page, unsigned first_unmapped)
 		try_to_free_buffers(page);
 }
 
+/*
+ * For situations where we want to clean all buffers attached to a page.
+ * We don't need to calculate how many buffers are attached to the page,
+ * we just need to specify a number larger than the maximum number of buffers.
+ */
+void clean_page_buffers(struct page *page)
+{
+	clean_buffers(page, ~0U);
+}
+
 static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 		      void *data)
 {
@@ -605,10 +615,8 @@ alloc_new:
 	if (bio == NULL) {
 		if (first_unmapped == blocks_per_page) {
 			if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
-								page, wbc)) {
-				clean_buffers(page, first_unmapped);
+								page, wbc))
 				goto out;
-			}
 		}
 		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
 				BIO_MAX_PAGES, GFP_NOFS|__GFP_HIGH);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae555eccf..446b24cac67d 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -232,6 +232,7 @@ int generic_write_end(struct file *, struct address_space *,
 				loff_t, unsigned, unsigned,
 				struct page *, void *);
 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
+void clean_page_buffers(struct page *page);
 int cont_write_begin(struct file *, struct address_space *, loff_t,
 			unsigned, unsigned, struct page **, void **,
 			get_block_t *, loff_t *);
-- 
cgit v1.2.3


From ca182551857cc2c1e6a2b7f1e72090a137a15008 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 13 Oct 2017 15:58:22 -0700
Subject: kmemleak: clear stale pointers from task stacks

Kmemleak considers any pointers on task stacks as references.  This
patch clears newly allocated and reused vmap stacks.

Link: http://lkml.kernel.org/r/150728990124.744199.8403409836394318684.stgit@buzz
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/thread_info.h | 2 +-
 kernel/fork.c               | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 905d769d8ddc..5f7eeab990fe 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -42,7 +42,7 @@ enum {
 #define THREAD_ALIGN	THREAD_SIZE
 #endif
 
-#ifdef CONFIG_DEBUG_STACK_USAGE
+#if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
 # define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
 				 __GFP_ZERO)
 #else
diff --git a/kernel/fork.c b/kernel/fork.c
index e702cb9ffbd8..07cc743698d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -215,6 +215,10 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
+#ifdef CONFIG_DEBUG_KMEMLEAK
+		/* Clear stale pointers from reused stack. */
+		memset(s->addr, 0, THREAD_SIZE);
+#endif
 		tsk->stack_vm_area = s;
 		return s->addr;
 	}
-- 
cgit v1.2.3