From fa07fb6a4ecad41d373eaf1574b9d54b4f0c1e75 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Sep 2016 08:54:06 -0400
Subject: workqueue: dump workqueue state on sanity check failures in
 destroy_workqueue()

destroy_workqueue() performs a number of sanity checks to ensure that
the workqueue is empty before proceeding with destruction.  However,
it's not always easy to tell what's going on just from the warning
message.  Let's dump workqueue state after sanity check failures to
help debugging.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/CACT4Y+Zs6vkjHo9qHb4TrEiz3S4+quvvVQ9VWvj2Mx6pETGb9Q@mail.gmail.com
Cc: Dmitry Vyukov <dvyukov@google.com>
---
 kernel/workqueue.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ef071ca73fc3..4eaec8b86d65 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4021,6 +4021,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		for (i = 0; i < WORK_NR_COLORS; i++) {
 			if (WARN_ON(pwq->nr_in_flight[i])) {
 				mutex_unlock(&wq->mutex);
+				show_workqueue_state();
 				return;
 			}
 		}
@@ -4029,6 +4030,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		    WARN_ON(pwq->nr_active) ||
 		    WARN_ON(!list_empty(&pwq->delayed_works))) {
 			mutex_unlock(&wq->mutex);
+			show_workqueue_state();
 			return;
 		}
 	}
-- 
cgit v1.2.3


From 3347fa0928210d96aaa2bd6cd5a8391d5e630873 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 Sep 2016 15:49:32 -0400
Subject: workqueue: make workqueue available early during boot

Workqueue is currently initialized in an early init call; however,
there are cases where early boot code has to be split and reordered to
come after workqueue initialization or the same code path which makes
use of workqueues is used both before workqueue initailization and
after.  The latter cases have to gate workqueue usages with
keventd_up() tests, which is nasty and easy to get wrong.

Workqueue usages have become widespread and it'd be a lot more
convenient if it can be used very early from boot.  This patch splits
workqueue initialization into two steps.  workqueue_init_early() which
sets up the basic data structures so that workqueues can be created
and work items queued, and workqueue_init() which actually brings up
workqueues online and starts executing queued work items.  The former
step can be done very early during boot once memory allocation,
cpumasks and idr are initialized.  The latter right after kthreads
become available.

This allows work item queueing and canceling from very early boot
which is what most of these use cases want.

* As systemd_wq being initialized doesn't indicate that workqueue is
  fully online anymore, update keventd_up() to test wq_online instead.
  The follow-up patches will get rid of all its usages and the
  function itself.

* Flushing doesn't make sense before workqueue is fully initialized.
  The flush functions trigger WARN and return immediately before fully
  online.

* Work items are never in-flight before fully online.  Canceling can
  always succeed by skipping the flush step.

* Some code paths can no longer assume to be called with irq enabled
  as irq is disabled during early boot.  Use irqsave/restore
  operations instead.

v2: Watchdog init, which requires timer to be running, moved from
    workqueue_init_early() to workqueue_init().

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/CA+55aFx0vPuMuxn00rBSM192n-Du5uxy+4AvKa0SBSOVJeuCGg@mail.gmail.com
---
 include/linux/workqueue.h |  7 ++++-
 init/main.c               | 10 +++++++
 kernel/workqueue.c        | 76 +++++++++++++++++++++++++++++++++++++----------
 3 files changed, 76 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26cc1df280d6..91d416f9c0a7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -358,6 +358,8 @@ extern struct workqueue_struct *system_freezable_wq;
 extern struct workqueue_struct *system_power_efficient_wq;
 extern struct workqueue_struct *system_freezable_power_efficient_wq;
 
+extern bool wq_online;
+
 extern struct workqueue_struct *
 __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 	struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
@@ -594,7 +596,7 @@ static inline bool schedule_delayed_work(struct delayed_work *dwork,
  */
 static inline bool keventd_up(void)
 {
-	return system_wq != NULL;
+	return wq_online;
 }
 
 #ifndef CONFIG_SMP
@@ -631,4 +633,7 @@ int workqueue_online_cpu(unsigned int cpu);
 int workqueue_offline_cpu(unsigned int cpu);
 #endif
 
+int __init workqueue_init_early(void);
+int __init workqueue_init(void);
+
 #endif
diff --git a/init/main.c b/init/main.c
index a8a58e2794a5..5c4fd68a8671 100644
--- a/init/main.c
+++ b/init/main.c
@@ -551,6 +551,14 @@ asmlinkage __visible void __init start_kernel(void)
 		 "Interrupts were enabled *very* early, fixing it\n"))
 		local_irq_disable();
 	idr_init_cache();
+
+	/*
+	 * Allow workqueue creation and work item queueing/cancelling
+	 * early.  Work item execution depends on kthreads and starts after
+	 * workqueue_init().
+	 */
+	workqueue_init_early();
+
 	rcu_init();
 
 	/* trace_printk() and trace points may be used after this */
@@ -1005,6 +1013,8 @@ static noinline void __init kernel_init_freeable(void)
 
 	smp_prepare_cpus(setup_max_cpus);
 
+	workqueue_init();
+
 	do_pre_smp_initcalls();
 	lockup_detector_init();
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4eaec8b86d65..15d0811c9e91 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,6 +290,8 @@ module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
 module_param_named(power_efficient, wq_power_efficient, bool, 0444);
 
+bool wq_online;				/* can kworkers be created yet? */
+
 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
 
 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -2583,6 +2585,9 @@ void flush_workqueue(struct workqueue_struct *wq)
 	};
 	int next_color;
 
+	if (WARN_ON(!wq_online))
+		return;
+
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
 
@@ -2843,6 +2848,9 @@ bool flush_work(struct work_struct *work)
 {
 	struct wq_barrier barr;
 
+	if (WARN_ON(!wq_online))
+		return false;
+
 	lock_map_acquire(&work->lockdep_map);
 	lock_map_release(&work->lockdep_map);
 
@@ -2913,7 +2921,13 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 	mark_work_canceling(work);
 	local_irq_restore(flags);
 
-	flush_work(work);
+	/*
+	 * This allows canceling during early boot.  We know that @work
+	 * isn't executing.
+	 */
+	if (wq_online)
+		flush_work(work);
+
 	clear_work_data(work);
 
 	/*
@@ -3352,7 +3366,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 		goto fail;
 
 	/* create and start the initial worker */
-	if (!create_worker(pool))
+	if (wq_online && !create_worker(pool))
 		goto fail;
 
 	/* install */
@@ -3417,6 +3431,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
 {
 	struct workqueue_struct *wq = pwq->wq;
 	bool freezable = wq->flags & WQ_FREEZABLE;
+	unsigned long flags;
 
 	/* for @wq->saved_max_active */
 	lockdep_assert_held(&wq->mutex);
@@ -3425,7 +3440,8 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
 	if (!freezable && pwq->max_active == wq->saved_max_active)
 		return;
 
-	spin_lock_irq(&pwq->pool->lock);
+	/* this function can be called during early boot w/ irq disabled */
+	spin_lock_irqsave(&pwq->pool->lock, flags);
 
 	/*
 	 * During [un]freezing, the caller is responsible for ensuring that
@@ -3448,7 +3464,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
 		pwq->max_active = 0;
 	}
 
-	spin_unlock_irq(&pwq->pool->lock);
+	spin_unlock_irqrestore(&pwq->pool->lock, flags);
 }
 
 /* initialize newly alloced @pwq which is associated with @wq and @pool */
@@ -5457,7 +5473,17 @@ static void __init wq_numa_init(void)
 	wq_numa_enabled = true;
 }
 
-static int __init init_workqueues(void)
+/**
+ * workqueue_init_early - early init for workqueue subsystem
+ *
+ * This is the first half of two-staged workqueue subsystem initialization
+ * and invoked as soon as the bare basics - memory allocation, cpumasks and
+ * idr are up.  It sets up all the data structures and system workqueues
+ * and allows early boot code to create workqueues and queue/cancel work
+ * items.  Actual work item execution starts only after kthreads can be
+ * created and scheduled right before early initcalls.
+ */
+int __init workqueue_init_early(void)
 {
 	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
 	int i, cpu;
@@ -5490,16 +5516,6 @@ static int __init init_workqueues(void)
 		}
 	}
 
-	/* create the initial worker */
-	for_each_online_cpu(cpu) {
-		struct worker_pool *pool;
-
-		for_each_cpu_worker_pool(pool, cpu) {
-			pool->flags &= ~POOL_DISASSOCIATED;
-			BUG_ON(!create_worker(pool));
-		}
-	}
-
 	/* create default unbound and ordered wq attrs */
 	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
 		struct workqueue_attrs *attrs;
@@ -5536,8 +5552,36 @@ static int __init init_workqueues(void)
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq);
 
+	return 0;
+}
+
+/**
+ * workqueue_init - bring workqueue subsystem fully online
+ *
+ * This is the latter half of two-staged workqueue subsystem initialization
+ * and invoked as soon as kthreads can be created and scheduled.
+ * Workqueues have been created and work items queued on them, but there
+ * are no kworkers executing the work items yet.  Populate the worker pools
+ * with the initial workers and enable future kworker creations.
+ */
+int __init workqueue_init(void)
+{
+	struct worker_pool *pool;
+	int cpu, bkt;
+
+	/* create the initial workers */
+	for_each_online_cpu(cpu) {
+		for_each_cpu_worker_pool(pool, cpu) {
+			pool->flags &= ~POOL_DISASSOCIATED;
+			BUG_ON(!create_worker(pool));
+		}
+	}
+
+	hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
+		BUG_ON(!create_worker(pool));
+
+	wq_online = true;
 	wq_watchdog_init();
 
 	return 0;
 }
-early_initcall(init_workqueues);
-- 
cgit v1.2.3


From a81f80f3eb759de72d74d18e478873fc0575abc5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 Sep 2016 15:49:33 -0400
Subject: power, workqueue: remove keventd_up() usage

Now that workqueue can handle work item queueing/cancelling from very
early during boot, there is no need to gate cancel_delayed_work_sync()
while !keventd_up().  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Qiao Zhou <qiaozhou@asrmicro.com>
---
 kernel/power/qos.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 168ff442ebde..97b0df71303e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -482,16 +482,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
 		return;
 	}
 
-	/*
-	 * This function may be called very early during boot, for example,
-	 * from of_clk_init(), where irq needs to stay disabled.
-	 * cancel_delayed_work_sync() assumes that irq is enabled on
-	 * invocation and re-enables it on return.  Avoid calling it until
-	 * workqueue is initialized.
-	 */
-	if (keventd_up())
-		cancel_delayed_work_sync(&req->work);
-
+	cancel_delayed_work_sync(&req->work);
 	__pm_qos_update_request(req, new_value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
-- 
cgit v1.2.3


From 863b710b664bdcb90c0c682ee24adb368f497a5b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 16 Sep 2016 15:49:34 -0400
Subject: workqueue: remove keventd_up()

keventd_up() no longer has in-kernel users.  Remove it and make
wq_online static.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 10 ----------
 kernel/workqueue.c        |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 91d416f9c0a7..56417133c672 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -358,8 +358,6 @@ extern struct workqueue_struct *system_freezable_wq;
 extern struct workqueue_struct *system_power_efficient_wq;
 extern struct workqueue_struct *system_freezable_power_efficient_wq;
 
-extern bool wq_online;
-
 extern struct workqueue_struct *
 __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 	struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
@@ -591,14 +589,6 @@ static inline bool schedule_delayed_work(struct delayed_work *dwork,
 	return queue_delayed_work(system_wq, dwork, delay);
 }
 
-/**
- * keventd_up - is workqueue initialized yet?
- */
-static inline bool keventd_up(void)
-{
-	return wq_online;
-}
-
 #ifndef CONFIG_SMP
 static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 15d0811c9e91..ad0cd439223b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -290,7 +290,7 @@ module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
 module_param_named(power_efficient, wq_power_efficient, bool, 0444);
 
-bool wq_online;				/* can kworkers be created yet? */
+static bool wq_online;			/* can kworkers be created yet? */
 
 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
 
-- 
cgit v1.2.3


From 57a09bf0a416700676e77102c28f9cfcb48267e0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 18 Oct 2016 19:51:19 +0200
Subject: bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers

A BPF program is required to check the return register of a
map_elem_lookup() call before accessing memory. The verifier keeps
track of this by converting the type of the result register from
PTR_TO_MAP_VALUE_OR_NULL to PTR_TO_MAP_VALUE after a conditional
jump ensures safety. This check is currently exclusively performed
for the result register 0.

In the event the compiler reorders instructions, BPF_MOV64_REG
instructions may be moved before the conditional jump which causes
them to keep their type PTR_TO_MAP_VALUE_OR_NULL to which the
verifier objects when the register is accessed:

0: (b7) r1 = 10
1: (7b) *(u64 *)(r10 -8) = r1
2: (bf) r2 = r10
3: (07) r2 += -8
4: (18) r1 = 0x59c00000
6: (85) call 1
7: (bf) r4 = r0
8: (15) if r0 == 0x0 goto pc+1
 R0=map_value(ks=8,vs=8) R4=map_value_or_null(ks=8,vs=8) R10=fp
9: (7a) *(u64 *)(r4 +0) = 0
R4 invalid mem access 'map_value_or_null'

This commit extends the verifier to keep track of all identical
PTR_TO_MAP_VALUE_OR_NULL registers after a map_elem_lookup() by
assigning them an ID and then marking them all when the conditional
jump is observed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h                |  2 +-
 kernel/bpf/verifier.c                       | 61 +++++++++++++++++-------
 tools/testing/selftests/bpf/test_verifier.c | 72 +++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7035b997aaa5..ac5b393ee6b2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -23,13 +23,13 @@ struct bpf_reg_state {
 	 * result in a bad access.
 	 */
 	u64 min_value, max_value;
+	u32 id;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
 
 		/* valid when type == PTR_TO_PACKET* */
 		struct {
-			u32 id;
 			u16 off;
 			u16 range;
 		};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 99a7e5b388f2..846d7ceaf202 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -212,9 +212,10 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL ||
 			 t == PTR_TO_MAP_VALUE_ADJ)
-			verbose("(ks=%d,vs=%d)",
+			verbose("(ks=%d,vs=%d,id=%u)",
 				reg->map_ptr->key_size,
-				reg->map_ptr->value_size);
+				reg->map_ptr->value_size,
+				reg->id);
 		if (reg->min_value != BPF_REGISTER_MIN_RANGE)
 			verbose(",min_value=%llu",
 				(unsigned long long)reg->min_value);
@@ -447,6 +448,7 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
 {
 	BUG_ON(regno >= MAX_BPF_REG);
 	regs[regno].type = UNKNOWN_VALUE;
+	regs[regno].id = 0;
 	regs[regno].imm = 0;
 }
 
@@ -1252,6 +1254,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
+		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
 		verbose("unknown return type %d of func %d\n",
 			fn->ret_type, func_id);
@@ -1644,8 +1647,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 						insn->src_reg);
 					return -EACCES;
 				}
-				regs[insn->dst_reg].type = UNKNOWN_VALUE;
-				regs[insn->dst_reg].map_ptr = NULL;
+				mark_reg_unknown_value(regs, insn->dst_reg);
 			}
 		} else {
 			/* case: R = imm
@@ -1907,6 +1909,38 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 	check_reg_overflow(true_reg);
 }
 
+static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
+			 enum bpf_reg_type type)
+{
+	struct bpf_reg_state *reg = &regs[regno];
+
+	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+		reg->type = type;
+		if (type == UNKNOWN_VALUE)
+			mark_reg_unknown_value(regs, regno);
+	}
+}
+
+/* The logic is similar to find_good_pkt_pointers(), both could eventually
+ * be folded together at some point.
+ */
+static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
+			  enum bpf_reg_type type)
+{
+	struct bpf_reg_state *regs = state->regs;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++)
+		mark_map_reg(regs, i, regs[regno].id, type);
+
+	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+		if (state->stack_slot_type[i] != STACK_SPILL)
+			continue;
+		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE,
+			     regs[regno].id, type);
+	}
+}
+
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
@@ -1994,18 +2028,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
 	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
-		if (opcode == BPF_JEQ) {
-			/* next fallthrough insn can access memory via
-			 * this register
-			 */
-			regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-			/* branch targer cannot access it, since reg == 0 */
-			mark_reg_unknown_value(other_branch->regs,
-					       insn->dst_reg);
-		} else {
-			other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-			mark_reg_unknown_value(regs, insn->dst_reg);
-		}
+		/* Mark all identical map registers in each branch as either
+		 * safe or unknown depending R == 0 or R != 0 conditional.
+		 */
+		mark_map_regs(this_branch, insn->dst_reg,
+			      opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
+		mark_map_regs(other_branch, insn->dst_reg,
+			      opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index ff5df121b2f6..0ef8eaf6cea7 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2588,6 +2588,78 @@ static struct bpf_test tests[] = {
 		.result_unpriv = REJECT,
 		.result = REJECT,
 	},
+	{
+		"multiple registers share map_lookup_elem result",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
+	{
+		"invalid memory access with multiple map_lookup_elem calls",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.result = REJECT,
+		.errstr = "R4 !read_ok",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
+	{
+		"valid indirect map_lookup_elem access with 2nd lookup in branch",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_IMM(BPF_REG_2, 10),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 3),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From 2186d9f940b6a04f263a3bacd48f2a7ba96df4cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Oct 2016 12:01:27 -0400
Subject: workqueue: move wq_numa_init() to workqueue_init()

While splitting up workqueue initialization into two parts,
ac8f73400782 ("workqueue: make workqueue available early during boot")
put wq_numa_init() into workqueue_init_early().  Unfortunately, on
some archs including power and arm64, cpu to node mapping isn't yet
established by the time the early init is called leading to incorrect
NUMA initialization and subsequently the following oops due to zero
cpumask on node-specific unbound pools.

  Unable to handle kernel paging request for data at address 0x00000038
  Faulting instruction address: 0xc0000000000fc0cc
  Oops: Kernel access of bad area, sig: 11 [#1]
  SMP NR_CPUS=2048 NUMA PowerNV
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.8.0-compiler_gcc-6.2.0-next-20161005 #94
  task: c0000007f5400000 task.stack: c000001ffc084000
  NIP: c0000000000fc0cc LR: c0000000000ed928 CTR: c0000000000fbfd0
  REGS: c000001ffc087780 TRAP: 0300   Not tainted  (4.8.0-compiler_gcc-6.2.0-next-20161005)
  MSR: 9000000002009033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 48000424  XER: 00000000
  CFAR: c0000000000089dc DAR: 0000000000000038 DSISR: 40000000 SOFTE: 0
  GPR00: c0000000000ed928 c000001ffc087a00 c000000000e63200 c000000010d6d600
  GPR04: c0000007f5409200 0000000000000021 000000000748e08c 000000000000001f
  GPR08: 0000000000000000 0000000000000021 000000000748f1f8 0000000000000000
  GPR12: 0000000028000422 c00000000fb80000 c00000000000e0c8 0000000000000000
  GPR16: 0000000000000000 0000000000000000 0000000000000021 0000000000000001
  GPR20: ffffffffafb50401 0000000000000000 c000000010d6d600 000000000000ba7e
  GPR24: 000000000000ba7e c000000000d8bc58 afb504000afb5041 0000000000000001
  GPR28: 0000000000000000 0000000000000004 c0000007f5409280 0000000000000000
  NIP [c0000000000fc0cc] enqueue_task_fair+0xfc/0x18b0
  LR [c0000000000ed928] activate_task+0x78/0xe0
  Call Trace:
  [c000001ffc087a00] [c0000007f5409200] 0xc0000007f5409200 (unreliable)
  [c000001ffc087b10] [c0000000000ed928] activate_task+0x78/0xe0
  [c000001ffc087b50] [c0000000000ede58] ttwu_do_activate+0x68/0xc0
  [c000001ffc087b90] [c0000000000ef1b8] try_to_wake_up+0x208/0x4f0
  [c000001ffc087c10] [c0000000000d3484] create_worker+0x144/0x250
  [c000001ffc087cb0] [c000000000cd72d0] workqueue_init+0x124/0x150
  [c000001ffc087d00] [c000000000cc0e74] kernel_init_freeable+0x158/0x360
  [c000001ffc087dc0] [c00000000000e0e4] kernel_init+0x24/0x160
  [c000001ffc087e30] [c00000000000bfa0] ret_from_kernel_thread+0x5c/0xbc
  Instruction dump:
  62940401 3b800000 3aa00000 7f17c378 3a600001 3b600001 60000000 60000000
  60420000 72490021 ebfe0150 2f890001 <ebbf0038> 419e0de0 7fbee840 419e0e58
  ---[ end trace 0000000000000000 ]---

Fix it by moving wq_numa_init() to workqueue_init().  As this means
that the early intialization may not have full NUMA info for per-cpu
pools and ignores NUMA affinity for unbound pools, fix them up from
workqueue_init() after wq_numa_init().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Link: http://lkml.kernel.org/r/87twck5wqo.fsf@concordia.ellerman.id.au
Fixes: ac8f73400782 ("workqueue: make workqueue available early during boot")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ad0cd439223b..c56479b4884b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5495,8 +5495,6 @@ int __init workqueue_init_early(void)
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-	wq_numa_init();
-
 	/* initialize CPU pools */
 	for_each_possible_cpu(cpu) {
 		struct worker_pool *pool;
@@ -5566,9 +5564,32 @@ int __init workqueue_init_early(void)
  */
 int __init workqueue_init(void)
 {
+	struct workqueue_struct *wq;
 	struct worker_pool *pool;
 	int cpu, bkt;
 
+	/*
+	 * It'd be simpler to initialize NUMA in workqueue_init_early() but
+	 * CPU to node mapping may not be available that early on some
+	 * archs such as power and arm64.  As per-cpu pools created
+	 * previously could be missing node hint and unbound pools NUMA
+	 * affinity, fix them up.
+	 */
+	wq_numa_init();
+
+	mutex_lock(&wq_pool_mutex);
+
+	for_each_possible_cpu(cpu) {
+		for_each_cpu_worker_pool(pool, cpu) {
+			pool->node = cpu_to_node(cpu);
+		}
+	}
+
+	list_for_each_entry(wq, &workqueues, list)
+		wq_update_unbound_numa(wq, smp_processor_id(), true);
+
+	mutex_unlock(&wq_pool_mutex);
+
 	/* create the initial workers */
 	for_each_online_cpu(cpu) {
 		for_each_cpu_worker_pool(pool, cpu) {
-- 
cgit v1.2.3


From 3c3fcb45d524feb5d14a14f332e3eec7f2aff8f3 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 19 Oct 2016 15:10:59 +0100
Subject: sched/fair: Kill the unused 'sched_shares_window_ns' tunable

The last user of this tunable was removed in 2012 in commit:

  82958366cfea ("sched: Replace update_shares weight distribution with per-entity computation")

Delete it since its very existence confuses people.

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161019141059.26408-1-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/sysctl.h | 1 -
 kernel/sched/fair.c          | 7 -------
 kernel/sysctl.c              | 7 -------
 3 files changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 22db1e63707e..441145351301 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -36,7 +36,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_sched_shares_window;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d941c97dfbc3..79d464a04417 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -93,13 +93,6 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 
-/*
- * The exponential sliding  window over which load is averaged for shares
- * distribution.
- * (default: 10msec)
- */
-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
-
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 706309f9ed84..739fb17371af 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "sched_shares_window_ns",
-		.data		= &sysctl_sched_shares_window,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #ifdef CONFIG_SCHEDSTATS
 	{
 		.procname	= "sched_schedstats",
-- 
cgit v1.2.3


From 97dcaa0fcfd24daa9a36c212c1ad1d5a97759212 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 20 Oct 2016 18:15:16 -0700
Subject: kexec: Export kexec_in_progress to modules

The bcm_sf2 driver uses kexec_in_progress to know whether it can power
down an integrated PHY during shutdown, and can be built as a module.
Other modules may be using this in the future, so export it.

Fixes: 2399d6143f85 ("net: dsa: bcm_sf2: Prevent GPHY shutdown for kexec'd kernels")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/kexec_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 561675589511..786ab85a5452 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -59,6 +59,7 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
+EXPORT_SYMBOL_GPL(kexec_in_progress);
 
 
 /* Location of the reserved area for the crash kernel */
-- 
cgit v1.2.3


From 0ce66d4f70d3bcfb67c89f473e76e7cf0a01288f Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 21 Oct 2016 14:21:55 -0700
Subject: Revert "kexec: Export kexec_in_progress to modules"

This reverts commit 97dcaa0fcfd24daa9a36c212c1ad1d5a97759212. Based on
the review discussion with Eric, we will come up with a different fix
for the bcm_sf2 driver which does not make it rely on the
kexec_in_progress value.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/kexec_core.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 786ab85a5452..561675589511 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -59,7 +59,6 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
 
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
-EXPORT_SYMBOL_GPL(kexec_in_progress);
 
 
 /* Location of the reserved area for the crash kernel */
-- 
cgit v1.2.3


From 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 Oct 2016 12:46:33 +0200
Subject: bpf: add helper for retrieving current numa node id

Use case is mainly for soreuseport to select sockets for the local
numa node, but since generic, lets also add this for other networking
and tracing program types.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 12 ++++++++++++
 kernel/trace/bpf_trace.c |  2 ++
 net/core/filter.c        |  2 ++
 6 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c201017b5730..edcd96ded8aa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,6 +319,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b97eca..374ef582ae18 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -426,6 +426,12 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_set_hash_invalid,
 
+	/**
+	 * bpf_get_numa_node_id()
+	 * Returns the id of the current NUMA node.
+	 */
+	BPF_FUNC_get_numa_node_id,
+
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index aa6d98154106..82a04143368e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1043,6 +1043,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 39918402e6e9..045cbe673356 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,6 +13,7 @@
 #include <linux/rcupdate.h>
 #include <linux/random.h>
 #include <linux/smp.h>
+#include <linux/topology.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
 #include <linux/uidgid.h>
@@ -92,6 +93,17 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 	.ret_type	= RET_INTEGER,
 };
 
+BPF_CALL_0(bpf_get_numa_node_id)
+{
+	return numa_node_id();
+}
+
+const struct bpf_func_proto bpf_get_numa_node_id_proto = {
+	.func		= bpf_get_numa_node_id,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
 BPF_CALL_0(bpf_ktime_get_ns)
 {
 	/* NMI safe access to clock monotonic */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5dcb99281259..fa77311dadb2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -422,6 +422,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 		return bpf_get_trace_printk_proto();
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_get_numa_node_id:
+		return &bpf_get_numa_node_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_probe_write_user:
diff --git a/net/core/filter.c b/net/core/filter.c
index 00351cdf7d0c..cd9e2ba66b0e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2492,6 +2492,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_raw_smp_processor_id_proto;
+	case BPF_FUNC_get_numa_node_id:
+		return &bpf_get_numa_node_id_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_ktime_get_ns:
-- 
cgit v1.2.3


From 119a0798dc42ed4c4f96d39b8b676efcea73aec6 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 17 Oct 2016 12:16:08 +0200
Subject: padata: Remove unused but set variables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the unused but set variable pinst in padata_parallel_worker to
fix the following warning when building with 'W=1':

  kernel/padata.c: In function ‘padata_parallel_worker’:
  kernel/padata.c:68:26: warning: variable ‘pinst’ set but not used [-Wunused-but-set-variable]

Also remove the now unused variable pd which is only used to set pinst.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 7848f0566403..05316c9f32da 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -64,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd)
 static void padata_parallel_worker(struct work_struct *parallel_work)
 {
 	struct padata_parallel_queue *pqueue;
-	struct parallel_data *pd;
-	struct padata_instance *pinst;
 	LIST_HEAD(local_list);
 
 	local_bh_disable();
 	pqueue = container_of(parallel_work,
 			      struct padata_parallel_queue, work);
-	pd = pqueue->pd;
-	pinst = pd->pinst;
 
 	spin_lock(&pqueue->parallel.lock);
 	list_replace_init(&pqueue->parallel.list, &local_list);
-- 
cgit v1.2.3


From a225023828038a1aaea876a65313c863ec23fa44 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 19 Oct 2016 15:45:27 +0200
Subject: sched/core: Explain sleep/wakeup in a better way

There were a few questions wrt. how sleep-wakeup works. Try and explain
it more.

Requested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 52 +++++++++++++++++++++++++++++++++++----------------
 kernel/sched/core.c   | 17 +++++++++--------
 2 files changed, 45 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..3762fe4e3a80 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_task_state(tsk, state_value)			\
 	do {							\
 		(tsk)->task_state_change = _THIS_IP_;		\
-		smp_store_mb((tsk)->state, (state_value));		\
+		smp_store_mb((tsk)->state, (state_value));	\
 	} while (0)
 
-/*
- * set_current_state() includes a barrier so that the write of current->state
- * is correctly serialised wrt the caller's subsequent test of whether to
- * actually sleep:
- *
- *	set_current_state(TASK_UNINTERRUPTIBLE);
- *	if (do_i_need_to_sleep())
- *		schedule();
- *
- * If the caller does not need such serialisation then use __set_current_state()
- */
 #define __set_current_state(state_value)			\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
@@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*!!(
 #define set_current_state(state_value)				\
 	do {							\
 		current->task_state_change = _THIS_IP_;		\
-		smp_store_mb(current->state, (state_value));		\
+		smp_store_mb(current->state, (state_value));	\
 	} while (0)
 
 #else
 
+/*
+ * @tsk had better be current, or you get to keep the pieces.
+ *
+ * The only reason is that computing current can be more expensive than
+ * using a pointer that's already available.
+ *
+ * Therefore, see set_current_state().
+ */
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
 #define set_task_state(tsk, state_value)		\
@@ -299,11 +296,34 @@ extern char ___assert_task_state[1 - 2*!!(
  * is correctly serialised wrt the caller's subsequent test of whether to
  * actually sleep:
  *
+ *   for (;;) {
  *	set_current_state(TASK_UNINTERRUPTIBLE);
- *	if (do_i_need_to_sleep())
- *		schedule();
+ *	if (!need_sleep)
+ *		break;
+ *
+ *	schedule();
+ *   }
+ *   __set_current_state(TASK_RUNNING);
+ *
+ * If the caller does not need such serialisation (because, for instance, the
+ * condition test and condition change and wakeup are under the same lock) then
+ * use __set_current_state().
+ *
+ * The above is typically ordered against the wakeup, which does:
+ *
+ *	need_sleep = false;
+ *	wake_up_state(p, TASK_UNINTERRUPTIBLE);
+ *
+ * Where wake_up_state() (and all other wakeup primitives) imply enough
+ * barriers to order the store of the variable against wakeup.
+ *
+ * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
+ * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
+ * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
+ *
+ * This is obviously fine, since they both store the exact same value.
  *
- * If the caller does not need such serialisation then use __set_current_state()
+ * Also see the comments of try_to_wake_up().
  */
 #define __set_current_state(state_value)		\
 	do { current->state = (state_value); } while (0)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..b8c86ba44ca9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  * @state: the mask of task states that can be woken
  * @wake_flags: wake modifier flags (WF_*)
  *
- * Put it on the run-queue if it's not already there. The "current"
- * thread is always on the run-queue (except when the actual
- * re-schedule is in progress), and as such you're allowed to do
- * the simpler "current->state = TASK_RUNNING" to mark yourself
- * runnable without the overhead of this.
- *
- * Return: %true if @p was woken up, %false if it was already running.
- * or @state didn't match @p's state.
+ * If (@state & @p->state) @p->state = TASK_RUNNING.
+ *
+ * If the task was not queued/runnable, also place it back on a runqueue.
+ *
+ * Atomic against schedule() which would dequeue a task, also see
+ * set_current_state().
+ *
+ * Return: %true if @p->state changes (an actual wakeup was done),
+ *	   %false otherwise.
  */
 static int
 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
-- 
cgit v1.2.3


From 3ca0ff571b092ee4d807f1168caa428d95b0173b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 23 Aug 2016 13:36:04 +0200
Subject: locking/mutex: Rework mutex::owner

The current mutex implementation has an atomic lock word and a
non-atomic owner field.

This disparity leads to a number of issues with the current mutex code
as it means that we can have a locked mutex without an explicit owner
(because the owner field has not been set, or already cleared).

This leads to a number of weird corner cases, esp. between the
optimistic spinning and debug code. Where the optimistic spinning
code needs the owner field updated inside the lock region, the debug
code is more relaxed because the whole lock is serialized by the
wait_lock.

Also, the spinning code itself has a few corner cases where we need to
deal with a held lock without an owner field.

Furthermore, it becomes even more of a problem when trying to fix
starvation cases in the current code. We end up stacking special case
on special case.

To solve this rework the basic mutex implementation to be a single
atomic word that contains the owner and uses the low bits for extra
state.

This matches how PI futexes and rt_mutex already work. By having the
owner an integral part of the lock state a lot of the problems
dissapear and we get a better option to deal with starvation cases,
direct owner handoff.

Changing the basic mutex does however invalidate all the arch specific
mutex code; this patch leaves that unused in-place, a later patch will
remove that.

Tested-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex-debug.h  |  24 ---
 include/linux/mutex.h        |  46 ++++--
 kernel/locking/mutex-debug.c |  13 --
 kernel/locking/mutex-debug.h |  10 --
 kernel/locking/mutex.c       | 371 ++++++++++++++++++-------------------------
 kernel/locking/mutex.h       |  26 ---
 kernel/sched/core.c          |   2 +-
 7 files changed, 187 insertions(+), 305 deletions(-)
 delete mode 100644 include/linux/mutex-debug.h

(limited to 'kernel')

diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
deleted file mode 100644
index 4ac8b1977b73..000000000000
--- a/include/linux/mutex-debug.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __LINUX_MUTEX_DEBUG_H
-#define __LINUX_MUTEX_DEBUG_H
-
-#include <linux/linkage.h>
-#include <linux/lockdep.h>
-#include <linux/debug_locks.h>
-
-/*
- * Mutexes - debugging helpers:
- */
-
-#define __DEBUG_MUTEX_INITIALIZER(lockname)				\
-	, .magic = &lockname
-
-#define mutex_init(mutex)						\
-do {									\
-	static struct lock_class_key __key;				\
-									\
-	__mutex_init((mutex), #mutex, &__key);				\
-} while (0)
-
-extern void mutex_destroy(struct mutex *lock);
-
-#endif
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 2cb7531e7d7a..4d3bccabbea5 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -18,6 +18,7 @@
 #include <linux/atomic.h>
 #include <asm/processor.h>
 #include <linux/osq_lock.h>
+#include <linux/debug_locks.h>
 
 /*
  * Simple, straightforward mutexes with strict semantics:
@@ -48,16 +49,12 @@
  *   locks and tasks (and only those tasks)
  */
 struct mutex {
-	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
-	atomic_t		count;
+	atomic_long_t		owner;
 	spinlock_t		wait_lock;
-	struct list_head	wait_list;
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER)
-	struct task_struct	*owner;
-#endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* Spinner MCS lock */
 #endif
+	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_MUTEXES
 	void			*magic;
 #endif
@@ -66,6 +63,11 @@ struct mutex {
 #endif
 };
 
+static inline struct task_struct *__mutex_owner(struct mutex *lock)
+{
+	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03);
+}
+
 /*
  * This is the control structure for tasks blocked on mutex,
  * which resides on the blocked task's kernel stack:
@@ -79,9 +81,20 @@ struct mutex_waiter {
 };
 
 #ifdef CONFIG_DEBUG_MUTEXES
-# include <linux/mutex-debug.h>
+
+#define __DEBUG_MUTEX_INITIALIZER(lockname)				\
+	, .magic = &lockname
+
+extern void mutex_destroy(struct mutex *lock);
+
 #else
+
 # define __DEBUG_MUTEX_INITIALIZER(lockname)
+
+static inline void mutex_destroy(struct mutex *lock) {}
+
+#endif
+
 /**
  * mutex_init - initialize the mutex
  * @mutex: the mutex to be initialized
@@ -90,14 +103,12 @@ struct mutex_waiter {
  *
  * It is not allowed to initialize an already locked mutex.
  */
-# define mutex_init(mutex) \
-do {							\
-	static struct lock_class_key __key;		\
-							\
-	__mutex_init((mutex), #mutex, &__key);		\
+#define mutex_init(mutex)						\
+do {									\
+	static struct lock_class_key __key;				\
+									\
+	__mutex_init((mutex), #mutex, &__key);				\
 } while (0)
-static inline void mutex_destroy(struct mutex *lock) {}
-#endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
@@ -107,7 +118,7 @@ static inline void mutex_destroy(struct mutex *lock) {}
 #endif
 
 #define __MUTEX_INITIALIZER(lockname) \
-		{ .count = ATOMIC_INIT(1) \
+		{ .owner = ATOMIC_LONG_INIT(0) \
 		, .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
 		, .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
 		__DEBUG_MUTEX_INITIALIZER(lockname) \
@@ -127,7 +138,10 @@ extern void __mutex_init(struct mutex *lock, const char *name,
  */
 static inline int mutex_is_locked(struct mutex *lock)
 {
-	return atomic_read(&lock->count) != 1;
+	/*
+	 * XXX think about spin_is_locked
+	 */
+	return __mutex_owner(lock) != NULL;
 }
 
 /*
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 9c951fade415..9aa713629387 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -73,21 +73,8 @@ void debug_mutex_unlock(struct mutex *lock)
 {
 	if (likely(debug_locks)) {
 		DEBUG_LOCKS_WARN_ON(lock->magic != lock);
-
-		if (!lock->owner)
-			DEBUG_LOCKS_WARN_ON(!lock->owner);
-		else
-			DEBUG_LOCKS_WARN_ON(lock->owner != current);
-
 		DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
 	}
-
-	/*
-	 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
-	 * mutexes so that we can do it here after we've verified state.
-	 */
-	mutex_clear_owner(lock);
-	atomic_set(&lock->count, 1);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 57a871ae3c81..a459faa48987 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -27,16 +27,6 @@ extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
 
-static inline void mutex_set_owner(struct mutex *lock)
-{
-	WRITE_ONCE(lock->owner, current);
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
-{
-	WRITE_ONCE(lock->owner, NULL);
-}
-
 #define spin_lock_mutex(lock, flags)			\
 	do {						\
 		struct mutex *l = container_of(lock, struct mutex, wait_lock); \
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a70b90db3909..de1ce0bae0d5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -27,41 +27,113 @@
 #include <linux/debug_locks.h>
 #include <linux/osq_lock.h>
 
-/*
- * In the DEBUG case we are using the "NULL fastpath" for mutexes,
- * which forces all calls into the slowpath:
- */
 #ifdef CONFIG_DEBUG_MUTEXES
 # include "mutex-debug.h"
-# include <asm-generic/mutex-null.h>
-/*
- * Must be 0 for the debug case so we do not do the unlock outside of the
- * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
- * case.
- */
-# undef __mutex_slowpath_needs_to_unlock
-# define  __mutex_slowpath_needs_to_unlock()	0
 #else
 # include "mutex.h"
-# include <asm/mutex.h>
 #endif
 
 void
 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
-	atomic_set(&lock->count, 1);
+	atomic_long_set(&lock->owner, 0);
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
-	mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
 	osq_lock_init(&lock->osq);
 #endif
 
 	debug_mutex_init(lock, name, key);
 }
-
 EXPORT_SYMBOL(__mutex_init);
 
+/*
+ * @owner: contains: 'struct task_struct *' to the current lock owner,
+ * NULL means not owned. Since task_struct pointers are aligned at
+ * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low
+ * bits to store extra state.
+ *
+ * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
+ */
+#define MUTEX_FLAG_WAITERS	0x01
+
+#define MUTEX_FLAGS		0x03
+
+static inline struct task_struct *__owner_task(unsigned long owner)
+{
+	return (struct task_struct *)(owner & ~MUTEX_FLAGS);
+}
+
+static inline unsigned long __owner_flags(unsigned long owner)
+{
+	return owner & MUTEX_FLAGS;
+}
+
+/*
+ * Actual trylock that will work on any unlocked state.
+ */
+static inline bool __mutex_trylock(struct mutex *lock)
+{
+	unsigned long owner, curr = (unsigned long)current;
+
+	owner = atomic_long_read(&lock->owner);
+	for (;;) { /* must loop, can race against a flag */
+		unsigned long old;
+
+		if (__owner_task(owner))
+			return false;
+
+		old = atomic_long_cmpxchg_acquire(&lock->owner, owner,
+						  curr | __owner_flags(owner));
+		if (old == owner)
+			return true;
+
+		owner = old;
+	}
+}
+
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Lockdep annotations are contained to the slow paths for simplicity.
+ * There is nothing that would stop spreading the lockdep annotations outwards
+ * except more code.
+ */
+
+/*
+ * Optimistic trylock that only works in the uncontended case. Make sure to
+ * follow with a __mutex_trylock() before failing.
+ */
+static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
+{
+	unsigned long curr = (unsigned long)current;
+
+	if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr))
+		return true;
+
+	return false;
+}
+
+static __always_inline bool __mutex_unlock_fast(struct mutex *lock)
+{
+	unsigned long curr = (unsigned long)current;
+
+	if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr)
+		return true;
+
+	return false;
+}
+#endif
+
+static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag)
+{
+	atomic_long_or(flag, &lock->owner);
+}
+
+static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
+{
+	atomic_long_andnot(flag, &lock->owner);
+}
+
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * We split the mutex lock/unlock logic into separate fastpath and
@@ -69,7 +141,7 @@ EXPORT_SYMBOL(__mutex_init);
  * We also put the fastpath first in the kernel image, to make sure the
  * branch is predicted by the CPU as default-untaken.
  */
-__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
+static void __sched __mutex_lock_slowpath(struct mutex *lock);
 
 /**
  * mutex_lock - acquire the mutex
@@ -95,14 +167,10 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
 void __sched mutex_lock(struct mutex *lock)
 {
 	might_sleep();
-	/*
-	 * The locking fastpath is the 1->0 transition from
-	 * 'unlocked' into 'locked' state.
-	 */
-	__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
-	mutex_set_owner(lock);
-}
 
+	if (!__mutex_trylock_fast(lock))
+		__mutex_lock_slowpath(lock);
+}
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
@@ -149,9 +217,6 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
 /*
  * After acquiring lock with fastpath or when we lost out in contested
  * slowpath, set ctx and wake up any waiters so they can recheck.
- *
- * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
- * as the fastpath and opportunistic spinning are disabled in that case.
  */
 static __always_inline void
 ww_mutex_set_context_fastpath(struct ww_mutex *lock,
@@ -176,7 +241,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
 	/*
 	 * Check if lock is contended, if not there is nobody to wake up
 	 */
-	if (likely(atomic_read(&lock->base.count) == 0))
+	if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS)))
 		return;
 
 	/*
@@ -227,7 +292,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 	bool ret = true;
 
 	rcu_read_lock();
-	while (lock->owner == owner) {
+	while (__mutex_owner(lock) == owner) {
 		/*
 		 * Ensure we emit the owner->on_cpu, dereference _after_
 		 * checking lock->owner still matches owner. If that fails,
@@ -260,26 +325,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 		return 0;
 
 	rcu_read_lock();
-	owner = READ_ONCE(lock->owner);
+	owner = __mutex_owner(lock);
 	if (owner)
 		retval = owner->on_cpu;
 	rcu_read_unlock();
+
 	/*
-	 * if lock->owner is not set, the mutex owner may have just acquired
-	 * it and not set the owner yet or the mutex has been released.
+	 * If lock->owner is not set, the mutex has been released. Return true
+	 * such that we'll trylock in the spin path, which is a faster option
+	 * than the blocking slow path.
 	 */
 	return retval;
 }
 
-/*
- * Atomically try to take the lock when it is available
- */
-static inline bool mutex_try_to_acquire(struct mutex *lock)
-{
-	return !mutex_is_locked(lock) &&
-		(atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
-}
-
 /*
  * Optimistic spinning.
  *
@@ -288,13 +346,6 @@ static inline bool mutex_try_to_acquire(struct mutex *lock)
  * need to reschedule. The rationale is that if the lock owner is
  * running, it is likely to release the lock soon.
  *
- * Since this needs the lock owner, and this mutex implementation
- * doesn't track the owner atomically in the lock field, we need to
- * track it non-atomically.
- *
- * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
- * to serialize everything.
- *
  * The mutex spinners are queued up using MCS lock so that only one
  * spinner can compete for the mutex. However, if mutex spinning isn't
  * going to happen, there is no point in going through the lock/unlock
@@ -342,35 +393,16 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		owner = READ_ONCE(lock->owner);
+		owner = __mutex_owner(lock);
 		if (owner && !mutex_spin_on_owner(lock, owner))
 			break;
 
 		/* Try to acquire the mutex if it is unlocked. */
-		if (mutex_try_to_acquire(lock)) {
-			lock_acquired(&lock->dep_map, ip);
-
-			if (use_ww_ctx) {
-				struct ww_mutex *ww;
-				ww = container_of(lock, struct ww_mutex, base);
-
-				ww_mutex_set_context_fastpath(ww, ww_ctx);
-			}
-
-			mutex_set_owner(lock);
+		if (__mutex_trylock(lock)) {
 			osq_unlock(&lock->osq);
 			return true;
 		}
 
-		/*
-		 * When there's no owner, we might have preempted between the
-		 * owner acquiring the lock and setting the owner field. If
-		 * we're an RT task that will live-lock because we won't let
-		 * the owner complete.
-		 */
-		if (!owner && (need_resched() || rt_task(task)))
-			break;
-
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
 		 * everything in this loop to be re-loaded. We don't need
@@ -406,8 +438,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 }
 #endif
 
-__visible __used noinline
-void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip);
 
 /**
  * mutex_unlock - release the mutex
@@ -422,21 +453,12 @@ void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
  */
 void __sched mutex_unlock(struct mutex *lock)
 {
-	/*
-	 * The unlocking fastpath is the 0->1 transition from 'locked'
-	 * into 'unlocked' state:
-	 */
-#ifndef CONFIG_DEBUG_MUTEXES
-	/*
-	 * When debugging is enabled we must not clear the owner before time,
-	 * the slow path will always be taken, and that clears the owner field
-	 * after verifying that it was indeed current.
-	 */
-	mutex_clear_owner(lock);
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+	if (__mutex_unlock_fast(lock))
+		return;
 #endif
-	__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
+	__mutex_unlock_slowpath(lock, _RET_IP_);
 }
-
 EXPORT_SYMBOL(mutex_unlock);
 
 /**
@@ -465,15 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock)
 		lock->ctx = NULL;
 	}
 
-#ifndef CONFIG_DEBUG_MUTEXES
-	/*
-	 * When debugging is enabled we must not clear the owner before time,
-	 * the slow path will always be taken, and that clears the owner field
-	 * after verifying that it was indeed current.
-	 */
-	mutex_clear_owner(&lock->base);
-#endif
-	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+	mutex_unlock(&lock->base);
 }
 EXPORT_SYMBOL(ww_mutex_unlock);
 
@@ -520,20 +534,24 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-	if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+	if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
 		/* got the lock, yay! */
+		lock_acquired(&lock->dep_map, ip);
+		if (use_ww_ctx) {
+			struct ww_mutex *ww;
+			ww = container_of(lock, struct ww_mutex, base);
+
+			ww_mutex_set_context_fastpath(ww, ww_ctx);
+		}
 		preempt_enable();
 		return 0;
 	}
 
 	spin_lock_mutex(&lock->wait_lock, flags);
-
 	/*
-	 * Once more, try to acquire the lock. Only try-lock the mutex if
-	 * it is unlocked to reduce unnecessary xchg() operations.
+	 * After waiting to acquire the wait_lock, try again.
 	 */
-	if (!mutex_is_locked(lock) &&
-	    (atomic_xchg_acquire(&lock->count, 0) == 1))
+	if (__mutex_trylock(lock))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -543,21 +561,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
+	if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter)
+		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
+
 	lock_contended(&lock->dep_map, ip);
 
 	for (;;) {
-		/*
-		 * Lets try to take the lock again - this is needed even if
-		 * we get here for the first time (shortly after failing to
-		 * acquire the lock), to make sure that we get a wakeup once
-		 * it's unlocked. Later on, if we sleep, this is the
-		 * operation that gives us the lock. We xchg it to -1, so
-		 * that when we release the lock, we properly wake up the
-		 * other waiters. We only attempt the xchg if the count is
-		 * non-negative in order to avoid unnecessary xchg operations:
-		 */
-		if (atomic_read(&lock->count) >= 0 &&
-		    (atomic_xchg_acquire(&lock->count, -1) == 1))
+		if (__mutex_trylock(lock))
 			break;
 
 		/*
@@ -585,15 +595,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	__set_task_state(task, TASK_RUNNING);
 
 	mutex_remove_waiter(lock, &waiter, task);
-	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
-		atomic_set(&lock->count, 0);
+		__mutex_clear_flag(lock, MUTEX_FLAG_WAITERS);
+
 	debug_mutex_free_waiter(&waiter);
 
 skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);
-	mutex_set_owner(lock);
 
 	if (use_ww_ctx) {
 		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
@@ -631,7 +640,6 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
 			    0, nest, _RET_IP_, NULL, 0);
 }
-
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
 
 int __sched
@@ -650,7 +658,6 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
 				   subclass, NULL, _RET_IP_, NULL, 0);
 }
-
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 
 static inline int
@@ -715,29 +722,22 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
 /*
  * Release the lock, slowpath:
  */
-static inline void
-__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
+static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
 {
-	unsigned long flags;
+	unsigned long owner, flags;
 	WAKE_Q(wake_q);
 
+	mutex_release(&lock->dep_map, 1, ip);
+
 	/*
-	 * As a performance measurement, release the lock before doing other
-	 * wakeup related duties to follow. This allows other tasks to acquire
-	 * the lock sooner, while still handling cleanups in past unlock calls.
-	 * This can be done as we do not enforce strict equivalence between the
-	 * mutex counter and wait_list.
-	 *
-	 *
-	 * Some architectures leave the lock unlocked in the fastpath failure
-	 * case, others need to leave it locked. In the later case we have to
-	 * unlock it here - as the lock counter is currently 0 or negative.
+	 * Release the lock before (potentially) taking the spinlock
+	 * such that other contenders can get on with things ASAP.
 	 */
-	if (__mutex_slowpath_needs_to_unlock())
-		atomic_set(&lock->count, 1);
+	owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner);
+	if (!__owner_flags(owner))
+		return;
 
 	spin_lock_mutex(&lock->wait_lock, flags);
-	mutex_release(&lock->dep_map, nested, _RET_IP_);
 	debug_mutex_unlock(lock);
 
 	if (!list_empty(&lock->wait_list)) {
@@ -754,17 +754,6 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 	wake_up_q(&wake_q);
 }
 
-/*
- * Release the lock, slowpath:
- */
-__visible void
-__mutex_unlock_slowpath(atomic_t *lock_count)
-{
-	struct mutex *lock = container_of(lock_count, struct mutex, count);
-
-	__mutex_unlock_common_slowpath(lock, 1);
-}
-
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * Here come the less common (and hence less performance-critical) APIs:
@@ -789,38 +778,30 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock);
  */
 int __sched mutex_lock_interruptible(struct mutex *lock)
 {
-	int ret;
-
 	might_sleep();
-	ret =  __mutex_fastpath_lock_retval(&lock->count);
-	if (likely(!ret)) {
-		mutex_set_owner(lock);
+
+	if (__mutex_trylock_fast(lock))
 		return 0;
-	} else
-		return __mutex_lock_interruptible_slowpath(lock);
+
+	return __mutex_lock_interruptible_slowpath(lock);
 }
 
 EXPORT_SYMBOL(mutex_lock_interruptible);
 
 int __sched mutex_lock_killable(struct mutex *lock)
 {
-	int ret;
-
 	might_sleep();
-	ret = __mutex_fastpath_lock_retval(&lock->count);
-	if (likely(!ret)) {
-		mutex_set_owner(lock);
+
+	if (__mutex_trylock_fast(lock))
 		return 0;
-	} else
-		return __mutex_lock_killable_slowpath(lock);
+
+	return __mutex_lock_killable_slowpath(lock);
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
-__visible void __sched
-__mutex_lock_slowpath(atomic_t *lock_count)
+static noinline void __sched
+__mutex_lock_slowpath(struct mutex *lock)
 {
-	struct mutex *lock = container_of(lock_count, struct mutex, count);
-
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
 			    NULL, _RET_IP_, NULL, 0);
 }
@@ -856,37 +837,6 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
 
 #endif
 
-/*
- * Spinlock based trylock, we take the spinlock and check whether we
- * can get the lock:
- */
-static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
-{
-	struct mutex *lock = container_of(lock_count, struct mutex, count);
-	unsigned long flags;
-	int prev;
-
-	/* No need to trylock if the mutex is locked. */
-	if (mutex_is_locked(lock))
-		return 0;
-
-	spin_lock_mutex(&lock->wait_lock, flags);
-
-	prev = atomic_xchg_acquire(&lock->count, -1);
-	if (likely(prev == 1)) {
-		mutex_set_owner(lock);
-		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-	}
-
-	/* Set it back to 0 if there are no waiters: */
-	if (likely(list_empty(&lock->wait_list)))
-		atomic_set(&lock->count, 0);
-
-	spin_unlock_mutex(&lock->wait_lock, flags);
-
-	return prev == 1;
-}
-
 /**
  * mutex_trylock - try to acquire the mutex, without waiting
  * @lock: the mutex to be acquired
@@ -903,13 +853,12 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	int ret;
+	bool locked = __mutex_trylock(lock);
 
-	ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
-	if (ret)
-		mutex_set_owner(lock);
+	if (locked)
+		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
 
-	return ret;
+	return locked;
 }
 EXPORT_SYMBOL(mutex_trylock);
 
@@ -917,36 +866,28 @@ EXPORT_SYMBOL(mutex_trylock);
 int __sched
 __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	int ret;
-
 	might_sleep();
 
-	ret = __mutex_fastpath_lock_retval(&lock->base.count);
-
-	if (likely(!ret)) {
+	if (__mutex_trylock_fast(&lock->base)) {
 		ww_mutex_set_context_fastpath(lock, ctx);
-		mutex_set_owner(&lock->base);
-	} else
-		ret = __ww_mutex_lock_slowpath(lock, ctx);
-	return ret;
+		return 0;
+	}
+
+	return __ww_mutex_lock_slowpath(lock, ctx);
 }
 EXPORT_SYMBOL(__ww_mutex_lock);
 
 int __sched
 __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-	int ret;
-
 	might_sleep();
 
-	ret = __mutex_fastpath_lock_retval(&lock->base.count);
-
-	if (likely(!ret)) {
+	if (__mutex_trylock_fast(&lock->base)) {
 		ww_mutex_set_context_fastpath(lock, ctx);
-		mutex_set_owner(&lock->base);
-	} else
-		ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
-	return ret;
+		return 0;
+	}
+
+	return __ww_mutex_lock_interruptible_slowpath(lock, ctx);
 }
 EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
 
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 6cd6b8e9efd7..4410a4af42a3 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,32 +16,6 @@
 #define mutex_remove_waiter(lock, waiter, task) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * The mutex owner can get read and written to locklessly.
- * We should use WRITE_ONCE when writing the owner value to
- * avoid store tearing, otherwise, a thread could potentially
- * read a partially written and incomplete owner value.
- */
-static inline void mutex_set_owner(struct mutex *lock)
-{
-	WRITE_ONCE(lock->owner, current);
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
-{
-	WRITE_ONCE(lock->owner, NULL);
-}
-#else
-static inline void mutex_set_owner(struct mutex *lock)
-{
-}
-
-static inline void mutex_clear_owner(struct mutex *lock)
-{
-}
-#endif
-
 #define debug_mutex_wake_waiter(lock, waiter)		do { } while (0)
 #define debug_mutex_free_waiter(waiter)			do { } while (0)
 #define debug_mutex_add_waiter(lock, waiter, ti)	do { } while (0)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..8912aafd09e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,11 +75,11 @@
 #include <linux/compiler.h>
 #include <linux/frame.h>
 #include <linux/prefetch.h>
+#include <linux/mutex.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
-#include <asm/mutex.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
-- 
cgit v1.2.3


From a3ea3d9b865c2a8f7fe455c7fa26db4b6fd066e3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 23 Aug 2016 13:45:15 +0200
Subject: locking/mutex: Allow MUTEX_SPIN_ON_OWNER when DEBUG_MUTEXES

Now that mutex::count and mutex::owner are the same field, we can
allow SPIN_ON_OWNER while DEBUG_MUTEX.

Tested-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/Kconfig.locks | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index ebdb0043203a..84d882f3e299 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -225,7 +225,7 @@ config ARCH_SUPPORTS_ATOMIC_RMW
 
 config MUTEX_SPIN_ON_OWNER
 	def_bool y
-	depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+	depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW
 
 config RWSEM_SPIN_ON_OWNER
        def_bool y
-- 
cgit v1.2.3


From 9d659ae14b545c4296e812c70493bfdc999b5c1c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 23 Aug 2016 14:40:16 +0200
Subject: locking/mutex: Add lock handoff to avoid starvation

Implement lock handoff to avoid lock starvation.

Lock starvation is possible because mutex_lock() allows lock stealing,
where a running (or optimistic spinning) task beats the woken waiter
to the acquire.

Lock stealing is an important performance optimization because waiting
for a waiter to wake up and get runtime can take a significant time,
during which everyboy would stall on the lock.

The down-side is of course that it allows for starvation.

This patch has the waiter requesting a handoff if it fails to acquire
the lock upon waking. This re-introduces some of the wait time,
because once we do a handoff we have to wait for the waiter to wake up
again.

A future patch will add a round of optimistic spinning to attempt to
alleviate this penalty, but if that turns out to not be enough, we can
add a counter and only request handoff after multiple failed wakeups.

There are a few tricky implementation details:

 - accepting a handoff must only be done in the wait-loop. Since the
   handoff condition is owner == current, it can easily cause
   recursive locking trouble.

 - accepting the handoff must be careful to provide the ACQUIRE
   semantics.

 - having the HANDOFF bit set on unlock requires care, we must not
   clear the owner.

 - we must be careful to not leave HANDOFF set after we've acquired
   the lock. The tricky scenario is setting the HANDOFF bit on an
   unlocked mutex.

Tested-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Waiman Long <Waiman.Long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 142 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 119 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index de1ce0bae0d5..b4ebd8b9bcd5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -54,8 +54,10 @@ EXPORT_SYMBOL(__mutex_init);
  * bits to store extra state.
  *
  * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup.
+ * Bit1 indicates unlock needs to hand the lock to the top-waiter
  */
 #define MUTEX_FLAG_WAITERS	0x01
+#define MUTEX_FLAG_HANDOFF	0x02
 
 #define MUTEX_FLAGS		0x03
 
@@ -71,20 +73,48 @@ static inline unsigned long __owner_flags(unsigned long owner)
 
 /*
  * Actual trylock that will work on any unlocked state.
+ *
+ * When setting the owner field, we must preserve the low flag bits.
+ *
+ * Be careful with @handoff, only set that in a wait-loop (where you set
+ * HANDOFF) to avoid recursive lock attempts.
  */
-static inline bool __mutex_trylock(struct mutex *lock)
+static inline bool __mutex_trylock(struct mutex *lock, const bool handoff)
 {
 	unsigned long owner, curr = (unsigned long)current;
 
 	owner = atomic_long_read(&lock->owner);
 	for (;;) { /* must loop, can race against a flag */
-		unsigned long old;
+		unsigned long old, flags = __owner_flags(owner);
+
+		if (__owner_task(owner)) {
+			if (handoff && unlikely(__owner_task(owner) == current)) {
+				/*
+				 * Provide ACQUIRE semantics for the lock-handoff.
+				 *
+				 * We cannot easily use load-acquire here, since
+				 * the actual load is a failed cmpxchg, which
+				 * doesn't imply any barriers.
+				 *
+				 * Also, this is a fairly unlikely scenario, and
+				 * this contains the cost.
+				 */
+				smp_mb(); /* ACQUIRE */
+				return true;
+			}
 
-		if (__owner_task(owner))
 			return false;
+		}
+
+		/*
+		 * We set the HANDOFF bit, we must make sure it doesn't live
+		 * past the point where we acquire it. This would be possible
+		 * if we (accidentally) set the bit on an unlocked mutex.
+		 */
+		if (handoff)
+			flags &= ~MUTEX_FLAG_HANDOFF;
 
-		old = atomic_long_cmpxchg_acquire(&lock->owner, owner,
-						  curr | __owner_flags(owner));
+		old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags);
 		if (old == owner)
 			return true;
 
@@ -134,6 +164,39 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag)
 	atomic_long_andnot(flag, &lock->owner);
 }
 
+static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter)
+{
+	return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter;
+}
+
+/*
+ * Give up ownership to a specific task, when @task = NULL, this is equivalent
+ * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE
+ * semantics like a regular unlock, the __mutex_trylock() provides matching
+ * ACQUIRE semantics for the handoff.
+ */
+static void __mutex_handoff(struct mutex *lock, struct task_struct *task)
+{
+	unsigned long owner = atomic_long_read(&lock->owner);
+
+	for (;;) {
+		unsigned long old, new;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+#endif
+
+		new = (owner & MUTEX_FLAG_WAITERS);
+		new |= (unsigned long)task;
+
+		old = atomic_long_cmpxchg_release(&lock->owner, owner, new);
+		if (old == owner)
+			break;
+
+		owner = old;
+	}
+}
+
 #ifndef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * We split the mutex lock/unlock logic into separate fastpath and
@@ -398,7 +461,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 			break;
 
 		/* Try to acquire the mutex if it is unlocked. */
-		if (__mutex_trylock(lock)) {
+		if (__mutex_trylock(lock, false)) {
 			osq_unlock(&lock->osq);
 			return true;
 		}
@@ -523,6 +586,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
 	unsigned long flags;
+	bool first = false;
 	int ret;
 
 	if (use_ww_ctx) {
@@ -534,7 +598,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-	if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+	if (__mutex_trylock(lock, false) ||
+	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
 		if (use_ww_ctx) {
@@ -551,7 +616,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	/*
 	 * After waiting to acquire the wait_lock, try again.
 	 */
-	if (__mutex_trylock(lock))
+	if (__mutex_trylock(lock, false))
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
@@ -561,13 +626,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	list_add_tail(&waiter.list, &lock->wait_list);
 	waiter.task = task;
 
-	if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter)
+	if (__mutex_waiter_is_first(lock, &waiter))
 		__mutex_set_flag(lock, MUTEX_FLAG_WAITERS);
 
 	lock_contended(&lock->dep_map, ip);
 
 	for (;;) {
-		if (__mutex_trylock(lock))
+		if (__mutex_trylock(lock, first))
 			break;
 
 		/*
@@ -586,17 +651,20 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		}
 
 		__set_task_state(task, state);
-
-		/* didn't get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		schedule_preempt_disabled();
 		spin_lock_mutex(&lock->wait_lock, flags);
+
+		if (!first && __mutex_waiter_is_first(lock, &waiter)) {
+			first = true;
+			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
+		}
 	}
 	__set_task_state(task, TASK_RUNNING);
 
 	mutex_remove_waiter(lock, &waiter, task);
 	if (likely(list_empty(&lock->wait_list)))
-		__mutex_clear_flag(lock, MUTEX_FLAG_WAITERS);
+		__mutex_clear_flag(lock, MUTEX_FLAGS);
 
 	debug_mutex_free_waiter(&waiter);
 
@@ -724,33 +792,61 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
  */
 static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip)
 {
+	struct task_struct *next = NULL;
 	unsigned long owner, flags;
 	WAKE_Q(wake_q);
 
 	mutex_release(&lock->dep_map, 1, ip);
 
 	/*
-	 * Release the lock before (potentially) taking the spinlock
-	 * such that other contenders can get on with things ASAP.
+	 * Release the lock before (potentially) taking the spinlock such that
+	 * other contenders can get on with things ASAP.
+	 *
+	 * Except when HANDOFF, in that case we must not clear the owner field,
+	 * but instead set it to the top waiter.
 	 */
-	owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner);
-	if (!__owner_flags(owner))
-		return;
+	owner = atomic_long_read(&lock->owner);
+	for (;;) {
+		unsigned long old;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+		DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current);
+#endif
+
+		if (owner & MUTEX_FLAG_HANDOFF)
+			break;
+
+		old = atomic_long_cmpxchg_release(&lock->owner, owner,
+						  __owner_flags(owner));
+		if (old == owner) {
+			if (owner & MUTEX_FLAG_WAITERS)
+				break;
+
+			return;
+		}
+
+		owner = old;
+	}
 
 	spin_lock_mutex(&lock->wait_lock, flags);
 	debug_mutex_unlock(lock);
-
 	if (!list_empty(&lock->wait_list)) {
 		/* get the first entry from the wait-list: */
 		struct mutex_waiter *waiter =
-				list_entry(lock->wait_list.next,
-					   struct mutex_waiter, list);
+			list_first_entry(&lock->wait_list,
+					 struct mutex_waiter, list);
+
+		next = waiter->task;
 
 		debug_mutex_wake_waiter(lock, waiter);
-		wake_q_add(&wake_q, waiter->task);
+		wake_q_add(&wake_q, next);
 	}
 
+	if (owner & MUTEX_FLAG_HANDOFF)
+		__mutex_handoff(lock, next);
+
 	spin_unlock_mutex(&lock->wait_lock, flags);
+
 	wake_up_q(&wake_q);
 }
 
@@ -853,7 +949,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
  */
 int __sched mutex_trylock(struct mutex *lock)
 {
-	bool locked = __mutex_trylock(lock);
+	bool locked = __mutex_trylock(lock, false);
 
 	if (locked)
 		mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
-- 
cgit v1.2.3


From 5bbd7e644378334700889fb762d5893985a7311f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 2 Sep 2016 13:42:12 +0200
Subject: locking/mutex: Restructure wait loop

Doesn't really matter yet, but pull the HANDOFF and trylock out from
under the wait_lock.

The intention is to add an optimistic spin loop here, which requires
we do not hold the wait_lock, so shuffle code around in preparation.

Also clarify the purpose of taking the wait_lock in the wait loop, its
tempting to want to avoid it altogether, but the cancellation cases
need to to avoid losing wakeups.

Suggested-by: Waiman Long <waiman.long@hpe.com>
Tested-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index b4ebd8b9bcd5..8bb2304bb78d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -631,13 +631,21 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	lock_contended(&lock->dep_map, ip);
 
+	set_task_state(task, state);
 	for (;;) {
+		/*
+		 * Once we hold wait_lock, we're serialized against
+		 * mutex_unlock() handing the lock off to us, do a trylock
+		 * before testing the error conditions to make sure we pick up
+		 * the handoff.
+		 */
 		if (__mutex_trylock(lock, first))
-			break;
+			goto acquired;
 
 		/*
-		 * got a signal? (This code gets eliminated in the
-		 * TASK_UNINTERRUPTIBLE case.)
+		 * Check for signals and wound conditions while holding
+		 * wait_lock. This ensures the lock cancellation is ordered
+		 * against mutex_unlock() and wake-ups do not go missing.
 		 */
 		if (unlikely(signal_pending_state(state, task))) {
 			ret = -EINTR;
@@ -650,16 +658,27 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 				goto err;
 		}
 
-		__set_task_state(task, state);
 		spin_unlock_mutex(&lock->wait_lock, flags);
 		schedule_preempt_disabled();
-		spin_lock_mutex(&lock->wait_lock, flags);
 
 		if (!first && __mutex_waiter_is_first(lock, &waiter)) {
 			first = true;
 			__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
 		}
+
+		set_task_state(task, state);
+		/*
+		 * Here we order against unlock; we must either see it change
+		 * state back to RUNNING and fall through the next schedule(),
+		 * or we must see its unlock and acquire.
+		 */
+		if (__mutex_trylock(lock, first))
+			break;
+
+		spin_lock_mutex(&lock->wait_lock, flags);
 	}
+	spin_lock_mutex(&lock->wait_lock, flags);
+acquired:
 	__set_task_state(task, TASK_RUNNING);
 
 	mutex_remove_waiter(lock, &waiter, task);
@@ -682,6 +701,7 @@ skip_wait:
 	return 0;
 
 err:
+	__set_task_state(task, TASK_RUNNING);
 	mutex_remove_waiter(lock, &waiter, task);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
-- 
cgit v1.2.3


From a40ca56577f628eb3f7af22b484e95edfdd047a2 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Fri, 26 Aug 2016 19:35:08 -0400
Subject: locking/mutex: Simplify some ww_mutex code in __mutex_lock_common()

This patch removes some of the redundant ww_mutex code in
__mutex_lock_common().

Tested-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <Will.Deacon@arm.com>
Link: http://lkml.kernel.org/r/1472254509-27508-1-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 8bb2304bb78d..6c0d3040e4dc 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -587,10 +587,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	struct mutex_waiter waiter;
 	unsigned long flags;
 	bool first = false;
+	struct ww_mutex *ww;
 	int ret;
 
 	if (use_ww_ctx) {
-		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+		ww = container_of(lock, struct ww_mutex, base);
 		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
 			return -EALREADY;
 	}
@@ -602,12 +603,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
-		if (use_ww_ctx) {
-			struct ww_mutex *ww;
-			ww = container_of(lock, struct ww_mutex, base);
-
+		if (use_ww_ctx)
 			ww_mutex_set_context_fastpath(ww, ww_ctx);
-		}
 		preempt_enable();
 		return 0;
 	}
@@ -691,10 +688,8 @@ skip_wait:
 	/* got the lock - cleanup and rejoice! */
 	lock_acquired(&lock->dep_map, ip);
 
-	if (use_ww_ctx) {
-		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+	if (use_ww_ctx)
 		ww_mutex_set_context_slowpath(ww, ww_ctx);
-	}
 
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	preempt_enable();
-- 
cgit v1.2.3


From b341afb325eb390f707a82cbefd65cda887302ab Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Fri, 26 Aug 2016 19:35:09 -0400
Subject: locking/mutex: Enable optimistic spinning of woken waiter

This patch makes the waiter that sets the HANDOFF flag start spinning
instead of sleeping until the handoff is complete or the owner
sleeps. Otherwise, the handoff will cause the optimistic spinners to
abort spinning as the handed-off owner may not be running.

Tested-by: Jason Low <jason.low2@hpe.com>
Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: Imre Deak <imre.deak@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <Will.Deacon@arm.com>
Link: http://lkml.kernel.org/r/1472254509-27508-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 77 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 6c0d3040e4dc..17a88e929e6a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -416,24 +416,39 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
  *
  * Returns true when the lock was taken, otherwise false, indicating
  * that we need to jump to the slowpath and sleep.
+ *
+ * The waiter flag is set to true if the spinner is a waiter in the wait
+ * queue. The waiter-spinner will spin on the lock directly and concurrently
+ * with the spinner at the head of the OSQ, if present, until the owner is
+ * changed to itself.
  */
 static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+				  struct ww_acquire_ctx *ww_ctx,
+				  const bool use_ww_ctx, const bool waiter)
 {
 	struct task_struct *task = current;
 
-	if (!mutex_can_spin_on_owner(lock))
-		goto done;
+	if (!waiter) {
+		/*
+		 * The purpose of the mutex_can_spin_on_owner() function is
+		 * to eliminate the overhead of osq_lock() and osq_unlock()
+		 * in case spinning isn't possible. As a waiter-spinner
+		 * is not going to take OSQ lock anyway, there is no need
+		 * to call mutex_can_spin_on_owner().
+		 */
+		if (!mutex_can_spin_on_owner(lock))
+			goto fail;
 
-	/*
-	 * In order to avoid a stampede of mutex spinners trying to
-	 * acquire the mutex all at once, the spinners need to take a
-	 * MCS (queued) lock first before spinning on the owner field.
-	 */
-	if (!osq_lock(&lock->osq))
-		goto done;
+		/*
+		 * In order to avoid a stampede of mutex spinners trying to
+		 * acquire the mutex all at once, the spinners need to take a
+		 * MCS (queued) lock first before spinning on the owner field.
+		 */
+		if (!osq_lock(&lock->osq))
+			goto fail;
+	}
 
-	while (true) {
+	for (;;) {
 		struct task_struct *owner;
 
 		if (use_ww_ctx && ww_ctx->acquired > 0) {
@@ -449,7 +464,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 			 * performed the optimistic spinning cannot be done.
 			 */
 			if (READ_ONCE(ww->ctx))
-				break;
+				goto fail_unlock;
 		}
 
 		/*
@@ -457,15 +472,20 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		 * release the lock or go to sleep.
 		 */
 		owner = __mutex_owner(lock);
-		if (owner && !mutex_spin_on_owner(lock, owner))
-			break;
+		if (owner) {
+			if (waiter && owner == task) {
+				smp_mb(); /* ACQUIRE */
+				break;
+			}
 
-		/* Try to acquire the mutex if it is unlocked. */
-		if (__mutex_trylock(lock, false)) {
-			osq_unlock(&lock->osq);
-			return true;
+			if (!mutex_spin_on_owner(lock, owner))
+				goto fail_unlock;
 		}
 
+		/* Try to acquire the mutex if it is unlocked. */
+		if (__mutex_trylock(lock, waiter))
+			break;
+
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
 		 * everything in this loop to be re-loaded. We don't need
@@ -475,8 +495,17 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		cpu_relax_lowlatency();
 	}
 
-	osq_unlock(&lock->osq);
-done:
+	if (!waiter)
+		osq_unlock(&lock->osq);
+
+	return true;
+
+
+fail_unlock:
+	if (!waiter)
+		osq_unlock(&lock->osq);
+
+fail:
 	/*
 	 * If we fell out of the spin path because of need_resched(),
 	 * reschedule now, before we try-lock the mutex. This avoids getting
@@ -495,7 +524,8 @@ done:
 }
 #else
 static bool mutex_optimistic_spin(struct mutex *lock,
-				  struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+				  struct ww_acquire_ctx *ww_ctx,
+				  const bool use_ww_ctx, const bool waiter)
 {
 	return false;
 }
@@ -600,7 +630,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
 	if (__mutex_trylock(lock, false) ||
-	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
+	    mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
 		if (use_ww_ctx)
@@ -669,7 +699,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * state back to RUNNING and fall through the next schedule(),
 		 * or we must see its unlock and acquire.
 		 */
-		if (__mutex_trylock(lock, first))
+		if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) ||
+		     __mutex_trylock(lock, first))
 			break;
 
 		spin_lock_mutex(&lock->wait_lock, flags);
-- 
cgit v1.2.3


From 0ee1dd9f5e7eae4e55f95935b72d4beecb03de9c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 25 Oct 2016 09:51:13 -0500
Subject: x86/dumpstack: Remove raw stack dump

For mostly historical reasons, the x86 oops dump shows the raw stack
values:

  ...
  [registers]
  Stack:
   ffff880079af7350 ffff880079905400 0000000000000000 ffffc900008f3ae0
   ffffffffa0196610 0000000000000001 00010000ffffffff 0000000087654321
   0000000000000002 0000000000000000 0000000000000000 0000000000000000
  Call Trace:
  ...

This seems to be an artifact from long ago, and probably isn't needed
anymore.  It generally just adds noise to the dump, and it can be
actively harmful because it leaks kernel addresses.

Linus says:

  "The stack dump actually goes back to forever, and it used to be
   useful back in 1992 or so. But it used to be useful mainly because
   stacks were simpler and we didn't have very good call traces anyway. I
   definitely remember having used them - I just do not remember having
   used them in the last ten+ years.

   Of course, it's still true that if you can trigger an oops, you've
   likely already lost the security game, but since the stack dump is so
   useless, let's aim to just remove it and make games like the above
   harder."

This also removes the related 'kstack=' cmdline option and the
'kstack_depth_to_print' sysctl.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e83bd50df52d8fe88e94d2566426ae40d813bf8f.1477405374.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/kernel-parameters.txt       |  3 --
 Documentation/sysctl/kernel.txt           |  8 -----
 Documentation/x86/x86_64/boot-options.txt |  4 ---
 arch/x86/include/asm/stacktrace.h         |  5 ---
 arch/x86/kernel/dumpstack.c               | 21 ++----------
 arch/x86/kernel/dumpstack_32.c            | 33 +------------------
 arch/x86/kernel/dumpstack_64.c            | 53 +------------------------------
 kernel/sysctl.c                           |  7 ----
 8 files changed, 4 insertions(+), 130 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 37babf91f2cb..049a9172ed22 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1958,9 +1958,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			kmemcheck=2 (one-shot mode)
 			Default: 2 (one-shot mode)
 
-	kstack=N	[X86] Print N words from the kernel stack
-			in oops dumps.
-
 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
 			Default is 0 (don't ignore, but inject #GP)
 
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ffab8b5caa60..065f18478c1c 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -40,7 +40,6 @@ show up in /proc/sys/kernel:
 - hung_task_warnings
 - kexec_load_disabled
 - kptr_restrict
-- kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
 - modules_disabled
@@ -395,13 +394,6 @@ When kptr_restrict is set to (2), kernel pointers printed using
 
 ==============================================================
 
-kstack_depth_to_print: (X86 only)
-
-Controls the number of words to print when dumping the raw
-kernel stack.
-
-==============================================================
-
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 0965a71f9942..61b611e9eeaf 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -277,10 +277,6 @@ IOMMU (input/output memory management unit)
     space might stop working. Use this option if you have devices that
     are accessed from userspace directly on some PCI host bridge.
 
-Debugging
-
-  kstack=N	Print N words from the kernel stack in oops dumps.
-
 Miscellaneous
 
 	nogbpages
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 37f2e0b377ad..1e375b05cfa8 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -43,8 +43,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
 		addr + len > begin && addr + len <= end);
 }
 
-extern int kstack_depth_to_print;
-
 #ifdef CONFIG_X86_32
 #define STACKSLOTS_PER_LINE 8
 #else
@@ -86,9 +84,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
 void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			unsigned long *stack, char *log_lvl);
 
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			unsigned long *sp, char *log_lvl);
-
 extern unsigned int code_bytes;
 
 /* The form of the top of the frame on the stack */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index f967652500fa..499aa6f0fde5 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,7 +22,6 @@
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
 unsigned int code_bytes = 64;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
 static int die_counter;
 
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
@@ -171,12 +170,12 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	if (!sp && task == current)
 		sp = get_stack_pointer(current, NULL);
 
-	show_stack_log_lvl(task, NULL, sp, KERN_DEFAULT);
+	show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
 }
 
 void show_stack_regs(struct pt_regs *regs)
 {
-	show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
+	show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
 }
 
 static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -295,22 +294,6 @@ void die(const char *str, struct pt_regs *regs, long err)
 	oops_end(flags, regs, sig);
 }
 
-static int __init kstack_setup(char *s)
-{
-	ssize_t ret;
-	unsigned long val;
-
-	if (!s)
-		return -EINVAL;
-
-	ret = kstrtoul(s, 0, &val);
-	if (ret)
-		return ret;
-	kstack_depth_to_print = val;
-	return 0;
-}
-early_param("kstack", kstack_setup);
-
 static int __init code_bytes_setup(char *s)
 {
 	ssize_t ret;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 06eb322b5f9f..90cf460d50bd 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -121,36 +121,6 @@ unknown:
 	return -EINVAL;
 }
 
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			unsigned long *sp, char *log_lvl)
-{
-	unsigned long *stack;
-	int i;
-
-	if (!try_get_task_stack(task))
-		return;
-
-	sp = sp ? : get_stack_pointer(task, regs);
-
-	stack = sp;
-	for (i = 0; i < kstack_depth_to_print; i++) {
-		if (kstack_end(stack))
-			break;
-		if ((i % STACKSLOTS_PER_LINE) == 0) {
-			if (i != 0)
-				pr_cont("\n");
-			printk("%s %08lx", log_lvl, *stack++);
-		} else
-			pr_cont(" %08lx", *stack++);
-		touch_nmi_watchdog();
-	}
-	pr_cont("\n");
-	show_trace_log_lvl(task, regs, sp, log_lvl);
-
-	put_task_stack(task);
-}
-
-
 void show_regs(struct pt_regs *regs)
 {
 	int i;
@@ -168,8 +138,7 @@ void show_regs(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
-		pr_emerg("Stack:\n");
-		show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
+		show_trace_log_lvl(current, regs, NULL, KERN_EMERG);
 
 		pr_emerg("Code:");
 
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 36cf1a498227..310abf4542dc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -140,56 +140,6 @@ unknown:
 	return -EINVAL;
 }
 
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			unsigned long *sp, char *log_lvl)
-{
-	unsigned long *irq_stack_end;
-	unsigned long *irq_stack;
-	unsigned long *stack;
-	int i;
-
-	if (!try_get_task_stack(task))
-		return;
-
-	irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
-	irq_stack     = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
-
-	sp = sp ? : get_stack_pointer(task, regs);
-
-	stack = sp;
-	for (i = 0; i < kstack_depth_to_print; i++) {
-		unsigned long word;
-
-		if (stack >= irq_stack && stack <= irq_stack_end) {
-			if (stack == irq_stack_end) {
-				stack = (unsigned long *) (irq_stack_end[-1]);
-				pr_cont(" <EOI> ");
-			}
-		} else {
-		if (kstack_end(stack))
-			break;
-		}
-
-		if (probe_kernel_address(stack, word))
-			break;
-
-		if ((i % STACKSLOTS_PER_LINE) == 0) {
-			if (i != 0)
-				pr_cont("\n");
-			printk("%s %016lx", log_lvl, word);
-		} else
-			pr_cont(" %016lx", word);
-
-		stack++;
-		touch_nmi_watchdog();
-	}
-
-	pr_cont("\n");
-	show_trace_log_lvl(task, regs, sp, log_lvl);
-
-	put_task_stack(task);
-}
-
 void show_regs(struct pt_regs *regs)
 {
 	int i;
@@ -207,8 +157,7 @@ void show_regs(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
-		printk(KERN_DEFAULT "Stack:\n");
-		show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
+		show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
 
 		printk(KERN_DEFAULT "Code: ");
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 706309f9ed84..17a5a8253294 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -989,13 +989,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "kstack_depth_to_print",
-		.data		= &kstack_depth_to_print,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "io_delay_type",
 		.data		= &io_delay_type,
-- 
cgit v1.2.3


From ca7dfdbb33675151ad0854aea11340f95c38aff3 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 26 Oct 2016 16:37:53 +1100
Subject: kernel/smp: Define pr_fmt() for smp.c

This makes all our pr_xxx()'s start with "smp: ", which helps pin down
where they come from and generally looks nice. There is actually only
one pr_xxx() use in smp.c at the moment, but we will add some more in
the next commit.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: akpm@osdl.org
Cc: jgross@suse.com
Cc: ak@linux.intel.com
Cc: tim.c.chen@linux.intel.com
Cc: len.brown@intel.com
Cc: peterz@infradead.org
Cc: richard@nod.at
Cc: jolsa@redhat.com
Cc: boris.ostrovsky@oracle.com
Cc: mgorman@techsingularity.net
Link: http://lkml.kernel.org/r/1477460275-8266-1-git-send-email-mpe@ellerman.id.au
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/smp.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index bba3b201668d..2d1f15d43022 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,9 @@
  *
  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/irq_work.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
-- 
cgit v1.2.3


From 92b23278298304f72bbc786a737f2646f4b9aa9d Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 26 Oct 2016 16:37:54 +1100
Subject: kernel/smp: Make the SMP boot message common on all arches

Currently after bringing up secondary CPUs all arches print "Brought up
%d CPUs". On x86 they also print the number of nodes that were brought
online.

It would be nice to also print the number of nodes on other arches.
Although we could override smp_announce() on the other ~10 NUMA aware
arches, it seems simpler to just always print the number of nodes. On
non-NUMA arches there is just always 1 node.

Having done that, smp_announce() is no longer weak, and seems small
enough to just pull directly into smp_init().

Also update the printing of "%d CPUs" to be smart when an SMP kernel is
booted on a single CPU system, or when only one CPU is available, eg:

   smp: Brought up 2 nodes, 1 CPU

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: akpm@osdl.org
Cc: jgross@suse.com
Cc: ak@linux.intel.com
Cc: tim.c.chen@linux.intel.com
Cc: len.brown@intel.com
Cc: peterz@infradead.org
Cc: richard@nod.at
Cc: jolsa@redhat.com
Cc: boris.ostrovsky@oracle.com
Cc: mgorman@techsingularity.net
Link: http://lkml.kernel.org/r/1477460275-8266-2-git-send-email-mpe@ellerman.id.au
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/smpboot.c |  8 --------
 kernel/smp.c              | 13 +++++++------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 42f5eb7b4f6c..b9f02383f372 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -821,14 +821,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	return (send_status | accept_status);
 }
 
-void smp_announce(void)
-{
-	int num_nodes = num_online_nodes();
-
-	printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
-	       num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
-}
-
 /* reduce the number of lines printed when booting a large cpu count system */
 static void announce_cpu(int cpu, int apicid)
 {
diff --git a/kernel/smp.c b/kernel/smp.c
index 2d1f15d43022..4323c5db7d26 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -546,14 +546,10 @@ void __init setup_nr_cpu_ids(void)
 	nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
 }
 
-void __weak smp_announce(void)
-{
-	printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
-}
-
 /* Called by boot processor to activate the rest. */
 void __init smp_init(void)
 {
+	int num_nodes, num_cpus;
 	unsigned int cpu;
 
 	idle_threads_init();
@@ -567,8 +563,13 @@ void __init smp_init(void)
 			cpu_up(cpu);
 	}
 
+	num_nodes = num_online_nodes();
+	num_cpus  = num_online_cpus();
+	pr_info("Brought up %d node%s, %d CPU%s\n",
+		num_nodes, (num_nodes > 1 ? "s" : ""),
+		num_cpus,  (num_cpus  > 1 ? "s" : ""));
+
 	/* Any cleanup work */
-	smp_announce();
 	smp_cpus_done(setup_max_cpus);
 }
 
-- 
cgit v1.2.3


From 51111dce2509506d16efd321939895ff7ffe1dc2 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 26 Oct 2016 16:37:55 +1100
Subject: kernel/smp: Tell the user we're bringing up secondary CPUs

Currently we don't print anything before starting to bring up secondary
CPUs. This can be confusing if it takes a long time to bring up the
secondaries, or if the kernel crashes while doing so and produces no
further output.

On x86 they work around this by detecting when the first secondary CPU
comes up and printing a message (see announce_cpu()). But doing it in
smp_init() is simpler and works for all arches.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: akpm@osdl.org
Cc: jgross@suse.com
Cc: ak@linux.intel.com
Cc: tim.c.chen@linux.intel.com
Cc: len.brown@intel.com
Cc: peterz@infradead.org
Cc: richard@nod.at
Cc: jolsa@redhat.com
Cc: boris.ostrovsky@oracle.com
Cc: mgorman@techsingularity.net
Link: http://lkml.kernel.org/r/1477460275-8266-3-git-send-email-mpe@ellerman.id.au
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 4323c5db7d26..77fcdb9f2775 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -555,6 +555,8 @@ void __init smp_init(void)
 	idle_threads_init();
 	cpuhp_threads_init();
 
+	pr_info("Bringing up secondary CPUs ...\n");
+
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
 		if (num_online_cpus() >= setup_max_cpus)
-- 
cgit v1.2.3


From 6c5e9059692567740a4ee51530dffe51a4b9584d Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 21 Oct 2016 08:58:50 -0700
Subject: timers: Fix usleep_range() in the context of wake_up_process()

Users of usleep_range() expect that it will _never_ return in less time
than the minimum passed parameter. However, nothing in the code ensures
this, when the sleeping task is woken by wake_up_process() or any other
mechanism which can wake a task from uninterruptible state.

Neither usleep_range() nor schedule_hrtimeout_range*() have any protection
against wakeups. schedule_hrtimeout_range*() is designed this way despite
the fact that the API documentation does not mention it.

msleep() already has code to handle this case since it will loop as long
as there was still time left.  usleep_range() has no such loop, add it.

Presumably this problem was not detected before because usleep_range() is
only used in a few places and the function is mostly used in contexts which
are not exposed to wakeups of any form.

An effort was made to look for users relying on the old behavior by
looking for usleep_range() in the same file as wake_up_process().
No problems were found by this search, though it is conceivable that
someone could have put the sleep and wakeup in two different files.

An effort was made to ask several upstream maintainers if they were aware
of people relying on wake_up_process() to wake up usleep_range(). No
maintainers were aware of that but they were aware of many people relying
on usleep_range() never returning before the minimum.

Reported-by: Tao Huang <huangtao@rock-chips.com>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: heiko@sntech.de
Cc: broonie@kernel.org
Cc: briannorris@chromium.org
Cc: Andreas Mohr <andi@lisas.de>
Cc: linux-rockchip@lists.infradead.org
Cc: tony.xie@rock-chips.com
Cc: John Stultz <john.stultz@linaro.org>
Cc: djkurtz@chromium.org
Cc: linux@roeck-us.net
Cc: tskd08@gmail.com
Link: http://lkml.kernel.org/r/1477065531-30342-1-git-send-email-dianders@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2d47980a1bc4..12681c9a7683 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1896,16 +1896,6 @@ unsigned long msleep_interruptible(unsigned int msecs)
 
 EXPORT_SYMBOL(msleep_interruptible);
 
-static void __sched do_usleep_range(unsigned long min, unsigned long max)
-{
-	ktime_t kmin;
-	u64 delta;
-
-	kmin = ktime_set(0, min * NSEC_PER_USEC);
-	delta = (u64)(max - min) * NSEC_PER_USEC;
-	schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
-}
-
 /**
  * usleep_range - Sleep for an approximate time
  * @min: Minimum time in usecs to sleep
@@ -1919,7 +1909,14 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
  */
 void __sched usleep_range(unsigned long min, unsigned long max)
 {
-	__set_current_state(TASK_UNINTERRUPTIBLE);
-	do_usleep_range(min, max);
+	ktime_t exp = ktime_add_us(ktime_get(), min);
+	u64 delta = (u64)(max - min) * NSEC_PER_USEC;
+
+	for (;;) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		/* Do not return before the requested sleep time has elapsed */
+		if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
+			break;
+	}
 }
 EXPORT_SYMBOL(usleep_range);
-- 
cgit v1.2.3


From 4b7e9cf9c84b09adc428e0433cd376b91f9c52a7 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Fri, 21 Oct 2016 08:58:51 -0700
Subject: timers: Fix documentation for schedule_timeout() and similar

The documentation for schedule_timeout(), schedule_hrtimeout(), and
schedule_hrtimeout_range() all claim that the routines couldn't possibly
return early if the task state was TASK_UNINTERRUPTIBLE. This is simply
not true since wake_up_process() will cause those routines to exit early.

We cannot make schedule_[hr]timeout() loop until the timeout expires if the
task state is uninterruptible because we have users which rely on the
existing and designed behaviour.

Make the documentation match the (correct) implementation.

schedule_hrtimeout() returns -EINTR even when a uninterruptible task was
woken up. This might look strange, but making the return code depend on the
state is too much of an effort as it would affect all the call sites. There
is no value in doing so, but we spell it out clearly in the documentation.

Suggested-by: Daniel Kurtz <djkurtz@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: huangtao@rock-chips.com
Cc: heiko@sntech.de
Cc: broonie@kernel.org
Cc: briannorris@chromium.org
Cc: Andreas Mohr <andi@lisas.de>
Cc: linux-rockchip@lists.infradead.org
Cc: tony.xie@rock-chips.com
Cc: John Stultz <john.stultz@linaro.org>
Cc: linux@roeck-us.net
Cc: tskd08@gmail.com
Link: http://lkml.kernel.org/r/1477065531-30342-2-git-send-email-dianders@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 20 ++++++++++++++------
 kernel/time/timer.c   | 11 +++++++----
 2 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bb5ec425dfe0..08be5c99d26b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1742,15 +1742,19 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
+ * delivered to the current task or the current task is explicitly woken
+ * up.
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  *
- * Returns 0 when the timer has expired otherwise -EINTR
+ * Returns 0 when the timer has expired. If the task was woken before the
+ * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
+ * by an explicit wakeup, it returns -EINTR.
  */
 int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
 				     const enum hrtimer_mode mode)
@@ -1772,15 +1776,19 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
- * pass before the routine returns.
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process()).
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task.
+ * delivered to the current task or the current task is explicitly woken
+ * up.
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  *
- * Returns 0 when the timer has expired otherwise -EINTR
+ * Returns 0 when the timer has expired. If the task was woken before the
+ * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
+ * by an explicit wakeup, it returns -EINTR.
  */
 int __sched schedule_hrtimeout(ktime_t *expires,
 			       const enum hrtimer_mode mode)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 12681c9a7683..88aab86a4594 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1691,11 +1691,12 @@ static void process_timeout(unsigned long __data)
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
- * pass before the routine returns. The routine will return 0
+ * pass before the routine returns unless the current task is explicitly
+ * woken up, (e.g. by wake_up_process())".
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
- * delivered to the current task. In this case the remaining time
- * in jiffies will be returned, or 0 if the timer expired in time
+ * delivered to the current task or the current task is explicitly woken
+ * up.
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
@@ -1704,7 +1705,9 @@ static void process_timeout(unsigned long __data)
  * the CPU away without a bound on the timeout. In this case the return
  * value will be %MAX_SCHEDULE_TIMEOUT.
  *
- * In all cases the return value is guaranteed to be non-negative.
+ * Returns 0 when the timer has expired otherwise the remaining time in
+ * jiffies will be returned.  In all cases the return value is guaranteed
+ * to be non-negative.
  */
 signed long __sched schedule_timeout(signed long timeout)
 {
-- 
cgit v1.2.3


From a07ea4d9941af5a0c6f0be2a71b51ac9c083c5e5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:02 +0200
Subject: genetlink: no longer support using static family IDs

Static family IDs have never really been used, the only
use case was the workaround I introduced for those users
that assumed their family ID was also their multicast
group ID.

Additionally, because static family IDs would never be
reserved by the generic netlink code, using a relatively
low ID would only work for built-in families that can be
registered immediately after generic netlink is started,
which is basically only the control family (apart from
the workaround code, which I also had to add code for so
it would reserve those IDs)

Thus, anything other than GENL_ID_GENERATE is flawed and
luckily not used except in the cases I mentioned. Move
those workarounds into a few lines of code, and then get
rid of GENL_ID_GENERATE entirely, making it more robust.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c                  |  1 -
 drivers/net/gtp.c                     |  1 -
 drivers/net/macsec.c                  |  1 -
 drivers/net/team/team.c               |  1 -
 drivers/net/wireless/mac80211_hwsim.c |  1 -
 drivers/scsi/pmcraid.c                |  6 ------
 drivers/target/target_core_user.c     |  1 -
 drivers/thermal/thermal_core.c        |  1 -
 fs/dlm/netlink.c                      |  1 -
 fs/quota/netlink.c                    |  7 -------
 include/linux/genl_magic_func.h       |  1 -
 include/net/genetlink.h               |  7 ++-----
 include/uapi/linux/genetlink.h        |  1 -
 kernel/taskstats.c                    |  1 -
 net/batman-adv/netlink.c              |  1 -
 net/core/devlink.c                    |  1 -
 net/core/drop_monitor.c               |  1 -
 net/hsr/hsr_netlink.c                 |  1 -
 net/ieee802154/netlink.c              |  1 -
 net/ieee802154/nl802154.c             |  1 -
 net/ipv4/fou.c                        |  1 -
 net/ipv4/tcp_metrics.c                |  1 -
 net/ipv6/ila/ila_xlat.c               |  1 -
 net/irda/irnetlink.c                  |  1 -
 net/l2tp/l2tp_netlink.c               |  1 -
 net/netfilter/ipvs/ip_vs_ctl.c        |  1 -
 net/netlabel/netlabel_calipso.c       |  1 -
 net/netlabel/netlabel_cipso_v4.c      |  1 -
 net/netlabel/netlabel_mgmt.c          |  1 -
 net/netlabel/netlabel_unlabeled.c     |  1 -
 net/netlink/genetlink.c               | 37 +++++++++++++++++++++--------------
 net/nfc/netlink.c                     |  1 -
 net/openvswitch/datapath.c            |  4 ----
 net/tipc/netlink.c                    |  1 -
 net/tipc/netlink_compat.c             |  1 -
 net/wimax/stack.c                     |  1 -
 net/wireless/nl80211.c                |  1 -
 37 files changed, 24 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index e24ea4e796e4..8dfca3d53131 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -83,7 +83,6 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = {
 };
 
 static struct genl_family acpi_event_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = ACPI_GENL_FAMILY_NAME,
 	.version = ACPI_GENL_VERSION,
 	.maxattr = ACPI_GENL_ATTR_MAX,
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 97e0cbca0a08..f66737ba1299 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1095,7 +1095,6 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info)
 }
 
 static struct genl_family gtp_genl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= "gtp",
 	.version	= 0,
 	.hdrsize	= 0,
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 1a134cb2d52c..a5309b81a786 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -1422,7 +1422,6 @@ static void clear_tx_sa(struct macsec_tx_sa *tx_sa)
 }
 
 static struct genl_family macsec_fam = {
-	.id		= GENL_ID_GENERATE,
 	.name		= MACSEC_GENL_NAME,
 	.hdrsize	= 0,
 	.version	= MACSEC_GENL_VERSION,
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index a380649bf6b5..0b50205764ff 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2151,7 +2151,6 @@ static struct rtnl_link_ops team_link_ops __read_mostly = {
  ***********************************/
 
 static struct genl_family team_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TEAM_GENL_NAME,
 	.version	= TEAM_GENL_VERSION,
 	.maxattr	= TEAM_ATTR_MAX,
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index e95b79bccf9b..54b6cd62676e 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -589,7 +589,6 @@ struct hwsim_radiotap_ack_hdr {
 
 /* MAC80211_HWSIM netlinf family */
 static struct genl_family hwsim_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = "MAC80211_HWSIM",
 	.version = 1,
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 68a5c347fae9..cc50eb87b28a 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1369,12 +1369,6 @@ static struct genl_multicast_group pmcraid_mcgrps[] = {
 };
 
 static struct genl_family pmcraid_event_family = {
-	/*
-	 * Due to prior multicast group abuse (the code having assumed that
-	 * the family ID can be used as a multicast group ID) we need to
-	 * statically allocate a family (and thus group) ID.
-	 */
-	.id = GENL_ID_PMCRAID,
 	.name = "pmcraid",
 	.version = 1,
 	.maxattr = PMCRAID_AEN_ATTR_MAX,
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 62bf4fe5704a..313a0ef3cda7 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -148,7 +148,6 @@ static const struct genl_multicast_group tcmu_mcgrps[] = {
 
 /* Our generic netlink family */
 static struct genl_family tcmu_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = "TCM-USER",
 	.version = 1,
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 226b0b4aced6..68d7503f6417 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -2164,7 +2164,6 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = {
 };
 
 static struct genl_family thermal_event_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = THERMAL_GENL_FAMILY_NAME,
 	.version = THERMAL_GENL_VERSION,
 	.maxattr = THERMAL_GENL_ATTR_MAX,
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 1e6e227134d7..00d226956264 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -17,7 +17,6 @@ static uint32_t dlm_nl_seqnum;
 static uint32_t listener_nlportid;
 
 static struct genl_family family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= DLM_GENL_NAME,
 	.version	= DLM_GENL_VERSION,
 };
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 8b252673d454..3965a5cdfaa2 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -13,13 +13,6 @@ static const struct genl_multicast_group quota_mcgrps[] = {
 
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
-	/*
-	 * Needed due to multicast group ID abuse - old code assumed
-	 * the family ID was also a valid multicast group ID (which
-	 * isn't true) and userspace might thus rely on it. Assign a
-	 * static ID for this group to make dealing with that easier.
-	 */
-	.id = GENL_ID_VFS_DQUOT,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
 	.version = 1,
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 667c31101b8b..7c070c1fe457 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -260,7 +260,6 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
  */
 #define ZZZ_genl_family		CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
 static struct genl_family ZZZ_genl_family __read_mostly = {
-	.id = GENL_ID_GENERATE,
 	.name = __stringify(GENL_MAGIC_FAMILY),
 	.version = GENL_MAGIC_VERSION,
 #ifdef GENL_MAGIC_FAMILY_HDRSZ
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index ef9defb3f5bc..43a5c3975a2f 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -20,7 +20,7 @@ struct genl_info;
 
 /**
  * struct genl_family - generic netlink family
- * @id: protocol family idenfitier
+ * @id: protocol family identifier (private)
  * @hdrsize: length of user specific header in bytes
  * @name: name of family
  * @version: protocol version
@@ -48,7 +48,7 @@ struct genl_info;
  * @n_ops: number of operations supported by this family (private)
  */
 struct genl_family {
-	unsigned int		id;
+	unsigned int		id;		/* private */
 	unsigned int		hdrsize;
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
@@ -149,9 +149,6 @@ static inline int genl_register_family(struct genl_family *family)
  * Registers the specified family and operations from the specified table.
  * Only one family may be registered with the same family name or identifier.
  *
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
- *
  * Either a doit or dumpit callback must be specified for every registered
  * operation or the function will fail. Only one operation structure per
  * command identifier may be registered.
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index 5512c90af7e3..d9b2db4a29c6 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -26,7 +26,6 @@ struct genlmsghdr {
 /*
  * List of reserved static generic netlink identifiers:
  */
-#define GENL_ID_GENERATE	0
 #define GENL_ID_CTRL		NLMSG_MIN_TYPE
 #define GENL_ID_VFS_DQUOT	(NLMSG_MIN_TYPE + 1)
 #define GENL_ID_PMCRAID		(NLMSG_MIN_TYPE + 2)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee20d18..d7a1a9461a10 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -42,7 +42,6 @@ static int family_registered;
 struct kmem_cache *taskstats_cache;
 
 static struct genl_family family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 64cb6acbe0a6..a03b0ed7e8dd 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -49,7 +49,6 @@
 #include "translation-table.h"
 
 struct genl_family batadv_netlink_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = BATADV_NL_NAME,
 	.version = 1,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d2fd736de6a2..3008d9c33875 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -342,7 +342,6 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
 }
 
 static struct genl_family devlink_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= DEVLINK_GENL_NAME,
 	.version	= DEVLINK_GENL_VERSION,
 	.maxattr	= DEVLINK_ATTR_MAX,
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 72cfb0c61125..a5320dfcd978 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -60,7 +60,6 @@ struct dm_hw_stat_delta {
 };
 
 static struct genl_family net_drop_monitor_family = {
-	.id             = GENL_ID_GENERATE,
 	.hdrsize        = 0,
 	.name           = "NET_DM",
 	.version        = 2,
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index d4d1617f43a8..2ad039492bee 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -132,7 +132,6 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
 };
 
 static struct genl_family hsr_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = "HSR",
 	.version = 1,
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index c8133c07ceee..19144158b696 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -29,7 +29,6 @@ static unsigned int ieee802154_seq_num;
 static DEFINE_SPINLOCK(ieee802154_seq_lock);
 
 struct genl_family nl802154_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= IEEE802154_NL_NAME,
 	.version	= 1,
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 21aabadccd0e..182299858f1d 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -34,7 +34,6 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 /* the netlink family */
 static struct genl_family nl802154_fam = {
-	.id = GENL_ID_GENERATE,		/* don't bother with a hardcoded ID */
 	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index cf50f7e2b012..e3fc527c5d37 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -623,7 +623,6 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
 }
 
 static struct genl_family fou_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= FOU_GENL_NAME,
 	.version	= FOU_GENL_VERSION,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bf1f3b2b29d1..3da305127b32 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -743,7 +743,6 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 }
 
 static struct genl_family tcp_metrics_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= TCP_METRICS_GENL_NAME,
 	.version	= TCP_METRICS_GENL_VERSION,
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index e604013dd814..0d57e27d1cdd 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -119,7 +119,6 @@ static const struct rhashtable_params rht_params = {
 };
 
 static struct genl_family ila_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= ILA_GENL_NAME,
 	.version	= ILA_GENL_VERSION,
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index e15c40e86660..f23b81aa91fe 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -25,7 +25,6 @@
 
 
 static struct genl_family irda_nl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = IRDA_NL_NAME,
 	.hdrsize = 0,
 	.version = IRDA_NL_VERSION,
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index bf3117771822..4fbf1f41ac52 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -32,7 +32,6 @@
 
 
 static struct genl_family l2tp_nl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= L2TP_GENL_NAME,
 	.version	= L2TP_GENL_VERSION,
 	.hdrsize	= 0,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c3c809b2e712..ceed66cdd03e 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2841,7 +2841,6 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
 
 /* IPVS genetlink family */
 static struct genl_family ip_vs_genl_family = {
-	.id		= GENL_ID_GENERATE,
 	.hdrsize	= 0,
 	.name		= IPVS_GENL_NAME,
 	.version	= IPVS_GENL_VERSION,
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 2ec93c5e77bb..152e503b8c5d 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg {
 
 /* NetLabel Generic NETLINK CALIPSO family */
 static struct genl_family netlbl_calipso_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CALIPSO_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 7fd1104ba900..755b284e7ad4 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -60,7 +60,6 @@ struct netlbl_domhsh_walk_arg {
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
 static struct genl_family netlbl_cipsov4_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index f85d0e07af2d..3b00f2368fcd 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg {
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
 static struct genl_family netlbl_mgmt_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_MGMT_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 4528cff9138b..c2ea8d1f653a 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -124,7 +124,6 @@ static u8 netlabel_unlabel_acceptflg;
 
 /* NetLabel Generic NETLINK unlabeled family */
 static struct genl_family netlbl_unlabel_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_UNLABELED_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 01291b7a27bb..f19ec969edee 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -349,8 +349,6 @@ static int genl_validate_ops(const struct genl_family *family)
  *
  * Registers the specified family after validating it first. Only one
  * family may be registered with the same family name or identifier.
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
  *
  * The family's ops array must already be assigned, you can use the
  * genl_register_family_with_ops() helper function.
@@ -359,13 +357,7 @@ static int genl_validate_ops(const struct genl_family *family)
  */
 int __genl_register_family(struct genl_family *family)
 {
-	int err = -EINVAL, i;
-
-	if (family->id && family->id < GENL_MIN_ID)
-		goto errout;
-
-	if (family->id > GENL_MAX_ID)
-		goto errout;
+	int err, i;
 
 	err = genl_validate_ops(family);
 	if (err)
@@ -378,8 +370,27 @@ int __genl_register_family(struct genl_family *family)
 		goto errout_locked;
 	}
 
-	if (family->id == GENL_ID_GENERATE) {
-		u16 newid = genl_generate_id();
+	if (family == &genl_ctrl) {
+		family->id = GENL_ID_CTRL;
+	} else {
+		u16 newid;
+
+		/* this should be left zero in the struct */
+		WARN_ON(family->id);
+
+		/*
+		 * Sadly, a few cases need to be special-cased
+		 * due to them having previously abused the API
+		 * and having used their family ID also as their
+		 * multicast group ID, so we use reserved IDs
+		 * for both to be sure we can do that mapping.
+		 */
+		if (strcmp(family->name, "pmcraid") == 0)
+			newid = GENL_ID_PMCRAID;
+		else if (strcmp(family->name, "VFS_DQUOT") == 0)
+			newid = GENL_ID_VFS_DQUOT;
+		else
+			newid = genl_generate_id();
 
 		if (!newid) {
 			err = -ENOMEM;
@@ -387,9 +398,6 @@ int __genl_register_family(struct genl_family *family)
 		}
 
 		family->id = newid;
-	} else if (genl_family_find_byid(family->id)) {
-		err = -EEXIST;
-		goto errout_locked;
 	}
 
 	if (family->maxattr && !family->parallel_ops) {
@@ -419,7 +427,6 @@ int __genl_register_family(struct genl_family *family)
 
 errout_locked:
 	genl_unlock_all();
-errout:
 	return err;
 }
 EXPORT_SYMBOL(__genl_register_family);
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 79786bf62b88..c230403e066c 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -39,7 +39,6 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = {
 };
 
 static struct genl_family nfc_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = 0,
 	.name = NFC_GENL_NAME,
 	.version = NFC_GENL_VERSION,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 194435aa1165..f9fef7dfba15 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -671,7 +671,6 @@ static const struct genl_ops dp_packet_genl_ops[] = {
 };
 
 static struct genl_family dp_packet_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_PACKET_FAMILY,
 	.version = OVS_PACKET_VERSION,
@@ -1436,7 +1435,6 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 };
 
 static struct genl_family dp_flow_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_FLOW_FAMILY,
 	.version = OVS_FLOW_VERSION,
@@ -1822,7 +1820,6 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 };
 
 static struct genl_family dp_datapath_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_DATAPATH_FAMILY,
 	.version = OVS_DATAPATH_VERSION,
@@ -2244,7 +2241,6 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 };
 
 struct genl_family dp_vport_genl_family = {
-	.id = GENL_ID_GENERATE,
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_VPORT_FAMILY,
 	.version = OVS_VPORT_VERSION,
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 4b94f3cfe3af..383b8fedabc7 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -136,7 +136,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
  * so we have a separate genl handling for the new API.
  */
 struct genl_family tipc_genl_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TIPC_GENL_V2_NAME,
 	.version	= TIPC_GENL_V2_VERSION,
 	.hdrsize	= 0,
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 1fd464764765..f04428e4c8e5 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1216,7 +1216,6 @@ send:
 }
 
 static struct genl_family tipc_genl_compat_family = {
-	.id		= GENL_ID_GENERATE,
 	.name		= TIPC_GENL_NAME,
 	.version	= TIPC_GENL_VERSION,
 	.hdrsize	= TIPC_GENL_HDRLEN,
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 3f816e2971ee..8ac83a41585f 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -573,7 +573,6 @@ size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
 
 
 struct genl_family wimax_gnl_family = {
-	.id = GENL_ID_GENERATE,
 	.name = "WiMAX",
 	.version = WIMAX_GNL_VERSION,
 	.hdrsize = 0,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 7d8cb3330c86..714beafe05e0 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -39,7 +39,6 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
 
 /* the netlink family */
 static struct genl_family nl80211_fam = {
-	.id = GENL_ID_GENERATE,		/* don't bother with a hardcoded ID */
 	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
-- 
cgit v1.2.3


From 489111e5c25b93be80340c3113d71903d7c82136 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:03 +0200
Subject: genetlink: statically initialize families

Instead of providing macros/inline functions to initialize
the families, make all users initialize them statically and
get rid of the macros.

This reduces the kernel code size by about 1.6k on x86-64
(with allyesconfig).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c                  |  1 +
 drivers/net/gtp.c                     | 21 +++++++----
 drivers/net/macsec.c                  | 21 +++++++----
 drivers/net/team/team.c               | 22 +++++++----
 drivers/net/wireless/mac80211_hwsim.c | 26 +++++++------
 drivers/scsi/pmcraid.c                |  1 +
 drivers/target/target_core_user.c     |  1 +
 drivers/thermal/thermal_core.c        |  1 +
 fs/dlm/netlink.c                      | 15 +++++---
 fs/quota/netlink.c                    |  1 +
 include/linux/drbd_genl.h             |  2 +-
 include/linux/genl_magic_func.h       | 28 ++++++++------
 include/net/genetlink.h               | 71 ++++++-----------------------------
 kernel/taskstats.c                    | 17 ++++++---
 net/batman-adv/netlink.c              | 25 +++++++-----
 net/core/devlink.c                    | 27 +++++++------
 net/core/drop_monitor.c               | 20 ++++++----
 net/hsr/hsr_netlink.c                 | 22 +++++++----
 net/ieee802154/netlink.c              | 23 +++++++-----
 net/ieee802154/nl802154.c             | 34 ++++++++---------
 net/ipv4/fou.c                        | 22 ++++++-----
 net/ipv4/tcp_metrics.c                | 22 ++++++-----
 net/ipv6/ila/ila_xlat.c               | 24 +++++++-----
 net/irda/irnetlink.c                  | 19 ++++++----
 net/l2tp/l2tp_netlink.c               | 25 +++++++-----
 net/netfilter/ipvs/ip_vs_ctl.c        | 22 ++++++-----
 net/netlabel/netlabel_calipso.c       | 20 ++++++----
 net/netlabel/netlabel_cipso_v4.c      | 21 ++++++-----
 net/netlabel/netlabel_mgmt.c          | 20 ++++++----
 net/netlabel/netlabel_unlabeled.c     | 20 ++++++----
 net/netlink/genetlink.c               | 35 +++++++++--------
 net/nfc/netlink.c                     | 24 +++++++-----
 net/openvswitch/datapath.c            |  4 ++
 net/tipc/netlink.c                    | 22 ++++++-----
 net/tipc/netlink_compat.c             | 20 +++++-----
 net/wimax/stack.c                     | 19 +++++-----
 net/wireless/nl80211.c                | 33 ++++++++--------
 37 files changed, 414 insertions(+), 337 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index 8dfca3d53131..1ab12ad7d5ba 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -83,6 +83,7 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = {
 };
 
 static struct genl_family acpi_event_genl_family = {
+	.module = THIS_MODULE,
 	.name = ACPI_GENL_FAMILY_NAME,
 	.version = ACPI_GENL_VERSION,
 	.maxattr = ACPI_GENL_ATTR_MAX,
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index f66737ba1299..0604fd78f826 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1094,13 +1094,7 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct genl_family gtp_genl_family = {
-	.name		= "gtp",
-	.version	= 0,
-	.hdrsize	= 0,
-	.maxattr	= GTPA_MAX,
-	.netnsok	= true,
-};
+static struct genl_family gtp_genl_family;
 
 static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 			      u32 type, struct pdp_ctx *pctx)
@@ -1296,6 +1290,17 @@ static const struct genl_ops gtp_genl_ops[] = {
 	},
 };
 
+static struct genl_family gtp_genl_family = {
+	.name		= "gtp",
+	.version	= 0,
+	.hdrsize	= 0,
+	.maxattr	= GTPA_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= gtp_genl_ops,
+	.n_ops		= ARRAY_SIZE(gtp_genl_ops),
+};
+
 static int __net_init gtp_net_init(struct net *net)
 {
 	struct gtp_net *gn = net_generic(net, gtp_net_id);
@@ -1335,7 +1340,7 @@ static int __init gtp_init(void)
 	if (err < 0)
 		goto error_out;
 
-	err = genl_register_family_with_ops(&gtp_genl_family, gtp_genl_ops);
+	err = genl_register_family(&gtp_genl_family);
 	if (err < 0)
 		goto unreg_rtnl_link;
 
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index a5309b81a786..63ca7a3c77cf 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -1421,13 +1421,7 @@ static void clear_tx_sa(struct macsec_tx_sa *tx_sa)
 	macsec_txsa_put(tx_sa);
 }
 
-static struct genl_family macsec_fam = {
-	.name		= MACSEC_GENL_NAME,
-	.hdrsize	= 0,
-	.version	= MACSEC_GENL_VERSION,
-	.maxattr	= MACSEC_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family macsec_fam;
 
 static struct net_device *get_dev_from_nl(struct net *net,
 					  struct nlattr **attrs)
@@ -2654,6 +2648,17 @@ static const struct genl_ops macsec_genl_ops[] = {
 	},
 };
 
+static struct genl_family macsec_fam = {
+	.name		= MACSEC_GENL_NAME,
+	.hdrsize	= 0,
+	.version	= MACSEC_GENL_VERSION,
+	.maxattr	= MACSEC_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= macsec_genl_ops,
+	.n_ops		= ARRAY_SIZE(macsec_genl_ops),
+};
+
 static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
 				     struct net_device *dev)
 {
@@ -3461,7 +3466,7 @@ static int __init macsec_init(void)
 	if (err)
 		goto notifier;
 
-	err = genl_register_family_with_ops(&macsec_fam, macsec_genl_ops);
+	err = genl_register_family(&macsec_fam);
 	if (err)
 		goto rtnl;
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 0b50205764ff..46bf7c1216c0 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2150,12 +2150,7 @@ static struct rtnl_link_ops team_link_ops __read_mostly = {
  * Generic netlink custom interface
  ***********************************/
 
-static struct genl_family team_nl_family = {
-	.name		= TEAM_GENL_NAME,
-	.version	= TEAM_GENL_VERSION,
-	.maxattr	= TEAM_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family team_nl_family;
 
 static const struct nla_policy team_nl_policy[TEAM_ATTR_MAX + 1] = {
 	[TEAM_ATTR_UNSPEC]			= { .type = NLA_UNSPEC, },
@@ -2745,6 +2740,18 @@ static const struct genl_multicast_group team_nl_mcgrps[] = {
 	{ .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, },
 };
 
+static struct genl_family team_nl_family = {
+	.name		= TEAM_GENL_NAME,
+	.version	= TEAM_GENL_VERSION,
+	.maxattr	= TEAM_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= team_nl_ops,
+	.n_ops		= ARRAY_SIZE(team_nl_ops),
+	.mcgrps		= team_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(team_nl_mcgrps),
+};
+
 static int team_nl_send_multicast(struct sk_buff *skb,
 				  struct team *team, u32 portid)
 {
@@ -2768,8 +2775,7 @@ static int team_nl_send_event_port_get(struct team *team,
 
 static int team_nl_init(void)
 {
-	return genl_register_family_with_ops_groups(&team_nl_family, team_nl_ops,
-						    team_nl_mcgrps);
+	return genl_register_family(&team_nl_family);
 }
 
 static void team_nl_fini(void)
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 54b6cd62676e..5d4637e586e8 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -587,14 +587,8 @@ struct hwsim_radiotap_ack_hdr {
 	__le16 rt_chbitmask;
 } __packed;
 
-/* MAC80211_HWSIM netlinf family */
-static struct genl_family hwsim_genl_family = {
-	.hdrsize = 0,
-	.name = "MAC80211_HWSIM",
-	.version = 1,
-	.maxattr = HWSIM_ATTR_MAX,
-	.netnsok = true,
-};
+/* MAC80211_HWSIM netlink family */
+static struct genl_family hwsim_genl_family;
 
 enum hwsim_multicast_groups {
 	HWSIM_MCGRP_CONFIG,
@@ -3234,6 +3228,18 @@ static const struct genl_ops hwsim_ops[] = {
 	},
 };
 
+static struct genl_family hwsim_genl_family = {
+	.name = "MAC80211_HWSIM",
+	.version = 1,
+	.maxattr = HWSIM_ATTR_MAX,
+	.netnsok = true,
+	.module = THIS_MODULE,
+	.ops = hwsim_ops,
+	.n_ops = ARRAY_SIZE(hwsim_ops),
+	.mcgrps = hwsim_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(hwsim_mcgrps),
+};
+
 static void destroy_radio(struct work_struct *work)
 {
 	struct mac80211_hwsim_data *data =
@@ -3287,9 +3293,7 @@ static int hwsim_init_netlink(void)
 
 	printk(KERN_INFO "mac80211_hwsim: initializing netlink\n");
 
-	rc = genl_register_family_with_ops_groups(&hwsim_genl_family,
-						  hwsim_ops,
-						  hwsim_mcgrps);
+	rc = genl_register_family(&hwsim_genl_family);
 	if (rc)
 		goto failure;
 
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index cc50eb87b28a..c0ab7bb8c3ce 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1369,6 +1369,7 @@ static struct genl_multicast_group pmcraid_mcgrps[] = {
 };
 
 static struct genl_family pmcraid_event_family = {
+	.module = THIS_MODULE,
 	.name = "pmcraid",
 	.version = 1,
 	.maxattr = PMCRAID_AEN_ATTR_MAX,
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 313a0ef3cda7..3483372f5562 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -148,6 +148,7 @@ static const struct genl_multicast_group tcmu_mcgrps[] = {
 
 /* Our generic netlink family */
 static struct genl_family tcmu_genl_family = {
+	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "TCM-USER",
 	.version = 1,
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 68d7503f6417..93b6caab2d9f 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -2164,6 +2164,7 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = {
 };
 
 static struct genl_family thermal_event_genl_family = {
+	.module = THIS_MODULE,
 	.name = THERMAL_GENL_FAMILY_NAME,
 	.version = THERMAL_GENL_VERSION,
 	.maxattr = THERMAL_GENL_ATTR_MAX,
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 00d226956264..04042d69573c 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -16,10 +16,7 @@
 static uint32_t dlm_nl_seqnum;
 static uint32_t listener_nlportid;
 
-static struct genl_family family = {
-	.name		= DLM_GENL_NAME,
-	.version	= DLM_GENL_VERSION,
-};
+static struct genl_family family;
 
 static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
 {
@@ -75,9 +72,17 @@ static struct genl_ops dlm_nl_ops[] = {
 	},
 };
 
+static struct genl_family family = {
+	.name		= DLM_GENL_NAME,
+	.version	= DLM_GENL_VERSION,
+	.ops		= dlm_nl_ops,
+	.n_ops		= ARRAY_SIZE(dlm_nl_ops),
+	.module		= THIS_MODULE,
+};
+
 int __init dlm_netlink_init(void)
 {
-	return genl_register_family_with_ops(&family, dlm_nl_ops);
+	return genl_register_family(&family);
 }
 
 void dlm_netlink_exit(void)
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 3965a5cdfaa2..9457c7b0dfa2 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -13,6 +13,7 @@ static const struct genl_multicast_group quota_mcgrps[] = {
 
 /* Netlink family structure for quota */
 static struct genl_family quota_genl_family = {
+	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
 	.version = 1,
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index c934d3a96b5e..2896f93808ae 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -67,7 +67,7 @@
  *	genl_magic_func.h
  *		generates an entry in the static genl_ops array,
  *		and static register/unregister functions to
- *		genl_register_family_with_ops().
+ *		genl_register_family().
  *
  *	flags and handler:
  *		GENL_op_init( .doit = x, .dumpit = y, .flags = something)
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 7c070c1fe457..40c2e39362c8 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -259,15 +259,7 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
  *									{{{2
  */
 #define ZZZ_genl_family		CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
-static struct genl_family ZZZ_genl_family __read_mostly = {
-	.name = __stringify(GENL_MAGIC_FAMILY),
-	.version = GENL_MAGIC_VERSION,
-#ifdef GENL_MAGIC_FAMILY_HDRSZ
-	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
-#endif
-	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
-};
-
+static struct genl_family ZZZ_genl_family;
 /*
  * Magic: define multicast groups
  * Magic: define multicast group registration helper
@@ -301,11 +293,23 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 #undef GENL_mc_group
 #define GENL_mc_group(group)
 
+static struct genl_family ZZZ_genl_family __read_mostly = {
+	.name = __stringify(GENL_MAGIC_FAMILY),
+	.version = GENL_MAGIC_VERSION,
+#ifdef GENL_MAGIC_FAMILY_HDRSZ
+	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
+#endif
+	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
+	.ops = ZZZ_genl_ops,
+	.n_ops = ARRAY_SIZE(ZZZ_genl_ops),
+	.mcgrps = ZZZ_genl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps),
+	.module = THIS_MODULE,
+};
+
 int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
 {
-	return genl_register_family_with_ops_groups(&ZZZ_genl_family,	\
-						    ZZZ_genl_ops,	\
-						    ZZZ_genl_mcgrps);
+	return genl_register_family(&ZZZ_genl_family);
 }
 
 void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 43a5c3975a2f..2298b50cee34 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -39,13 +39,14 @@ struct genl_info;
  *	Note that unbind() will not be called symmetrically if the
  *	generic netlink family is removed while there are still open
  *	sockets.
- * @attrbuf: buffer to store parsed attributes
- * @family_list: family list
- * @mcgrps: multicast groups used by this family (private)
- * @n_mcgrps: number of multicast groups (private)
+ * @attrbuf: buffer to store parsed attributes (private)
+ * @family_list: family list (private)
+ * @mcgrps: multicast groups used by this family
+ * @n_mcgrps: number of multicast groups
  * @mcgrp_offset: starting number of multicast group IDs in this family
- * @ops: the operations supported by this family (private)
- * @n_ops: number of operations supported by this family (private)
+ *	(private)
+ * @ops: the operations supported by this family
+ * @n_ops: number of operations supported by this family
  */
 struct genl_family {
 	unsigned int		id;		/* private */
@@ -64,10 +65,10 @@ struct genl_family {
 	int			(*mcast_bind)(struct net *net, int group);
 	void			(*mcast_unbind)(struct net *net, int group);
 	struct nlattr **	attrbuf;	/* private */
-	const struct genl_ops *	ops;		/* private */
-	const struct genl_multicast_group *mcgrps; /* private */
-	unsigned int		n_ops;		/* private */
-	unsigned int		n_mcgrps;	/* private */
+	const struct genl_ops *	ops;
+	const struct genl_multicast_group *mcgrps;
+	unsigned int		n_ops;
+	unsigned int		n_mcgrps;
 	unsigned int		mcgrp_offset;	/* private */
 	struct list_head	family_list;	/* private */
 	struct module		*module;
@@ -132,55 +133,7 @@ struct genl_ops {
 	u8			flags;
 };
 
-int __genl_register_family(struct genl_family *family);
-
-static inline int genl_register_family(struct genl_family *family)
-{
-	family->module = THIS_MODULE;
-	return __genl_register_family(family);
-}
-
-/**
- * genl_register_family_with_ops - register a generic netlink family with ops
- * @family: generic netlink family
- * @ops: operations to be registered
- * @n_ops: number of elements to register
- *
- * Registers the specified family and operations from the specified table.
- * Only one family may be registered with the same family name or identifier.
- *
- * Either a doit or dumpit callback must be specified for every registered
- * operation or the function will fail. Only one operation structure per
- * command identifier may be registered.
- *
- * See include/net/genetlink.h for more documenation on the operations
- * structure.
- *
- * Return 0 on success or a negative error code.
- */
-static inline int
-_genl_register_family_with_ops_grps(struct genl_family *family,
-				    const struct genl_ops *ops, size_t n_ops,
-				    const struct genl_multicast_group *mcgrps,
-				    size_t n_mcgrps)
-{
-	family->module = THIS_MODULE;
-	family->ops = ops;
-	family->n_ops = n_ops;
-	family->mcgrps = mcgrps;
-	family->n_mcgrps = n_mcgrps;
-	return __genl_register_family(family);
-}
-
-#define genl_register_family_with_ops(family, ops)			\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    NULL, 0)
-#define genl_register_family_with_ops_groups(family, ops, grps)	\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    (grps), ARRAY_SIZE(grps))
-
+int genl_register_family(struct genl_family *family);
 int genl_unregister_family(struct genl_family *family);
 void genl_notify(struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d7a1a9461a10..4075ece592f2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -41,11 +41,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum);
 static int family_registered;
 struct kmem_cache *taskstats_cache;
 
-static struct genl_family family = {
-	.name		= TASKSTATS_GENL_NAME,
-	.version	= TASKSTATS_GENL_VERSION,
-	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
-};
+static struct genl_family family;
 
 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
@@ -650,6 +646,15 @@ static const struct genl_ops taskstats_ops[] = {
 	},
 };
 
+static struct genl_family family = {
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+	.module		= THIS_MODULE,
+	.ops		= taskstats_ops,
+	.n_ops		= ARRAY_SIZE(taskstats_ops),
+};
+
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
@@ -666,7 +671,7 @@ static int __init taskstats_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops(&family, taskstats_ops);
+	rc = genl_register_family(&family);
 	if (rc)
 		return rc;
 
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index a03b0ed7e8dd..e28cec34a016 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -48,13 +48,7 @@
 #include "tp_meter.h"
 #include "translation-table.h"
 
-struct genl_family batadv_netlink_family = {
-	.hdrsize = 0,
-	.name = BATADV_NL_NAME,
-	.version = 1,
-	.maxattr = BATADV_ATTR_MAX,
-	.netnsok = true,
-};
+struct genl_family batadv_netlink_family;
 
 /* multicast groups */
 enum batadv_netlink_multicast_groups {
@@ -609,6 +603,19 @@ static struct genl_ops batadv_netlink_ops[] = {
 
 };
 
+struct genl_family batadv_netlink_family = {
+	.hdrsize = 0,
+	.name = BATADV_NL_NAME,
+	.version = 1,
+	.maxattr = BATADV_ATTR_MAX,
+	.netnsok = true,
+	.module = THIS_MODULE,
+	.ops = batadv_netlink_ops,
+	.n_ops = ARRAY_SIZE(batadv_netlink_ops),
+	.mcgrps = batadv_netlink_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
+};
+
 /**
  * batadv_netlink_register - register batadv genl netlink family
  */
@@ -616,9 +623,7 @@ void __init batadv_netlink_register(void)
 {
 	int ret;
 
-	ret = genl_register_family_with_ops_groups(&batadv_netlink_family,
-						   batadv_netlink_ops,
-						   batadv_netlink_mcgrps);
+	ret = genl_register_family(&batadv_netlink_family);
 	if (ret)
 		pr_warn("unable to register netlink family");
 }
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3008d9c33875..063da8091aef 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -341,14 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops,
 	mutex_unlock(&devlink_mutex);
 }
 
-static struct genl_family devlink_nl_family = {
-	.name		= DEVLINK_GENL_NAME,
-	.version	= DEVLINK_GENL_VERSION,
-	.maxattr	= DEVLINK_ATTR_MAX,
-	.netnsok	= true,
-	.pre_doit	= devlink_nl_pre_doit,
-	.post_doit	= devlink_nl_post_doit,
-};
+static struct genl_family devlink_nl_family;
 
 enum devlink_multicast_groups {
 	DEVLINK_MCGRP_CONFIG,
@@ -1619,6 +1612,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 	},
 };
 
+static struct genl_family devlink_nl_family = {
+	.name		= DEVLINK_GENL_NAME,
+	.version	= DEVLINK_GENL_VERSION,
+	.maxattr	= DEVLINK_ATTR_MAX,
+	.netnsok	= true,
+	.pre_doit	= devlink_nl_pre_doit,
+	.post_doit	= devlink_nl_post_doit,
+	.module		= THIS_MODULE,
+	.ops		= devlink_nl_ops,
+	.n_ops		= ARRAY_SIZE(devlink_nl_ops),
+	.mcgrps		= devlink_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(devlink_nl_mcgrps),
+};
+
 /**
  *	devlink_alloc - Allocate new devlink instance resources
  *
@@ -1841,9 +1848,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 
 static int __init devlink_module_init(void)
 {
-	return genl_register_family_with_ops_groups(&devlink_nl_family,
-						    devlink_nl_ops,
-						    devlink_nl_mcgrps);
+	return genl_register_family(&devlink_nl_family);
 }
 
 static void __exit devlink_module_exit(void)
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index a5320dfcd978..80c002794ff6 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -59,11 +59,7 @@ struct dm_hw_stat_delta {
 	unsigned long last_drop_val;
 };
 
-static struct genl_family net_drop_monitor_family = {
-	.hdrsize        = 0,
-	.name           = "NET_DM",
-	.version        = 2,
-};
+static struct genl_family net_drop_monitor_family;
 
 static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
 
@@ -350,6 +346,17 @@ static const struct genl_ops dropmon_ops[] = {
 	},
 };
 
+static struct genl_family net_drop_monitor_family = {
+	.hdrsize        = 0,
+	.name           = "NET_DM",
+	.version        = 2,
+	.module		= THIS_MODULE,
+	.ops		= dropmon_ops,
+	.n_ops		= ARRAY_SIZE(dropmon_ops),
+	.mcgrps		= dropmon_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(dropmon_mcgrps),
+};
+
 static struct notifier_block dropmon_net_notifier = {
 	.notifier_call = dropmon_net_event
 };
@@ -366,8 +373,7 @@ static int __init init_net_drop_monitor(void)
 		return -ENOSPC;
 	}
 
-	rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
-						  dropmon_ops, dropmon_mcgrps);
+	rc = genl_register_family(&net_drop_monitor_family);
 	if (rc) {
 		pr_err("Could not create drop monitor netlink family\n");
 		return rc;
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 2ad039492bee..aab34c7f6f89 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -131,12 +131,7 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
 	[HSR_A_IF2_SEQ] = { .type = NLA_U16 },
 };
 
-static struct genl_family hsr_genl_family = {
-	.hdrsize = 0,
-	.name = "HSR",
-	.version = 1,
-	.maxattr = HSR_A_MAX,
-};
+static struct genl_family hsr_genl_family;
 
 static const struct genl_multicast_group hsr_mcgrps[] = {
 	{ .name = "hsr-network", },
@@ -466,6 +461,18 @@ static const struct genl_ops hsr_ops[] = {
 	},
 };
 
+static struct genl_family hsr_genl_family = {
+	.hdrsize = 0,
+	.name = "HSR",
+	.version = 1,
+	.maxattr = HSR_A_MAX,
+	.module = THIS_MODULE,
+	.ops = hsr_ops,
+	.n_ops = ARRAY_SIZE(hsr_ops),
+	.mcgrps = hsr_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
+};
+
 int __init hsr_netlink_init(void)
 {
 	int rc;
@@ -474,8 +481,7 @@ int __init hsr_netlink_init(void)
 	if (rc)
 		goto fail_rtnl_link_register;
 
-	rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops,
-						  hsr_mcgrps);
+	rc = genl_register_family(&hsr_genl_family);
 	if (rc)
 		goto fail_genl_register_family;
 
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 19144158b696..08e62470bac2 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -28,13 +28,6 @@
 static unsigned int ieee802154_seq_num;
 static DEFINE_SPINLOCK(ieee802154_seq_lock);
 
-struct genl_family nl802154_family = {
-	.hdrsize	= 0,
-	.name		= IEEE802154_NL_NAME,
-	.version	= 1,
-	.maxattr	= IEEE802154_ATTR_MAX,
-};
-
 /* Requests to userspace */
 struct sk_buff *ieee802154_nl_create(int flags, u8 req)
 {
@@ -138,11 +131,21 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = {
 	[IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
 };
 
+struct genl_family nl802154_family = {
+	.hdrsize	= 0,
+	.name		= IEEE802154_NL_NAME,
+	.version	= 1,
+	.maxattr	= IEEE802154_ATTR_MAX,
+	.module		= THIS_MODULE,
+	.ops		= ieee8021154_ops,
+	.n_ops		= ARRAY_SIZE(ieee8021154_ops),
+	.mcgrps		= ieee802154_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(ieee802154_mcgrps),
+};
+
 int __init ieee802154_nl_init(void)
 {
-	return genl_register_family_with_ops_groups(&nl802154_family,
-						    ieee8021154_ops,
-						    ieee802154_mcgrps);
+	return genl_register_family(&nl802154_family);
 }
 
 void ieee802154_nl_exit(void)
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 182299858f1d..f7e75578aedd 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -26,22 +26,8 @@
 #include "rdev-ops.h"
 #include "core.h"
 
-static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			     struct genl_info *info);
-
-static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			       struct genl_info *info);
-
 /* the netlink family */
-static struct genl_family nl802154_fam = {
-	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
-	.hdrsize = 0,			/* no private header */
-	.version = 1,			/* no particular meaning now */
-	.maxattr = NL802154_ATTR_MAX,
-	.netnsok = true,
-	.pre_doit = nl802154_pre_doit,
-	.post_doit = nl802154_post_doit,
-};
+static struct genl_family nl802154_fam;
 
 /* multicast groups */
 enum nl802154_multicast_groups {
@@ -2476,11 +2462,25 @@ static const struct genl_ops nl802154_ops[] = {
 #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
 };
 
+static struct genl_family nl802154_fam = {
+	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
+	.hdrsize = 0,			/* no private header */
+	.version = 1,			/* no particular meaning now */
+	.maxattr = NL802154_ATTR_MAX,
+	.netnsok = true,
+	.pre_doit = nl802154_pre_doit,
+	.post_doit = nl802154_post_doit,
+	.module = THIS_MODULE,
+	.ops = nl802154_ops,
+	.n_ops = ARRAY_SIZE(nl802154_ops),
+	.mcgrps = nl802154_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(nl802154_mcgrps),
+};
+
 /* initialisation/exit functions */
 int nl802154_init(void)
 {
-	return genl_register_family_with_ops_groups(&nl802154_fam, nl802154_ops,
-						    nl802154_mcgrps);
+	return genl_register_family(&nl802154_fam);
 }
 
 void nl802154_exit(void)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index e3fc527c5d37..5b5226a2434f 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -622,13 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
 	return err;
 }
 
-static struct genl_family fou_nl_family = {
-	.hdrsize	= 0,
-	.name		= FOU_GENL_NAME,
-	.version	= FOU_GENL_VERSION,
-	.maxattr	= FOU_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family fou_nl_family;
 
 static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
 	[FOU_ATTR_PORT] = { .type = NLA_U16, },
@@ -830,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = {
 	},
 };
 
+static struct genl_family fou_nl_family = {
+	.hdrsize	= 0,
+	.name		= FOU_GENL_NAME,
+	.version	= FOU_GENL_VERSION,
+	.maxattr	= FOU_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= fou_nl_ops,
+	.n_ops		= ARRAY_SIZE(fou_nl_ops),
+};
+
 size_t fou_encap_hlen(struct ip_tunnel_encap *e)
 {
 	return sizeof(struct udphdr);
@@ -1085,8 +1090,7 @@ static int __init fou_init(void)
 	if (ret)
 		goto exit;
 
-	ret = genl_register_family_with_ops(&fou_nl_family,
-					    fou_nl_ops);
+	ret = genl_register_family(&fou_nl_family);
 	if (ret < 0)
 		goto unregister;
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 3da305127b32..bba3c72c4a39 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -742,13 +742,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
 	rcu_read_unlock();
 }
 
-static struct genl_family tcp_metrics_nl_family = {
-	.hdrsize	= 0,
-	.name		= TCP_METRICS_GENL_NAME,
-	.version	= TCP_METRICS_GENL_VERSION,
-	.maxattr	= TCP_METRICS_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family tcp_metrics_nl_family;
 
 static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
 	[TCP_METRICS_ATTR_ADDR_IPV4]	= { .type = NLA_U32, },
@@ -1115,6 +1109,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
 	},
 };
 
+static struct genl_family tcp_metrics_nl_family = {
+	.hdrsize	= 0,
+	.name		= TCP_METRICS_GENL_NAME,
+	.version	= TCP_METRICS_GENL_VERSION,
+	.maxattr	= TCP_METRICS_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= tcp_metrics_nl_ops,
+	.n_ops		= ARRAY_SIZE(tcp_metrics_nl_ops),
+};
+
 static unsigned int tcpmhash_entries;
 static int __init set_tcpmhash_entries(char *str)
 {
@@ -1178,8 +1183,7 @@ void __init tcp_metrics_init(void)
 	if (ret < 0)
 		panic("Could not allocate the tcp_metrics hash table\n");
 
-	ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
-					    tcp_metrics_nl_ops);
+	ret = genl_register_family(&tcp_metrics_nl_family);
 	if (ret < 0)
 		panic("Could not register tcp_metrics generic netlink\n");
 }
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 0d57e27d1cdd..97f7b0cc4675 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -118,14 +118,7 @@ static const struct rhashtable_params rht_params = {
 	.obj_cmpfn = ila_cmpfn,
 };
 
-static struct genl_family ila_nl_family = {
-	.hdrsize	= 0,
-	.name		= ILA_GENL_NAME,
-	.version	= ILA_GENL_VERSION,
-	.maxattr	= ILA_ATTR_MAX,
-	.netnsok	= true,
-	.parallel_ops	= true,
-};
+static struct genl_family ila_nl_family;
 
 static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
@@ -560,6 +553,18 @@ static const struct genl_ops ila_nl_ops[] = {
 	},
 };
 
+static struct genl_family ila_nl_family = {
+	.hdrsize	= 0,
+	.name		= ILA_GENL_NAME,
+	.version	= ILA_GENL_VERSION,
+	.maxattr	= ILA_ATTR_MAX,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.ops		= ila_nl_ops,
+	.n_ops		= ARRAY_SIZE(ila_nl_ops),
+};
+
 #define ILA_HASH_TABLE_SIZE 1024
 
 static __net_init int ila_init_net(struct net *net)
@@ -630,8 +635,7 @@ int ila_xlat_init(void)
 	if (ret)
 		goto exit;
 
-	ret = genl_register_family_with_ops(&ila_nl_family,
-					    ila_nl_ops);
+	ret = genl_register_family(&ila_nl_family);
 	if (ret < 0)
 		goto unregister;
 
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index f23b81aa91fe..07877347c2f7 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -24,12 +24,7 @@
 
 
-static struct genl_family irda_nl_family = {
-	.name = IRDA_NL_NAME,
-	.hdrsize = 0,
-	.version = IRDA_NL_VERSION,
-	.maxattr = IRDA_NL_CMD_MAX,
-};
+static struct genl_family irda_nl_family;
 
 static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info)
 {
@@ -146,9 +141,19 @@ static const struct genl_ops irda_nl_ops[] = {
 
 };
 
+static struct genl_family irda_nl_family = {
+	.name = IRDA_NL_NAME,
+	.hdrsize = 0,
+	.version = IRDA_NL_VERSION,
+	.maxattr = IRDA_NL_CMD_MAX,
+	.module = THIS_MODULE,
+	.ops = irda_nl_ops,
+	.n_ops = ARRAY_SIZE(irda_nl_ops),
+};
+
 int irda_nl_register(void)
 {
-	return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops);
+	return genl_register_family(&irda_nl_family);
 }
 
 void irda_nl_unregister(void)
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 4fbf1f41ac52..e4e8c0769a6b 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -31,13 +31,7 @@
 #include "l2tp_core.h"
 
 
-static struct genl_family l2tp_nl_family = {
-	.name		= L2TP_GENL_NAME,
-	.version	= L2TP_GENL_VERSION,
-	.hdrsize	= 0,
-	.maxattr	= L2TP_ATTR_MAX,
-	.netnsok	= true,
-};
+static struct genl_family l2tp_nl_family;
 
 static const struct genl_multicast_group l2tp_multicast_group[] = {
 	{
@@ -976,6 +970,19 @@ static const struct genl_ops l2tp_nl_ops[] = {
 	},
 };
 
+static struct genl_family l2tp_nl_family = {
+	.name		= L2TP_GENL_NAME,
+	.version	= L2TP_GENL_VERSION,
+	.hdrsize	= 0,
+	.maxattr	= L2TP_ATTR_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= l2tp_nl_ops,
+	.n_ops		= ARRAY_SIZE(l2tp_nl_ops),
+	.mcgrps		= l2tp_multicast_group,
+	.n_mcgrps	= ARRAY_SIZE(l2tp_multicast_group),
+};
+
 int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops)
 {
 	int ret;
@@ -1012,9 +1019,7 @@ EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
 static int l2tp_nl_init(void)
 {
 	pr_info("L2TP netlink interface\n");
-	return genl_register_family_with_ops_groups(&l2tp_nl_family,
-						    l2tp_nl_ops,
-						    l2tp_multicast_group);
+	return genl_register_family(&l2tp_nl_family);
 }
 
 static void l2tp_nl_cleanup(void)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ceed66cdd03e..ea3e8aed063f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2840,13 +2840,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
  */
 
 /* IPVS genetlink family */
-static struct genl_family ip_vs_genl_family = {
-	.hdrsize	= 0,
-	.name		= IPVS_GENL_NAME,
-	.version	= IPVS_GENL_VERSION,
-	.maxattr	= IPVS_CMD_MAX,
-	.netnsok        = true,         /* Make ipvsadm to work on netns */
-};
+static struct genl_family ip_vs_genl_family;
 
 /* Policy used for first-level command attributes */
 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
@@ -3871,10 +3865,20 @@ static const struct genl_ops ip_vs_genl_ops[] = {
 	},
 };
 
+static struct genl_family ip_vs_genl_family = {
+	.hdrsize	= 0,
+	.name		= IPVS_GENL_NAME,
+	.version	= IPVS_GENL_VERSION,
+	.maxattr	= IPVS_CMD_MAX,
+	.netnsok        = true,         /* Make ipvsadm to work on netns */
+	.module		= THIS_MODULE,
+	.ops		= ip_vs_genl_ops,
+	.n_ops		= ARRAY_SIZE(ip_vs_genl_ops),
+};
+
 static int __init ip_vs_genl_register(void)
 {
-	return genl_register_family_with_ops(&ip_vs_genl_family,
-					     ip_vs_genl_ops);
+	return genl_register_family(&ip_vs_genl_family);
 }
 
 static void ip_vs_genl_unregister(void)
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index 152e503b8c5d..ca7c9c411a5c 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -60,12 +60,7 @@ struct netlbl_domhsh_walk_arg {
 };
 
 /* NetLabel Generic NETLINK CALIPSO family */
-static struct genl_family netlbl_calipso_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_CALIPSO_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_CALIPSO_A_MAX,
-};
+static struct genl_family netlbl_calipso_gnl_family;
 
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = {
@@ -354,6 +349,16 @@ static const struct genl_ops netlbl_calipso_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_calipso_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_CALIPSO_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_CALIPSO_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_calipso_ops,
+	.n_ops = ARRAY_SIZE(netlbl_calipso_ops),
+};
+
 /* NetLabel Generic NETLINK Protocol Functions
  */
 
@@ -367,8 +372,7 @@ static const struct genl_ops netlbl_calipso_ops[] = {
  */
 int __init netlbl_calipso_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_calipso_gnl_family,
-					     netlbl_calipso_ops);
+	return genl_register_family(&netlbl_calipso_gnl_family);
 }
 
 static const struct netlbl_calipso_ops *calipso_ops;
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index 755b284e7ad4..a665eae91245 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -59,13 +59,7 @@ struct netlbl_domhsh_walk_arg {
 };
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_cipsov4_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_CIPSOV4_A_MAX,
-};
-
+static struct genl_family netlbl_cipsov4_gnl_family;
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = {
 	[NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 },
@@ -766,6 +760,16 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_cipsov4_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_CIPSOV4_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_cipsov4_ops,
+	.n_ops = ARRAY_SIZE(netlbl_cipsov4_ops),
+};
+
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
@@ -780,6 +784,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
  */
 int __init netlbl_cipsov4_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family,
-					     netlbl_cipsov4_ops);
+	return genl_register_family(&netlbl_cipsov4_gnl_family);
 }
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index 3b00f2368fcd..ecfe8eb149db 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -60,12 +60,7 @@ struct netlbl_domhsh_walk_arg {
 };
 
 /* NetLabel Generic NETLINK CIPSOv4 family */
-static struct genl_family netlbl_mgmt_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_MGMT_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_MGMT_A_MAX,
-};
+static struct genl_family netlbl_mgmt_gnl_family;
 
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = {
@@ -833,6 +828,16 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_mgmt_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_MGMT_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_MGMT_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_mgmt_genl_ops,
+	.n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops),
+};
+
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
@@ -847,6 +852,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
  */
 int __init netlbl_mgmt_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_mgmt_gnl_family,
-					     netlbl_mgmt_genl_ops);
+	return genl_register_family(&netlbl_mgmt_gnl_family);
 }
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index c2ea8d1f653a..5dbbad41114f 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -123,12 +123,7 @@ static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;
 static u8 netlabel_unlabel_acceptflg;
 
 /* NetLabel Generic NETLINK unlabeled family */
-static struct genl_family netlbl_unlabel_gnl_family = {
-	.hdrsize = 0,
-	.name = NETLBL_NLTYPE_UNLABELED_NAME,
-	.version = NETLBL_PROTO_VERSION,
-	.maxattr = NLBL_UNLABEL_A_MAX,
-};
+static struct genl_family netlbl_unlabel_gnl_family;
 
 /* NetLabel Netlink attribute policy */
 static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
@@ -1377,6 +1372,16 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
 	},
 };
 
+static struct genl_family netlbl_unlabel_gnl_family = {
+	.hdrsize = 0,
+	.name = NETLBL_NLTYPE_UNLABELED_NAME,
+	.version = NETLBL_PROTO_VERSION,
+	.maxattr = NLBL_UNLABEL_A_MAX,
+	.module = THIS_MODULE,
+	.ops = netlbl_unlabel_genl_ops,
+	.n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
+};
+
 /*
  * NetLabel Generic NETLINK Protocol Functions
  */
@@ -1391,8 +1396,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
  */
 int __init netlbl_unlabel_genl_init(void)
 {
-	return genl_register_family_with_ops(&netlbl_unlabel_gnl_family,
-					     netlbl_unlabel_genl_ops);
+	return genl_register_family(&netlbl_unlabel_gnl_family);
 }
 
 /*
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f19ec969edee..ca582ee4ae05 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -344,18 +344,18 @@ static int genl_validate_ops(const struct genl_family *family)
 }
 
 /**
- * __genl_register_family - register a generic netlink family
+ * genl_register_family - register a generic netlink family
  * @family: generic netlink family
  *
  * Registers the specified family after validating it first. Only one
  * family may be registered with the same family name or identifier.
  *
- * The family's ops array must already be assigned, you can use the
- * genl_register_family_with_ops() helper function.
+ * The family's ops, multicast groups and module pointer must already
+ * be assigned.
  *
  * Return 0 on success or a negative error code.
  */
-int __genl_register_family(struct genl_family *family)
+int genl_register_family(struct genl_family *family)
 {
 	int err, i;
 
@@ -429,7 +429,7 @@ errout_locked:
 	genl_unlock_all();
 	return err;
 }
-EXPORT_SYMBOL(__genl_register_family);
+EXPORT_SYMBOL(genl_register_family);
 
 /**
  * genl_unregister_family - unregister generic netlink family
@@ -452,7 +452,6 @@ int genl_unregister_family(struct genl_family *family)
 		genl_unregister_mc_groups(family);
 
 		list_del(&rc->family_list);
-		family->n_ops = 0;
 		up_write(&cb_lock);
 		wait_event(genl_sk_destructing_waitq,
 			   atomic_read(&genl_sk_destructing_cnt) == 0);
@@ -681,13 +680,7 @@ static void genl_rcv(struct sk_buff *skb)
  * Controller
  **************************************************************************/
 
-static struct genl_family genl_ctrl = {
-	.id = GENL_ID_CTRL,
-	.name = "nlctrl",
-	.version = 0x2,
-	.maxattr = CTRL_ATTR_MAX,
-	.netnsok = true,
-};
+static struct genl_family genl_ctrl;
 
 static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq,
 			  u32 flags, struct sk_buff *skb, u8 cmd)
@@ -997,6 +990,19 @@ static const struct genl_multicast_group genl_ctrl_groups[] = {
 	{ .name = "notify", },
 };
 
+static struct genl_family genl_ctrl = {
+	.module = THIS_MODULE,
+	.ops = genl_ctrl_ops,
+	.n_ops = ARRAY_SIZE(genl_ctrl_ops),
+	.mcgrps = genl_ctrl_groups,
+	.n_mcgrps = ARRAY_SIZE(genl_ctrl_groups),
+	.id = GENL_ID_CTRL,
+	.name = "nlctrl",
+	.version = 0x2,
+	.maxattr = CTRL_ATTR_MAX,
+	.netnsok = true,
+};
+
 static int genl_bind(struct net *net, int group)
 {
 	int i, err = -ENOENT;
@@ -1086,8 +1092,7 @@ static int __init genl_init(void)
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++)
 		INIT_LIST_HEAD(&family_ht[i]);
 
-	err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops,
-						   genl_ctrl_groups);
+	err = genl_register_family(&genl_ctrl);
 	if (err < 0)
 		goto problem;
 
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index c230403e066c..450b1e5144cc 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -38,13 +38,7 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = {
 	{ .name = NFC_GENL_MCAST_EVENT_NAME, },
 };
 
-static struct genl_family nfc_genl_family = {
-	.hdrsize = 0,
-	.name = NFC_GENL_NAME,
-	.version = NFC_GENL_VERSION,
-	.maxattr = NFC_ATTR_MAX,
-};
-
+static struct genl_family nfc_genl_family;
 static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = {
 	[NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 },
 	[NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING,
@@ -1752,6 +1746,18 @@ static const struct genl_ops nfc_genl_ops[] = {
 	},
 };
 
+static struct genl_family nfc_genl_family = {
+	.hdrsize = 0,
+	.name = NFC_GENL_NAME,
+	.version = NFC_GENL_VERSION,
+	.maxattr = NFC_ATTR_MAX,
+	.module = THIS_MODULE,
+	.ops = nfc_genl_ops,
+	.n_ops = ARRAY_SIZE(nfc_genl_ops),
+	.mcgrps = nfc_genl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps),
+};
+
 
 struct urelease_work {
 	struct	work_struct w;
@@ -1837,9 +1843,7 @@ int __init nfc_genl_init(void)
 {
 	int rc;
 
-	rc = genl_register_family_with_ops_groups(&nfc_genl_family,
-						  nfc_genl_ops,
-						  nfc_genl_mcgrps);
+	rc = genl_register_family(&nfc_genl_family);
 	if (rc)
 		return rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f9fef7dfba15..ad6a111a0014 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -679,6 +679,7 @@ static struct genl_family dp_packet_genl_family = {
 	.parallel_ops = true,
 	.ops = dp_packet_genl_ops,
 	.n_ops = ARRAY_SIZE(dp_packet_genl_ops),
+	.module = THIS_MODULE,
 };
 
 static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats,
@@ -1445,6 +1446,7 @@ static struct genl_family dp_flow_genl_family = {
 	.n_ops = ARRAY_SIZE(dp_flow_genl_ops),
 	.mcgrps = &ovs_dp_flow_multicast_group,
 	.n_mcgrps = 1,
+	.module = THIS_MODULE,
 };
 
 static size_t ovs_dp_cmd_msg_size(void)
@@ -1830,6 +1832,7 @@ static struct genl_family dp_datapath_genl_family = {
 	.n_ops = ARRAY_SIZE(dp_datapath_genl_ops),
 	.mcgrps = &ovs_dp_datapath_multicast_group,
 	.n_mcgrps = 1,
+	.module = THIS_MODULE,
 };
 
 /* Called with ovs_mutex or RCU read lock. */
@@ -2251,6 +2254,7 @@ struct genl_family dp_vport_genl_family = {
 	.n_ops = ARRAY_SIZE(dp_vport_genl_ops),
 	.mcgrps = &ovs_dp_vport_multicast_group,
 	.n_mcgrps = 1,
+	.module = THIS_MODULE,
 };
 
 static struct genl_family * const dp_genl_families[] = {
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 383b8fedabc7..74a405bf107b 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -135,14 +135,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
 /* Users of the legacy API (tipc-config) can't handle that we add operations,
  * so we have a separate genl handling for the new API.
  */
-struct genl_family tipc_genl_family = {
-	.name		= TIPC_GENL_V2_NAME,
-	.version	= TIPC_GENL_V2_VERSION,
-	.hdrsize	= 0,
-	.maxattr	= TIPC_NLA_MAX,
-	.netnsok	= true,
-};
-
 static const struct genl_ops tipc_genl_v2_ops[] = {
 	{
 		.cmd	= TIPC_NL_BEARER_DISABLE,
@@ -257,6 +249,17 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 #endif
 };
 
+struct genl_family tipc_genl_family = {
+	.name		= TIPC_GENL_V2_NAME,
+	.version	= TIPC_GENL_V2_VERSION,
+	.hdrsize	= 0,
+	.maxattr	= TIPC_NLA_MAX,
+	.netnsok	= true,
+	.module		= THIS_MODULE,
+	.ops		= tipc_genl_v2_ops,
+	.n_ops		= ARRAY_SIZE(tipc_genl_v2_ops),
+};
+
 int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
 {
 	u32 maxattr = tipc_genl_family.maxattr;
@@ -272,8 +275,7 @@ int tipc_netlink_start(void)
 {
 	int res;
 
-	res = genl_register_family_with_ops(&tipc_genl_family,
-					    tipc_genl_v2_ops);
+	res = genl_register_family(&tipc_genl_family);
 	if (res) {
 		pr_err("Failed to register netlink interface\n");
 		return res;
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index f04428e4c8e5..07b19931e458 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1215,27 +1215,29 @@ send:
 	return err;
 }
 
+static struct genl_ops tipc_genl_compat_ops[] = {
+	{
+		.cmd		= TIPC_GENL_CMD,
+		.doit		= tipc_nl_compat_recv,
+	},
+};
+
 static struct genl_family tipc_genl_compat_family = {
 	.name		= TIPC_GENL_NAME,
 	.version	= TIPC_GENL_VERSION,
 	.hdrsize	= TIPC_GENL_HDRLEN,
 	.maxattr	= 0,
 	.netnsok	= true,
-};
-
-static struct genl_ops tipc_genl_compat_ops[] = {
-	{
-		.cmd		= TIPC_GENL_CMD,
-		.doit		= tipc_nl_compat_recv,
-	},
+	.module		= THIS_MODULE,
+	.ops		= tipc_genl_compat_ops,
+	.n_ops		= ARRAY_SIZE(tipc_genl_compat_ops),
 };
 
 int tipc_netlink_compat_start(void)
 {
 	int res;
 
-	res = genl_register_family_with_ops(&tipc_genl_compat_family,
-					    tipc_genl_compat_ops);
+	res = genl_register_family(&tipc_genl_compat_family);
 	if (res) {
 		pr_err("Failed to register legacy compat interface\n");
 		return res;
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 8ac83a41585f..587e1627681f 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -572,15 +572,20 @@ struct d_level D_LEVEL[] = {
 size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL);
 
 
+static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
+	{ .name = "msg", },
+};
+
 struct genl_family wimax_gnl_family = {
 	.name = "WiMAX",
 	.version = WIMAX_GNL_VERSION,
 	.hdrsize = 0,
 	.maxattr = WIMAX_GNL_ATTR_MAX,
-};
-
-static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
-	{ .name = "msg", },
+	.module = THIS_MODULE,
+	.ops = wimax_gnl_ops,
+	.n_ops = ARRAY_SIZE(wimax_gnl_ops),
+	.mcgrps = wimax_gnl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps),
 };
 
 
@@ -595,11 +600,7 @@ int __init wimax_subsys_init(void)
 	d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params,
 		       "wimax.debug");
 
-	snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name),
-		 "WiMAX");
-	result = genl_register_family_with_ops_groups(&wimax_gnl_family,
-						      wimax_gnl_ops,
-						      wimax_gnl_mcgrps);
+	result = genl_register_family(&wimax_gnl_family);
 	if (unlikely(result < 0)) {
 		pr_err("cannot register generic netlink family: %d\n", result);
 		goto error_register_family;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 714beafe05e0..8e5ca3c47593 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -32,21 +32,8 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 				   struct cfg80211_crypto_settings *settings,
 				   int cipher_limit);
 
-static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			    struct genl_info *info);
-static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
-			      struct genl_info *info);
-
 /* the netlink family */
-static struct genl_family nl80211_fam = {
-	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
-	.hdrsize = 0,			/* no private header */
-	.version = 1,			/* no particular meaning now */
-	.maxattr = NL80211_ATTR_MAX,
-	.netnsok = true,
-	.pre_doit = nl80211_pre_doit,
-	.post_doit = nl80211_post_doit,
-};
+static struct genl_family nl80211_fam;
 
 /* multicast groups */
 enum nl80211_multicast_groups {
@@ -12599,6 +12586,21 @@ static const struct genl_ops nl80211_ops[] = {
 	},
 };
 
+static struct genl_family nl80211_fam = {
+	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
+	.hdrsize = 0,			/* no private header */
+	.version = 1,			/* no particular meaning now */
+	.maxattr = NL80211_ATTR_MAX,
+	.netnsok = true,
+	.pre_doit = nl80211_pre_doit,
+	.post_doit = nl80211_post_doit,
+	.module = THIS_MODULE,
+	.ops = nl80211_ops,
+	.n_ops = ARRAY_SIZE(nl80211_ops),
+	.mcgrps = nl80211_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(nl80211_mcgrps),
+};
+
 /* notification functions */
 
 void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev,
@@ -14565,8 +14567,7 @@ int nl80211_init(void)
 {
 	int err;
 
-	err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops,
-						   nl80211_mcgrps);
+	err = genl_register_family(&nl80211_fam);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 56989f6d8568c21257dcec0f5e644d5570ba3281 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:05 +0200
Subject: genetlink: mark families as __ro_after_init

Now genl_register_family() is the only thing (other than the
users themselves, perhaps, but I didn't find any doing that)
writing to the family struct.

In all families that I found, genl_register_family() is only
called from __init functions (some indirectly, in which case
I've add __init annotations to clarifly things), so all can
actually be marked __ro_after_init.

This protects the data structure from accidental corruption.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/acpi/event.c                  |  4 ++--
 drivers/net/gtp.c                     |  2 +-
 drivers/net/macsec.c                  |  2 +-
 drivers/net/team/team.c               |  4 ++--
 drivers/net/wireless/mac80211_hwsim.c |  4 ++--
 drivers/scsi/pmcraid.c                |  4 ++--
 drivers/target/target_core_user.c     |  2 +-
 drivers/thermal/thermal_core.c        |  4 ++--
 fs/dlm/netlink.c                      |  2 +-
 fs/quota/netlink.c                    |  2 +-
 include/linux/genl_magic_func.h       |  2 +-
 kernel/taskstats.c                    |  2 +-
 net/batman-adv/netlink.c              |  2 +-
 net/core/devlink.c                    |  2 +-
 net/core/drop_monitor.c               |  2 +-
 net/hsr/hsr_netlink.c                 |  2 +-
 net/ieee802154/netlink.c              |  2 +-
 net/ieee802154/nl802154.c             |  4 ++--
 net/ipv4/fou.c                        |  2 +-
 net/ipv4/tcp_metrics.c                |  2 +-
 net/ipv6/ila/ila_xlat.c               |  4 ++--
 net/irda/irnetlink.c                  |  4 ++--
 net/l2tp/l2tp_netlink.c               |  4 ++--
 net/netfilter/ipvs/ip_vs_ctl.c        |  2 +-
 net/netlabel/netlabel_calipso.c       |  2 +-
 net/netlabel/netlabel_cipso_v4.c      |  2 +-
 net/netlabel/netlabel_mgmt.c          |  2 +-
 net/netlabel/netlabel_unlabeled.c     |  2 +-
 net/netlink/genetlink.c               |  2 +-
 net/nfc/netlink.c                     |  2 +-
 net/openvswitch/datapath.c            | 10 +++++-----
 net/tipc/netlink.c                    |  4 ++--
 net/tipc/netlink_compat.c             |  4 ++--
 net/wimax/stack.c                     |  2 +-
 net/wireless/nl80211.c                |  4 ++--
 35 files changed, 51 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c
index 1ab12ad7d5ba..7fceb3b4691b 100644
--- a/drivers/acpi/event.c
+++ b/drivers/acpi/event.c
@@ -82,7 +82,7 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = {
 	{ .name = ACPI_GENL_MCAST_GROUP_NAME, },
 };
 
-static struct genl_family acpi_event_genl_family = {
+static struct genl_family acpi_event_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.name = ACPI_GENL_FAMILY_NAME,
 	.version = ACPI_GENL_VERSION,
@@ -144,7 +144,7 @@ int acpi_bus_generate_netlink_event(const char *device_class,
 
 EXPORT_SYMBOL(acpi_bus_generate_netlink_event);
 
-static int acpi_event_genetlink_init(void)
+static int __init acpi_event_genetlink_init(void)
 {
 	return genl_register_family(&acpi_event_genl_family);
 }
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 0604fd78f826..719d19f35673 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -1290,7 +1290,7 @@ static const struct genl_ops gtp_genl_ops[] = {
 	},
 };
 
-static struct genl_family gtp_genl_family = {
+static struct genl_family gtp_genl_family __ro_after_init = {
 	.name		= "gtp",
 	.version	= 0,
 	.hdrsize	= 0,
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 63ca7a3c77cf..0a715ab9d9cc 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2648,7 +2648,7 @@ static const struct genl_ops macsec_genl_ops[] = {
 	},
 };
 
-static struct genl_family macsec_fam = {
+static struct genl_family macsec_fam __ro_after_init = {
 	.name		= MACSEC_GENL_NAME,
 	.hdrsize	= 0,
 	.version	= MACSEC_GENL_VERSION,
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 46bf7c1216c0..bdc58567d10e 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2740,7 +2740,7 @@ static const struct genl_multicast_group team_nl_mcgrps[] = {
 	{ .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, },
 };
 
-static struct genl_family team_nl_family = {
+static struct genl_family team_nl_family __ro_after_init = {
 	.name		= TEAM_GENL_NAME,
 	.version	= TEAM_GENL_VERSION,
 	.maxattr	= TEAM_ATTR_MAX,
@@ -2773,7 +2773,7 @@ static int team_nl_send_event_port_get(struct team *team,
 					  port);
 }
 
-static int team_nl_init(void)
+static int __init team_nl_init(void)
 {
 	return genl_register_family(&team_nl_family);
 }
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 5d4637e586e8..220e9dc8ccf8 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -3228,7 +3228,7 @@ static const struct genl_ops hwsim_ops[] = {
 	},
 };
 
-static struct genl_family hwsim_genl_family = {
+static struct genl_family hwsim_genl_family __ro_after_init = {
 	.name = "MAC80211_HWSIM",
 	.version = 1,
 	.maxattr = HWSIM_ATTR_MAX,
@@ -3287,7 +3287,7 @@ static struct notifier_block hwsim_netlink_notifier = {
 	.notifier_call = mac80211_hwsim_netlink_notify,
 };
 
-static int hwsim_init_netlink(void)
+static int __init hwsim_init_netlink(void)
 {
 	int rc;
 
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index c0ab7bb8c3ce..845affa112f7 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1368,7 +1368,7 @@ static struct genl_multicast_group pmcraid_mcgrps[] = {
 	{ .name = "events", /* not really used - see ID discussion below */ },
 };
 
-static struct genl_family pmcraid_event_family = {
+static struct genl_family pmcraid_event_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.name = "pmcraid",
 	.version = 1,
@@ -1384,7 +1384,7 @@ static struct genl_family pmcraid_event_family = {
  *	0 if the pmcraid_event_family is successfully registered
  *	with netlink generic, non-zero otherwise
  */
-static int pmcraid_netlink_init(void)
+static int __init pmcraid_netlink_init(void)
 {
 	int result;
 
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 3483372f5562..0f173bf7dbac 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -147,7 +147,7 @@ static const struct genl_multicast_group tcmu_mcgrps[] = {
 };
 
 /* Our generic netlink family */
-static struct genl_family tcmu_genl_family = {
+static struct genl_family tcmu_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "TCM-USER",
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 93b6caab2d9f..911fd964c742 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -2163,7 +2163,7 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = {
 	{ .name = THERMAL_GENL_MCAST_GROUP_NAME, },
 };
 
-static struct genl_family thermal_event_genl_family = {
+static struct genl_family thermal_event_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.name = THERMAL_GENL_FAMILY_NAME,
 	.version = THERMAL_GENL_VERSION,
@@ -2235,7 +2235,7 @@ int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 }
 EXPORT_SYMBOL_GPL(thermal_generate_netlink_event);
 
-static int genetlink_init(void)
+static int __init genetlink_init(void)
 {
 	return genl_register_family(&thermal_event_genl_family);
 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 04042d69573c..0643ae44f342 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -72,7 +72,7 @@ static struct genl_ops dlm_nl_ops[] = {
 	},
 };
 
-static struct genl_family family = {
+static struct genl_family family __ro_after_init = {
 	.name		= DLM_GENL_NAME,
 	.version	= DLM_GENL_VERSION,
 	.ops		= dlm_nl_ops,
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 9457c7b0dfa2..e99b1a72d9a7 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -12,7 +12,7 @@ static const struct genl_multicast_group quota_mcgrps[] = {
 };
 
 /* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
+static struct genl_family quota_genl_family __ro_after_init = {
 	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 40c2e39362c8..377257d8f7e3 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -293,7 +293,7 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 #undef GENL_mc_group
 #define GENL_mc_group(group)
 
-static struct genl_family ZZZ_genl_family __read_mostly = {
+static struct genl_family ZZZ_genl_family __ro_after_init = {
 	.name = __stringify(GENL_MAGIC_FAMILY),
 	.version = GENL_MAGIC_VERSION,
 #ifdef GENL_MAGIC_FAMILY_HDRSZ
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4075ece592f2..9b7f838511ce 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -646,7 +646,7 @@ static const struct genl_ops taskstats_ops[] = {
 	},
 };
 
-static struct genl_family family = {
+static struct genl_family family __ro_after_init = {
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index e28cec34a016..005012ba9b48 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -603,7 +603,7 @@ static struct genl_ops batadv_netlink_ops[] = {
 
 };
 
-struct genl_family batadv_netlink_family = {
+struct genl_family batadv_netlink_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = BATADV_NL_NAME,
 	.version = 1,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 063da8091aef..c14f8b661db9 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1612,7 +1612,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 	},
 };
 
-static struct genl_family devlink_nl_family = {
+static struct genl_family devlink_nl_family __ro_after_init = {
 	.name		= DEVLINK_GENL_NAME,
 	.version	= DEVLINK_GENL_VERSION,
 	.maxattr	= DEVLINK_ATTR_MAX,
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 80c002794ff6..8e0c0635ee97 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -346,7 +346,7 @@ static const struct genl_ops dropmon_ops[] = {
 	},
 };
 
-static struct genl_family net_drop_monitor_family = {
+static struct genl_family net_drop_monitor_family __ro_after_init = {
 	.hdrsize        = 0,
 	.name           = "NET_DM",
 	.version        = 2,
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index aab34c7f6f89..1ab30e7d3f99 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -461,7 +461,7 @@ static const struct genl_ops hsr_ops[] = {
 	},
 };
 
-static struct genl_family hsr_genl_family = {
+static struct genl_family hsr_genl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = "HSR",
 	.version = 1,
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 08e62470bac2..6bde9e5a5503 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -131,7 +131,7 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = {
 	[IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, },
 };
 
-struct genl_family nl802154_family = {
+struct genl_family nl802154_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= IEEE802154_NL_NAME,
 	.version	= 1,
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index f7e75578aedd..fc60cd061f39 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -2462,7 +2462,7 @@ static const struct genl_ops nl802154_ops[] = {
 #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
 };
 
-static struct genl_family nl802154_fam = {
+static struct genl_family nl802154_fam __ro_after_init = {
 	.name = NL802154_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
@@ -2478,7 +2478,7 @@ static struct genl_family nl802154_fam = {
 };
 
 /* initialisation/exit functions */
-int nl802154_init(void)
+int __init nl802154_init(void)
 {
 	return genl_register_family(&nl802154_fam);
 }
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 5b5226a2434f..6cb57bb8692d 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -824,7 +824,7 @@ static const struct genl_ops fou_nl_ops[] = {
 	},
 };
 
-static struct genl_family fou_nl_family = {
+static struct genl_family fou_nl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= FOU_GENL_NAME,
 	.version	= FOU_GENL_VERSION,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bba3c72c4a39..d46f4d5b1c62 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1109,7 +1109,7 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
 	},
 };
 
-static struct genl_family tcp_metrics_nl_family = {
+static struct genl_family tcp_metrics_nl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= TCP_METRICS_GENL_NAME,
 	.version	= TCP_METRICS_GENL_VERSION,
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 97f7b0cc4675..628ae6d85b59 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -553,7 +553,7 @@ static const struct genl_ops ila_nl_ops[] = {
 	},
 };
 
-static struct genl_family ila_nl_family = {
+static struct genl_family ila_nl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= ILA_GENL_NAME,
 	.version	= ILA_GENL_VERSION,
@@ -627,7 +627,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral)
 	return 0;
 }
 
-int ila_xlat_init(void)
+int __init ila_xlat_init(void)
 {
 	int ret;
 
diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c
index 07877347c2f7..7fc340e574cf 100644
--- a/net/irda/irnetlink.c
+++ b/net/irda/irnetlink.c
@@ -141,7 +141,7 @@ static const struct genl_ops irda_nl_ops[] = {
 
 };
 
-static struct genl_family irda_nl_family = {
+static struct genl_family irda_nl_family __ro_after_init = {
 	.name = IRDA_NL_NAME,
 	.hdrsize = 0,
 	.version = IRDA_NL_VERSION,
@@ -151,7 +151,7 @@ static struct genl_family irda_nl_family = {
 	.n_ops = ARRAY_SIZE(irda_nl_ops),
 };
 
-int irda_nl_register(void)
+int __init irda_nl_register(void)
 {
 	return genl_register_family(&irda_nl_family);
 }
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index e4e8c0769a6b..59aa2d204e4a 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -970,7 +970,7 @@ static const struct genl_ops l2tp_nl_ops[] = {
 	},
 };
 
-static struct genl_family l2tp_nl_family = {
+static struct genl_family l2tp_nl_family __ro_after_init = {
 	.name		= L2TP_GENL_NAME,
 	.version	= L2TP_GENL_VERSION,
 	.hdrsize	= 0,
@@ -1016,7 +1016,7 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type)
 }
 EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops);
 
-static int l2tp_nl_init(void)
+static int __init l2tp_nl_init(void)
 {
 	pr_info("L2TP netlink interface\n");
 	return genl_register_family(&l2tp_nl_family);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ea3e8aed063f..6b85ded4f91d 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3865,7 +3865,7 @@ static const struct genl_ops ip_vs_genl_ops[] = {
 	},
 };
 
-static struct genl_family ip_vs_genl_family = {
+static struct genl_family ip_vs_genl_family __ro_after_init = {
 	.hdrsize	= 0,
 	.name		= IPVS_GENL_NAME,
 	.version	= IPVS_GENL_VERSION,
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index ca7c9c411a5c..d177dd066504 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -349,7 +349,7 @@ static const struct genl_ops netlbl_calipso_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_calipso_gnl_family = {
+static struct genl_family netlbl_calipso_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CALIPSO_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c
index a665eae91245..4149d3e63589 100644
--- a/net/netlabel/netlabel_cipso_v4.c
+++ b/net/netlabel/netlabel_cipso_v4.c
@@ -760,7 +760,7 @@ static const struct genl_ops netlbl_cipsov4_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_cipsov4_gnl_family = {
+static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_CIPSOV4_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c
index ecfe8eb149db..21e0095b1d14 100644
--- a/net/netlabel/netlabel_mgmt.c
+++ b/net/netlabel/netlabel_mgmt.c
@@ -828,7 +828,7 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_mgmt_gnl_family = {
+static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_MGMT_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 5dbbad41114f..22dc1b9d6362 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1372,7 +1372,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = {
 	},
 };
 
-static struct genl_family netlbl_unlabel_gnl_family = {
+static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NETLBL_NLTYPE_UNLABELED_NAME,
 	.version = NETLBL_PROTO_VERSION,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 85659921e7b2..df0cbcddda2c 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -936,7 +936,7 @@ static const struct genl_multicast_group genl_ctrl_groups[] = {
 	{ .name = "notify", },
 };
 
-static struct genl_family genl_ctrl = {
+static struct genl_family genl_ctrl __ro_after_init = {
 	.module = THIS_MODULE,
 	.ops = genl_ctrl_ops,
 	.n_ops = ARRAY_SIZE(genl_ctrl_ops),
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index 450b1e5144cc..03f3d5c7beb8 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1746,7 +1746,7 @@ static const struct genl_ops nfc_genl_ops[] = {
 	},
 };
 
-static struct genl_family nfc_genl_family = {
+static struct genl_family nfc_genl_family __ro_after_init = {
 	.hdrsize = 0,
 	.name = NFC_GENL_NAME,
 	.version = NFC_GENL_VERSION,
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ad6a111a0014..fa8760176b7d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -670,7 +670,7 @@ static const struct genl_ops dp_packet_genl_ops[] = {
 	}
 };
 
-static struct genl_family dp_packet_genl_family = {
+static struct genl_family dp_packet_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_PACKET_FAMILY,
 	.version = OVS_PACKET_VERSION,
@@ -1435,7 +1435,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 	},
 };
 
-static struct genl_family dp_flow_genl_family = {
+static struct genl_family dp_flow_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_FLOW_FAMILY,
 	.version = OVS_FLOW_VERSION,
@@ -1821,7 +1821,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 	},
 };
 
-static struct genl_family dp_datapath_genl_family = {
+static struct genl_family dp_datapath_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_DATAPATH_FAMILY,
 	.version = OVS_DATAPATH_VERSION,
@@ -2243,7 +2243,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 	},
 };
 
-struct genl_family dp_vport_genl_family = {
+struct genl_family dp_vport_genl_family __ro_after_init = {
 	.hdrsize = sizeof(struct ovs_header),
 	.name = OVS_VPORT_FAMILY,
 	.version = OVS_VPORT_VERSION,
@@ -2272,7 +2272,7 @@ static void dp_unregister_genl(int n_families)
 		genl_unregister_family(dp_genl_families[i]);
 }
 
-static int dp_register_genl(void)
+static int __init dp_register_genl(void)
 {
 	int err;
 	int i;
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 74a405bf107b..26ca8dd64ded 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -249,7 +249,7 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
 #endif
 };
 
-struct genl_family tipc_genl_family = {
+struct genl_family tipc_genl_family __ro_after_init = {
 	.name		= TIPC_GENL_V2_NAME,
 	.version	= TIPC_GENL_V2_VERSION,
 	.hdrsize	= 0,
@@ -271,7 +271,7 @@ int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr)
 	return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy);
 }
 
-int tipc_netlink_start(void)
+int __init tipc_netlink_start(void)
 {
 	int res;
 
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 07b19931e458..e1ae8a8a2b8e 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1222,7 +1222,7 @@ static struct genl_ops tipc_genl_compat_ops[] = {
 	},
 };
 
-static struct genl_family tipc_genl_compat_family = {
+static struct genl_family tipc_genl_compat_family __ro_after_init = {
 	.name		= TIPC_GENL_NAME,
 	.version	= TIPC_GENL_VERSION,
 	.hdrsize	= TIPC_GENL_HDRLEN,
@@ -1233,7 +1233,7 @@ static struct genl_family tipc_genl_compat_family = {
 	.n_ops		= ARRAY_SIZE(tipc_genl_compat_ops),
 };
 
-int tipc_netlink_compat_start(void)
+int __init tipc_netlink_compat_start(void)
 {
 	int res;
 
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 587e1627681f..5db731512014 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -576,7 +576,7 @@ static const struct genl_multicast_group wimax_gnl_mcgrps[] = {
 	{ .name = "msg", },
 };
 
-struct genl_family wimax_gnl_family = {
+struct genl_family wimax_gnl_family __ro_after_init = {
 	.name = "WiMAX",
 	.version = WIMAX_GNL_VERSION,
 	.hdrsize = 0,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8e5ca3c47593..271707dacfea 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12586,7 +12586,7 @@ static const struct genl_ops nl80211_ops[] = {
 	},
 };
 
-static struct genl_family nl80211_fam = {
+static struct genl_family nl80211_fam __ro_after_init = {
 	.name = NL80211_GENL_NAME,	/* have users key off the name instead */
 	.hdrsize = 0,			/* no private header */
 	.version = 1,			/* no particular meaning now */
@@ -14563,7 +14563,7 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev)
 
 /* initialisation/exit functions */
 
-int nl80211_init(void)
+int __init nl80211_init(void)
 {
 	int err;
 
-- 
cgit v1.2.3


From ef295ecf090d3e86e5b742fc6ab34f1122a43773 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 28 Oct 2016 08:48:16 -0600
Subject: block: better op and flags encoding

Now that we don't need the common flags to overflow outside the range
of a 32-bit type we can encode them the same way for both the bio and
request fields.  This in addition allows us to place the operation
first (and make some room for more ops while we're at it) and to
stop having to shift around the operation values.

In addition this allows passing around only one value in the block layer
instead of two (and eventuall also in the file systems, but we can do
that later) and thus clean up a lot of code.

Last but not least this allows decreasing the size of the cmd_flags
field in struct request to 32-bits.  Various functions passing this
value could also be updated, but I'd like to avoid the churn for now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/biodoc.txt |  4 +-
 block/blk-core.c               | 60 ++++++++++--------------------
 block/blk-flush.c              |  2 +-
 block/blk-lib.c                |  2 +-
 block/blk-map.c                |  2 +
 block/blk-mq.c                 | 28 ++++++--------
 block/cfq-iosched.c            | 66 ++++++++++++++++-----------------
 block/elevator.c               |  4 +-
 drivers/md/dm-crypt.c          |  2 +-
 drivers/scsi/sd.c              |  3 +-
 fs/btrfs/inode.c               |  5 +--
 fs/buffer.c                    |  2 +-
 fs/f2fs/f2fs.h                 |  2 +-
 fs/gfs2/lops.c                 |  2 +-
 include/linux/blk-cgroup.h     | 11 +++---
 include/linux/blk_types.h      | 83 +++++++++++++++++++-----------------------
 include/linux/blkdev.h         | 26 +------------
 include/linux/blktrace_api.h   |  2 +-
 include/linux/dm-io.h          |  2 +-
 include/linux/elevator.h       |  4 +-
 include/trace/events/bcache.h  | 12 ++----
 include/trace/events/block.h   | 31 ++++++----------
 kernel/trace/blktrace.c        | 14 +++----
 23 files changed, 148 insertions(+), 221 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6acea160298c..01ddeaf64b0f 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -553,8 +553,8 @@ struct request {
 	struct request_list *rl;
 }
 	
-See the rq_flag_bits definitions for an explanation of the various flags
-available. Some bits are used by the block layer or i/o scheduler.
+See the req_ops and req_flag_bits definitions for an explanation of the various
+flags available. Some bits are used by the block layer or i/o scheduler.
 	
 The behaviour of the various sector counts are almost the same as before,
 except that since we have multi-segment bios, current_nr_sectors refers
diff --git a/block/blk-core.c b/block/blk-core.c
index fd416651a676..0bfaa54d3e9f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1056,8 +1056,7 @@ static struct io_context *rq_ioc(struct bio *bio)
 /**
  * __get_request - get a free request
  * @rl: request list to allocate from
- * @op: REQ_OP_READ/REQ_OP_WRITE
- * @op_flags: rq_flag_bits
+ * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
@@ -1068,23 +1067,22 @@ static struct io_context *rq_ioc(struct bio *bio)
  * Returns ERR_PTR on failure, with @q->queue_lock held.
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
-static struct request *__get_request(struct request_list *rl, int op,
-				     int op_flags, struct bio *bio,
-				     gfp_t gfp_mask)
+static struct request *__get_request(struct request_list *rl, unsigned int op,
+		struct bio *bio, gfp_t gfp_mask)
 {
 	struct request_queue *q = rl->q;
 	struct request *rq;
 	struct elevator_type *et = q->elevator->type;
 	struct io_context *ioc = rq_ioc(bio);
 	struct io_cq *icq = NULL;
-	const bool is_sync = rw_is_sync(op, op_flags) != 0;
+	const bool is_sync = op_is_sync(op);
 	int may_queue;
 	req_flags_t rq_flags = RQF_ALLOCED;
 
 	if (unlikely(blk_queue_dying(q)))
 		return ERR_PTR(-ENODEV);
 
-	may_queue = elv_may_queue(q, op, op_flags);
+	may_queue = elv_may_queue(q, op);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 
@@ -1154,7 +1152,7 @@ static struct request *__get_request(struct request_list *rl, int op,
 
 	blk_rq_init(q, rq);
 	blk_rq_set_rl(rq, rl);
-	req_set_op_attrs(rq, op, op_flags);
+	rq->cmd_flags = op;
 	rq->rq_flags = rq_flags;
 
 	/* init elvpriv */
@@ -1232,8 +1230,7 @@ rq_starved:
 /**
  * get_request - get a free request
  * @q: request_queue to allocate request from
- * @op: REQ_OP_READ/REQ_OP_WRITE
- * @op_flags: rq_flag_bits
+ * @op: operation and flags
  * @bio: bio to allocate request for (can be %NULL)
  * @gfp_mask: allocation mask
  *
@@ -1244,18 +1241,17 @@ rq_starved:
  * Returns ERR_PTR on failure, with @q->queue_lock held.
  * Returns request pointer on success, with @q->queue_lock *not held*.
  */
-static struct request *get_request(struct request_queue *q, int op,
-				   int op_flags, struct bio *bio,
-				   gfp_t gfp_mask)
+static struct request *get_request(struct request_queue *q, unsigned int op,
+		struct bio *bio, gfp_t gfp_mask)
 {
-	const bool is_sync = rw_is_sync(op, op_flags) != 0;
+	const bool is_sync = op_is_sync(op);
 	DEFINE_WAIT(wait);
 	struct request_list *rl;
 	struct request *rq;
 
 	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(rl, op, op_flags, bio, gfp_mask);
+	rq = __get_request(rl, op, bio, gfp_mask);
 	if (!IS_ERR(rq))
 		return rq;
 
@@ -1297,7 +1293,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
 	create_io_context(gfp_mask, q->node);
 
 	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, rw, 0, NULL, gfp_mask);
+	rq = get_request(q, rw, NULL, gfp_mask);
 	if (IS_ERR(rq)) {
 		spin_unlock_irq(q->queue_lock);
 		return rq;
@@ -1446,7 +1442,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 */
 	if (rq_flags & RQF_ALLOCED) {
 		struct request_list *rl = blk_rq_rl(req);
-		bool sync = rw_is_sync(req_op(req), req->cmd_flags);
+		bool sync = op_is_sync(req->cmd_flags);
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(ELV_ON_HASH(req));
@@ -1652,8 +1648,6 @@ out:
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cmd_type = REQ_TYPE_FS;
-
-	req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK;
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
@@ -1665,9 +1659,8 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
-	const bool sync = !!(bio->bi_opf & REQ_SYNC);
 	struct blk_plug *plug;
-	int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
+	int el_ret, where = ELEVATOR_INSERT_SORT;
 	struct request *req;
 	unsigned int request_count = 0;
 
@@ -1722,24 +1715,11 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	}
 
 get_rq:
-	/*
-	 * This sync check and mask will be re-done in init_request_from_bio(),
-	 * but we need to set it earlier to expose the sync flag to the
-	 * rq allocator and io schedulers.
-	 */
-	if (sync)
-		rw_flags |= REQ_SYNC;
-
-	/*
-	 * Add in META/PRIO flags, if set, before we get to the IO scheduler
-	 */
-	rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO));
-
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
-	req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO);
+	req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
 		bio->bi_error = PTR_ERR(req);
 		bio_endio(bio);
@@ -2946,8 +2926,6 @@ EXPORT_SYMBOL_GPL(__blk_end_request_err);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
-	req_set_op(rq, bio_op(bio));
-
 	if (bio_has_data(bio))
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 
@@ -3031,8 +3009,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	req_set_op_attrs(dst, req_op(src),
-			 (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE);
+	dst->cmd_flags = src->cmd_flags | REQ_NOMERGE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
@@ -3537,8 +3514,11 @@ EXPORT_SYMBOL(blk_set_runtime_active);
 
 int __init blk_dev_init(void)
 {
-	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+	BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
+	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
 			FIELD_SIZEOF(struct request, cmd_flags));
+	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
+			FIELD_SIZEOF(struct bio, bi_opf));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 3990b9cfbda5..95f1d4d357df 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	}
 
 	flush_rq->cmd_type = REQ_TYPE_FS;
-	req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH);
+	flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH;
 	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 46fe9248410d..18abda862915 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	struct bio *bio = *biop;
 	unsigned int granularity;
-	enum req_op op;
+	unsigned int op;
 	int alignment;
 	sector_t bs_mask;
 
diff --git a/block/blk-map.c b/block/blk-map.c
index 2c5ae5fef473..0173a72a8aa9 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -16,6 +16,8 @@
 int blk_rq_append_bio(struct request *rq, struct bio *bio)
 {
 	if (!rq->bio) {
+		rq->cmd_flags &= REQ_OP_MASK;
+		rq->cmd_flags |= (bio->bi_opf & REQ_OP_MASK);
 		blk_rq_bio_prep(rq->q, rq, bio);
 	} else {
 		if (!ll_back_merge_fn(rq->q, rq, bio))
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b49c6658eb05..2da1a0ee3318 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -139,14 +139,13 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 EXPORT_SYMBOL(blk_mq_can_queue);
 
 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-			       struct request *rq, int op,
-			       unsigned int op_flags)
+			       struct request *rq, unsigned int op)
 {
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = q;
 	rq->mq_ctx = ctx;
-	req_set_op_attrs(rq, op, op_flags);
+	rq->cmd_flags = op;
 	if (blk_queue_io_stat(q))
 		rq->rq_flags |= RQF_IO_STAT;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
@@ -183,11 +182,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
-	ctx->rq_dispatched[rw_is_sync(op, op_flags)]++;
+	ctx->rq_dispatched[op_is_sync(op)]++;
 }
 
 static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
+__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
 {
 	struct request *rq;
 	unsigned int tag;
@@ -202,7 +201,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags)
 		}
 
 		rq->tag = tag;
-		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags);
+		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
 		return rq;
 	}
 
@@ -225,7 +224,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 	ctx = blk_mq_get_ctx(q);
 	hctx = blk_mq_map_queue(q, ctx->cpu);
 	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	blk_mq_put_ctx(ctx);
 
 	if (!rq) {
@@ -277,7 +276,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 	ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
 
 	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw, 0);
+	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (!rq) {
 		ret = -EWOULDBLOCK;
 		goto out_queue_exit;
@@ -1196,19 +1195,14 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct request *rq;
-	int op = bio_data_dir(bio);
-	int op_flags = 0;
 
 	blk_queue_enter_live(q);
 	ctx = blk_mq_get_ctx(q);
 	hctx = blk_mq_map_queue(q, ctx->cpu);
 
-	if (rw_is_sync(bio_op(bio), bio->bi_opf))
-		op_flags |= REQ_SYNC;
-
-	trace_block_getrq(q, bio, op);
+	trace_block_getrq(q, bio, bio->bi_opf);
 	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-	rq = __blk_mq_alloc_request(data, op, op_flags);
+	rq = __blk_mq_alloc_request(data, bio->bi_opf);
 
 	data->hctx->queued++;
 	return rq;
@@ -1256,7 +1250,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
  */
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_mq_alloc_data data;
 	struct request *rq;
@@ -1350,7 +1344,7 @@ done:
  */
 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
-	const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf);
+	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA);
 	struct blk_plug *plug;
 	unsigned int request_count = 0;
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5e24d880306c..c96186adaa66 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -667,10 +667,10 @@ static inline void cfqg_put(struct cfq_group *cfqg)
 } while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-					    struct cfq_group *curr_cfqg, int op,
-					    int op_flags)
+					    struct cfq_group *curr_cfqg,
+					    unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1);
+	blkg_rwstat_add(&cfqg->stats.queued, op, 1);
 	cfqg_stats_end_empty_time(&cfqg->stats);
 	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 }
@@ -684,30 +684,29 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 #endif
 }
 
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
-					       int op_flags)
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
+					       unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1);
+	blkg_rwstat_add(&cfqg->stats.queued, op, -1);
 }
 
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
-					       int op_flags)
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
+					       unsigned int op)
 {
-	blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1);
+	blkg_rwstat_add(&cfqg->stats.merged, op, 1);
 }
 
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags)
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	unsigned long long now = sched_clock();
 
 	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, op, op_flags,
-				now - io_start_time);
+		blkg_rwstat_add(&stats->service_time, op, now - io_start_time);
 	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, op, op_flags,
+		blkg_rwstat_add(&stats->wait_time, op,
 				io_start_time - start_time);
 }
 
@@ -786,16 +785,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { }
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-			struct cfq_group *curr_cfqg, int op, int op_flags) { }
+			struct cfq_group *curr_cfqg, unsigned int op) { }
 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 			uint64_t time, unsigned long unaccounted_time) { }
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op,
-			int op_flags) { }
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op,
-			int op_flags) { }
+static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
+			unsigned int op) { }
+static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
+			unsigned int op) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-			uint64_t start_time, uint64_t io_start_time, int op,
-			int op_flags) { }
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op) { }
 
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 
@@ -2474,10 +2473,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags);
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	cfq_add_rq_rb(rq);
 	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
-				 req_op(rq), rq->cmd_flags);
+				 rq->cmd_flags);
 }
 
 static struct request *
@@ -2530,7 +2529,7 @@ static void cfq_remove_request(struct request *rq)
 	cfq_del_rq_rb(rq);
 
 	cfqq->cfqd->rq_queued--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags);
+	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
 		cfqq->prio_pending--;
@@ -2565,7 +2564,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req,
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
-	cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf);
+	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
 }
 
 static void
@@ -2588,7 +2587,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq,
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
-	cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags);
+	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
 
 	cfqq = RQ_CFQQ(next);
 	/*
@@ -4142,7 +4141,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
-	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq),
+	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
 				 rq->cmd_flags);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
@@ -4240,8 +4239,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
 	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), req_op(rq),
-				     rq->cmd_flags);
+				     rq_io_start_time_ns(rq), rq->cmd_flags);
 
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 
@@ -4319,14 +4317,14 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
 		cfq_schedule_dispatch(cfqd);
 }
 
-static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags)
+static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
 {
 	/*
 	 * If REQ_PRIO is set, boost class and prio level, if it's below
 	 * BE/NORM. If prio is not set, restore the potentially boosted
 	 * class/prio level.
 	 */
-	if (!(op_flags & REQ_PRIO)) {
+	if (!(op & REQ_PRIO)) {
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	} else {
@@ -4347,7 +4345,7 @@ static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 	return ELV_MQUEUE_MAY;
 }
 
-static int cfq_may_queue(struct request_queue *q, int op, int op_flags)
+static int cfq_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
@@ -4364,10 +4362,10 @@ static int cfq_may_queue(struct request_queue *q, int op, int op_flags)
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 
-	cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags));
+	cfqq = cic_to_cfqq(cic, op_is_sync(op));
 	if (cfqq) {
 		cfq_init_prio_data(cfqq, cic);
-		cfqq_boost_on_prio(cfqq, op_flags);
+		cfqq_boost_on_prio(cfqq, op);
 
 		return __cfq_may_queue(cfqq);
 	}
diff --git a/block/elevator.c b/block/elevator.c
index ac80f89a0842..a18a5db274e4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -714,12 +714,12 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 		e->type->ops.elevator_put_req_fn(rq);
 }
 
-int elv_may_queue(struct request_queue *q, int op, int op_flags)
+int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
 	if (e->type->ops.elevator_may_queue_fn)
-		return e->type->ops.elevator_may_queue_fn(q, op, op_flags);
+		return e->type->ops.elevator_may_queue_fn(q, op);
 
 	return ELV_MQUEUE_MAY;
 }
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index a2768835d394..68a9eb4f3f36 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1135,7 +1135,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_private = io;
 	clone->bi_end_io  = crypt_endio;
 	clone->bi_bdev    = cc->dev->bdev;
-	bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio));
+	clone->bi_opf	  = io->base_bio->bi_opf;
 }
 
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index cef1f78031d4..65738b0aad36 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1031,8 +1031,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	} else if (rq_data_dir(rq) == READ) {
 		SCpnt->cmnd[0] = READ_6;
 	} else {
-		scmd_printk(KERN_ERR, SCpnt, "Unknown command %llu,%llx\n",
-			    req_op(rq), (unsigned long long) rq->cmd_flags);
+		scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq));
 		goto out;
 	}
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2b790bda7998..9a377079af26 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8427,7 +8427,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	if (!bio)
 		return -ENOMEM;
 
-	bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
+	bio->bi_opf = orig_bio->bi_opf;
 	bio->bi_private = dip;
 	bio->bi_end_io = btrfs_end_dio_bio;
 	btrfs_io_bio(bio)->logical = file_offset;
@@ -8465,8 +8465,7 @@ next_block:
 						  start_sector, GFP_NOFS);
 			if (!bio)
 				goto out_err;
-			bio_set_op_attrs(bio, bio_op(orig_bio),
-					 bio_flags(orig_bio));
+			bio->bi_opf = orig_bio->bi_opf;
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 			btrfs_io_bio(bio)->logical = file_offset;
diff --git a/fs/buffer.c b/fs/buffer.c
index b205a629001d..a29335867e30 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3118,7 +3118,7 @@ EXPORT_SYMBOL(submit_bh);
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
  * @op: whether to %READ or %WRITE
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9e8de18a168a..2cf4f7f09e32 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -688,7 +688,7 @@ struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	int op;			/* contains REQ_OP_ */
-	int op_flags;		/* rq_flag_bits */
+	int op_flags;		/* req_flag_bits */
 	block_t new_blkaddr;	/* new block address to be written */
 	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 49d5a1b61b06..b1f9144b42c7 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio)
  * gfs2_log_flush_bio - Submit any pending log bio
  * @sdp: The superblock
  * @op: REQ_OP
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
  *
  * Submit any pending part-built or full bio to the block device. If
  * there is no pending bio, then this is a no-op.
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 3bf5d33800ab..ddaf28d0988f 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -581,15 +581,14 @@ static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
 /**
  * blkg_rwstat_add - add a value to a blkg_rwstat
  * @rwstat: target blkg_rwstat
- * @op: REQ_OP
- * @op_flags: rq_flag_bits
+ * @op: REQ_OP and flags
  * @val: value to add
  *
  * Add @val to @rwstat.  The counters are chosen according to @rw.  The
  * caller is responsible for synchronizing calls to this function.
  */
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-				   int op, int op_flags, uint64_t val)
+				   unsigned int op, uint64_t val)
 {
 	struct percpu_counter *cnt;
 
@@ -600,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 
 	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 
-	if (op_flags & REQ_SYNC)
+	if (op & REQ_SYNC)
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
@@ -705,9 +704,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	if (!throtl) {
 		blkg = blkg ?: q->root_blkg;
-		blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf,
+		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
 				bio->bi_iter.bi_size);
-		blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1);
+		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 	}
 
 	rcu_read_unlock();
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ec69a8fe3b29..dca972d67548 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -88,24 +88,6 @@ struct bio {
 	struct bio_vec		bi_inline_vecs[0];
 };
 
-#define BIO_OP_SHIFT	(8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS)
-#define bio_flags(bio)	((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1))
-#define bio_op(bio)	((bio)->bi_opf >> BIO_OP_SHIFT)
-
-#define bio_set_op_attrs(bio, op, op_flags) do {			\
-	if (__builtin_constant_p(op))					\
-		BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS));		\
-	else								\
-		WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS));		\
-	if (__builtin_constant_p(op_flags))				\
-		BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
-	else								\
-		WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT));	\
-	(bio)->bi_opf = bio_flags(bio);					\
-	(bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT);			\
-	(bio)->bi_opf |= (op_flags);					\
-} while (0)
-
 #define BIO_RESET_BYTES		offsetof(struct bio, bi_max_vecs)
 
 /*
@@ -147,26 +129,40 @@ struct bio {
 #endif /* CONFIG_BLOCK */
 
 /*
- * Request flags.  For use in the cmd_flags field of struct request, and in
- * bi_opf of struct bio.  Note that some flags are only valid in either one.
+ * Operations and flags common to the bio and request structures.
+ * We use 8 bits for encoding the operation, and the remaining 24 for flags.
  */
-enum rq_flag_bits {
-	/* common flags */
-	__REQ_FAILFAST_DEV,	/* no driver retries of device errors */
+#define REQ_OP_BITS	8
+#define REQ_OP_MASK	((1 << REQ_OP_BITS) - 1)
+#define REQ_FLAG_BITS	24
+
+enum req_opf {
+	REQ_OP_READ,
+	REQ_OP_WRITE,
+	REQ_OP_DISCARD,		/* request to discard sectors */
+	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
+	REQ_OP_WRITE_SAME,	/* write same block many times */
+	REQ_OP_FLUSH,		/* request for cache flush */
+	REQ_OP_ZONE_REPORT,	/* Get zone information */
+	REQ_OP_ZONE_RESET,	/* Reset a zone write pointer */
+
+	REQ_OP_LAST,
+};
+
+enum req_flag_bits {
+	__REQ_FAILFAST_DEV =	/* no driver retries of device errors */
+		REQ_OP_BITS,
 	__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
 	__REQ_FAILFAST_DRIVER,	/* no driver retries of driver errors */
-
 	__REQ_SYNC,		/* request is sync (sync write or read) */
 	__REQ_META,		/* metadata io request */
 	__REQ_PRIO,		/* boost priority in cfq */
-
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
 	__REQ_FUA,		/* forced unit access */
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
-
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -176,37 +172,32 @@ enum rq_flag_bits {
 #define REQ_SYNC		(1ULL << __REQ_SYNC)
 #define REQ_META		(1ULL << __REQ_META)
 #define REQ_PRIO		(1ULL << __REQ_PRIO)
+#define REQ_NOMERGE		(1ULL << __REQ_NOMERGE)
 #define REQ_NOIDLE		(1ULL << __REQ_NOIDLE)
 #define REQ_INTEGRITY		(1ULL << __REQ_INTEGRITY)
+#define REQ_FUA			(1ULL << __REQ_FUA)
+#define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
+#define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
-#define REQ_COMMON_MASK \
-	(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
-	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_RAHEAD)
-#define REQ_CLONE_MASK		REQ_COMMON_MASK
 
-/* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
 	(REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
 
-#define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
-#define REQ_FUA			(1ULL << __REQ_FUA)
-#define REQ_NOMERGE		(1ULL << __REQ_NOMERGE)
-#define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
+#define bio_op(bio) \
+	((bio)->bi_opf & REQ_OP_MASK)
+#define req_op(req) \
+	((req)->cmd_flags & REQ_OP_MASK)
 
-enum req_op {
-	REQ_OP_READ,
-	REQ_OP_WRITE,
-	REQ_OP_DISCARD,		/* request to discard sectors */
-	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
-	REQ_OP_WRITE_SAME,	/* write same block many times */
-	REQ_OP_FLUSH,		/* request for cache flush */
-	REQ_OP_ZONE_REPORT,	/* Get zone information */
-	REQ_OP_ZONE_RESET,	/* Reset a zone write pointer */
-};
+/* obsolete, don't use in new code */
+#define bio_set_op_attrs(bio, op, op_flags) \
+	((bio)->bi_opf |= (op | op_flags))
 
-#define REQ_OP_BITS 3
+static inline bool op_is_sync(unsigned int op)
+{
+	return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC);
+}
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE	-1U
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b4415feac679..8396da2bb698 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -142,7 +142,7 @@ struct request {
 
 	int cpu;
 	unsigned cmd_type;
-	u64 cmd_flags;
+	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
 	unsigned long atomic_flags;
 
@@ -244,20 +244,6 @@ struct request {
 	struct request *next_rq;
 };
 
-#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS)
-#define req_op(req)  ((req)->cmd_flags >> REQ_OP_SHIFT)
-
-#define req_set_op(req, op) do {				\
-	WARN_ON(op >= (1 << REQ_OP_BITS));			\
-	(req)->cmd_flags &= ((1ULL << REQ_OP_SHIFT) - 1);	\
-	(req)->cmd_flags |= ((u64) (op) << REQ_OP_SHIFT);	\
-} while (0)
-
-#define req_set_op_attrs(req, op, flags) do {	\
-	req_set_op(req, op);			\
-	(req)->cmd_flags |= flags;		\
-} while (0)
-
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
@@ -741,17 +727,9 @@ static inline unsigned int blk_queue_zone_size(struct request_queue *q)
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
 
-/*
- * We regard a request as sync, if either a read or a sync write
- */
-static inline bool rw_is_sync(int op, unsigned int rw_flags)
-{
-	return op == REQ_OP_READ || (rw_flags & REQ_SYNC);
-}
-
 static inline bool rq_is_sync(struct request *rq)
 {
-	return rw_is_sync(req_op(rq), rq->cmd_flags);
+	return op_is_sync(rq->cmd_flags);
 }
 
 static inline bool blk_rl_full(struct request_list *rl, bool sync)
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index cceb72f9e29f..e417f080219a 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq)
 }
 
 extern void blk_dump_cmd(char *buf, struct request *rq);
-extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes);
+extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
 
 #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
 
diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h
index b91b023deffb..a52c6580cc9a 100644
--- a/include/linux/dm-io.h
+++ b/include/linux/dm-io.h
@@ -58,7 +58,7 @@ struct dm_io_notify {
 struct dm_io_client;
 struct dm_io_request {
 	int bi_op;			/* REQ_OP */
-	int bi_op_flags;		/* rq_flag_bits */
+	int bi_op_flags;		/* req_flag_bits */
 	struct dm_io_memory mem;	/* Memory to use for io */
 	struct dm_io_notify notify;	/* Synchronous if notify.fn is NULL */
 	struct dm_io_client *client;	/* Client memory handler */
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index e7f358d2e5fc..f219c9aed360 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -30,7 +30,7 @@ typedef int (elevator_dispatch_fn) (struct request_queue *, int);
 typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
 typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
-typedef int (elevator_may_queue_fn) (struct request_queue *, int, int);
+typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int);
 
 typedef void (elevator_init_icq_fn) (struct io_cq *);
 typedef void (elevator_exit_icq_fn) (struct io_cq *);
@@ -139,7 +139,7 @@ extern struct request *elv_former_request(struct request_queue *, struct request
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 extern int elv_register_queue(struct request_queue *q);
 extern void elv_unregister_queue(struct request_queue *q);
-extern int elv_may_queue(struct request_queue *, int, int);
+extern int elv_may_queue(struct request_queue *, unsigned int);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
 			   struct bio *bio, gfp_t gfp_mask);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index d336b890e31f..df3e9ae5ad8d 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -27,8 +27,7 @@ DECLARE_EVENT_CLASS(bcache_request,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->orig_sector	= bio->bi_iter.bi_sector - 16;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -102,8 +101,7 @@ DECLARE_EVENT_CLASS(bcache_bio,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d  %s %llu + %u",
@@ -138,8 +136,7 @@ TRACE_EVENT(bcache_read,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		__entry->cache_hit = hit;
 		__entry->bypass = bypass;
 	),
@@ -170,8 +167,7 @@ TRACE_EVENT(bcache_write,
 		__entry->inode		= inode;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		__entry->writeback = writeback;
 		__entry->bypass = bypass;
 	),
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 8f3a163b8166..3e02e3a25413 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -84,8 +84,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 					0 : blk_rq_sectors(rq);
 		__entry->errors    = rq->errors;
 
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
-			      blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 
@@ -163,7 +162,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->nr_sector = nr_bytes >> 9;
 		__entry->errors    = rq->errors;
 
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, nr_bytes);
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 
@@ -199,8 +198,7 @@ DECLARE_EVENT_CLASS(block_rq,
 		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					blk_rq_bytes(rq) : 0;
 
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
-			      blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -274,8 +272,7 @@ TRACE_EVENT(block_bio_bounce,
 					  bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -313,8 +310,7 @@ TRACE_EVENT(block_bio_complete,
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->error		= error;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
@@ -341,8 +337,7 @@ DECLARE_EVENT_CLASS(block_bio_merge,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -409,8 +404,7 @@ TRACE_EVENT(block_bio_queue,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio_sectors(bio);
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -438,7 +432,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
-		blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0,
+		blk_fill_rwbs(__entry->rwbs,
 			      bio ? bio->bi_opf : 0, __entry->nr_sector);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
@@ -573,8 +567,7 @@ TRACE_EVENT(block_split,
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->new_sector	= new_sector;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 
@@ -617,8 +610,7 @@ TRACE_EVENT(block_bio_remap,
 		__entry->nr_sector	= bio_sectors(bio);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf,
-			      bio->bi_iter.bi_size);
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
@@ -664,8 +656,7 @@ TRACE_EVENT(block_rq_remap,
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
 		__entry->nr_bios	= blk_rq_count_bios(rq);
-		blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags,
-			      blk_rq_bytes(rq));
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u",
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index dbafc5df03f3..95cecbf67f5c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1777,14 +1777,14 @@ void blk_dump_cmd(char *buf, struct request *rq)
 	}
 }
 
-void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
+void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
 {
 	int i = 0;
 
-	if (rw & REQ_PREFLUSH)
+	if (op & REQ_PREFLUSH)
 		rwbs[i++] = 'F';
 
-	switch (op) {
+	switch (op & REQ_OP_MASK) {
 	case REQ_OP_WRITE:
 	case REQ_OP_WRITE_SAME:
 		rwbs[i++] = 'W';
@@ -1806,13 +1806,13 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes)
 		rwbs[i++] = 'N';
 	}
 
-	if (rw & REQ_FUA)
+	if (op & REQ_FUA)
 		rwbs[i++] = 'F';
-	if (rw & REQ_RAHEAD)
+	if (op & REQ_RAHEAD)
 		rwbs[i++] = 'A';
-	if (rw & REQ_SYNC)
+	if (op & REQ_SYNC)
 		rwbs[i++] = 'S';
-	if (rw & REQ_META)
+	if (op & REQ_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';
-- 
cgit v1.2.3


From ebb676daa1a340ccef25eb769aefc09b79c01f8a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 27 Oct 2016 11:23:51 +0200
Subject: bpf: Print function name in addition to function id

The verifier currently prints raw function ids when printing CALL
instructions or when complaining:

	5: (85) call 23
	unknown func 23

print a meaningful function name instead:

	5: (85) call bpf_redirect#23
	unknown func bpf_redirect#23

Moves the function documentation to a single comment and renames all
helpers names in the list to conform to the bpf_ prefix notation so
they can be greped in the kernel source.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 574 ++++++++++++++++++++++++-----------------------
 kernel/bpf/verifier.c    |  35 ++-
 2 files changed, 316 insertions(+), 293 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 374ef582ae18..e2f38e0091b6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,297 +143,301 @@ union bpf_attr {
 	};
 } __attribute__((aligned(8)));
 
+/* BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(&map, &key)
+ *     Return: Map value or NULL
+ *
+ * int bpf_map_update_elem(&map, &key, &value, flags)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_map_delete_elem(&map, &key)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_probe_read(void *dst, int size, void *src)
+ *     Return: 0 on success or negative error
+ *
+ * u64 bpf_ktime_get_ns(void)
+ *     Return: current ktime
+ *
+ * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
+ *     Return: length of buffer written or negative error
+ *
+ * u32 bpf_prandom_u32(void)
+ *     Return: random value
+ *
+ * u32 bpf_raw_smp_processor_id(void)
+ *     Return: SMP processor ID
+ *
+ * int bpf_skb_store_bytes(skb, offset, from, len, flags)
+ *     store bytes into packet
+ *     @skb: pointer to skb
+ *     @offset: offset within packet from skb->mac_header
+ *     @from: pointer where to copy bytes from
+ *     @len: number of bytes to store into packet
+ *     @flags: bit 0 - if true, recompute skb->csum
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_l3_csum_replace(skb, offset, from, to, flags)
+ *     recompute IP checksum
+ *     @skb: pointer to skb
+ *     @offset: offset within packet where IP checksum is located
+ *     @from: old value of header field
+ *     @to: new value of header field
+ *     @flags: bits 0-3 - size of header field
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_l4_csum_replace(skb, offset, from, to, flags)
+ *     recompute TCP/UDP checksum
+ *     @skb: pointer to skb
+ *     @offset: offset within packet where TCP/UDP checksum is located
+ *     @from: old value of header field
+ *     @to: new value of header field
+ *     @flags: bits 0-3 - size of header field
+ *             bit 4 - is pseudo header
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_tail_call(ctx, prog_array_map, index)
+ *     jump into another BPF program
+ *     @ctx: context pointer passed to next program
+ *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
+ *     @index: index inside array that selects specific program to run
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_clone_redirect(skb, ifindex, flags)
+ *     redirect to another netdev
+ *     @skb: pointer to skb
+ *     @ifindex: ifindex of the net device
+ *     @flags: bit 0 - if set, redirect to ingress instead of egress
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ *     Return: current->tgid << 32 | current->pid
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ *     Return: current_gid << 32 | current_uid
+ *
+ * int bpf_get_current_comm(char *buf, int size_of_buf)
+ *     stores current->comm into buf
+ *     Return: 0 on success or negative error
+ *
+ * u32 bpf_get_cgroup_classid(skb)
+ *     retrieve a proc's classid
+ *     @skb: pointer to skb
+ *     Return: classid if != 0
+ *
+ * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_vlan_pop(skb)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_get_tunnel_key(skb, key, size, flags)
+ * int bpf_skb_set_tunnel_key(skb, key, size, flags)
+ *     retrieve or populate tunnel metadata
+ *     @skb: pointer to skb
+ *     @key: pointer to 'struct bpf_tunnel_key'
+ *     @size: size of 'struct bpf_tunnel_key'
+ *     @flags: room for future extensions
+ *     Return: 0 on success or negative error
+ *
+ * u64 bpf_perf_event_read(&map, index)
+ *     Return: Number events read or error code
+ *
+ * int bpf_redirect(ifindex, flags)
+ *     redirect to another netdev
+ *     @ifindex: ifindex of the net device
+ *     @flags: bit 0 - if set, redirect to ingress instead of egress
+ *             other bits - reserved
+ *     Return: TC_ACT_REDIRECT
+ *
+ * u32 bpf_get_route_realm(skb)
+ *     retrieve a dst's tclassid
+ *     @skb: pointer to skb
+ *     Return: realm if != 0
+ *
+ * int bpf_perf_event_output(ctx, map, index, data, size)
+ *     output perf raw sample
+ *     @ctx: struct pt_regs*
+ *     @map: pointer to perf_event_array map
+ *     @index: index of event in the map
+ *     @data: data on stack to be output as raw data
+ *     @size: size of data
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_get_stackid(ctx, map, flags)
+ *     walk user or kernel stack and return id
+ *     @ctx: struct pt_regs*
+ *     @map: pointer to stack_trace map
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 9 - compare stacks by hash only
+ *             bit 10 - if two different stacks hash into the same stackid
+ *                      discard old
+ *             other bits - reserved
+ *     Return: >= 0 stackid on success or negative error
+ *
+ * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
+ *     calculate csum diff
+ *     @from: raw from buffer
+ *     @from_size: length of from buffer
+ *     @to: raw to buffer
+ *     @to_size: length of to buffer
+ *     @seed: optional seed
+ *     Return: csum result or negative error code
+ *
+ * int bpf_skb_get_tunnel_opt(skb, opt, size)
+ *     retrieve tunnel options metadata
+ *     @skb: pointer to skb
+ *     @opt: pointer to raw tunnel option data
+ *     @size: size of @opt
+ *     Return: option size
+ *
+ * int bpf_skb_set_tunnel_opt(skb, opt, size)
+ *     populate tunnel options metadata
+ *     @skb: pointer to skb
+ *     @opt: pointer to raw tunnel option data
+ *     @size: size of @opt
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_change_proto(skb, proto, flags)
+ *     Change protocol of the skb. Currently supported is v4 -> v6,
+ *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
+ *     program is expected to fill the new headers via skb_store_bytes
+ *     and lX_csum_replace.
+ *     @skb: pointer to skb
+ *     @proto: new skb->protocol type
+ *     @flags: reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_change_type(skb, type)
+ *     Change packet type of skb.
+ *     @skb: pointer to skb
+ *     @type: new skb->pkt_type type
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_under_cgroup(skb, map, index)
+ *     Check cgroup2 membership of skb
+ *     @skb: pointer to skb
+ *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+ *     @index: index of the cgroup in the bpf_map
+ *     Return:
+ *       == 0 skb failed the cgroup2 descendant test
+ *       == 1 skb succeeded the cgroup2 descendant test
+ *        < 0 error
+ *
+ * u32 bpf_get_hash_recalc(skb)
+ *     Retrieve and possibly recalculate skb->hash.
+ *     @skb: pointer to skb
+ *     Return: hash
+ *
+ * u64 bpf_get_current_task(void)
+ *     Returns current task_struct
+ *     Return: current
+ *
+ * int bpf_probe_write_user(void *dst, void *src, int len)
+ *     safely attempt to write to a location
+ *     @dst: destination address in userspace
+ *     @src: source address on stack
+ *     @len: number of bytes to copy
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_current_task_under_cgroup(map, index)
+ *     Check cgroup2 membership of current task
+ *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+ *     @index: index of the cgroup in the bpf_map
+ *     Return:
+ *       == 0 current failed the cgroup2 descendant test
+ *       == 1 current succeeded the cgroup2 descendant test
+ *        < 0 error
+ *
+ * int bpf_skb_change_tail(skb, len, flags)
+ *     The helper will resize the skb to the given new size, to be used f.e.
+ *     with control messages.
+ *     @skb: pointer to skb
+ *     @len: new skb length
+ *     @flags: reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_pull_data(skb, len)
+ *     The helper will pull in non-linear data in case the skb is non-linear
+ *     and not all of len are part of the linear section. Only needed for
+ *     read/write with direct packet access.
+ *     @skb: pointer to skb
+ *     @len: len to make read/writeable
+ *     Return: 0 on success or negative error
+ *
+ * s64 bpf_csum_update(skb, csum)
+ *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+ *     @skb: pointer to skb
+ *     @csum: csum to add
+ *     Return: csum on success or negative error
+ *
+ * void bpf_set_hash_invalid(skb)
+ *     Invalidate current skb->hash.
+ *     @skb: pointer to skb
+ *
+ * int bpf_get_numa_node_id()
+ *     Return: Id of current NUMA node.
+ */
+#define __BPF_FUNC_MAPPER(FN)		\
+	FN(unspec),			\
+	FN(map_lookup_elem),		\
+	FN(map_update_elem),		\
+	FN(map_delete_elem),		\
+	FN(probe_read),			\
+	FN(ktime_get_ns),		\
+	FN(trace_printk),		\
+	FN(get_prandom_u32),		\
+	FN(get_smp_processor_id),	\
+	FN(skb_store_bytes),		\
+	FN(l3_csum_replace),		\
+	FN(l4_csum_replace),		\
+	FN(tail_call),			\
+	FN(clone_redirect),		\
+	FN(get_current_pid_tgid),	\
+	FN(get_current_uid_gid),	\
+	FN(get_current_comm),		\
+	FN(get_cgroup_classid),		\
+	FN(skb_vlan_push),		\
+	FN(skb_vlan_pop),		\
+	FN(skb_get_tunnel_key),		\
+	FN(skb_set_tunnel_key),		\
+	FN(perf_event_read),		\
+	FN(redirect),			\
+	FN(get_route_realm),		\
+	FN(perf_event_output),		\
+	FN(skb_load_bytes),		\
+	FN(get_stackid),		\
+	FN(csum_diff),			\
+	FN(skb_get_tunnel_opt),		\
+	FN(skb_set_tunnel_opt),		\
+	FN(skb_change_proto),		\
+	FN(skb_change_type),		\
+	FN(skb_under_cgroup),		\
+	FN(get_hash_recalc),		\
+	FN(get_current_task),		\
+	FN(probe_write_user),		\
+	FN(current_task_under_cgroup),	\
+	FN(skb_change_tail),		\
+	FN(skb_pull_data),		\
+	FN(csum_update),		\
+	FN(set_hash_invalid),		\
+	FN(get_numa_node_id),
+
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
  */
+#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x
 enum bpf_func_id {
-	BPF_FUNC_unspec,
-	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
-	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
-	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
-	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
-	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
-	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
-	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
-	BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */
-
-	/**
-	 * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet
-	 * @skb: pointer to skb
-	 * @offset: offset within packet from skb->mac_header
-	 * @from: pointer where to copy bytes from
-	 * @len: number of bytes to store into packet
-	 * @flags: bit 0 - if true, recompute skb->csum
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_skb_store_bytes,
-
-	/**
-	 * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum
-	 * @skb: pointer to skb
-	 * @offset: offset within packet where IP checksum is located
-	 * @from: old value of header field
-	 * @to: new value of header field
-	 * @flags: bits 0-3 - size of header field
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_l3_csum_replace,
-
-	/**
-	 * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum
-	 * @skb: pointer to skb
-	 * @offset: offset within packet where TCP/UDP checksum is located
-	 * @from: old value of header field
-	 * @to: new value of header field
-	 * @flags: bits 0-3 - size of header field
-	 *         bit 4 - is pseudo header
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_l4_csum_replace,
-
-	/**
-	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
-	 * @ctx: context pointer passed to next program
-	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
-	 * @index: index inside array that selects specific program to run
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_tail_call,
-
-	/**
-	 * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev
-	 * @skb: pointer to skb
-	 * @ifindex: ifindex of the net device
-	 * @flags: bit 0 - if set, redirect to ingress instead of egress
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_clone_redirect,
-
-	/**
-	 * u64 bpf_get_current_pid_tgid(void)
-	 * Return: current->tgid << 32 | current->pid
-	 */
-	BPF_FUNC_get_current_pid_tgid,
-
-	/**
-	 * u64 bpf_get_current_uid_gid(void)
-	 * Return: current_gid << 32 | current_uid
-	 */
-	BPF_FUNC_get_current_uid_gid,
-
-	/**
-	 * bpf_get_current_comm(char *buf, int size_of_buf)
-	 * stores current->comm into buf
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_get_current_comm,
-
-	/**
-	 * bpf_get_cgroup_classid(skb) - retrieve a proc's classid
-	 * @skb: pointer to skb
-	 * Return: classid if != 0
-	 */
-	BPF_FUNC_get_cgroup_classid,
-	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
-	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
-
-	/**
-	 * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
-	 * retrieve or populate tunnel metadata
-	 * @skb: pointer to skb
-	 * @key: pointer to 'struct bpf_tunnel_key'
-	 * @size: size of 'struct bpf_tunnel_key'
-	 * @flags: room for future extensions
-	 * Retrun: 0 on success
-	 */
-	BPF_FUNC_skb_get_tunnel_key,
-	BPF_FUNC_skb_set_tunnel_key,
-	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
-	/**
-	 * bpf_redirect(ifindex, flags) - redirect to another netdev
-	 * @ifindex: ifindex of the net device
-	 * @flags: bit 0 - if set, redirect to ingress instead of egress
-	 *         other bits - reserved
-	 * Return: TC_ACT_REDIRECT
-	 */
-	BPF_FUNC_redirect,
-
-	/**
-	 * bpf_get_route_realm(skb) - retrieve a dst's tclassid
-	 * @skb: pointer to skb
-	 * Return: realm if != 0
-	 */
-	BPF_FUNC_get_route_realm,
-
-	/**
-	 * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample
-	 * @ctx: struct pt_regs*
-	 * @map: pointer to perf_event_array map
-	 * @index: index of event in the map
-	 * @data: data on stack to be output as raw data
-	 * @size: size of data
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_perf_event_output,
-	BPF_FUNC_skb_load_bytes,
-
-	/**
-	 * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
-	 * @ctx: struct pt_regs*
-	 * @map: pointer to stack_trace map
-	 * @flags: bits 0-7 - numer of stack frames to skip
-	 *         bit 8 - collect user stack instead of kernel
-	 *         bit 9 - compare stacks by hash only
-	 *         bit 10 - if two different stacks hash into the same stackid
-	 *                  discard old
-	 *         other bits - reserved
-	 * Return: >= 0 stackid on success or negative error
-	 */
-	BPF_FUNC_get_stackid,
-
-	/**
-	 * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff
-	 * @from: raw from buffer
-	 * @from_size: length of from buffer
-	 * @to: raw to buffer
-	 * @to_size: length of to buffer
-	 * @seed: optional seed
-	 * Return: csum result
-	 */
-	BPF_FUNC_csum_diff,
-
-	/**
-	 * bpf_skb_[gs]et_tunnel_opt(skb, opt, size)
-	 * retrieve or populate tunnel options metadata
-	 * @skb: pointer to skb
-	 * @opt: pointer to raw tunnel option data
-	 * @size: size of @opt
-	 * Return: 0 on success for set, option size for get
-	 */
-	BPF_FUNC_skb_get_tunnel_opt,
-	BPF_FUNC_skb_set_tunnel_opt,
-
-	/**
-	 * bpf_skb_change_proto(skb, proto, flags)
-	 * Change protocol of the skb. Currently supported is
-	 * v4 -> v6, v6 -> v4 transitions. The helper will also
-	 * resize the skb. eBPF program is expected to fill the
-	 * new headers via skb_store_bytes and lX_csum_replace.
-	 * @skb: pointer to skb
-	 * @proto: new skb->protocol type
-	 * @flags: reserved
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_change_proto,
-
-	/**
-	 * bpf_skb_change_type(skb, type)
-	 * Change packet type of skb.
-	 * @skb: pointer to skb
-	 * @type: new skb->pkt_type type
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_change_type,
-
-	/**
-	 * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb
-	 * @skb: pointer to skb
-	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
-	 * @index: index of the cgroup in the bpf_map
-	 * Return:
-	 *   == 0 skb failed the cgroup2 descendant test
-	 *   == 1 skb succeeded the cgroup2 descendant test
-	 *    < 0 error
-	 */
-	BPF_FUNC_skb_under_cgroup,
-
-	/**
-	 * bpf_get_hash_recalc(skb)
-	 * Retrieve and possibly recalculate skb->hash.
-	 * @skb: pointer to skb
-	 * Return: hash
-	 */
-	BPF_FUNC_get_hash_recalc,
-
-	/**
-	 * u64 bpf_get_current_task(void)
-	 * Returns current task_struct
-	 * Return: current
-	 */
-	BPF_FUNC_get_current_task,
-
-	/**
-	 * bpf_probe_write_user(void *dst, void *src, int len)
-	 * safely attempt to write to a location
-	 * @dst: destination address in userspace
-	 * @src: source address on stack
-	 * @len: number of bytes to copy
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_probe_write_user,
-
-	/**
-	 * bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership of current task
-	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
-	 * @index: index of the cgroup in the bpf_map
-	 * Return:
-	 *   == 0 current failed the cgroup2 descendant test
-	 *   == 1 current succeeded the cgroup2 descendant test
-	 *    < 0 error
-	 */
-	BPF_FUNC_current_task_under_cgroup,
-
-	/**
-	 * bpf_skb_change_tail(skb, len, flags)
-	 * The helper will resize the skb to the given new size,
-	 * to be used f.e. with control messages.
-	 * @skb: pointer to skb
-	 * @len: new skb length
-	 * @flags: reserved
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_change_tail,
-
-	/**
-	 * bpf_skb_pull_data(skb, len)
-	 * The helper will pull in non-linear data in case the
-	 * skb is non-linear and not all of len are part of the
-	 * linear section. Only needed for read/write with direct
-	 * packet access.
-	 * @skb: pointer to skb
-	 * @len: len to make read/writeable
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_pull_data,
-
-	/**
-	 * bpf_csum_update(skb, csum)
-	 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
-	 * @skb: pointer to skb
-	 * @csum: csum to add
-	 * Return: csum on success or negative error
-	 */
-	BPF_FUNC_csum_update,
-
-	/**
-	 * bpf_set_hash_invalid(skb)
-	 * Invalidate current skb>hash.
-	 * @skb: pointer to skb
-	 */
-	BPF_FUNC_set_hash_invalid,
-
-	/**
-	 * bpf_get_numa_node_id()
-	 * Returns the id of the current NUMA node.
-	 */
-	BPF_FUNC_get_numa_node_id,
-
+	__BPF_FUNC_MAPPER(__BPF_ENUM_FN)
 	__BPF_FUNC_MAX_ID,
 };
+#undef __BPF_ENUM_FN
 
 /* All flags used by eBPF helper functions, placed here. */
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 846d7ceaf202..900257578934 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19,6 +19,7 @@
 #include <net/netlink.h>
 #include <linux/file.h>
 #include <linux/vmalloc.h>
+#include <linux/stringify.h>
 
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
@@ -190,6 +191,22 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+static const char *func_id_name(int id)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+		return func_id_str[id];
+	else
+		return "unknown";
+}
+
 static void print_verifier_state(struct bpf_verifier_state *state)
 {
 	struct bpf_reg_state *reg;
@@ -354,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn)
 		u8 opcode = BPF_OP(insn->code);
 
 		if (opcode == BPF_CALL) {
-			verbose("(%02x) call %d\n", insn->code, insn->imm);
+			verbose("(%02x) call %s#%d\n", insn->code,
+				func_id_name(insn->imm), insn->imm);
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
 			verbose("(%02x) goto pc%+d\n",
 				insn->code, insn->off);
@@ -1114,8 +1132,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 
 	return 0;
 error:
-	verbose("cannot pass map_type %d into func %d\n",
-		map->map_type, func_id);
+	verbose("cannot pass map_type %d into func %s#%d\n",
+		map->map_type, func_id_name(func_id), func_id);
 	return -EINVAL;
 }
 
@@ -1172,7 +1190,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 
 	/* find function prototype */
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-		verbose("invalid func %d\n", func_id);
+		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
@@ -1180,7 +1198,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 		fn = env->prog->aux->ops->get_func_proto(func_id);
 
 	if (!fn) {
-		verbose("unknown func %d\n", func_id);
+		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
@@ -1200,7 +1218,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 	 */
 	err = check_raw_mode(fn);
 	if (err) {
-		verbose("kernel subsystem misconfigured func %d\n", func_id);
+		verbose("kernel subsystem misconfigured func %s#%d\n",
+			func_id_name(func_id), func_id);
 		return err;
 	}
 
@@ -1256,8 +1275,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
 	} else {
-		verbose("unknown return type %d of func %d\n",
-			fn->ret_type, func_id);
+		verbose("unknown return type %d of func %s#%d\n",
+			fn->ret_type, func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 0f98621bef5d2b7ad41f6595899660af344f5016 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 29 Oct 2016 02:30:46 +0200
Subject: bpf, inode: add support for symlinks and fix mtime/ctime

While commit bb35a6ef7da4 ("bpf, inode: allow for rename and link ops")
added support for hard links that can be used for prog and map nodes,
this work adds simple symlink support, which can be used f.e. for
directories also when unpriviledged and works with cmdline tooling that
understands S_IFLNK anyway. Since the switch in e27f4a942a0e ("bpf: Use
mount_nodev not mount_ns to mount the bpf filesystem"), there can be
various mount instances with mount_nodev() and thus hierarchy can be
flattened to facilitate object sharing. Thus, we can keep bpf tooling
also working by repointing paths.

Most of the functionality can be used from vfs library operations. The
symlink is stored in the inode itself, that is in i_link, which is
sufficient in our case as opposed to storing it in the page cache.
While at it, I noticed that bpf_mkdir() and bpf_mkobj() don't update
the directories mtime and ctime, so add a common helper for it called
bpf_dentry_finalize() that takes care of it for all cases now.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 45 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 1ed8473ec537..2565809fbb34 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -87,6 +87,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,
 	switch (mode & S_IFMT) {
 	case S_IFDIR:
 	case S_IFREG:
+	case S_IFLNK:
 		break;
 	default:
 		return ERR_PTR(-EINVAL);
@@ -119,6 +120,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
 	return 0;
 }
 
+static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
+				struct inode *dir)
+{
+	d_instantiate(dentry, inode);
+	dget(dentry);
+
+	dir->i_mtime = current_time(dir);
+	dir->i_ctime = dir->i_mtime;
+}
+
 static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
@@ -133,9 +144,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inc_nlink(inode);
 	inc_nlink(dir);
 
-	d_instantiate(dentry, inode);
-	dget(dentry);
-
+	bpf_dentry_finalize(dentry, inode, dir);
 	return 0;
 }
 
@@ -151,9 +160,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
 	inode->i_op = iops;
 	inode->i_private = dentry->d_fsdata;
 
-	d_instantiate(dentry, inode);
-	dget(dentry);
-
+	bpf_dentry_finalize(dentry, inode, dir);
 	return 0;
 }
 
@@ -181,13 +188,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 {
 	if (strchr(dentry->d_name.name, '.'))
 		return ERR_PTR(-EPERM);
+
 	return simple_lookup(dir, dentry, flags);
 }
 
+static int bpf_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *target)
+{
+	char *link = kstrdup(target, GFP_USER | __GFP_NOWARN);
+	struct inode *inode;
+
+	if (!link)
+		return -ENOMEM;
+
+	inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
+	if (IS_ERR(inode)) {
+		kfree(link);
+		return PTR_ERR(inode);
+	}
+
+	inode->i_op = &simple_symlink_inode_operations;
+	inode->i_link = link;
+
+	bpf_dentry_finalize(dentry, inode, dir);
+	return 0;
+}
+
 static const struct inode_operations bpf_dir_iops = {
 	.lookup		= bpf_lookup,
 	.mknod		= bpf_mkobj,
 	.mkdir		= bpf_mkdir,
+	.symlink	= bpf_symlink,
 	.rmdir		= simple_rmdir,
 	.rename		= simple_rename,
 	.link		= simple_link,
@@ -324,6 +355,8 @@ static void bpf_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
 
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_link);
 	if (!bpf_inode_type(inode, &type))
 		bpf_any_put(inode->i_private, type);
 }
-- 
cgit v1.2.3


From 405c0759712f57b680f66aee9c55cd06ad1cbdef Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 31 Oct 2016 08:11:43 -0700
Subject: fork: Add task stack refcounting sanity check and prevent premature
 task stack freeing

If something goes wrong with task stack refcounting and a stack
refcount hits zero too early, warn and leak it rather than
potentially freeing it early (and silently).

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f29119c783a9680a4b4656e751b6123917ace94b.1477926663.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/fork.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 623259fc794d..997ac1d584f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,9 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 
 static void release_task_stack(struct task_struct *tsk)
 {
+	if (WARN_ON(tsk->state != TASK_DEAD))
+		return;  /* Better to leak the stack than to free prematurely */
+
 	account_kernel_stack(tsk, -1);
 	arch_release_thread_stack(tsk->stack);
 	free_thread_stack(tsk);
@@ -1862,6 +1865,7 @@ bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
 	exit_creds(p);
 bad_fork_free:
+	p->state = TASK_DEAD;
 	put_task_stack(p);
 	free_task(p);
 fork_out:
-- 
cgit v1.2.3


From 70fd76140a6cb63262bd47b68d57b42e889c10ee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 1 Nov 2016 07:40:10 -0600
Subject: block,fs: use REQ_* flags directly

Remove the WRITE_* and READ_SYNC wrappers, and just use the flags
directly.  Where applicable this also drops usage of the
bio_set_op_attrs wrapper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-flush.c                   |  4 ++--
 drivers/block/drbd/drbd_receiver.c  |  2 +-
 drivers/block/xen-blkback/blkback.c | 10 ++++----
 drivers/md/bcache/btree.c           |  4 ++--
 drivers/md/bcache/debug.c           |  4 ++--
 drivers/md/bcache/request.c         |  2 +-
 drivers/md/bcache/super.c           |  4 ++--
 drivers/md/dm-bufio.c               |  2 +-
 drivers/md/dm-log.c                 |  2 +-
 drivers/md/dm-raid1.c               |  4 ++--
 drivers/md/dm-snap-persistent.c     |  4 ++--
 drivers/md/dm.c                     |  2 +-
 drivers/md/md.c                     |  4 ++--
 drivers/md/raid5-cache.c            |  4 ++--
 drivers/md/raid5.c                  |  2 +-
 drivers/nvme/target/io-cmd.c        |  4 ++--
 drivers/target/target_core_iblock.c |  8 +++----
 fs/btrfs/disk-io.c                  |  6 ++---
 fs/btrfs/extent_io.c                | 16 ++++++-------
 fs/btrfs/inode.c                    |  6 ++---
 fs/btrfs/scrub.c                    |  2 +-
 fs/btrfs/volumes.c                  |  2 +-
 fs/btrfs/volumes.h                  |  2 +-
 fs/buffer.c                         |  8 +++----
 fs/direct-io.c                      |  2 +-
 fs/ext4/mmp.c                       |  6 ++---
 fs/ext4/page-io.c                   |  2 +-
 fs/ext4/super.c                     |  2 +-
 fs/f2fs/checkpoint.c                |  4 ++--
 fs/f2fs/data.c                      | 16 ++++++-------
 fs/f2fs/gc.c                        |  6 ++---
 fs/f2fs/inline.c                    |  2 +-
 fs/f2fs/node.c                      |  4 ++--
 fs/f2fs/segment.c                   |  8 +++----
 fs/f2fs/super.c                     |  2 +-
 fs/gfs2/log.c                       |  4 ++--
 fs/gfs2/meta_io.c                   |  6 ++---
 fs/gfs2/ops_fstype.c                |  2 +-
 fs/hfsplus/super.c                  |  4 ++--
 fs/jbd2/checkpoint.c                |  2 +-
 fs/jbd2/commit.c                    |  9 +++----
 fs/jbd2/journal.c                   | 15 ++++++------
 fs/jbd2/revoke.c                    |  2 +-
 fs/jfs/jfs_logmgr.c                 |  4 ++--
 fs/mpage.c                          |  6 ++---
 fs/nilfs2/super.c                   |  2 +-
 fs/ocfs2/cluster/heartbeat.c        |  2 +-
 fs/reiserfs/journal.c               |  6 +++--
 fs/xfs/xfs_aops.c                   | 11 +++++----
 fs/xfs/xfs_buf.c                    |  2 +-
 include/linux/fs.h                  | 47 -------------------------------------
 include/trace/events/f2fs.h         | 10 ++++----
 kernel/power/swap.c                 | 19 +++++++--------
 53 files changed, 133 insertions(+), 182 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 95f1d4d357df..d35beca18481 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	}
 
 	flush_rq->cmd_type = REQ_TYPE_FS;
-	flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH;
+	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
 	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
@@ -486,7 +486,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_bdev = bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	ret = submit_bio_wait(bio);
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 942384f34e22..a89538cb3eaa 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1266,7 +1266,7 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont
 	bio->bi_bdev = device->ldev->backing_bdev;
 	bio->bi_private = octx;
 	bio->bi_end_io = one_flush_endio;
-	bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH;
 
 	device->flush_jif = jiffies;
 	set_bit(FLUSH_PENDING, &device->flags);
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 4a80ee752597..726c32e35db9 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -1253,14 +1253,14 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	case BLKIF_OP_WRITE:
 		ring->st_wr_req++;
 		operation = REQ_OP_WRITE;
-		operation_flags = WRITE_ODIRECT;
+		operation_flags = REQ_SYNC | REQ_IDLE;
 		break;
 	case BLKIF_OP_WRITE_BARRIER:
 		drain = true;
 	case BLKIF_OP_FLUSH_DISKCACHE:
 		ring->st_f_req++;
 		operation = REQ_OP_WRITE;
-		operation_flags = WRITE_FLUSH;
+		operation_flags = REQ_PREFLUSH;
 		break;
 	default:
 		operation = 0; /* make gcc happy */
@@ -1272,7 +1272,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	nseg = req->operation == BLKIF_OP_INDIRECT ?
 	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
 
-	if (unlikely(nseg == 0 && operation_flags != WRITE_FLUSH) ||
+	if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
 	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
 		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
 	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
@@ -1334,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 	}
 
 	/* Wait on all outstanding I/O's and once that has been completed
-	 * issue the WRITE_FLUSH.
+	 * issue the flush.
 	 */
 	if (drain)
 		xen_blk_drain_io(pending_req->ring);
@@ -1380,7 +1380,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 
 	/* This will be hit if the operation was a flush or discard. */
 	if (!bio) {
-		BUG_ON(operation_flags != WRITE_FLUSH);
+		BUG_ON(operation_flags != REQ_PREFLUSH);
 
 		bio = bio_alloc(GFP_KERNEL, 0);
 		if (unlikely(bio == NULL))
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81d3db40cd7b..6fdd8e252760 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -297,7 +297,7 @@ static void bch_btree_node_read(struct btree *b)
 	bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
 	bio->bi_end_io	= btree_node_read_endio;
 	bio->bi_private	= &cl;
-	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+	bio->bi_opf = REQ_OP_READ | REQ_META;
 
 	bch_bio_map(bio, b->keys.set[0].data);
 
@@ -393,7 +393,7 @@ static void do_btree_node_write(struct btree *b)
 	b->bio->bi_end_io	= btree_node_write_endio;
 	b->bio->bi_private	= cl;
 	b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c));
-	bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
+	b->bio->bi_opf		= REQ_OP_WRITE | REQ_META | REQ_FUA;
 	bch_bio_map(b->bio, i);
 
 	/*
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 333a1e5f6ae6..1c9130ae0073 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -52,7 +52,7 @@ void bch_btree_verify(struct btree *b)
 	bio->bi_bdev		= PTR_CACHE(b->c, &b->key, 0)->bdev;
 	bio->bi_iter.bi_sector	= PTR_OFFSET(&b->key, 0);
 	bio->bi_iter.bi_size	= KEY_SIZE(&v->key) << 9;
-	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
+	bio->bi_opf		= REQ_OP_READ | REQ_META;
 	bch_bio_map(bio, sorted);
 
 	submit_bio_wait(bio);
@@ -113,7 +113,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
 	check = bio_clone(bio, GFP_NOIO);
 	if (!check)
 		return;
-	bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC);
+	check->bi_opf = REQ_OP_READ;
 
 	if (bio_alloc_pages(check, GFP_NOIO))
 		goto out_put;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index e8a2b693c928..0d99b5f4b3e6 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -923,7 +923,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s)
 			flush->bi_bdev	= bio->bi_bdev;
 			flush->bi_end_io = request_endio;
 			flush->bi_private = cl;
-			bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH);
+			flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 			closure_bio_submit(flush, cl);
 		}
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 849ad441cd76..988edf928466 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -381,7 +381,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 		return "bad uuid pointer";
 
 	bkey_copy(&c->uuid_bucket, k);
-	uuid_io(c, REQ_OP_READ, READ_SYNC, k, cl);
+	uuid_io(c, REQ_OP_READ, 0, k, cl);
 
 	if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
 		struct uuid_entry_v0	*u0 = (void *) c->uuids;
@@ -600,7 +600,7 @@ static void prio_read(struct cache *ca, uint64_t bucket)
 			ca->prio_last_buckets[bucket_nr] = bucket;
 			bucket_nr++;
 
-			prio_io(ca, bucket, REQ_OP_READ, READ_SYNC);
+			prio_io(ca, bucket, REQ_OP_READ, 0);
 
 			if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
 				pr_warn("bad csum reading priorities");
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 125aedc3875f..b3ba142e59a4 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1316,7 +1316,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c)
 {
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = WRITE_FLUSH,
+		.bi_op_flags = REQ_PREFLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = c->dm_io,
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 07fc1ad42ec5..33e71ea6cc14 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -308,7 +308,7 @@ static int flush_header(struct log_c *lc)
 	};
 
 	lc->io_req.bi_op = REQ_OP_WRITE;
-	lc->io_req.bi_op_flags = WRITE_FLUSH;
+	lc->io_req.bi_op_flags = REQ_PREFLUSH;
 
 	return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index bdf1606f67bc..1a176d7c8b90 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = WRITE_FLUSH,
+		.bi_op_flags = REQ_PREFLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.addr = NULL,
 		.client = ms->io_client,
@@ -657,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct mirror *m;
 	struct dm_io_request io_req = {
 		.bi_op = REQ_OP_WRITE,
-		.bi_op_flags = bio->bi_opf & WRITE_FLUSH_FUA,
+		.bi_op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH),
 		.mem.type = DM_IO_BIO,
 		.mem.ptr.bio = bio,
 		.notify.fn = write_callback,
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index b8cf956b577b..b93476c3ba3f 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -741,7 +741,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	/*
 	 * Commit exceptions to disk.
 	 */
-	if (ps->valid && area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA))
+	if (ps->valid && area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA))
 		ps->valid = 0;
 
 	/*
@@ -818,7 +818,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
 	for (i = 0; i < nr_merged; i++)
 		clear_exception(ps, ps->current_committed - 1 - i);
 
-	r = area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA);
+	r = area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA);
 	if (r < 0)
 		return r;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 147af9536d0c..b2abfa41af3e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1527,7 +1527,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	bio_init(&md->flush_bio);
 	md->flush_bio.bi_bdev = md->bdev;
-	bio_set_op_attrs(&md->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	dm_stats_init(&md->stats);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index eac84d8ff724..b69ec7da4bae 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -394,7 +394,7 @@ static void submit_flushes(struct work_struct *ws)
 			bi->bi_end_io = md_end_flush;
 			bi->bi_private = rdev;
 			bi->bi_bdev = rdev->bdev;
-			bio_set_op_attrs(bi, REQ_OP_WRITE, WRITE_FLUSH);
+			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 			atomic_inc(&mddev->flush_pending);
 			submit_bio(bi);
 			rcu_read_lock();
@@ -743,7 +743,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 	bio_add_page(bio, page, size, 0);
 	bio->bi_private = rdev;
 	bio->bi_end_io = super_written;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA;
 
 	atomic_inc(&mddev->pending_writes);
 	submit_bio(bio);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 1b1ab4a1d132..28d015c6fffe 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -685,7 +685,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
 	bio_reset(&log->flush_bio);
 	log->flush_bio.bi_bdev = log->rdev->bdev;
 	log->flush_bio.bi_end_io = r5l_log_flush_endio;
-	bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	submit_bio(&log->flush_bio);
 }
 
@@ -1053,7 +1053,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
 	mb->checksum = cpu_to_le32(crc);
 
 	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
-			  WRITE_FUA, false)) {
+			  REQ_FUA, false)) {
 		__free_page(page);
 		return -EIO;
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 92ac251e91e6..70acdd379e44 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -913,7 +913,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			op = REQ_OP_WRITE;
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
-				op_flags = WRITE_FUA;
+				op_flags = REQ_FUA;
 			if (test_bit(R5_Discard, &sh->dev[i].flags))
 				op = REQ_OP_DISCARD;
 		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 4a96c2049b7b..c2784cfc5e29 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -58,7 +58,7 @@ static void nvmet_execute_rw(struct nvmet_req *req)
 
 	if (req->cmd->rw.opcode == nvme_cmd_write) {
 		op = REQ_OP_WRITE;
-		op_flags = WRITE_ODIRECT;
+		op_flags = REQ_SYNC | REQ_IDLE;
 		if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
 			op_flags |= REQ_FUA;
 	} else {
@@ -109,7 +109,7 @@ static void nvmet_execute_flush(struct nvmet_req *req)
 	bio->bi_bdev = req->ns->bdev;
 	bio->bi_private = req;
 	bio->bi_end_io = nvmet_bio_done;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
 	submit_bio(bio);
 }
diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index 372d744315f3..d316ed537d59 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -388,7 +388,7 @@ iblock_execute_sync_cache(struct se_cmd *cmd)
 	bio = bio_alloc(GFP_KERNEL, 0);
 	bio->bi_end_io = iblock_end_io_flush;
 	bio->bi_bdev = ib_dev->ibd_bd;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	if (!immed)
 		bio->bi_private = cmd;
 	submit_bio(bio);
@@ -686,15 +686,15 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 		struct iblock_dev *ib_dev = IBLOCK_DEV(dev);
 		struct request_queue *q = bdev_get_queue(ib_dev->ibd_bd);
 		/*
-		 * Force writethrough using WRITE_FUA if a volatile write cache
+		 * Force writethrough using REQ_FUA if a volatile write cache
 		 * is not enabled, or if initiator set the Force Unit Access bit.
 		 */
 		op = REQ_OP_WRITE;
 		if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) {
 			if (cmd->se_cmd_flags & SCF_FUA)
-				op_flags = WRITE_FUA;
+				op_flags = REQ_FUA;
 			else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
-				op_flags = WRITE_FUA;
+				op_flags = REQ_FUA;
 		}
 	} else {
 		op = REQ_OP_READ;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c8454a8e35f2..fe10afd51e02 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3485,9 +3485,9 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * to go down lazy.
 		 */
 		if (i == 0)
-			ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh);
+			ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh);
 		else
-			ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+			ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 		if (ret)
 			errors++;
 	}
@@ -3551,7 +3551,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
 	device->flush_bio = bio;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 66a755150056..ff87bff7bdb6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -127,7 +127,7 @@ struct extent_page_data {
 	 */
 	unsigned int extent_locked:1;
 
-	/* tells the submit_bio code to use a WRITE_SYNC */
+	/* tells the submit_bio code to use REQ_SYNC */
 	unsigned int sync_io:1;
 };
 
@@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		return -EIO;
 	}
 	bio->bi_bdev = dev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	bio_add_page(bio, page, length, pg_offset);
 
 	if (btrfsic_submit_bio_wait(bio)) {
@@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct bio *bio;
-	int read_mode;
+	int read_mode = 0;
 	int ret;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	}
 
 	if (failed_bio->bi_vcnt > 1)
-		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
-	else
-		read_mode = READ_SYNC;
+		read_mode |= REQ_FAILFAST_DEV;
 
 	phy_offset >>= inode->i_sb->s_blocksize_bits;
 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
@@ -3484,7 +3482,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long nr_written = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		write_flags = WRITE_SYNC;
+		write_flags = REQ_SYNC;
 
 	trace___extent_writepage(page, inode, wbc);
 
@@ -3729,7 +3727,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
 	unsigned long start, end;
-	int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META;
+	int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
@@ -4076,7 +4074,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 		int ret;
 
 		bio_set_op_attrs(epd->bio, REQ_OP_WRITE,
-				 epd->sync_io ? WRITE_SYNC : 0);
+				 epd->sync_io ? REQ_SYNC : 0);
 
 		ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
 		BUG_ON(ret < 0); /* -ENOMEM */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9a377079af26..c8eb82a416b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7917,7 +7917,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	struct io_failure_record *failrec;
 	struct bio *bio;
 	int isector;
-	int read_mode;
+	int read_mode = 0;
 	int ret;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7936,9 +7936,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	if ((failed_bio->bi_vcnt > 1)
 		|| (failed_bio->bi_io_vec->bv_len
 			> BTRFS_I(inode)->root->sectorsize))
-		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
-	else
-		read_mode = READ_SYNC;
+		read_mode |= REQ_FAILFAST_DEV;
 
 	isector = start - btrfs_io_bio(failed_bio)->logical;
 	isector >>= inode->i_sb->s_blocksize_bits;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index fffb9ab8526e..ff3078234d94 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4440,7 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	ret = bio_add_page(bio, page, PAGE_SIZE, 0);
 	if (ret != PAGE_SIZE) {
 leave_with_eio:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index deda46cf1292..0d7d635d8bfb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6023,7 +6023,7 @@ static void btrfs_end_bio(struct bio *bio)
 				else
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_READ_ERRS);
-				if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH)
+				if (bio->bi_opf & REQ_PREFLUSH)
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_FLUSH_ERRS);
 				btrfs_dev_stat_print_on_error(dev);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 09ed29c67848..f137ffe6654c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -62,7 +62,7 @@ struct btrfs_device {
 	int running_pending;
 	/* regular prio bios */
 	struct btrfs_pending_bios pending_bios;
-	/* WRITE_SYNC bios */
+	/* sync bios */
 	struct btrfs_pending_bios pending_sync_bios;
 
 	struct block_device *bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index a29335867e30..bc7c2bb30a9b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -753,7 +753,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * still in flight on potentially older
 				 * contents.
 				 */
-				write_dirty_buffer(bh, WRITE_SYNC);
+				write_dirty_buffer(bh, REQ_SYNC);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -1684,7 +1684,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
  * prevents this contention from occurring.
  *
  * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
  * causes the writes to be flagged as synchronous writes.
  */
 int __block_write_full_page(struct inode *inode, struct page *page,
@@ -1697,7 +1697,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -3210,7 +3210,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer);
 
 int sync_dirty_buffer(struct buffer_head *bh)
 {
-	return __sync_dirty_buffer(bh, WRITE_SYNC);
+	return __sync_dirty_buffer(bh, REQ_SYNC);
 }
 EXPORT_SYMBOL(sync_dirty_buffer);
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index fb9aa16a7727..a5138c564019 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,7 +1209,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	dio->inode = inode;
 	if (iov_iter_rw(iter) == WRITE) {
 		dio->op = REQ_OP_WRITE;
-		dio->op_flags = WRITE_ODIRECT;
+		dio->op_flags = REQ_SYNC | REQ_IDLE;
 	} else {
 		dio->op = REQ_OP_READ;
 	}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d89754ef1aab..eb9835638680 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
 }
 
 /*
- * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * Write the MMP block using REQ_SYNC to try to get the block on-disk
  * faster.
  */
 static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 	lock_buffer(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+	submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	sb_end_write(sb);
 	if (unlikely(!buffer_uptodate(bh)))
@@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
 	get_bh(*bh);
 	lock_buffer(*bh);
 	(*bh)->b_end_io = end_buffer_read_sync;
-	submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh);
+	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
 	wait_on_buffer(*bh);
 	if (!buffer_uptodate(*bh)) {
 		ret = -EIO;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0094923e5ebf..e0b3b54cdef3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
 
 	if (bio) {
 		int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
-				  WRITE_SYNC : 0;
+				  REQ_SYNC : 0;
 		bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
 		submit_bio(io->io_bio);
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6db81fbcbaa6..f31eb286af90 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4553,7 +4553,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	unlock_buffer(sbh);
 	if (sync) {
 		error = __sync_dirty_buffer(sbh,
-			test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
+			test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
 		if (error)
 			return error;
 
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7e9b504bd8b2..d935c06a84f0 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_READ,
-		.op_flags = READ_SYNC | REQ_META | REQ_PRIO,
+		.op_flags = REQ_META | REQ_PRIO,
 		.old_blkaddr = index,
 		.new_blkaddr = index,
 		.encrypted_page = NULL,
@@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_READ,
-		.op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
+		.op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
 		.encrypted_page = NULL,
 	};
 	struct blk_plug plug;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9ae194fd2fdb..b80bf10603d7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -198,11 +198,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 	if (type >= META_FLUSH) {
 		io->fio.type = META_FLUSH;
 		io->fio.op = REQ_OP_WRITE;
-		if (test_opt(sbi, NOBARRIER))
-			io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
-		else
-			io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
-								REQ_PRIO;
+		io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO;
+		if (!test_opt(sbi, NOBARRIER))
+			io->fio.op_flags |= REQ_FUA;
 	}
 	__submit_merged_bio(io);
 out:
@@ -483,7 +481,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
 		return page;
 	f2fs_put_page(page, 0);
 
-	page = get_read_data_page(inode, index, READ_SYNC, false);
+	page = get_read_data_page(inode, index, 0, false);
 	if (IS_ERR(page))
 		return page;
 
@@ -509,7 +507,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 repeat:
-	page = get_read_data_page(inode, index, READ_SYNC, for_write);
+	page = get_read_data_page(inode, index, 0, for_write);
 	if (IS_ERR(page))
 		return page;
 
@@ -1251,7 +1249,7 @@ static int f2fs_write_data_page(struct page *page,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0,
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -1663,7 +1661,7 @@ repeat:
 			err = PTR_ERR(bio);
 			goto fail;
 		}
-		bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+		bio->bi_opf = REQ_OP_READ;
 		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 			bio_put(bio);
 			err = -EFAULT;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 93985c64d8a8..9eb11b2244ea 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -550,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
 		.op = REQ_OP_READ,
-		.op_flags = READ_SYNC,
+		.op_flags = 0,
 		.encrypted_page = NULL,
 	};
 	struct dnode_of_data dn;
@@ -625,7 +625,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
 
 	fio.op = REQ_OP_WRITE;
-	fio.op_flags = WRITE_SYNC;
+	fio.op_flags = REQ_SYNC;
 	fio.new_blkaddr = newaddr;
 	f2fs_submit_page_mbio(&fio);
 
@@ -663,7 +663,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 			.sbi = F2FS_I_SB(inode),
 			.type = DATA,
 			.op = REQ_OP_WRITE,
-			.op_flags = WRITE_SYNC,
+			.op_flags = REQ_SYNC,
 			.page = page,
 			.encrypted_page = NULL,
 		};
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5f1a67f756af..2e7f54c191b4 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 		.sbi = F2FS_I_SB(dn->inode),
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_PRIO,
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 01177ecdeab8..932f3f8bb57b 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1134,7 +1134,7 @@ repeat:
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	err = read_node_page(page, READ_SYNC);
+	err = read_node_page(page, 0);
 	if (err < 0) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
@@ -1570,7 +1570,7 @@ static int f2fs_write_node_page(struct page *page,
 		.sbi = sbi,
 		.type = NODE,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0,
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fc886f008449..f1b4a1775ebe 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_PRIO,
 		.encrypted_page = NULL,
 	};
 	bool submit_bio = false;
@@ -420,7 +420,7 @@ repeat:
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
 		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+		bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 		ret = submit_bio_wait(bio);
 
 		llist_for_each_entry_safe(cmd, next,
@@ -454,7 +454,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 
 		atomic_inc(&fcc->submit_flush);
 		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+		bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 		ret = submit_bio_wait(bio);
 		atomic_dec(&fcc->submit_flush);
 		bio_put(bio);
@@ -1515,7 +1515,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_META | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
 		.old_blkaddr = page->index,
 		.new_blkaddr = page->index,
 		.page = page,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6132b4ce4e4c..2cac6bb86080 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1238,7 +1238,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
 	unlock_buffer(bh);
 
 	/* it's rare case, we can do fua all the time */
-	return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+	return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
 }
 
 static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e58ccef09c91..27c00a16def0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 	struct gfs2_log_header *lh;
 	unsigned int tail;
 	u32 hash;
-	int op_flags = WRITE_FLUSH_FUA | REQ_META;
+	int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META;
 	struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
 	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
 	lh = page_address(page);
@@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
 		gfs2_ordered_wait(sdp);
 		log_flush_wait(sdp);
-		op_flags = WRITE_SYNC | REQ_META | REQ_PRIO;
+		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
 	}
 
 	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 373639a59782..e562b1191c9c 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -38,7 +38,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
 	int write_flags = REQ_META | REQ_PRIO |
-		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+		(wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
@@ -285,7 +285,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		}
 	}
 
-	gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
+	gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
@@ -453,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh);
+		ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
 
 	dblock++;
 	extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ff72ac6439c8..a34308df927f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 
 	bio->bi_end_io = end_bio_io_page;
 	bio->bi_private = page;
-	bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META);
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
 	submit_bio(bio);
 	wait_on_page_locked(page);
 	bio_put(bio);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 11854dd84572..67aedf4c2e7c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	error2 = hfsplus_submit_bio(sb,
 				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
 				   sbi->s_vhdr_buf, NULL, REQ_OP_WRITE,
-				   WRITE_SYNC);
+				   REQ_SYNC);
 	if (!error)
 		error = error2;
 	if (!write_backup)
@@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	error2 = hfsplus_submit_bio(sb,
 				  sbi->part_start + sbi->sect_count - 2,
 				  sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE,
-				  WRITE_SYNC);
+				  REQ_SYNC);
 	if (!error)
 		error2 = error;
 out:
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 684996c8a3a4..4055f51617ef 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count)
 
 	blk_start_plug(&plug);
 	for (i = 0; i < *batch_count; i++)
-		write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
+		write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
 	blk_finish_plug(&plug);
 
 	for (i = 0; i < *batch_count; i++) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 31f8ca046639..8c514367ba5a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_has_feature_async_commit(journal))
-		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+		ret = submit_bh(REQ_OP_WRITE,
+			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
 	else
-		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 
 	*cbh = bh;
 	return ret;
@@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
 						journal->j_tail,
-						WRITE_SYNC);
+						REQ_SYNC);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
 		jbd_debug(3, "superblock not updated\n");
@@ -717,7 +718,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 			}
 			cond_resched();
 			stats.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 927da4956a89..8ed971eeab44 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
 	if (ret)
 		goto out;
 
@@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal)
 		/* Lock here to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
 		/*
-		 * Update log tail information. We use WRITE_FUA since new
+		 * Update log tail information. We use REQ_FUA since new
 		 * transaction will start reusing journal space and so we
 		 * must make sure information about current log tail is on
 		 * disk before that.
@@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal)
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
 						journal->j_tail,
-						WRITE_FUA);
+						REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 	return jbd2_journal_start_thread(journal);
@@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
 	read_unlock(&journal->j_state_lock);
 
-	jbd2_write_superblock(journal, WRITE_FUA);
+	jbd2_write_superblock(journal, REQ_FUA);
 }
 EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 
@@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal)
 				++journal->j_transaction_sequence;
 			write_unlock(&journal->j_state_lock);
 
-			jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
+			jbd2_mark_journal_empty(journal,
+					REQ_PREFLUSH | REQ_FUA);
 			mutex_unlock(&journal->j_checkpoint_mutex);
 		} else
 			err = -EIO;
@@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	jbd2_mark_journal_empty(journal, WRITE_FUA);
+	jbd2_mark_journal_empty(journal, REQ_FUA);
 	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(!journal->j_running_transaction);
@@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (write) {
 		/* Lock to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
-		jbd2_mark_journal_empty(journal, WRITE_FUA);
+		jbd2_mark_journal_empty(journal, REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 91171dc352cb..cfc38b552118 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(descriptor);
 	BUFFER_TRACE(descriptor, "write");
 	set_buffer_dirty(descriptor);
-	write_dirty_buffer(descriptor, WRITE_SYNC);
+	write_dirty_buffer(descriptor, REQ_SYNC);
 }
 #endif
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a21ea8b3e5fa..bb1da1feafeb 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+	bio->bi_opf = REQ_OP_READ;
 	/*check if journaling to disk has been disabled*/
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
@@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
diff --git a/fs/mpage.c b/fs/mpage.c
index d2413af0823a..f35e2819d0c6 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
-	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : 0);
+	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0);
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -705,7 +705,7 @@ mpage_writepages(struct address_space *mapping,
 		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
 		if (mpd.bio) {
 			int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
-				  WRITE_SYNC : 0);
+				  REQ_SYNC : 0);
 			mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
 		}
 	}
@@ -726,7 +726,7 @@ int mpage_writepage(struct page *page, get_block_t get_block,
 	int ret = __mpage_writepage(page, wbc, &mpd);
 	if (mpd.bio) {
 		int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
-			  WRITE_SYNC : 0);
+			  REQ_SYNC : 0);
 		mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
 	}
 	return ret;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c95d369e90aa..12eeae62a2b1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
 	set_buffer_dirty(nilfs->ns_sbh[0]);
 	if (nilfs_test_opt(nilfs, BARRIER)) {
 		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-					  WRITE_SYNC | WRITE_FLUSH_FUA);
+					  REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
 	} else {
 		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 636abcbd4650..52eef16edb01 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
 	slot = o2nm_this_node();
 
 	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE,
-				 WRITE_SYNC);
+				 REQ_SYNC);
 	if (IS_ERR(bio)) {
 		status = PTR_ERR(bio);
 		mlog_errno(status);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bc2dde2423c2..aa40c242f1db 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s,
 		mark_buffer_dirty(jl->j_commit_bh) ;
 		depth = reiserfs_write_unlock_nested(s);
 		if (reiserfs_barrier_flush(s))
-			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+			__sync_dirty_buffer(jl->j_commit_bh,
+					REQ_PREFLUSH | REQ_FUA);
 		else
 			sync_dirty_buffer(jl->j_commit_bh);
 		reiserfs_write_lock_nested(s, depth);
@@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb,
 		depth = reiserfs_write_unlock_nested(sb);
 
 		if (reiserfs_barrier_flush(sb))
-			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+			__sync_dirty_buffer(journal->j_header_bh,
+					REQ_PREFLUSH | REQ_FUA);
 		else
 			sync_dirty_buffer(journal->j_header_bh);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3e57a56cf829..594e02c485b2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -495,8 +495,10 @@ xfs_submit_ioend(
 
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			 (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	ioend->io_bio->bi_opf = REQ_OP_WRITE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ioend->io_bio->bi_opf |= REQ_SYNC;
+
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
 	 * error and finish it. This will run IO completion immediately
@@ -567,8 +569,9 @@ xfs_chain_bio(
 
 	bio_chain(ioend->io_bio, new);
 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			  (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	ioend->io_bio->bi_opf = REQ_OP_WRITE;
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		ioend->io_bio->bi_opf |= REQ_SYNC;
 	submit_bio(ioend->io_bio);
 	ioend->io_bio = new;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b5b9bffe3520..33c435f3316c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1304,7 +1304,7 @@ _xfs_buf_ioapply(
 	if (bp->b_flags & XBF_WRITE) {
 		op = REQ_OP_WRITE;
 		if (bp->b_flags & XBF_SYNCIO)
-			op_flags = WRITE_SYNC;
+			op_flags = REQ_SYNC;
 		if (bp->b_flags & XBF_FUA)
 			op_flags |= REQ_FUA;
 		if (bp->b_flags & XBF_FLUSH)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 46a74209917f..7a1b78ab7c15 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -151,58 +151,11 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
  */
 #define CHECK_IOVEC_ONLY -1
 
-/*
- * The below are the various read and write flags that we support. Some of
- * them include behavioral modifiers that send information down to the
- * block layer and IO scheduler. They should be used along with a req_op.
- * Terminology:
- *
- *	The block layer uses device plugging to defer IO a little bit, in
- *	the hope that we will see more IO very shortly. This increases
- *	coalescing of adjacent IO and thus reduces the number of IOs we
- *	have to send to the device. It also allows for better queuing,
- *	if the IO isn't mergeable. If the caller is going to be waiting
- *	for the IO, then he must ensure that the device is unplugged so
- *	that the IO is dispatched to the driver.
- *
- *	All IO is handled async in Linux. This is fine for background
- *	writes, but for reads or writes that someone waits for completion
- *	on, we want to notify the block layer and IO scheduler so that they
- *	know about it. That allows them to make better scheduling
- *	decisions. So when the below references 'sync' and 'async', it
- *	is referencing this priority hint.
- *
- * With that in mind, the available types are:
- *
- * READ			A normal read operation. Device will be plugged.
- * READ_SYNC		A synchronous read. Device is not plugged, caller can
- *			immediately wait on this read without caring about
- *			unplugging.
- * WRITE		A normal async write. Device will be plugged.
- * WRITE_SYNC		Synchronous write. Identical to WRITE, but passes down
- *			the hint that someone will be waiting on this IO
- *			shortly. The write equivalent of READ_SYNC.
- * WRITE_ODIRECT	Special case write for O_DIRECT only.
- * WRITE_FLUSH		Like WRITE_SYNC but with preceding cache flush.
- * WRITE_FUA		Like WRITE_SYNC but data is guaranteed to be on
- *			non-volatile media on completion.
- * WRITE_FLUSH_FUA	Combination of WRITE_FLUSH and FUA. The IO is preceded
- *			by a cache flush and data is guaranteed to be on
- *			non-volatile media on completion.
- *
- */
 #define RW_MASK			REQ_OP_WRITE
 
 #define READ			REQ_OP_READ
 #define WRITE			REQ_OP_WRITE
 
-#define READ_SYNC		0
-#define WRITE_SYNC		REQ_SYNC
-#define WRITE_ODIRECT		(REQ_SYNC | REQ_IDLE)
-#define WRITE_FLUSH		REQ_PREFLUSH
-#define WRITE_FUA		REQ_FUA
-#define WRITE_FLUSH_FUA		(REQ_PREFLUSH | REQ_FUA)
-
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index a9d34424450d..5da2c829a718 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -55,7 +55,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ IPU,		"IN-PLACE" },				\
 		{ OPU,		"OUT-OF-PLACE" })
 
-#define F2FS_BIO_FLAG_MASK(t)	(t & (REQ_RAHEAD | WRITE_FLUSH_FUA))
+#define F2FS_BIO_FLAG_MASK(t)	(t & (REQ_RAHEAD | REQ_PREFLUSH | REQ_FUA))
 #define F2FS_BIO_EXTRA_MASK(t)	(t & (REQ_META | REQ_PRIO))
 
 #define show_bio_type(op_flags)	show_bio_op_flags(op_flags), 		\
@@ -65,11 +65,9 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 	__print_symbolic(F2FS_BIO_FLAG_MASK(flags),			\
 		{ 0,			"WRITE" },			\
 		{ REQ_RAHEAD, 		"READAHEAD" },			\
-		{ READ_SYNC, 		"READ_SYNC" },			\
-		{ WRITE_SYNC, 		"WRITE_SYNC" },			\
-		{ WRITE_FLUSH,		"WRITE_FLUSH" },		\
-		{ WRITE_FUA, 		"WRITE_FUA" },			\
-		{ WRITE_FLUSH_FUA,	"WRITE_FLUSH_FUA" })
+		{ REQ_SYNC, 		"REQ_SYNC" },			\
+		{ REQ_PREFLUSH,		"REQ_PREFLUSH" },		\
+		{ REQ_FUA,		"REQ_FUA" })
 
 #define show_bio_extra(type)						\
 	__print_symbolic(F2FS_BIO_EXTRA_MASK(type),			\
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a3b1e617bcdc..32e0c232efba 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -307,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 {
 	int error;
 
-	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+	hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
 		      swsusp_header, NULL);
 	if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
 	    !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
@@ -317,7 +317,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		swsusp_header->flags = flags;
 		if (flags & SF_CRC32_MODE)
 			swsusp_header->crc32 = handle->crc32;
-		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+		error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 				      swsusp_resume_block, swsusp_header, NULL);
 	} else {
 		printk(KERN_ERR "PM: Swap header not found!\n");
@@ -397,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
 	} else {
 		src = buf;
 	}
-	return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb);
+	return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb);
 }
 
 static void release_swap_writer(struct swap_map_handle *handle)
@@ -1000,8 +1000,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 			return -ENOMEM;
 		}
 
-		error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset,
-				      tmp->map, NULL);
+		error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL);
 		if (error) {
 			release_swap_reader(handle);
 			return error;
@@ -1025,7 +1024,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 	offset = handle->cur->entries[handle->k];
 	if (!offset)
 		return -EFAULT;
-	error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb);
+	error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb);
 	if (error)
 		return error;
 	if (++handle->k >= MAP_PAGE_ENTRIES) {
@@ -1534,7 +1533,7 @@ int swsusp_check(void)
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-		error = hib_submit_io(REQ_OP_READ, READ_SYNC,
+		error = hib_submit_io(REQ_OP_READ, 0,
 					swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
@@ -1543,7 +1542,7 @@ int swsusp_check(void)
 		if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
 			/* Reset swap signature now */
-			error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+			error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 						swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
@@ -1588,11 +1587,11 @@ int swsusp_unmark(void)
 {
 	int error;
 
-	hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block,
+	hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block,
 		      swsusp_header, NULL);
 	if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
 		memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
-		error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC,
+		error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 					swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
-- 
cgit v1.2.3


From 285fdfc5d9959a2104021b6bbdec39b8c26e99ef Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Tue, 20 Sep 2016 19:39:47 +0200
Subject: seccomp: Fix documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix struct seccomp_filter and seccomp_run_filters() signatures.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0db7c8a2afe2..494cba230ca0 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -41,8 +41,7 @@
  *         outside of a lifetime-guarded section.  In general, this
  *         is only needed for handling filters shared across tasks.
  * @prev: points to a previously installed, or inherited, filter
- * @len: the number of instructions in the program
- * @insnsi: the BPF program instructions to evaluate
+ * @prog: the BPF program to evaluate
  *
  * seccomp_filter objects are organized in a tree linked via the @prev
  * pointer.  For any task, it appears to be a singly-linked list starting
@@ -168,8 +167,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 }
 
 /**
- * seccomp_run_filters - evaluates all seccomp filters against @syscall
- * @syscall: number of the current system call
+ * seccomp_run_filters - evaluates all seccomp filters against @sd
+ * @sd: optional seccomp data to be passed to filters
  *
  * Returns valid seccomp BPF response codes.
  */
-- 
cgit v1.2.3


From ceb75787bc75d0a7b88519ab8a68067ac690f55a Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Tue, 1 Nov 2016 11:49:56 +0100
Subject: PM / sleep: fix device reference leak in test_suspend

Make sure to drop the reference taken by class_find_device() after
opening the RTC device.

Fixes: 77437fd4e61f (pm: boot time suspend selftest)
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend_test.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 084452e34a12..bdff5ed57f10 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -203,8 +203,10 @@ static int __init test_suspend(void)
 
 	/* RTCs have initialized by now too ... can we use one? */
 	dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
-	if (dev)
+	if (dev) {
 		rtc = rtc_class_open(dev_name(dev));
+		put_device(dev);
+	}
 	if (!rtc) {
 		printk(warn_no_rtc);
 		return 0;
-- 
cgit v1.2.3


From 382005027fedc50b28d40ae64ef1461cca38953e Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 2 Nov 2016 19:50:29 +0900
Subject: sched/core: Fix oops in sched_show_task()

When CONFIG_THREAD_INFO_IN_TASK=y, it is possible that an exited thread
remains in the task list after its stack pointer was already set to NULL.

Therefore, thread_saved_pc() and stack_not_used() in sched_show_task()
will trigger NULL pointer dereference if an attempt to dump such thread's
traces (e.g. SysRq-t, khungtaskd) is made.

Since show_stack() in sched_show_task() calls try_get_task_stack() and
sched_show_task() is called from interrupt context, calling
try_get_task_stack() from sched_show_task() will be safe as well.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: brgerst@gmail.com
Cc: jann@thejh.net
Cc: keescook@chromium.org
Cc: linux-api@vger.kernel.org
Cc: tycho.andersen@canonical.com
Link: http://lkml.kernel.org/r/201611021950.FEJ34368.HFFJOOMLtQOVSF@I-love.SAKURA.ne.jp
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 42d4027f9e26..9abf66b6c271 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5192,6 +5192,8 @@ void sched_show_task(struct task_struct *p)
 	int ppid;
 	unsigned long state = p->state;
 
+	if (!try_get_task_stack(p))
+		return;
 	if (state)
 		state = __ffs(state) + 1;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
@@ -5221,6 +5223,7 @@ void sched_show_task(struct task_struct *p)
 
 	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
+	put_task_stack(p);
 }
 
 void show_state_filter(unsigned long state_filter)
-- 
cgit v1.2.3


From 8243d5597793b5e85143c9a935e1b971c59740a9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 1 Nov 2016 17:47:18 -0600
Subject: sched/core: Remove pointless printout in sched_show_task()

In sched_show_task() we print out a useless hex number, not even a
symbol, and there's a big question mark whether this even makes sense
anyway, I suspect we should just remove it all.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@alien8.de
Cc: brgerst@gmail.com
Cc: jann@thejh.net
Cc: keescook@chromium.org
Cc: linux-api@vger.kernel.org
Cc: tycho.andersen@canonical.com
Link: http://lkml.kernel.org/r/CA+55aFzphURPFzAvU4z6Moy7ZmimcwPuUdYU8bj9z0J+S8X1rw@mail.gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9abf66b6c271..154fd689fe02 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5198,17 +5198,8 @@ void sched_show_task(struct task_struct *p)
 		state = __ffs(state) + 1;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
-	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
 	if (state == TASK_RUNNING)
 		printk(KERN_CONT "  running task    ");
-	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
-- 
cgit v1.2.3


From 243d52126184b072a18fe2130ce0008f8aa3a340 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Thu, 3 Nov 2016 09:42:36 -0700
Subject: taskstats: fix the length of cgroupstats_cmd_get_policy

cgroupstats_cmd_get_policy is [CGROUPSTATS_CMD_ATTR_MAX+1],
taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1],
but their family.maxattr is TASKSTATS_CMD_ATTR_MAX.
CGROUPSTATS_CMD_ATTR_MAX is less than TASKSTATS_CMD_ATTR_MAX,
so we could end up accessing out-of-bound.

Change cgroupstats_cmd_get_policy to TASKSTATS_CMD_ATTR_MAX+1,
this is safe because the rest are initialized to 0's.

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Tested-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee20d18..cbb387a265db 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
 	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
 	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
 
-static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
+/*
+ * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
+ * Make sure they are always aligned.
+ */
+static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
 	[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
 
-- 
cgit v1.2.3


From 483bed2b0ddd12ec33fc9407e0c6e1088e77a97c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Nov 2016 00:01:19 +0100
Subject: bpf: fix htab map destruction when extra reserve is in use

Commit a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem")
added an extra per-cpu reserve to the hash table map to restore old
behaviour from pre prealloc times. When non-prealloc is in use for a
map, then problem is that once a hash table extra element has been
linked into the hash-table, and the hash table is destroyed due to
refcount dropping to zero, then htab_map_free() -> delete_all_elements()
will walk the whole hash table and drop all elements via htab_elem_free().
The problem is that the element from the extra reserve is first fed
to the wrong backend allocator and eventually freed twice.

Fixes: a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 570eeca7bdfa..ad1bc67aff1b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			htab_elem_free(htab, l);
+			if (l->state != HTAB_EXTRA_ELEM_USED)
+				htab_elem_free(htab, l);
 		}
 	}
 }
-- 
cgit v1.2.3


From 20b2b24f91f70e7d3f0918c077546cb21bd73a87 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Nov 2016 00:56:31 +0100
Subject: bpf: fix map not being uncharged during map creation failure

In map_create(), we first find and create the map, then once that
suceeded, we charge it to the user's RLIMIT_MEMLOCK, and then fetch
a new anon fd through anon_inode_getfd(). The problem is, once the
latter fails f.e. due to RLIMIT_NOFILE limit, then we only destruct
the map via map->ops->map_free(), but without uncharging the previously
locked memory first. That means that the user_struct allocation is
leaked as well as the accounted RLIMIT_MEMLOCK memory not released.
Make the label names in the fix consistent with bpf_prog_load().

Fixes: aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962447a5..237f3d6a7ddc 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr)
 
 	err = bpf_map_charge_memlock(map);
 	if (err)
-		goto free_map;
+		goto free_map_nouncharge;
 
 	err = bpf_map_new_fd(map);
 	if (err < 0)
@@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr)
 	return err;
 
 free_map:
+	bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
 	map->ops->map_free(map);
 	return err;
 }
-- 
cgit v1.2.3


From 7ee7e87dfb158e79019ea1d5ea1b0e6f2bc93ee4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 7 Nov 2016 19:57:00 +0100
Subject: genirq: Use irq type from irqdata instead of irqdesc

The type flags in the irq descriptor are there for historical reasons and
only updated via irq_modify_status() or irq_set_type(). Both functions also
update the type flags in irqdata. __setup_irq() is the only left over user
of the type flags in the irq descriptor.

If __setup_irq() is called with empty irq type flags, then the type flags
are retrieved from irqdata. If an interrupt is shared, then the type flags
are compared with the type flags stored in the irq descriptor.

On x86 the ioapic does not have a irq_set_type() callback because the type
is defined in the BIOS tables and cannot be changed. The type is stored in
irqdata at setup time without updating the type data in the irq
descriptor. As a result the comparison described above fails.

There is no point in updating the irq descriptor flags because the only
relevant storage is irqdata. Use the type flags from irqdata for both
retrieval and comparison in __setup_irq() instead.

Aside of that the print out in case of non matching type flags has the old
and new type flags arguments flipped. Fix that as well.

For correctness sake the flags stored in the irq descriptor should be
removed, but this is beyond the scope of this bugfix and will be done in a
later patch.

Fixes: 4b357daed698 ("genirq: Look-up trigger type if not specified by caller")
Reported-and-tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Jon Hunter <jonathanh@nvidia.com>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1611072020360.3501@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9c4d30483264..6b669593e7eb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 	} else if (new->flags & IRQF_TRIGGER_MASK) {
 		unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
-		unsigned int omsk = irq_settings_get_trigger_mask(desc);
+		unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
 
 		if (nmsk != omsk)
 			/* hope the handler works with current  trigger mode */
 			pr_warn("irq %d uses trigger mode %u; requested %u\n",
-				irq, nmsk, omsk);
+				irq, omsk, nmsk);
 	}
 
 	*old_ptr = new;
-- 
cgit v1.2.3


From 212bd846223c718b6577d4df16fd8d05a55ad914 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Nov 2016 17:15:02 -0800
Subject: genirq/affinity: Handle pre/post vectors in
 irq_calc_affinity_vectors()

Only calculate the affinity for the main I/O vectors, and skip the pre or
post vectors specified by struct irq_affinity.

Also remove the irq_affinity cpumask argument that has never been used.  If
we ever need it in the future we can pass it through struct irq_affinity.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Link: http://lkml.kernel.org/r/1478654107-7384-3-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c         |  8 ++++----
 include/linux/interrupt.h |  4 ++--
 kernel/irq/affinity.c     | 24 ++++++++++--------------
 3 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index ad70507cfb56..dad2da7cf80e 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1061,6 +1061,7 @@ EXPORT_SYMBOL(pci_msi_enabled);
 static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 		unsigned int flags)
 {
+	static const struct irq_affinity default_affd;
 	bool affinity = flags & PCI_IRQ_AFFINITY;
 	int nvec;
 	int rc;
@@ -1091,8 +1092,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 
 	for (;;) {
 		if (affinity) {
-			nvec = irq_calc_affinity_vectors(dev->irq_affinity,
-					nvec);
+			nvec = irq_calc_affinity_vectors(nvec, &default_affd);
 			if (nvec < minvec)
 				return -ENOSPC;
 		}
@@ -1132,6 +1132,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 		struct msix_entry *entries, int minvec, int maxvec,
 		unsigned int flags)
 {
+	static const struct irq_affinity default_affd;
 	bool affinity = flags & PCI_IRQ_AFFINITY;
 	int rc, nvec = maxvec;
 
@@ -1140,8 +1141,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 
 	for (;;) {
 		if (affinity) {
-			nvec = irq_calc_affinity_vectors(dev->irq_affinity,
-					nvec);
+			nvec = irq_calc_affinity_vectors(nvec, &default_affd);
 			if (nvec < minvec)
 				return -ENOSPC;
 		}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6b5268688a81..9081f23bc0ff 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -291,7 +291,7 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
 struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec);
-int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec);
+int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
 
@@ -331,7 +331,7 @@ irq_create_affinity_masks(const struct cpumask *affinity, int nvec)
 }
 
 static inline int
-irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
+irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
 {
 	return maxvec;
 }
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 17f51d63da56..8d9259727cb4 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -131,24 +131,20 @@ out:
 }
 
 /**
- * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
- * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
- *			is used
- * @maxvec:		The maximum number of vectors available
+ * irq_calc_affinity_vectors - Calculate the optimal number of vectors
+ * @maxvec:	The maximum number of vectors available
+ * @affd:	Description of the affinity requirements
  */
-int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
+int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
 {
-	int cpus, ret;
+	int resv = affd->pre_vectors + affd->post_vectors;
+	int vecs = maxvec - resv;
+	int cpus;
 
 	/* Stabilize the cpumasks */
 	get_online_cpus();
-	/* If the supplied affinity mask is NULL, use cpu online mask */
-	if (!affinity)
-		affinity = cpu_online_mask;
-
-	cpus = cpumask_weight(affinity);
-	ret = (cpus < maxvec) ? cpus : maxvec;
-
+	cpus = cpumask_weight(cpu_online_mask);
 	put_online_cpus();
-	return ret;
+
+	return min(cpus, vecs) + resv;
 }
-- 
cgit v1.2.3


From 67c93c218dc5d1b45d547771f1fdb44a381e1faf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 8 Nov 2016 17:15:03 -0800
Subject: genirq/affinity: Handle pre/post vectors in
 irq_create_affinity_masks()

Only calculate the affinity for the main I/O vectors, and skip the
pre or post vectors specified by struct irq_affinity.

Also remove the irq_affinity cpumask argument that has never been used.
If we ever need it in the future we can pass it through struct
irq_affinity.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Link: http://lkml.kernel.org/r/1478654107-7384-4-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi.c         |  6 ++++--
 include/linux/interrupt.h |  4 ++--
 kernel/irq/affinity.c     | 46 +++++++++++++++++++++++++---------------------
 3 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index dad2da7cf80e..f4a108b59336 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -553,12 +553,13 @@ error_attrs:
 static struct msi_desc *
 msi_setup_entry(struct pci_dev *dev, int nvec, bool affinity)
 {
+	static const struct irq_affinity default_affd;
 	struct cpumask *masks = NULL;
 	struct msi_desc *entry;
 	u16 control;
 
 	if (affinity) {
-		masks = irq_create_affinity_masks(dev->irq_affinity, nvec);
+		masks = irq_create_affinity_masks(nvec, &default_affd);
 		if (!masks)
 			pr_err("Unable to allocate affinity masks, ignoring\n");
 	}
@@ -692,12 +693,13 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			      struct msix_entry *entries, int nvec,
 			      bool affinity)
 {
+	static const struct irq_affinity default_affd;
 	struct cpumask *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	int ret, i;
 
 	if (affinity) {
-		masks = irq_create_affinity_masks(dev->irq_affinity, nvec);
+		masks = irq_create_affinity_masks(nvec, &default_affd);
 		if (!masks)
 			pr_err("Unable to allocate affinity masks, ignoring\n");
 	}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9081f23bc0ff..53144e78a369 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -290,7 +290,7 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
-struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec);
+struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
 int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
@@ -325,7 +325,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 }
 
 static inline struct cpumask *
-irq_create_affinity_masks(const struct cpumask *affinity, int nvec)
+irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
 {
 	return NULL;
 }
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 8d9259727cb4..17360bd9619b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -51,16 +51,16 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
 
 /**
  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
- * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
- *			is used
- * @nvecs:		The number of vectors
+ * @nvecs:	The total number of vectors
+ * @affd:	Description of the affinity requirements
  *
  * Returns the masks pointer or NULL if allocation failed.
  */
-struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
-					  int nvec)
+struct cpumask *
+irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
-	int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0;
+	int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
+	int affv = nvecs - affd->pre_vectors - affd->post_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 	struct cpumask *masks;
 	cpumask_var_t nmsk;
@@ -68,46 +68,46 @@ struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return NULL;
 
-	masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL);
+	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
 		goto out;
 
+	/* Fill out vectors at the beginning that don't need affinity */
+	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
+		cpumask_copy(masks + curvec, cpu_possible_mask);
+
 	/* Stabilize the cpumasks */
 	get_online_cpus();
-	/* If the supplied affinity mask is NULL, use cpu online mask */
-	if (!affinity)
-		affinity = cpu_online_mask;
-
-	nodes = get_nodes_in_cpumask(affinity, &nodemsk);
+	nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk);
 
 	/*
 	 * If the number of nodes in the mask is less than or equal the
 	 * number of vectors we just spread the vectors across the nodes.
 	 */
-	if (nvec <= nodes) {
+	if (affv <= nodes) {
 		for_each_node_mask(n, nodemsk) {
 			cpumask_copy(masks + curvec, cpumask_of_node(n));
-			if (++curvec == nvec)
+			if (++curvec == affv)
 				break;
 		}
-		goto outonl;
+		goto done;
 	}
 
 	/* Spread the vectors per node */
-	vecs_per_node = nvec / nodes;
+	vecs_per_node = affv / nodes;
 	/* Account for rounding errors */
-	extra_vecs = nvec - (nodes * vecs_per_node);
+	extra_vecs = affv - (nodes * vecs_per_node);
 
 	for_each_node_mask(n, nodemsk) {
 		int ncpus, v, vecs_to_assign = vecs_per_node;
 
 		/* Get the cpus on this node which are in the mask */
-		cpumask_and(nmsk, affinity, cpumask_of_node(n));
+		cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
 
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
 
-		for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) {
+		for (v = 0; curvec < affv && v < vecs_to_assign; curvec++, v++) {
 			cpus_per_vec = ncpus / vecs_to_assign;
 
 			/* Account for extra vectors to compensate rounding errors */
@@ -119,12 +119,16 @@ struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
 			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
 		}
 
-		if (curvec >= nvec)
+		if (curvec >= affv)
 			break;
 	}
 
-outonl:
+done:
 	put_online_cpus();
+
+	/* Fill out vectors at the end that don't need affinity */
+	for (; curvec < nvecs; curvec++)
+		cpumask_copy(masks + curvec, cpu_possible_mask);
 out:
 	free_cpumask_var(nmsk);
 	return masks;
-- 
cgit v1.2.3


From 90b14889d2f9b29d7e5b4b2d36251c13ce3dd13f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 3 Nov 2016 15:49:58 +0100
Subject: kernel/printk: Convert to hotplug state machine

Install the callbacks via the state machine.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20161103145021.28528-3-bigeasy@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  1 +
 kernel/printk/printk.c     | 29 +++++++++++++----------------
 2 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 69b74fa0da60..4174083280d7 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -31,6 +31,7 @@ enum cpuhp_state {
 	CPUHP_S390_PFAULT_DEAD,
 	CPUHP_BLK_MQ_DEAD,
 	CPUHP_FS_BUFF_DEAD,
+	CPUHP_PRINTK_DEAD,
 	CPUHP_WORKQUEUE_PREP,
 	CPUHP_POWER_NUMA_PREPARE,
 	CPUHP_HRTIMERS_PREPARE,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index de08fc90baaf..4487ffcd42d5 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2185,27 +2185,18 @@ void resume_console(void)
 
 /**
  * console_cpu_notify - print deferred console messages after CPU hotplug
- * @self: notifier struct
- * @action: CPU hotplug event
- * @hcpu: unused
+ * @cpu: unused
  *
  * If printk() is called from a CPU that is not online yet, the messages
  * will be spooled but will not show up on the console.  This function is
  * called when a new CPU comes online (or fails to come up), and ensures
  * that any such output gets printed.
  */
-static int console_cpu_notify(struct notifier_block *self,
-	unsigned long action, void *hcpu)
-{
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DEAD:
-	case CPU_DOWN_FAILED:
-	case CPU_UP_CANCELED:
-		console_lock();
-		console_unlock();
-	}
-	return NOTIFY_OK;
+static int console_cpu_notify(unsigned int cpu)
+{
+	console_lock();
+	console_unlock();
+	return 0;
 }
 
 /**
@@ -2843,6 +2834,7 @@ EXPORT_SYMBOL(unregister_console);
 static int __init printk_late_init(void)
 {
 	struct console *con;
+	int ret;
 
 	for_each_console(con) {
 		if (!keep_bootcon && con->flags & CON_BOOT) {
@@ -2857,7 +2849,12 @@ static int __init printk_late_init(void)
 				unregister_console(con);
 		}
 	}
-	hotcpu_notifier(console_cpu_notify, 0);
+	ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
+					console_cpu_notify);
+	WARN_ON(ret < 0);
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
+					console_cpu_notify, NULL);
+	WARN_ON(ret < 0);
 	return 0;
 }
 late_initcall(printk_late_init);
-- 
cgit v1.2.3


From de464375daf0d10f04fa5add2e889f42328d2ade Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 8 Nov 2016 16:40:28 +0100
Subject: bpf: Remove unused but set variables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the unused but set variables min_set and max_set in
adjust_reg_min_max_vals to fix the following warning when building with
'W=1':

  kernel/bpf/verifier.c:1483:7: warning: variable ‘min_set’ set but not used [-Wunused-but-set-variable]

There is no warning about max_set being unused, but since it is only
used in the assignment of min_set it can be removed as well.

They were introduced in commit 484611357c19 ("bpf: allow access into map
value arrays") but seem to have never been used.

Cc: Josef Bacik <jbacik@fb.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 900257578934..89f787ca47ef 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1499,7 +1499,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 {
 	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
 	u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE;
-	bool min_set = false, max_set = false;
 	u8 opcode = BPF_OP(insn->code);
 
 	dst_reg = &regs[insn->dst_reg];
@@ -1522,7 +1521,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
 		   (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
 		min_val = max_val = insn->imm;
-		min_set = max_set = true;
 	}
 
 	/* We don't know anything about what was done to this register, mark it
-- 
cgit v1.2.3


From 83f06168ef15da5dc735c7ea14fae67609ed9538 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Tue, 8 Nov 2016 00:02:07 -0800
Subject: locking/lockdep: Remove unused parameter from the add_lock_to_list()
 function

The 'class' parameter is not used, remove it.
n
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1478592127-4376-1-git-send-email-tahsin@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763a49b3..e74b438c850b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -840,9 +840,9 @@ static struct lock_list *alloc_list_entry(void)
 /*
  * Add a new dependency to the head of the list:
  */
-static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
-			    struct list_head *head, unsigned long ip,
-			    int distance, struct stack_trace *trace)
+static int add_lock_to_list(struct lock_class *this, struct list_head *head,
+			    unsigned long ip, int distance,
+			    struct stack_trace *trace)
 {
 	struct lock_list *entry;
 	/*
@@ -1869,14 +1869,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 	 * Ok, all validations passed, add the new lock
 	 * to the previous lock's dependency list:
 	 */
-	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
+	ret = add_lock_to_list(hlock_class(next),
 			       &hlock_class(prev)->locks_after,
 			       next->acquire_ip, distance, &trace);
 
 	if (!ret)
 		return 0;
 
-	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
+	ret = add_lock_to_list(hlock_class(prev),
 			       &hlock_class(next)->locks_before,
 			       next->acquire_ip, distance, &trace);
 	if (!ret)
-- 
cgit v1.2.3


From 9846d50df3def5bff9d8a408a958722e79bcaa10 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Tue, 8 Nov 2016 11:15:23 +0100
Subject: sched/deadline: Fix typo in a comment

In the comment:

        /*
         * The task might have changed its scheduling policy to something
         * different than SCHED_DEADLINE (through switched_fromd_dl()).
         */

s/fromd/from/

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/5408b3b3f9ee197a7b7f10fb834341100a4f2c88.1478599881.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 37e2449186c4..c61b461248a3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 
 	/*
 	 * The task might have changed its scheduling policy to something
-	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
+	 * different than SCHED_DEADLINE (through switched_from_dl()).
 	 */
 	if (!dl_task(p)) {
 		__dl_clear_params(p);
-- 
cgit v1.2.3


From c6c7d83b9c9e6a8b3e6d84c820ac61fbffc9e396 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 10 Nov 2016 10:46:26 -0800
Subject: Revert "console: don't prefer first registered if DT specifies
 stdout-path"

This reverts commit 05fd007e4629 ("console: don't prefer first
registered if DT specifies stdout-path").

The reverted commit changes existing behavior on which many ARM boards
rely.  Many ARM small-board-computers, like e.g.  the Raspberry Pi have
both a video output and a serial console.  Depending on whether the user
is using the device as a more regular computer; or as a headless device
we need to have the console on either one or the other.

Many users rely on the kernel behavior of the console being present on
both outputs, before the reverted commit the console setup with no
console= kernel arguments on an ARM board which sets stdout-path in dt
would look like this:

  [root@localhost ~]# cat /proc/consoles
  ttyS0                -W- (EC p a)    4:64
  tty0                 -WU (E  p  )    4:1

Where as after the reverted commit, it looks like this:

  [root@localhost ~]# cat /proc/consoles
  ttyS0                -W- (EC p a)    4:64

This commit reverts commit 05fd007e4629 ("console: don't prefer first
registered if DT specifies stdout-path") restoring the original
behavior.

Fixes: 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path")
Link: http://lkml.kernel.org/r/20161104121135.4780-2-hdegoede@redhat.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/of/base.c       |  2 --
 include/linux/console.h |  6 ------
 kernel/printk/printk.c  | 13 +------------
 3 files changed, 1 insertion(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d687e6de24a0..a0bccb54a9bd 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2077,8 +2077,6 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align))
 			name = of_get_property(of_aliases, "stdout", NULL);
 		if (name)
 			of_stdout = of_find_node_opts_by_path(name, &of_stdout_options);
-		if (of_stdout)
-			console_set_by_of();
 	}
 
 	if (!of_aliases)
diff --git a/include/linux/console.h b/include/linux/console.h
index 3672809234a7..d530c4627e54 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -173,12 +173,6 @@ static inline void console_sysfs_notify(void)
 #endif
 extern bool console_suspend_enabled;
 
-#ifdef CONFIG_OF
-extern void console_set_by_of(void);
-#else
-static inline void console_set_by_of(void) {}
-#endif
-
 /* Suspend and resume console messages over PM events */
 extern void suspend_console(void);
 extern void resume_console(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index de08fc90baaf..5028f4fd504a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -253,17 +253,6 @@ static int preferred_console = -1;
 int console_set_on_cmdline;
 EXPORT_SYMBOL(console_set_on_cmdline);
 
-#ifdef CONFIG_OF
-static bool of_specified_console;
-
-void console_set_by_of(void)
-{
-	of_specified_console = true;
-}
-#else
-# define of_specified_console false
-#endif
-
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
 
@@ -2657,7 +2646,7 @@ void register_console(struct console *newcon)
 	 *	didn't select a console we take the first one
 	 *	that registers here.
 	 */
-	if (preferred_console < 0 && !of_specified_console) {
+	if (preferred_console < 0) {
 		if (newcon->index < 0)
 			newcon->index = 0;
 		if (newcon->setup == NULL ||
-- 
cgit v1.2.3


From c540594f864bb4645573c2c0a304919fabb3d7ea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Nov 2016 22:02:34 +0100
Subject: bpf, mlx4: fix prog refcount in mlx4_en_try_alloc_resources error
 path

Commit 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings
scheme") added a bug in that the prog's reference count is not dropped
in the error path when mlx4_en_try_alloc_resources() is failing from
mlx4_xdp_set().

We previously took bpf_prog_add(prog, priv->rx_ring_num - 1), that we
need to release again. Earlier in the call path, dev_change_xdp_fd()
itself holds a reference to the prog as well (hence the '- 1' in the
bpf_prog_add()), so a simple atomic_sub() is safe to use here. When
an error is propagated, then bpf_prog_put() is called eventually from
dev_change_xdp_fd()

Fixes: 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings scheme")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  5 ++++-
 include/linux/bpf.h                            |  5 +++++
 kernel/bpf/syscall.c                           | 11 +++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 0f6225c042be..9bf7320107b0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2747,8 +2747,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	}
 
 	err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof);
-	if (err)
+	if (err) {
+		if (prog)
+			bpf_prog_sub(prog, priv->rx_ring_num - 1);
 		goto unlock_out;
+	}
 
 	if (priv->port_up) {
 		port_up = 1;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index edcd96ded8aa..01c1487277b2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -234,6 +234,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
+void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
@@ -303,6 +304,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+}
+
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962447a5..23eb2050f15e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -680,6 +680,17 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_add);
 
+void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+	/* Only to be used for undoing previous bpf_prog_add() in some
+	 * error path. We still know that another entity in our call
+	 * path holds a reference to the program, thus atomic_sub() can
+	 * be safely used in such cases!
+	 */
+	WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_sub);
+
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 {
 	return bpf_prog_add(prog, 1);
-- 
cgit v1.2.3


From f5c9f9c72395c3291c2e35c905dedae2b98475a4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 14 Nov 2016 09:31:52 -0800
Subject: Revert "printk: make reading the kernel log flush pending lines"

This reverts commit bfd8d3f23b51018388be0411ccbc2d56277fe294.

It turns out that this flushes things much too aggressiverly, and causes
lines to break up when the system logger races with new continuation
lines being printed.

There's a pending patch to make printk() flushing much more
straightforward, but it's too invasive for 4.9, so in the meantime let's
just not make the system message logging flush continuation lines.
They'll be flushed by the final newline anyway.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5028f4fd504a..f7a55e9ff2f7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -783,8 +783,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 	return ret;
 }
 
-static void cont_flush(void);
-
 static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
@@ -800,7 +798,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	while (user->seq == log_next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
@@ -863,7 +860,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 		return -ESPIPE;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
@@ -902,7 +898,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &log_wait, wait);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (user->seq < log_next_seq) {
 		/* return error when data has vanished underneath us */
 		if (user->seq < log_first_seq)
@@ -1289,7 +1284,6 @@ static int syslog_print(char __user *buf, int size)
 		size_t skip;
 
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -1349,7 +1343,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		return -ENOMEM;
 
 	raw_spin_lock_irq(&logbuf_lock);
-	cont_flush();
 	if (buf) {
 		u64 next_seq;
 		u64 seq;
@@ -1511,7 +1504,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
 		raw_spin_lock_irq(&logbuf_lock);
-		cont_flush();
 		if (syslog_seq < log_first_seq) {
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
@@ -3028,7 +3020,6 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		dumper->active = true;
 
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		cont_flush();
 		dumper->cur_seq = clear_seq;
 		dumper->cur_idx = clear_idx;
 		dumper->next_seq = log_next_seq;
@@ -3119,7 +3110,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
 	bool ret;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	cont_flush();
 	ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
@@ -3162,7 +3152,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	cont_flush();
 	if (dumper->cur_seq < log_first_seq) {
 		/* messages are gone, move to first available one */
 		dumper->cur_seq = log_first_seq;
-- 
cgit v1.2.3


From b8f2ed538477d9ab803c6458f497df1b1c6cf4ce Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 23 Aug 2016 06:51:47 -0700
Subject: rcu: Tighten up __call_rcu() rcu_head alignment check

Commit 720abae3d68ae ("rcu: force alignment on struct
callback_head/rcu_head") forced the rcu_head (AKA callback_head)
structure's alignment to pointer size, that is, to 4-byte boundaries on
32-bit systems and to 8-byte boundaries on 64-bit systems.  This
commit therefore checks for this same alignment in __call_rcu(),
which used to contain a looser check for two-byte alignment.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 69a5611a7e7c..37e4f7d2be0c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 	unsigned long flags;
 	struct rcu_data *rdp;
 
-	WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
+	/* Misaligned rcu_head! */
+	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+
 	if (debug_rcu_head_queue(head)) {
 		/* Probable double call_rcu(), so leak the callback. */
 		WRITE_ONCE(head->func, rcu_leak_callback);
-- 
cgit v1.2.3


From 5403d367a746cfc51676e3b39be600d94f587867 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Sep 2016 05:04:24 -0700
Subject: rcu: Remove obsolete rcu_check_callbacks() header comment

In the deep past, rcu_check_callbacks() was only invoked if rcu_pending()
returned true.  Which was fine, but these days rcu_check_callbacks()
is invoked unconditionally.  This commit therefore removes the obsolete
sentence from the header comment.

Reported-by: Michalis Kokologiannakis <mixaskok@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 37e4f7d2be0c..d52780024f9f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2828,8 +2828,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  * Also schedule RCU core processing.
  *
  * This function must be called from hardirq context.  It is normally
- * invoked from the scheduling-clock interrupt.  If rcu_pending returns
- * false, there is no point in invoking rcu_check_callbacks().
+ * invoked from the scheduling-clock interrupt.
  */
 void rcu_check_callbacks(int user)
 {
-- 
cgit v1.2.3


From 1f21b50b7785b26aff2a0770d8a674921514ff0c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Sep 2016 05:15:53 -0700
Subject: rcu: Remove obsolete comment from __call_rcu()

The __call_rcu() comment about opportunistically noting grace period
beginnings and endings is obsolete.  RCU still does such opportunistic
noting, but in __call_rcu_core() rather than __call_rcu(), and there
already is an appropriate comment in __call_rcu_core().  This commit
therefore removes the obsolete comment.

Reported-by: Michalis Kokologiannakis <mixaskok@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d52780024f9f..865af187498e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3131,13 +3131,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func,
 	}
 	head->func = func;
 	head->next = NULL;
-
-	/*
-	 * Opportunistically note grace-period endings and beginnings.
-	 * Note that we might see a beginning right after we see an
-	 * end, but never vice versa, since this CPU has to pass through
-	 * a quiescent state betweentimes.
-	 */
 	local_irq_save(flags);
 	rdp = this_cpu_ptr(rsp->rda);
 
-- 
cgit v1.2.3


From d0af39e89ec59fe7c92c4bcbc2d652ea4c0ee644 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 10 Oct 2016 18:26:04 -0700
Subject: torture: Trace long read-side delays

Although rcutorture will occasionally do a 50-millisecond grace-period
delay, these delays are quite rare.  And rightly so, because otherwise
the read rate would be quite low.  Thie means that it can be important
to identify whether or not a given run contained a long-delay read.
This commit therefore inserts a trace_rcu_torture_read() event to flag
runs containing long delays.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h |  5 ++++-
 kernel/rcu/rcutorture.c    | 11 ++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index d3e756539d44..9d4f9b3a2b7b 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -698,7 +698,10 @@ TRACE_EVENT(rcu_batch_end,
 /*
  * Tracepoint for rcutorture readers.  The first argument is the name
  * of the RCU flavor from rcutorture's viewpoint and the second argument
- * is the callback address.
+ * is the callback address.  The third argument is the start time in
+ * seconds, and the last two arguments are the grace period numbers
+ * at the beginning and end of the read, respectively.  Note that the
+ * callback address can be NULL.
  */
 TRACE_EVENT(rcu_torture_read,
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bf08fee53dc7..87c51225ceec 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
 
 static void rcu_read_delay(struct torture_random_state *rrsp)
 {
+	unsigned long started;
+	unsigned long completed;
 	const unsigned long shortdelay_us = 200;
 	const unsigned long longdelay_ms = 50;
+	unsigned long long ts;
 
 	/* We want a short delay sometimes to make a reader delay the grace
 	 * period, and we want a long delay occasionally to trigger
 	 * force_quiescent_state. */
 
-	if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+	if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
+		started = cur_ops->completed();
+		ts = rcu_trace_clock_local();
 		mdelay(longdelay_ms);
+		completed = cur_ops->completed();
+		do_trace_rcu_torture_read(cur_ops->name, NULL, ts,
+					  started, completed);
+	}
 	if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
 		udelay(shortdelay_us);
 #ifdef CONFIG_PREEMPT
-- 
cgit v1.2.3


From 0742ac3e2f9f4b8a3a394a270d8685078837662b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 11 Oct 2016 06:09:59 -0700
Subject: rcu: Make expedited grace periods recheck dyntick idle state

Expedited grace periods check dyntick-idle state, and avoid sending
IPIs to idle CPUs, including those running guest OSes, and, on NOHZ_FULL
kernels, nohz_full CPUs.  However, the kernel has been observed checking
a CPU while it was non-idle, but sending the IPI after it has gone
idle.  This commit therefore rechecks idle state immediately before
sending the IPI, refraining from IPIing CPUs that have since gone idle.

Reported-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.h     |  1 +
 kernel/rcu/tree_exp.h | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e99a5234d9ed..fe98dd24adf8 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -404,6 +404,7 @@ struct rcu_data {
 	atomic_long_t exp_workdone1;	/* # done by others #1. */
 	atomic_long_t exp_workdone2;	/* # done by others #2. */
 	atomic_long_t exp_workdone3;	/* # done by others #3. */
+	int exp_dynticks_snap;		/* Double-check need for IPI. */
 
 	/* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 24343eb87b58..d3053e99fdb6 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -358,8 +358,10 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
+			rdp->exp_dynticks_snap =
+				atomic_add_return(0, &rdtp->dynticks);
 			if (raw_smp_processor_id() == cpu ||
-			    !(atomic_add_return(0, &rdtp->dynticks) & 0x1) ||
+			    !(rdp->exp_dynticks_snap & 0x1) ||
 			    !(rnp->qsmaskinitnext & rdp->grpmask))
 				mask_ofl_test |= rdp->grpmask;
 		}
@@ -377,9 +379,17 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		/* IPI the remaining CPUs for expedited quiescent state. */
 		for_each_leaf_node_possible_cpu(rnp, cpu) {
 			unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
 			if (!(mask_ofl_ipi & mask))
 				continue;
 retry_ipi:
+			if (atomic_add_return(0, &rdtp->dynticks) !=
+			    rdp->exp_dynticks_snap) {
+				mask_ofl_test |= mask;
+				continue;
+			}
 			ret = smp_call_function_single(cpu, func, rsp, 0);
 			if (!ret) {
 				mask_ofl_ipi &= ~mask;
-- 
cgit v1.2.3


From aa3e0bf1aa76579c6c51a2c1116a630aa716379d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 23 Mar 2016 10:43:23 -0700
Subject: rcu: Don't kick unless grace period or request

The current code can result in spurious kicks when there are no grace
periods in progress and no grace-period-related requests.  This is
sort of OK for a diagnostic aid, but the resulting ftrace-dump messages
in dmesg are annoying.  This commit therefore avoids spurious kicks
in the common case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 865af187498e..96c52e43f7ca 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1304,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
 	if (!rcu_kick_kthreads)
 		return;
 	j = READ_ONCE(rsp->jiffies_kick_kthreads);
-	if (time_after(jiffies, j) && rsp->gp_kthread) {
+	if (time_after(jiffies, j) && rsp->gp_kthread &&
+	    (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) {
 		WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
 		rcu_ftrace_dump(DUMP_ALL);
 		wake_up_process(rsp->gp_kthread);
-- 
cgit v1.2.3


From 8443075eacb51df8539916c4170d2fdfe7c81433 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 10 Nov 2016 01:39:49 -0500
Subject: audit: tame initialization warning len_abuf in audit_log_execve_info

Tame initialization warning of len_abuf in audit_log_execve_info even
though there isn't presently a bug introduced by commit 43761473c254
("audit: fix a double fetch in audit_log_single_execve_arg()").  Using
UNINITIALIZED_VAR instead may mask future bugs.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 5abf1dc1f91c..8c434318ec8d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1000,7 +1000,7 @@ static void audit_log_execve_info(struct audit_context *context,
 	long len_rem;
 	long len_full;
 	long len_buf;
-	long len_abuf;
+	long len_abuf = 0;
 	long len_tmp;
 	bool require_data;
 	bool encode;
-- 
cgit v1.2.3


From 535e7b4b5ef220be374b895684f274872aebd0f8 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Sun, 13 Nov 2016 19:44:03 +0100
Subject: bpf: Use u64_to_user_ptr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the custom u64_to_ptr() function with the u64_to_user_ptr()
macro.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 23eb2050f15e..cdc06546401b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -17,6 +17,7 @@
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
+#include <linux/kernel.h>
 
 DEFINE_PER_CPU(int, bpf_prog_active);
 
@@ -252,12 +253,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	return map;
 }
 
-/* helper to convert user pointers passed inside __aligned_u64 fields */
-static void __user *u64_to_ptr(__u64 val)
-{
-	return (void __user *) (unsigned long) val;
-}
-
 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 {
 	return -ENOTSUPP;
@@ -268,8 +263,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 
 static int map_lookup_elem(union bpf_attr *attr)
 {
-	void __user *ukey = u64_to_ptr(attr->key);
-	void __user *uvalue = u64_to_ptr(attr->value);
+	void __user *ukey = u64_to_user_ptr(attr->key);
+	void __user *uvalue = u64_to_user_ptr(attr->value);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
 	void *key, *value, *ptr;
@@ -342,8 +337,8 @@ err_put:
 
 static int map_update_elem(union bpf_attr *attr)
 {
-	void __user *ukey = u64_to_ptr(attr->key);
-	void __user *uvalue = u64_to_ptr(attr->value);
+	void __user *ukey = u64_to_user_ptr(attr->key);
+	void __user *uvalue = u64_to_user_ptr(attr->value);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
 	void *key, *value;
@@ -420,7 +415,7 @@ err_put:
 
 static int map_delete_elem(union bpf_attr *attr)
 {
-	void __user *ukey = u64_to_ptr(attr->key);
+	void __user *ukey = u64_to_user_ptr(attr->key);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
 	struct fd f;
@@ -464,8 +459,8 @@ err_put:
 
 static int map_get_next_key(union bpf_attr *attr)
 {
-	void __user *ukey = u64_to_ptr(attr->key);
-	void __user *unext_key = u64_to_ptr(attr->next_key);
+	void __user *ukey = u64_to_user_ptr(attr->key);
+	void __user *unext_key = u64_to_user_ptr(attr->next_key);
 	int ufd = attr->map_fd;
 	struct bpf_map *map;
 	void *key, *next_key;
@@ -741,7 +736,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		return -EINVAL;
 
 	/* copy eBPF program license from user space */
-	if (strncpy_from_user(license, u64_to_ptr(attr->license),
+	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
 			      sizeof(license) - 1) < 0)
 		return -EFAULT;
 	license[sizeof(license) - 1] = 0;
@@ -771,7 +766,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
-	if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
 			   prog->len * sizeof(struct bpf_insn)) != 0)
 		goto free_prog;
 
@@ -822,7 +817,7 @@ static int bpf_obj_pin(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_OBJ))
 		return -EINVAL;
 
-	return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
 }
 
 static int bpf_obj_get(const union bpf_attr *attr)
@@ -830,7 +825,7 @@ static int bpf_obj_get(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
 		return -EINVAL;
 
-	return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
 }
 
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
-- 
cgit v1.2.3


From 977c1f9c8c022d0173181766b34a0db3705265a4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Nov 2016 15:14:20 -0800
Subject: ftrace: Ignore FTRACE_FL_DISABLED while walking dyn_ftrace records

ftrace_shutdown() checks for sanity of ftrace records
and if dyn_ftrace->flags is not zero, it will warn.
It can happen that 'flags' are set to FTRACE_FL_DISABLED at this point,
since some module was loaded, but before ftrace_module_enable()
cleared the flags for this module.

In other words the module.c is doing:
ftrace_module_init(mod); // calls ftrace_update_code() that sets flags=FTRACE_FL_DISABLED
... // here ftrace_shutdown() is called that warns, since
err = prepare_coming_module(mod); // didn't have a chance to clear FTRACE_FL_DISABLED

Fix it by ignoring disabled records.
It's similar to what __ftrace_hash_rec_update() is already doing.

Link: http://lkml.kernel.org/r/1478560460-3818619-1-git-send-email-ast@fb.com

Cc: stable@vger.kernel.org
Fixes: b7ffffbb46f2 "ftrace: Add infrastructure for delayed enabling of module functions"
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2050a7652a86..326498baab83 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2763,7 +2763,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		struct dyn_ftrace *rec;
 
 		do_for_each_ftrace_rec(pg, rec) {
-			if (FTRACE_WARN_ON_ONCE(rec->flags))
+			if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
 				pr_warn("  %pS flags:%lx\n",
 					(void *)rec->ip, rec->flags);
 		} while_for_each_ftrace_rec();
-- 
cgit v1.2.3


From 546fece4eae871f033925ccf0ff2b740725ae915 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 14 Nov 2016 16:31:49 -0500
Subject: ftrace: Add more checks for FTRACE_FL_DISABLED in processing ip
 records

When a module is first loaded and its function ip records are added to the
ftrace list of functions to modify, they are set to DISABLED, as their text
is still in a read only state. When the module is fully loaded, and can be
updated, the flag is cleared, and if their's any functions that should be
tracing them, it is updated at that moment.

But there's several locations that do record accounting and should ignore
records that are marked as disabled, or they can cause issues.

Alexei already fixed one location, but others need to be addressed.

Cc: stable@vger.kernel.org
Fixes: b7ffffbb46f2 "ftrace: Add infrastructure for delayed enabling of module functions"
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 326498baab83..da87b3cba5b3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1862,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
 
 	/* Update rec->flags */
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		/* We need to update only differences of filter_hash */
 		in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
 		in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -1884,6 +1888,10 @@ rollback:
 
 	/* Roll back what we did above */
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (rec == end)
 			goto err_out;
 
@@ -2397,6 +2405,10 @@ void __weak ftrace_replace_code(int enable)
 		return;
 
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		failed = __ftrace_replace_code(rec, enable);
 		if (failed) {
 			ftrace_bug(failed, rec);
@@ -3598,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
 		goto out_unlock;
 
 	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
 			ret = enter_record(hash, rec, clear_filter);
 			if (ret < 0) {
@@ -3793,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 
 	do_for_each_ftrace_rec(pg, rec) {
 
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (!ftrace_match_record(rec, &func_g, NULL, 0))
 			continue;
 
@@ -4685,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
 
 	do_for_each_ftrace_rec(pg, rec) {
 
+		if (rec->flags & FTRACE_FL_DISABLED)
+			continue;
+
 		if (ftrace_match_record(rec, &func_g, NULL, 0)) {
 			/* if it is in the array */
 			exists = false;
-- 
cgit v1.2.3


From 60f1d5e3bac44b598f67d36062da96c095d2b700 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 5 Oct 2016 20:58:15 +0900
Subject: ftrace: Support full glob matching

Use glob_match() to support flexible glob wildcards (*,?)
and character classes ([) for ftrace.
Since the full glob matching is slower than the current
partial matching routines(*pat, pat*, *pat*), this leaves
those routines and just add MATCH_GLOB for complex glob
expression.

e.g.
----
[root@localhost tracing]# echo 'sched*group' > set_ftrace_filter
[root@localhost tracing]# cat set_ftrace_filter
sched_free_group
sched_change_group
sched_create_group
sched_online_group
sched_destroy_group
sched_offline_group
[root@localhost tracing]# echo '[Ss]y[Ss]_*' > set_ftrace_filter
[root@localhost tracing]# head set_ftrace_filter
sys_arch_prctl
sys_rt_sigreturn
sys_ioperm
SyS_iopl
sys_modify_ldt
SyS_mmap
SyS_set_thread_area
SyS_get_thread_area
SyS_set_tid_address
sys_fork
----

Link: http://lkml.kernel.org/r/147566869501.29136.6462645009894738056.stgit@devbox

Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/events.txt     |  9 +++------
 Documentation/trace/ftrace.txt     |  9 +++------
 kernel/trace/Kconfig               |  2 ++
 kernel/trace/ftrace.c              |  4 ++++
 kernel/trace/trace.c               |  2 +-
 kernel/trace/trace.h               |  2 ++
 kernel/trace/trace_events_filter.c | 17 ++++++++++++++++-
 7 files changed, 31 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 08d74d75150d..2cc08d4a326e 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -189,16 +189,13 @@ And for string fields they are:
 
 ==, !=, ~
 
-The glob (~) only accepts a wild card character (*) at the start and or
-end of the string. For example:
+The glob (~) accepts a wild card character (*,?) and character classes
+([). For example:
 
   prev_comm ~ "*sh"
   prev_comm ~ "sh*"
   prev_comm ~ "*sh*"
-
-But does not allow for it to be within the string:
-
-  prev_comm ~ "ba*sh"   <-- is invalid
+  prev_comm ~ "ba*sh"
 
 5.2 Setting filters
 -------------------
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 185c39fea2a0..1bc66c1db0cb 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -2218,16 +2218,13 @@ hrtimer_interrupt
 sys_nanosleep
 
 
-Perhaps this is not enough. The filters also allow simple wild
-cards. Only the following are currently available
+Perhaps this is not enough. The filters also allow glob(7) matching.
 
   <match>*  - will match functions that begin with <match>
   *<match>  - will match functions that end with <match>
   *<match>* - will match functions that have <match> in it
-
-These are the only wild cards which are supported.
-
-  <match>*<match> will not work.
+  <match1>*<match2> - will match functions that begin with
+                      <match1> and end with <match2>
 
 Note: It is better to use quotes to enclose the wild cards,
       otherwise the shell may expand the parameters into names
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2a96b063d659..d5038005eb5d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -70,6 +70,7 @@ config FTRACE_NMI_ENTER
 
 config EVENT_TRACING
 	select CONTEXT_SWITCH_TRACER
+        select GLOB
 	bool
 
 config CONTEXT_SWITCH_TRACER
@@ -133,6 +134,7 @@ config FUNCTION_TRACER
 	select KALLSYMS
 	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
+        select GLOB
 	help
 	  Enable the kernel to trace every kernel function. This is done
 	  by using a compiler feature to insert a small, 5-byte No-Operation
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index da87b3cba5b3..356bb70d071e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3511,6 +3511,10 @@ static int ftrace_match(char *str, struct ftrace_glob *g)
 		    memcmp(str + slen - g->len, g->search, g->len) == 0)
 			matched = 1;
 		break;
+	case MATCH_GLOB:
+		if (glob_match(g->search, str))
+			matched = 1;
+		break;
 	}
 
 	return matched;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8696ce6bf2f6..d904516dfdab 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4065,7 +4065,7 @@ static const char readme_msg[] =
 	"\n  available_filter_functions - list of functions that can be filtered on\n"
 	"  set_ftrace_filter\t- echo function name in here to only trace these\n"
 	"\t\t\t  functions\n"
-	"\t     accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+	"\t     accepts: func_full_name or glob-matching-pattern\n"
 	"\t     modules: Can select a group via module\n"
 	"\t      Format: :mod:<module-name>\n"
 	"\t     example: echo :mod:ext3 > set_ftrace_filter\n"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd24b1f9ac43..4b7918902ab8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,6 +15,7 @@
 #include <linux/trace_events.h>
 #include <linux/compiler.h>
 #include <linux/trace_seq.h>
+#include <linux/glob.h>
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 #include <asm/unistd.h>		/* For NR_SYSCALLS	     */
@@ -1257,6 +1258,7 @@ enum regex_type {
 	MATCH_FRONT_ONLY,
 	MATCH_MIDDLE_ONLY,
 	MATCH_END_ONLY,
+	MATCH_GLOB,
 };
 
 struct regex {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9daa9b3bc6d9..e1c7e2cdc240 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -344,6 +344,12 @@ static int regex_match_end(char *str, struct regex *r, int len)
 	return 0;
 }
 
+static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused)
+{
+	if (glob_match(r->pattern, str))
+		return 1;
+	return 0;
+}
 /**
  * filter_parse_regex - parse a basic regex
  * @buff:   the raw regex
@@ -380,14 +386,20 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
 			if (!i) {
 				*search = buff + 1;
 				type = MATCH_END_ONLY;
-			} else {
+			} else if (i == len - 1) {
 				if (type == MATCH_END_ONLY)
 					type = MATCH_MIDDLE_ONLY;
 				else
 					type = MATCH_FRONT_ONLY;
 				buff[i] = 0;
 				break;
+			} else {	/* pattern continues, use full glob */
+				type = MATCH_GLOB;
+				break;
 			}
+		} else if (strchr("[?\\", buff[i])) {
+			type = MATCH_GLOB;
+			break;
 		}
 	}
 
@@ -420,6 +432,9 @@ static void filter_build_regex(struct filter_pred *pred)
 	case MATCH_END_ONLY:
 		r->match = regex_match_end;
 		break;
+	case MATCH_GLOB:
+		r->match = regex_match_glob;
+		break;
 	}
 
 	pred->not ^= not;
-- 
cgit v1.2.3


From fdf5b679864b8a1165712164b8f17debeb6b10b6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 25 Oct 2016 16:14:28 -0400
Subject: tracing: Optimise comparison filters and fix binary and for 64 bit

Currently the filter logic for comparisons (like greater-than and less-than)
are used, they share the same function and a switch statement is used to
jump to the comparison type to perform. This is done in the extreme hot path
of the tracing code, and it does not take much more space to create a
unique comparison function to perform each type of comparison and remove the
switch statement.

Also, a bug was found where the binary and operation for 64 bits could fail
if the resulting bits were greater than 32 bits, because the result was
passed into a 32 bit variable. This was fixed when adding the separate
binary and function.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e1c7e2cdc240..1ba7a6b86f55 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -145,34 +145,50 @@ struct pred_stack {
 
 /* If not of not match is equal to not of not, then it is a match */
 #define DEFINE_COMPARISON_PRED(type)					\
-static int filter_pred_##type(struct filter_pred *pred, void *event)	\
+static int filter_pred_LT_##type(struct filter_pred *pred, void *event)	\
 {									\
 	type *addr = (type *)(event + pred->offset);			\
 	type val = (type)pred->val;					\
-	int match = 0;							\
-									\
-	switch (pred->op) {						\
-	case OP_LT:							\
-		match = (*addr < val);					\
-		break;							\
-	case OP_LE:							\
-		match = (*addr <= val);					\
-		break;							\
-	case OP_GT:							\
-		match = (*addr > val);					\
-		break;							\
-	case OP_GE:							\
-		match = (*addr >= val);					\
-		break;							\
-	case OP_BAND:							\
-		match = (*addr & val);					\
-		break;							\
-	default:							\
-		break;							\
-	}								\
-									\
+	int match = (*addr < val);					\
 	return !!match == !pred->not;					\
-}
+}									\
+static int filter_pred_LE_##type(struct filter_pred *pred, void *event)	\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = (*addr <= val);					\
+	return !!match == !pred->not;					\
+}									\
+static int filter_pred_GT_##type(struct filter_pred *pred, void *event)	\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = (*addr > val);					\
+	return !!match == !pred->not;					\
+}									\
+static int filter_pred_GE_##type(struct filter_pred *pred, void *event)	\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = (*addr >= val);					\
+	return !!match == !pred->not;					\
+}									\
+static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = !!(*addr & val);					\
+	return match == !pred->not;					\
+}									\
+static const filter_pred_fn_t pred_funcs_##type[] = {			\
+	filter_pred_LT_##type,						\
+	filter_pred_LE_##type,						\
+	filter_pred_GT_##type,						\
+	filter_pred_GE_##type,						\
+	filter_pred_BAND_##type,					\
+};
+
+#define PRED_FUNC_START			OP_LT
 
 #define DEFINE_EQUALITY_PRED(size)					\
 static int filter_pred_##size(struct filter_pred *pred, void *event)	\
@@ -982,33 +998,33 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
 		if (op == OP_EQ || op == OP_NE)
 			fn = filter_pred_64;
 		else if (field_is_signed)
-			fn = filter_pred_s64;
+			fn = pred_funcs_s64[op - PRED_FUNC_START];
 		else
-			fn = filter_pred_u64;
+			fn = pred_funcs_u64[op - PRED_FUNC_START];
 		break;
 	case 4:
 		if (op == OP_EQ || op == OP_NE)
 			fn = filter_pred_32;
 		else if (field_is_signed)
-			fn = filter_pred_s32;
+			fn = pred_funcs_s32[op - PRED_FUNC_START];
 		else
-			fn = filter_pred_u32;
+			fn = pred_funcs_u32[op - PRED_FUNC_START];
 		break;
 	case 2:
 		if (op == OP_EQ || op == OP_NE)
 			fn = filter_pred_16;
 		else if (field_is_signed)
-			fn = filter_pred_s16;
+			fn = pred_funcs_s16[op - PRED_FUNC_START];
 		else
-			fn = filter_pred_u16;
+			fn = pred_funcs_u16[op - PRED_FUNC_START];
 		break;
 	case 1:
 		if (op == OP_EQ || op == OP_NE)
 			fn = filter_pred_8;
 		else if (field_is_signed)
-			fn = filter_pred_s8;
+			fn = pred_funcs_s8[op - PRED_FUNC_START];
 		else
-			fn = filter_pred_u8;
+			fn = pred_funcs_u8[op - PRED_FUNC_START];
 		break;
 	}
 
-- 
cgit v1.2.3


From 3f303fbccfc423a28765df6e1be0428fdf1aac59 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 26 Oct 2016 15:58:03 -0400
Subject: tracing/filter: Define op as the enum that it is

The trace_events_file.c filter logic can be a bit complex. I copy this into
a userspace program where I can debug it a bit easier. One issue is the op
is defined in most places as an int instead of as an enum, and gdb just
gives the value when debugging. Having the actual op name shown in gdb is
more useful.

This has no functionality change, but helps in debugging when the file is
debugged in user space.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1ba7a6b86f55..59a411ff60c7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -108,12 +108,12 @@ static char *err_text[] = {
 };
 
 struct opstack_op {
-	int op;
+	enum filter_op_ids op;
 	struct list_head list;
 };
 
 struct postfix_elt {
-	int op;
+	enum filter_op_ids op;
 	char *operand;
 	struct list_head list;
 };
@@ -977,7 +977,7 @@ int filter_assign_type(const char *type)
 	return FILTER_OTHER;
 }
 
-static bool is_legal_op(struct ftrace_event_field *field, int op)
+static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op)
 {
 	if (is_string_field(field) &&
 	    (op != OP_EQ && op != OP_NE && op != OP_GLOB))
@@ -988,8 +988,8 @@ static bool is_legal_op(struct ftrace_event_field *field, int op)
 	return true;
 }
 
-static filter_pred_fn_t select_comparison_fn(int op, int field_size,
-					     int field_is_signed)
+static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
+					    int field_size, int field_is_signed)
 {
 	filter_pred_fn_t fn = NULL;
 
@@ -1197,7 +1197,8 @@ static inline int append_operand_char(struct filter_parse_state *ps, char c)
 	return 0;
 }
 
-static int filter_opstack_push(struct filter_parse_state *ps, int op)
+static int filter_opstack_push(struct filter_parse_state *ps,
+			       enum filter_op_ids op)
 {
 	struct opstack_op *opstack_op;
 
@@ -1231,7 +1232,7 @@ static int filter_opstack_top(struct filter_parse_state *ps)
 static int filter_opstack_pop(struct filter_parse_state *ps)
 {
 	struct opstack_op *opstack_op;
-	int op;
+	enum filter_op_ids op;
 
 	if (filter_opstack_empty(ps))
 		return OP_NONE;
@@ -1276,7 +1277,7 @@ static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
 	return 0;
 }
 
-static int postfix_append_op(struct filter_parse_state *ps, int op)
+static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op)
 {
 	struct postfix_elt *elt;
 
@@ -1306,8 +1307,8 @@ static void postfix_clear(struct filter_parse_state *ps)
 
 static int filter_parse(struct filter_parse_state *ps)
 {
+	enum filter_op_ids op, top_op;
 	int in_string = 0;
-	int op, top_op;
 	char ch;
 
 	while ((ch = infix_next(ps))) {
@@ -1398,7 +1399,8 @@ parse_operand:
 
 static struct filter_pred *create_pred(struct filter_parse_state *ps,
 				       struct trace_event_call *call,
-				       int op, char *operand1, char *operand2)
+				       enum filter_op_ids op,
+				       char *operand1, char *operand2)
 {
 	struct ftrace_event_field *field;
 	static struct filter_pred pred;
-- 
cgit v1.2.3


From 8d414bd2f77ce858f6b9d119c63b9ce29cf0b75d Mon Sep 17 00:00:00 2001
From: Zhou Chengming <zhouchengming1@huawei.com>
Date: Mon, 14 Nov 2016 11:19:13 +0800
Subject: tracing: Allow wakeup_dl tracer to be used by instances

Allow wakeup_dl tracer to be used by instances, like wakeup tracer
and wakeup_rt tracer.

Link: http://lkml.kernel.org/r/1479093553-31264-1-git-send-email-zhouchengming1@huawei.com

Signed-off-by: Zhou Chengming <zhouchengming1@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_wakeup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9d4399b553a3..1bf2324dc682 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -790,6 +790,7 @@ static struct tracer wakeup_dl_tracer __read_mostly =
 #endif
 	.open		= wakeup_trace_open,
 	.close		= wakeup_trace_close,
+	.allow_instances = true,
 	.use_max_tr	= true,
 };
 
-- 
cgit v1.2.3


From 981ee2d444408fc55b9390d6a4a54a6697513611 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 15 Nov 2016 03:06:50 +0100
Subject: sched/cputime, powerpc: Remove cputime_to_scaled()

Currently cputime_to_scaled() just return it's argument on
all implementations, we don't need to call this function.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1479175612-14718-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/powerpc/include/asm/cputime.h    |  7 -------
 include/asm-generic/cputime_jiffies.h |  1 -
 include/asm-generic/cputime_nsecs.h   |  1 -
 kernel/sched/cputime.c                | 26 ++++++++++++--------------
 4 files changed, 12 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 9f5dcf73b608..aa2e6a34b872 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -52,13 +52,6 @@ static inline unsigned long cputime_to_jiffies(const cputime_t ct)
 	return mulhdu((__force u64) ct, __cputime_jiffies_factor);
 }
 
-/* Estimate the scaled cputime by scaling the real cputime based on
- * the last scaled to real ratio */
-static inline cputime_t cputime_to_scaled(const cputime_t ct)
-{
-	return ct;
-}
-
 static inline cputime_t jiffies_to_cputime(const unsigned long jif)
 {
 	u64 ct;
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h
index fe386fc6e85e..6bb8cd45f53b 100644
--- a/include/asm-generic/cputime_jiffies.h
+++ b/include/asm-generic/cputime_jiffies.h
@@ -7,7 +7,6 @@ typedef unsigned long __nocast cputime_t;
 
 #define cputime_one_jiffy		jiffies_to_cputime(1)
 #define cputime_to_jiffies(__ct)	(__force unsigned long)(__ct)
-#define cputime_to_scaled(__ct)		(__ct)
 #define jiffies_to_cputime(__hz)	(__force cputime_t)(__hz)
 
 typedef u64 __nocast cputime64_t;
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index a84e28e0c634..4e3b18e559b1 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -34,7 +34,6 @@ typedef u64 __nocast cputime64_t;
  */
 #define cputime_to_jiffies(__ct)	\
 	cputime_div(__ct, NSEC_PER_SEC / HZ)
-#define cputime_to_scaled(__ct)		(__ct)
 #define jiffies_to_cputime(__jif)	\
 	(__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
 #define cputime64_to_jiffies64(__ct)	\
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5ebee3164e64..3229c7244fdd 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -390,7 +390,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 					 struct rq *rq, int ticks)
 {
 	u64 cputime = (__force u64) cputime_one_jiffy * ticks;
-	cputime_t scaled, other;
+	cputime_t other;
 
 	/*
 	 * When returning from idle, many ticks can get accounted at
@@ -403,7 +403,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 	if (other >= cputime)
 		return;
 	cputime -= other;
-	scaled = cputime_to_scaled(cputime);
 
 	if (this_cpu_ksoftirqd() == p) {
 		/*
@@ -411,15 +410,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		__account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
+		__account_system_time(p, cputime, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
-		account_user_time(p, cputime, scaled);
+		account_user_time(p, cputime, cputime);
 	} else if (p == rq->idle) {
 		account_idle_time(cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
-		account_guest_time(p, cputime, scaled);
+		account_guest_time(p, cputime, cputime);
 	} else {
-		__account_system_time(p, cputime, scaled,	CPUTIME_SYSTEM);
+		__account_system_time(p, cputime, cputime, CPUTIME_SYSTEM);
 	}
 }
 
@@ -502,7 +501,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
  */
 void account_process_tick(struct task_struct *p, int user_tick)
 {
-	cputime_t cputime, scaled, steal;
+	cputime_t cputime, steal;
 	struct rq *rq = this_rq();
 
 	if (vtime_accounting_cpu_enabled())
@@ -520,12 +519,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
 		return;
 
 	cputime -= steal;
-	scaled = cputime_to_scaled(cputime);
 
 	if (user_tick)
-		account_user_time(p, cputime, scaled);
+		account_user_time(p, cputime, cputime);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
+		account_system_time(p, HARDIRQ_OFFSET, cputime, cputime);
 	else
 		account_idle_time(cputime);
 }
@@ -746,7 +744,7 @@ static void __vtime_account_system(struct task_struct *tsk)
 {
 	cputime_t delta_cpu = get_vtime_delta(tsk);
 
-	account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+	account_system_time(tsk, irq_count(), delta_cpu, delta_cpu);
 }
 
 void vtime_account_system(struct task_struct *tsk)
@@ -767,7 +765,7 @@ void vtime_account_user(struct task_struct *tsk)
 	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk)) {
 		delta_cpu = get_vtime_delta(tsk);
-		account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+		account_user_time(tsk, delta_cpu, delta_cpu);
 	}
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -940,8 +938,8 @@ void task_cputime_scaled(struct task_struct *t,
 	fetch_task_cputime(t, utimescaled, stimescaled,
 			   &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
 	if (utimescaled)
-		*utimescaled += cputime_to_scaled(udelta);
+		*utimescaled += udelta;
 	if (stimescaled)
-		*stimescaled += cputime_to_scaled(sdelta);
+		*stimescaled += sdelta;
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-- 
cgit v1.2.3


From 40565b5aedd6d0ca88b7dfd3859d709d2f6f8cf9 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 15 Nov 2016 03:06:51 +0100
Subject: sched/cputime, powerpc, s390: Make scaled cputime arch specific

Only s390 and powerpc have hardware facilities allowing to measure
cputimes scaled by frequency. On all other architectures
utimescaled/stimescaled are equal to utime/stime (however they are
accounted separately).

Remove {u,s}timescaled accounting on all architectures except
powerpc and s390, where those values are explicitly accounted
in the proper places.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161031162143.GB12646@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/Kconfig                |  3 +++
 arch/ia64/kernel/time.c     |  4 +--
 arch/powerpc/Kconfig        |  1 +
 arch/powerpc/kernel/time.c  |  6 +++--
 arch/s390/Kconfig           |  1 +
 arch/s390/kernel/vtime.c    |  9 ++++---
 include/linux/kernel_stat.h |  4 +--
 include/linux/sched.h       | 23 ++++++++++++-----
 kernel/fork.c               |  2 ++
 kernel/sched/cputime.c      | 61 +++++++++++----------------------------------
 10 files changed, 53 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 659bdd079277..abab6590f08f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -512,6 +512,9 @@ config HAVE_CONTEXT_TRACKING
 config HAVE_VIRT_CPU_ACCOUNTING
 	bool
 
+config ARCH_HAS_SCALED_CPUTIME
+	bool
+
 config HAVE_VIRT_CPU_ACCOUNTING_GEN
 	bool
 	default y if 64BIT
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 6f892b94e906..021f44ab4bfb 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -68,7 +68,7 @@ void vtime_account_user(struct task_struct *tsk)
 
 	if (ti->ac_utime) {
 		delta_utime = cycle_to_cputime(ti->ac_utime);
-		account_user_time(tsk, delta_utime, delta_utime);
+		account_user_time(tsk, delta_utime);
 		ti->ac_utime = 0;
 	}
 }
@@ -112,7 +112,7 @@ void vtime_account_system(struct task_struct *tsk)
 {
 	cputime_t delta = vtime_delta(tsk);
 
-	account_system_time(tsk, 0, delta, delta);
+	account_system_time(tsk, 0, delta);
 }
 EXPORT_SYMBOL_GPL(vtime_account_system);
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 65fba4c34cd7..c7f120aaa98f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -160,6 +160,7 @@ config PPC
 	select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select GENERIC_CPU_AUTOPROBE
 	select HAVE_VIRT_CPU_ACCOUNTING
+	select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
 	select HAVE_ARCH_HARDENED_USERCOPY
 	select HAVE_KERNEL_GZIP
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 81051986739c..be9751f1cb2a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -358,7 +358,8 @@ void vtime_account_system(struct task_struct *tsk)
 	unsigned long delta, sys_scaled, stolen;
 
 	delta = vtime_delta(tsk, &sys_scaled, &stolen);
-	account_system_time(tsk, 0, delta, sys_scaled);
+	account_system_time(tsk, 0, delta);
+	tsk->stimescaled += sys_scaled;
 	if (stolen)
 		account_steal_time(stolen);
 }
@@ -391,7 +392,8 @@ void vtime_account_user(struct task_struct *tsk)
 	acct->user_time = 0;
 	acct->user_time_scaled = 0;
 	acct->utime_sspurr = 0;
-	account_user_time(tsk, utime, utimescaled);
+	account_user_time(tsk, utime);
+	tsk->utimescaled += utimescaled;
 }
 
 #ifdef CONFIG_PPC32
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 426481d4cc86..028f97be5bae 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -171,6 +171,7 @@ config S390
 	select SYSCTL_EXCEPTION_TRACE
 	select TTY
 	select VIRT_CPU_ACCOUNTING
+	select ARCH_HAS_SCALED_CPUTIME
 	select VIRT_TO_BUS
 	select HAVE_NMI
 
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 856e30d8463f..1bd5dde2d5a9 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -137,8 +137,10 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
 		user_scaled = (user_scaled * mult) / div;
 		system_scaled = (system_scaled * mult) / div;
 	}
-	account_user_time(tsk, user, user_scaled);
-	account_system_time(tsk, hardirq_offset, system, system_scaled);
+	account_user_time(tsk, user);
+	tsk->utimescaled += user_scaled;
+	account_system_time(tsk, hardirq_offset, system);
+	tsk->stimescaled += system_scaled;
 
 	steal = S390_lowcore.steal_timer;
 	if ((s64) steal > 0) {
@@ -202,7 +204,8 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 
 		system_scaled = (system_scaled * mult) / div;
 	}
-	account_system_time(tsk, 0, system, system_scaled);
+	account_system_time(tsk, 0, system);
+	tsk->stimescaled += system_scaled;
 
 	virt_timer_forward(system);
 }
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44fda64ad434..00f776816aa3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,8 +78,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 	return kstat_cpu(cpu).irqs_sum;
 }
 
-extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
-extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
+extern void account_user_time(struct task_struct *, cputime_t);
+extern void account_system_time(struct task_struct *, int, cputime_t);
 extern void account_steal_time(cputime_t);
 extern void account_idle_time(cputime_t);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3762fe4e3a80..f72e81395dac 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1647,7 +1647,10 @@ struct task_struct {
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
-	cputime_t utime, stime, utimescaled, stimescaled;
+	cputime_t utime, stime;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+	cputime_t utimescaled, stimescaled;
+#endif
 	cputime_t gtime;
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -2240,8 +2243,6 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask);
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 extern void task_cputime(struct task_struct *t,
 			 cputime_t *utime, cputime_t *stime);
-extern void task_cputime_scaled(struct task_struct *t,
-				cputime_t *utimescaled, cputime_t *stimescaled);
 extern cputime_t task_gtime(struct task_struct *t);
 #else
 static inline void task_cputime(struct task_struct *t,
@@ -2253,6 +2254,13 @@ static inline void task_cputime(struct task_struct *t,
 		*stime = t->stime;
 }
 
+static inline cputime_t task_gtime(struct task_struct *t)
+{
+	return t->gtime;
+}
+#endif
+
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 static inline void task_cputime_scaled(struct task_struct *t,
 				       cputime_t *utimescaled,
 				       cputime_t *stimescaled)
@@ -2262,12 +2270,15 @@ static inline void task_cputime_scaled(struct task_struct *t,
 	if (stimescaled)
 		*stimescaled = t->stimescaled;
 }
-
-static inline cputime_t task_gtime(struct task_struct *t)
+#else
+static inline void task_cputime_scaled(struct task_struct *t,
+				       cputime_t *utimescaled,
+				       cputime_t *stimescaled)
 {
-	return t->gtime;
+	task_cputime(t, utimescaled, stimescaled);
 }
 #endif
+
 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 997ac1d584f7..600e93b5e539 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1551,7 +1551,9 @@ static __latent_entropy struct task_struct *copy_process(
 	init_sigpending(&p->pending);
 
 	p->utime = p->stime = p->gtime = 0;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 	p->utimescaled = p->stimescaled = 0;
+#endif
 	prev_cputime_init(&p->prev_cputime);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 3229c7244fdd..ba55ebf77f9a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index,
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
  */
-void account_user_time(struct task_struct *p, cputime_t cputime,
-		       cputime_t cputime_scaled)
+void account_user_time(struct task_struct *p, cputime_t cputime)
 {
 	int index;
 
 	/* Add user time to process. */
 	p->utime += cputime;
-	p->utimescaled += cputime_scaled;
 	account_group_user_time(p, cputime);
 
 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
@@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
  * Account guest cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
-			       cputime_t cputime_scaled)
+static void account_guest_time(struct task_struct *p, cputime_t cputime)
 {
 	u64 *cpustat = kcpustat_this_cpu->cpustat;
 
 	/* Add guest time to process. */
 	p->utime += cputime;
-	p->utimescaled += cputime_scaled;
 	account_group_user_time(p, cputime);
 	p->gtime += cputime;
 
@@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
  * Account system cpu time to a process and desired cpustat field
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
+ * @index: pointer to cpustat field that has to be updated
  */
 static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-			cputime_t cputime_scaled, int index)
+void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
 {
 	/* Add system time to process. */
 	p->stime += cputime;
-	p->stimescaled += cputime_scaled;
 	account_group_system_time(p, cputime);
 
 	/* Add system time to cpustat. */
@@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
  * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
  */
 void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime, cputime_t cputime_scaled)
+			 cputime_t cputime)
 {
 	int index;
 
 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-		account_guest_time(p, cputime, cputime_scaled);
+		account_guest_time(p, cputime);
 		return;
 	}
 
@@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	else
 		index = CPUTIME_SYSTEM;
 
-	__account_system_time(p, cputime, cputime_scaled, index);
+	__account_system_time(p, cputime, index);
 }
 
 /*
@@ -410,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 		 * So, we have to handle it separately here.
 		 * Also, p->stime needs to be updated for ksoftirqd.
 		 */
-		__account_system_time(p, cputime, cputime, CPUTIME_SOFTIRQ);
+		__account_system_time(p, cputime, CPUTIME_SOFTIRQ);
 	} else if (user_tick) {
-		account_user_time(p, cputime, cputime);
+		account_user_time(p, cputime);
 	} else if (p == rq->idle) {
 		account_idle_time(cputime);
 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
-		account_guest_time(p, cputime, cputime);
+		account_guest_time(p, cputime);
 	} else {
-		__account_system_time(p, cputime, cputime, CPUTIME_SYSTEM);
+		__account_system_time(p, cputime, CPUTIME_SYSTEM);
 	}
 }
 
@@ -521,9 +511,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
 	cputime -= steal;
 
 	if (user_tick)
-		account_user_time(p, cputime, cputime);
+		account_user_time(p, cputime);
 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, cputime, cputime);
+		account_system_time(p, HARDIRQ_OFFSET, cputime);
 	else
 		account_idle_time(cputime);
 }
@@ -744,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk)
 {
 	cputime_t delta_cpu = get_vtime_delta(tsk);
 
-	account_system_time(tsk, irq_count(), delta_cpu, delta_cpu);
+	account_system_time(tsk, irq_count(), delta_cpu);
 }
 
 void vtime_account_system(struct task_struct *tsk)
@@ -765,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk)
 	tsk->vtime_snap_whence = VTIME_SYS;
 	if (vtime_delta(tsk)) {
 		delta_cpu = get_vtime_delta(tsk);
-		account_user_time(tsk, delta_cpu, delta_cpu);
+		account_user_time(tsk, delta_cpu);
 	}
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -921,25 +911,4 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 	if (stime)
 		*stime += sdelta;
 }
-
-void task_cputime_scaled(struct task_struct *t,
-			 cputime_t *utimescaled, cputime_t *stimescaled)
-{
-	cputime_t udelta, sdelta;
-
-	if (!vtime_accounting_enabled()) {
-		if (utimescaled)
-			*utimescaled = t->utimescaled;
-		if (stimescaled)
-			*stimescaled = t->stimescaled;
-		return;
-	}
-
-	fetch_task_cputime(t, utimescaled, stimescaled,
-			   &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
-	if (utimescaled)
-		*utimescaled += udelta;
-	if (stimescaled)
-		*stimescaled += sdelta;
-}
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
-- 
cgit v1.2.3


From 353c50ebe329daaf2c94dc41c1c481cbba2a31fd Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 15 Nov 2016 03:06:52 +0100
Subject: sched/cputime: Simplify task_cputime()

Now since fetch_task_cputime() has no other users than task_cputime(),
its code could be used directly in task_cputime().

Moreover since only 2 task_cputime() calls of 17 use a NULL argument,
we can add dummy variables to those calls and remove NULL checks from
task_cputimes().

Also remove NULL checks from task_cputimes_scaled().

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1479175612-14718-5-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apm_32.c       |  4 +--
 include/linux/sched.h          | 12 +++------
 kernel/sched/cputime.c         | 57 +++++++++++-------------------------------
 kernel/time/posix-cpu-timers.c |  4 +--
 4 files changed, 23 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c7364bd633e1..d90749b883f5 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 	static int use_apm_idle; /* = 0 */
 	static unsigned int last_jiffies; /* = 0 */
 	static unsigned int last_stime; /* = 0 */
-	cputime_t stime;
+	cputime_t stime, utime;
 
 	int apm_idle_done = 0;
 	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
 	unsigned int bucket;
 
 recalc:
-	task_cputime(current, NULL, &stime);
+	task_cputime(current, &utime, &stime);
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
 	} else if (jiffies_since_last_check > idle_period) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f72e81395dac..fe3ce46cfd03 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2248,10 +2248,8 @@ extern cputime_t task_gtime(struct task_struct *t);
 static inline void task_cputime(struct task_struct *t,
 				cputime_t *utime, cputime_t *stime)
 {
-	if (utime)
-		*utime = t->utime;
-	if (stime)
-		*stime = t->stime;
+	*utime = t->utime;
+	*stime = t->stime;
 }
 
 static inline cputime_t task_gtime(struct task_struct *t)
@@ -2265,10 +2263,8 @@ static inline void task_cputime_scaled(struct task_struct *t,
 				       cputime_t *utimescaled,
 				       cputime_t *stimescaled)
 {
-	if (utimescaled)
-		*utimescaled = t->utimescaled;
-	if (stimescaled)
-		*stimescaled = t->stimescaled;
+	*utimescaled = t->utimescaled;
+	*stimescaled = t->stimescaled;
 }
 #else
 static inline void task_cputime_scaled(struct task_struct *t,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ba55ebf77f9a..7700a9cba335 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -851,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t)
  * add up the pending nohz execution time since the last
  * cputime snapshot.
  */
-static void
-fetch_task_cputime(struct task_struct *t,
-		   cputime_t *u_dst, cputime_t *s_dst,
-		   cputime_t *u_src, cputime_t *s_src,
-		   cputime_t *udelta, cputime_t *sdelta)
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 {
+	cputime_t delta;
 	unsigned int seq;
-	unsigned long long delta;
 
-	do {
-		*udelta = 0;
-		*sdelta = 0;
+	if (!vtime_accounting_enabled()) {
+		*utime = t->utime;
+		*stime = t->stime;
+		return;
+	}
 
+	do {
 		seq = read_seqcount_begin(&t->vtime_seqcount);
 
-		if (u_dst)
-			*u_dst = *u_src;
-		if (s_dst)
-			*s_dst = *s_src;
+		*utime = t->utime;
+		*stime = t->stime;
 
 		/* Task is sleeping, nothing to add */
-		if (t->vtime_snap_whence == VTIME_INACTIVE ||
-		    is_idle_task(t))
+		if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
 			continue;
 
 		delta = vtime_delta(t);
@@ -882,33 +878,10 @@ fetch_task_cputime(struct task_struct *t,
 		 * Task runs either in user or kernel space, add pending nohz time to
 		 * the right place.
 		 */
-		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
-			*udelta = delta;
-		} else {
-			if (t->vtime_snap_whence == VTIME_SYS)
-				*sdelta = delta;
-		}
+		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
+			*utime += delta;
+		else if (t->vtime_snap_whence == VTIME_SYS)
+			*stime += delta;
 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
 }
-
-
-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
-{
-	cputime_t udelta, sdelta;
-
-	if (!vtime_accounting_enabled()) {
-		if (utime)
-			*utime = t->utime;
-		if (stime)
-			*stime = t->stime;
-		return;
-	}
-
-	fetch_task_cputime(t, utime, stime, &t->utime,
-			   &t->stime, &udelta, &sdelta);
-	if (utime)
-		*utime += udelta;
-	if (stime)
-		*stime += sdelta;
-}
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d78927a..e887ffc8eef3 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -133,9 +133,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p)
 }
 static inline unsigned long long virt_ticks(struct task_struct *p)
 {
-	cputime_t utime;
+	cputime_t utime, stime;
 
-	task_cputime(p, &utime, NULL);
+	task_cputime(p, &utime, &stime);
 
 	return cputime_to_expires(utime);
 }
-- 
cgit v1.2.3


From 864c2357ca898c6171fe5284f5ecc795c8ce27a8 Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Tue, 1 Nov 2016 11:52:58 -0700
Subject: perf/core: Do not set cpuctx->cgrp for unscheduled cgroups

Commit:

  db4a835601b7 ("perf/core: Set cgroup in CPU contexts for new cgroup events")

failed to verify that event->cgrp is actually the scheduled cgroup
in a CPU before setting cpuctx->cgrp. This patch fixes that.

Now that there is a different path for scheduled and unscheduled
cgroup, add a warning to catch when cpuctx->cgrp is still set after
the last cgroup event has been unsheduled.

To verify the bug:

  # Create 2 cgroups.
  mkdir /dev/cgroups/devices/g1
  mkdir /dev/cgroups/devices/g2

  # launch a task, bind it to a cpu and move it to g1
  CPU=2
  while :; do : ; done &
  P=$!

  taskset -pc $CPU $P
  echo $P > /dev/cgroups/devices/g1/tasks

  # monitor g2 (it runs no tasks) and observe output
  perf stat -e cycles -I 1000 -C $CPU -G g2

  #           time             counts unit events
     1.000091408          7,579,527      cycles                    g2
     2.000350111      <not counted>      cycles                    g2
     3.000589181      <not counted>      cycles                    g2
     4.000771428      <not counted>      cycles                    g2

  # note first line that displays that a task run in g2, despite
  # g2 having no tasks. This is because cpuctx->cgrp was wrongly
  # set when context of new event was installed.
  # After applying the fix we obtain the right output:

  perf stat -e cycles -I 1000 -C $CPU -G g2
  #           time             counts unit events
     1.000119615      <not counted>      cycles                    g2
     2.000389430      <not counted>      cycles                    g2
     3.000590962      <not counted>      cycles                    g2

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Link: http://lkml.kernel.org/r/1478026378-86083-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e292132efac..ff230bb4a02e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event,
 	 * this will always be called from the right CPU.
 	 */
 	cpuctx = __get_cpu_context(ctx);
+
+	/* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */
+	if (perf_cgroup_from_task(current, ctx) != event->cgrp) {
+		/*
+		 * We are removing the last cpu event in this context.
+		 * If that event is not active in this cpu, cpuctx->cgrp
+		 * should've been cleared by perf_cgroup_switch.
+		 */
+		WARN_ON_ONCE(!add && cpuctx->cgrp);
+		return;
+	}
 	cpuctx->cgrp = add ? event->cgrp : NULL;
 }
 
-- 
cgit v1.2.3


From 3a08c2fd763450a927d1130de078d6f9e74944fb Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:06 -0800
Subject: bpf: LRU List

Introduce bpf_lru_list which will provide LRU capability to
the bpf_htab in the later patch.

* General Thoughts:
1. Target use case.  Read is more often than update.
   (i.e. bpf_lookup_elem() is more often than bpf_update_elem()).
   If bpf_prog does a bpf_lookup_elem() first and then an in-place
   update, it still counts as a read operation to the LRU list concern.
2. It may be useful to think of it as a LRU cache
3. Optimize the read case
   3.1 No lock in read case
   3.2 The LRU maintenance is only done during bpf_update_elem()
4. If there is a percpu LRU list, it will lose the system-wise LRU
   property.  A completely isolated percpu LRU list has the best
   performance but the memory utilization is not ideal considering
   the work load may be imbalance.
5. Hence, this patch starts the LRU implementation with a global LRU
   list with batched operations before accessing the global LRU list.
   As a LRU cache, #read >> #update/#insert operations, it will work well.
6. There is a local list (for each cpu) which is named
   'struct bpf_lru_locallist'.  This local list is not used to sort
   the LRU property.  Instead, the local list is to batch enough
   operations before acquiring the lock of the global LRU list.  More
   details on this later.
7. In the later patch, it allows a percpu LRU list by specifying a
   map-attribute for scalability reason and for use cases that need to
   prepare for the worst (and pathological) case like DoS attack.
   The percpu LRU list is completely isolated from each other and the
   LRU nodes (including free nodes) cannot be moved across the list.  The
   following description is for the global LRU list but mostly applicable
   to the percpu LRU list also.

* Global LRU List:
1. It has three sub-lists: active-list, inactive-list and free-list.
2. The two list idea, active and inactive, is borrowed from the
   page cache.
3. All nodes are pre-allocated and all sit at the free-list (of the
   global LRU list) at the beginning.  The pre-allocation reasoning
   is similar to the existing BPF_MAP_TYPE_HASH.  However,
   opting-out prealloc (BPF_F_NO_PREALLOC) is not supported in
   the LRU map.

* Active/Inactive List (of the global LRU list):
1. The active list, as its name says it, maintains the active set of
   the nodes.  We can think of it as the working set or more frequently
   accessed nodes.  The access frequency is approximated by a ref-bit.
   The ref-bit is set during the bpf_lookup_elem().
2. The inactive list, as its name also says it, maintains a less
   active set of nodes.  They are the candidates to be removed
   from the bpf_htab when we are running out of free nodes.
3. The ordering of these two lists is acting as a rough clock.
   The tail of the inactive list is the older nodes and
   should be released first if the bpf_htab needs free element.

* Rotating the Active/Inactive List (of the global LRU list):
1. It is the basic operation to maintain the LRU property of
   the global list.
2. The active list is only rotated when the inactive list is running
   low.  This idea is similar to the current page cache.
   Inactive running low is currently defined as
   "# of inactive < # of active".
3. The active list rotation always starts from the tail.  It moves
   node without ref-bit set to the head of the inactive list.
   It moves node with ref-bit set back to the head of the active
   list and then clears its ref-bit.
4. The inactive rotation is pretty simply.
   It walks the inactive list and moves the nodes back to the head of
   active list if its ref-bit is set. The ref-bit is cleared after moving
   to the active list.
   If the node does not have ref-bit set, it just leave it as it is
   because it is already in the inactive list.

* Shrinking the Inactive List (of the global LRU list):
1. Shrinking is the operation to get free nodes when the bpf_htab is
   full.
2. It usually only shrinks the inactive list to get free nodes.
3. During shrinking, it will walk the inactive list from the tail,
   delete the nodes without ref-bit set from bpf_htab.
4. If no free node found after step (3), it will forcefully get
   one node from the tail of inactive or active list.  Forcefully is
   in the sense that it ignores the ref-bit.

* Local List:
1. Each CPU has a 'struct bpf_lru_locallist'.  The purpose is to
   batch enough operations before acquiring the lock of the
   global LRU.
2. A local list has two sub-lists, free-list and pending-list.
3. During bpf_update_elem(), it will try to get from the free-list
   of (the current CPU local list).
4. If the local free-list is empty, it will acquire from the
   global LRU list.  The global LRU list can either satisfy it
   by its global free-list or by shrinking the global inactive
   list.  Since we have acquired the global LRU list lock,
   it will try to get at most LOCAL_FREE_TARGET elements
   to the local free list.
5. When a new element is added to the bpf_htab, it will
   first sit at the pending-list (of the local list) first.
   The pending-list will be flushed to the global LRU list
   when it needs to acquire free nodes from the global list
   next time.

* Lock Consideration:
The LRU list has a lock (lru_lock).  Each bucket of htab has a
lock (buck_lock).  If both locks need to be acquired together,
the lock order is always lru_lock -> buck_lock and this only
happens in the bpf_lru_list.c logic.

In hashtab.c, both locks are not acquired together (i.e. one
lock is always released first before acquiring another lock).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile       |   2 +-
 kernel/bpf/bpf_lru_list.c | 567 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/bpf_lru_list.h |  80 +++++++
 3 files changed, 648 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/bpf_lru_list.c
 create mode 100644 kernel/bpf/bpf_lru_list.h

(limited to 'kernel')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d091da..c4d89d6e2058 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
new file mode 100644
index 000000000000..73f67094f93a
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.c
@@ -0,0 +1,567 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include "bpf_lru_list.h"
+
+#define LOCAL_FREE_TARGET		(128)
+#define LOCAL_NR_SCANS			LOCAL_FREE_TARGET
+
+/* Helpers to get the local list index */
+#define LOCAL_LIST_IDX(t)	((t) - BPF_LOCAL_LIST_T_OFFSET)
+#define LOCAL_FREE_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
+#define LOCAL_PENDING_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
+#define IS_LOCAL_LIST_TYPE(t)	((t) >= BPF_LOCAL_LIST_T_OFFSET)
+
+static int get_next_cpu(int cpu)
+{
+	cpu = cpumask_next(cpu, cpu_possible_mask);
+	if (cpu >= nr_cpu_ids)
+		cpu = cpumask_first(cpu_possible_mask);
+	return cpu;
+}
+
+/* Local list helpers */
+static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
+{
+	return &loc_l->lists[LOCAL_FREE_LIST_IDX];
+}
+
+static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
+{
+	return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
+}
+
+/* bpf_lru_node helpers */
+static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
+{
+	return node->ref;
+}
+
+static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
+				   enum bpf_lru_list_type type)
+{
+	if (type < NR_BPF_LRU_LIST_COUNT)
+		l->counts[type]++;
+}
+
+static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
+				   enum bpf_lru_list_type type)
+{
+	if (type < NR_BPF_LRU_LIST_COUNT)
+		l->counts[type]--;
+}
+
+static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
+					struct bpf_lru_node *node,
+					struct list_head *free_list,
+					enum bpf_lru_list_type tgt_free_type)
+{
+	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+		return;
+
+	/* If the removing node is the next_inactive_rotation candidate,
+	 * move the next_inactive_rotation pointer also.
+	 */
+	if (&node->list == l->next_inactive_rotation)
+		l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+	bpf_lru_list_count_dec(l, node->type);
+
+	node->type = tgt_free_type;
+	list_move(&node->list, free_list);
+}
+
+/* Move nodes from local list to the LRU list */
+static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
+				   struct bpf_lru_node *node,
+				   enum bpf_lru_list_type tgt_type)
+{
+	if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
+	    WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+		return;
+
+	bpf_lru_list_count_inc(l, tgt_type);
+	node->type = tgt_type;
+	node->ref = 0;
+	list_move(&node->list, &l->lists[tgt_type]);
+}
+
+/* Move nodes between or within active and inactive list (like
+ * active to inactive, inactive to active or tail of active back to
+ * the head of active).
+ */
+static void __bpf_lru_node_move(struct bpf_lru_list *l,
+				struct bpf_lru_node *node,
+				enum bpf_lru_list_type tgt_type)
+{
+	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
+	    WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
+		return;
+
+	if (node->type != tgt_type) {
+		bpf_lru_list_count_dec(l, node->type);
+		bpf_lru_list_count_inc(l, tgt_type);
+		node->type = tgt_type;
+	}
+	node->ref = 0;
+
+	/* If the moving node is the next_inactive_rotation candidate,
+	 * move the next_inactive_rotation pointer also.
+	 */
+	if (&node->list == l->next_inactive_rotation)
+		l->next_inactive_rotation = l->next_inactive_rotation->prev;
+
+	list_move(&node->list, &l->lists[tgt_type]);
+}
+
+static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
+{
+	return l->counts[BPF_LRU_LIST_T_INACTIVE] <
+		l->counts[BPF_LRU_LIST_T_ACTIVE];
+}
+
+/* Rotate the active list:
+ * 1. Start from tail
+ * 2. If the node has the ref bit set, it will be rotated
+ *    back to the head of active list with the ref bit cleared.
+ *    Give this node one more chance to survive in the active list.
+ * 3. If the ref bit is not set, move it to the head of the
+ *    inactive list.
+ * 4. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
+					 struct bpf_lru_list *l)
+{
+	struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+	struct bpf_lru_node *node, *tmp_node, *first_node;
+	unsigned int i = 0;
+
+	first_node = list_first_entry(active, struct bpf_lru_node, list);
+	list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
+		if (bpf_lru_node_is_ref(node))
+			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+		else
+			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+
+		if (++i == lru->nr_scans || node == first_node)
+			break;
+	}
+}
+
+/* Rotate the inactive list.  It starts from the next_inactive_rotation
+ * 1. If the node has ref bit set, it will be moved to the head
+ *    of active list with the ref bit cleared.
+ * 2. If the node does not have ref bit set, it will leave it
+ *    at its current location (i.e. do nothing) so that it can
+ *    be considered during the next inactive_shrink.
+ * 3. It will at most scan nr_scans nodes
+ */
+static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
+					   struct bpf_lru_list *l)
+{
+	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+	struct list_head *cur, *next, *last;
+	struct bpf_lru_node *node;
+	unsigned int i = 0;
+
+	if (list_empty(inactive))
+		return;
+
+	last = l->next_inactive_rotation->next;
+	if (last == inactive)
+		last = last->next;
+
+	cur = l->next_inactive_rotation;
+	while (i < lru->nr_scans) {
+		if (cur == inactive) {
+			cur = cur->prev;
+			continue;
+		}
+
+		node = list_entry(cur, struct bpf_lru_node, list);
+		next = cur->prev;
+		if (bpf_lru_node_is_ref(node))
+			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+		if (cur == last)
+			break;
+		cur = next;
+		i++;
+	}
+
+	l->next_inactive_rotation = next;
+}
+
+/* Shrink the inactive list.  It starts from the tail of the
+ * inactive list and only move the nodes without the ref bit
+ * set to the designated free list.
+ */
+static unsigned int
+__bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
+			       struct bpf_lru_list *l,
+			       unsigned int tgt_nshrink,
+			       struct list_head *free_list,
+			       enum bpf_lru_list_type tgt_free_type)
+{
+	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+	struct bpf_lru_node *node, *tmp_node, *first_node;
+	unsigned int nshrinked = 0;
+	unsigned int i = 0;
+
+	first_node = list_first_entry(inactive, struct bpf_lru_node, list);
+	list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
+		if (bpf_lru_node_is_ref(node)) {
+			__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
+		} else if (lru->del_from_htab(lru->del_arg, node)) {
+			__bpf_lru_node_move_to_free(l, node, free_list,
+						    tgt_free_type);
+			if (++nshrinked == tgt_nshrink)
+				break;
+		}
+
+		if (++i == lru->nr_scans)
+			break;
+	}
+
+	return nshrinked;
+}
+
+/* 1. Rotate the active list (if needed)
+ * 2. Always rotate the inactive list
+ */
+static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
+{
+	if (bpf_lru_list_inactive_low(l))
+		__bpf_lru_list_rotate_active(lru, l);
+
+	__bpf_lru_list_rotate_inactive(lru, l);
+}
+
+/* Calls __bpf_lru_list_shrink_inactive() to shrink some
+ * ref-bit-cleared nodes and move them to the designated
+ * free list.
+ *
+ * If it cannot get a free node after calling
+ * __bpf_lru_list_shrink_inactive().  It will just remove
+ * one node from either inactive or active list without
+ * honoring the ref-bit.  It prefers inactive list to active
+ * list in this situation.
+ */
+static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
+					  struct bpf_lru_list *l,
+					  unsigned int tgt_nshrink,
+					  struct list_head *free_list,
+					  enum bpf_lru_list_type tgt_free_type)
+
+{
+	struct bpf_lru_node *node, *tmp_node;
+	struct list_head *force_shrink_list;
+	unsigned int nshrinked;
+
+	nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
+						   free_list, tgt_free_type);
+	if (nshrinked)
+		return nshrinked;
+
+	/* Do a force shrink by ignoring the reference bit */
+	if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
+		force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+	else
+		force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
+
+	list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
+					 list) {
+		if (lru->del_from_htab(lru->del_arg, node)) {
+			__bpf_lru_node_move_to_free(l, node, free_list,
+						    tgt_free_type);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/* Flush the nodes from the local pending list to the LRU list */
+static void __local_list_flush(struct bpf_lru_list *l,
+			       struct bpf_lru_locallist *loc_l)
+{
+	struct bpf_lru_node *node, *tmp_node;
+
+	list_for_each_entry_safe_reverse(node, tmp_node,
+					 local_pending_list(loc_l), list) {
+		if (bpf_lru_node_is_ref(node))
+			__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
+		else
+			__bpf_lru_node_move_in(l, node,
+					       BPF_LRU_LIST_T_INACTIVE);
+	}
+}
+
+static void bpf_lru_list_push_free(struct bpf_lru_list *l,
+				   struct bpf_lru_node *node)
+{
+	unsigned long flags;
+
+	if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
+		return;
+
+	raw_spin_lock_irqsave(&l->lock, flags);
+	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+	raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
+					   struct bpf_lru_locallist *loc_l)
+{
+	struct bpf_lru_list *l = &lru->common_lru.lru_list;
+	struct bpf_lru_node *node, *tmp_node;
+	unsigned int nfree = 0;
+
+	raw_spin_lock(&l->lock);
+
+	__local_list_flush(l, loc_l);
+
+	__bpf_lru_list_rotate(lru, l);
+
+	list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
+				 list) {
+		__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
+					    BPF_LRU_LOCAL_LIST_T_FREE);
+		if (++nfree == LOCAL_FREE_TARGET)
+			break;
+	}
+
+	if (nfree < LOCAL_FREE_TARGET)
+		__bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+				      local_free_list(loc_l),
+				      BPF_LRU_LOCAL_LIST_T_FREE);
+
+	raw_spin_unlock(&l->lock);
+}
+
+static void __local_list_add_pending(struct bpf_lru *lru,
+				     struct bpf_lru_locallist *loc_l,
+				     int cpu,
+				     struct bpf_lru_node *node,
+				     u32 hash)
+{
+	*(u32 *)((void *)node + lru->hash_offset) = hash;
+	node->cpu = cpu;
+	node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
+	node->ref = 0;
+	list_add(&node->list, local_pending_list(loc_l));
+}
+
+struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
+{
+	struct bpf_lru_node *node;
+
+	node = list_first_entry_or_null(local_free_list(loc_l),
+					struct bpf_lru_node,
+					list);
+	if (node)
+		list_del(&node->list);
+
+	return node;
+}
+
+struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
+					      struct bpf_lru_locallist *loc_l)
+{
+	struct bpf_lru_node *node;
+	bool force = false;
+
+ignore_ref:
+	/* Get from the tail (i.e. older element) of the pending list. */
+	list_for_each_entry_reverse(node, local_pending_list(loc_l),
+				    list) {
+		if ((!bpf_lru_node_is_ref(node) || force) &&
+		    lru->del_from_htab(lru->del_arg, node)) {
+			list_del(&node->list);
+			return node;
+		}
+	}
+
+	if (!force) {
+		force = true;
+		goto ignore_ref;
+	}
+
+	return NULL;
+}
+
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+{
+	struct bpf_lru_locallist *loc_l, *steal_loc_l;
+	struct bpf_common_lru *clru = &lru->common_lru;
+	struct bpf_lru_node *node;
+	int steal, first_steal;
+	unsigned long flags;
+	int cpu = raw_smp_processor_id();
+
+	loc_l = per_cpu_ptr(clru->local_list, cpu);
+
+	raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+	node = __local_list_pop_free(loc_l);
+	if (!node) {
+		bpf_lru_list_pop_free_to_local(lru, loc_l);
+		node = __local_list_pop_free(loc_l);
+	}
+
+	if (node)
+		__local_list_add_pending(lru, loc_l, cpu, node, hash);
+
+	raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+
+	if (node)
+		return node;
+
+	/* No free nodes found from the local free list and
+	 * the global LRU list.
+	 *
+	 * Steal from the local free/pending list of the
+	 * current CPU and remote CPU in RR.  It starts
+	 * with the loc_l->next_steal CPU.
+	 */
+
+	first_steal = loc_l->next_steal;
+	steal = first_steal;
+	do {
+		steal_loc_l = per_cpu_ptr(clru->local_list, steal);
+
+		raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
+
+		node = __local_list_pop_free(steal_loc_l);
+		if (!node)
+			node = __local_list_pop_pending(lru, steal_loc_l);
+
+		raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
+
+		steal = get_next_cpu(steal);
+	} while (!node && steal != first_steal);
+
+	loc_l->next_steal = steal;
+
+	if (node) {
+		raw_spin_lock_irqsave(&loc_l->lock, flags);
+		__local_list_add_pending(lru, loc_l, cpu, node, hash);
+		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+	}
+
+	return node;
+}
+
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
+{
+	unsigned long flags;
+
+	if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
+	    WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
+		return;
+
+	if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+		struct bpf_lru_locallist *loc_l;
+
+		loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
+
+		raw_spin_lock_irqsave(&loc_l->lock, flags);
+
+		if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
+			raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+			goto check_lru_list;
+		}
+
+		node->type = BPF_LRU_LOCAL_LIST_T_FREE;
+		node->ref = 0;
+		list_move(&node->list, local_free_list(loc_l));
+
+		raw_spin_unlock_irqrestore(&loc_l->lock, flags);
+		return;
+	}
+
+check_lru_list:
+	bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
+}
+
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+		      u32 elem_size, u32 nr_elems)
+{
+	struct bpf_lru_list *l = &lru->common_lru.lru_list;
+	u32 i;
+
+	for (i = 0; i < nr_elems; i++) {
+		struct bpf_lru_node *node;
+
+		node = (struct bpf_lru_node *)(buf + node_offset);
+		node->type = BPF_LRU_LIST_T_FREE;
+		node->ref = 0;
+		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+		buf += elem_size;
+	}
+}
+
+static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
+{
+	int i;
+
+	for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
+		INIT_LIST_HEAD(&loc_l->lists[i]);
+
+	loc_l->next_steal = cpu;
+
+	raw_spin_lock_init(&loc_l->lock);
+}
+
+static void bpf_lru_list_init(struct bpf_lru_list *l)
+{
+	int i;
+
+	for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
+		INIT_LIST_HEAD(&l->lists[i]);
+
+	for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
+		l->counts[i] = 0;
+
+	l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
+
+	raw_spin_lock_init(&l->lock);
+}
+
+int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset,
+		 del_from_htab_func del_from_htab, void *del_arg)
+{
+	int cpu;
+	struct bpf_common_lru *clru = &lru->common_lru;
+
+	clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+	if (!clru->local_list)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct bpf_lru_locallist *loc_l;
+
+		loc_l = per_cpu_ptr(clru->local_list, cpu);
+		bpf_lru_locallist_init(loc_l, cpu);
+	}
+
+	bpf_lru_list_init(&clru->lru_list);
+	lru->nr_scans = LOCAL_NR_SCANS;
+
+	lru->del_from_htab = del_from_htab;
+	lru->del_arg = del_arg;
+	lru->hash_offset = hash_offset;
+
+	return 0;
+}
+
+void bpf_lru_destroy(struct bpf_lru *lru)
+{
+	free_percpu(lru->common_lru.local_list);
+}
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
new file mode 100644
index 000000000000..aaa2445ed1ca
--- /dev/null
+++ b/kernel/bpf/bpf_lru_list.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __BPF_LRU_LIST_H_
+#define __BPF_LRU_LIST_H_
+
+#include <linux/list.h>
+#include <linux/spinlock_types.h>
+
+#define NR_BPF_LRU_LIST_T	(3)
+#define NR_BPF_LRU_LIST_COUNT	(2)
+#define NR_BPF_LRU_LOCAL_LIST_T (2)
+#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
+
+enum bpf_lru_list_type {
+	BPF_LRU_LIST_T_ACTIVE,
+	BPF_LRU_LIST_T_INACTIVE,
+	BPF_LRU_LIST_T_FREE,
+	BPF_LRU_LOCAL_LIST_T_FREE,
+	BPF_LRU_LOCAL_LIST_T_PENDING,
+};
+
+struct bpf_lru_node {
+	struct list_head list;
+	u16 cpu;
+	u8 type;
+	u8 ref;
+};
+
+struct bpf_lru_list {
+	struct list_head lists[NR_BPF_LRU_LIST_T];
+	unsigned int counts[NR_BPF_LRU_LIST_COUNT];
+	/* The next inacitve list rotation starts from here */
+	struct list_head *next_inactive_rotation;
+
+	raw_spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+struct bpf_lru_locallist {
+	struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
+	u16 next_steal;
+	raw_spinlock_t lock;
+};
+
+struct bpf_common_lru {
+	struct bpf_lru_list lru_list;
+	struct bpf_lru_locallist __percpu *local_list;
+};
+
+typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
+
+struct bpf_lru {
+	struct bpf_common_lru common_lru;
+	del_from_htab_func del_from_htab;
+	void *del_arg;
+	unsigned int hash_offset;
+	unsigned int nr_scans;
+};
+
+static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
+{
+	/* ref is an approximation on access frequency.  It does not
+	 * have to be very accurate.  Hence, no protection is used.
+	 */
+	node->ref = 1;
+}
+
+int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset,
+		 del_from_htab_func del_from_htab, void *delete_arg);
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+		      u32 elem_size, u32 nr_elems);
+void bpf_lru_destroy(struct bpf_lru *lru);
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
+void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
+
+#endif
-- 
cgit v1.2.3


From 961578b63474d13ad0e2f615fcc2901c5197dda6 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:07 -0800
Subject: bpf: Add percpu LRU list

Instead of having a common LRU list, this patch allows a
percpu LRU list which can be selected by specifying a map
attribute.  The map attribute will be added in the later
patch.

While the common use case for LRU is #reads >> #updates,
percpu LRU list allows bpf prog to absorb unusual #updates
under pathological case (e.g. external traffic facing machine which
could be under attack).

Each percpu LRU is isolated from each other.  The LRU nodes (including
free nodes) cannot be moved across different LRU Lists.

Here are the update performance comparison between
common LRU list and percpu LRU list (the test code is
at the last patch):

[root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \
./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done
 1 cpus: 2934082 updates
 4 cpus: 7391434 updates
 8 cpus: 6500576 updates

[root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \
./map_perf_test 32 $i | awk '{r += $3}END{printr " updates"}'; done
  1 cpus: 2896553 updates
  4 cpus: 9766395 updates
  8 cpus: 17460553 updates

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/bpf_lru_list.c | 162 +++++++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/bpf_lru_list.h |   8 ++-
 2 files changed, 151 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 73f67094f93a..bfebff010ba9 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -13,6 +13,9 @@
 #define LOCAL_FREE_TARGET		(128)
 #define LOCAL_NR_SCANS			LOCAL_FREE_TARGET
 
+#define PERCPU_FREE_TARGET		(16)
+#define PERCPU_NR_SCANS			PERCPU_FREE_TARGET
+
 /* Helpers to get the local list index */
 #define LOCAL_LIST_IDX(t)	((t) - BPF_LOCAL_LIST_T_OFFSET)
 #define LOCAL_FREE_LIST_IDX	LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
@@ -396,7 +399,40 @@ ignore_ref:
 	return NULL;
 }
 
-struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
+						    u32 hash)
+{
+	struct list_head *free_list;
+	struct bpf_lru_node *node = NULL;
+	struct bpf_lru_list *l;
+	unsigned long flags;
+	int cpu = raw_smp_processor_id();
+
+	l = per_cpu_ptr(lru->percpu_lru, cpu);
+
+	raw_spin_lock_irqsave(&l->lock, flags);
+
+	__bpf_lru_list_rotate(lru, l);
+
+	free_list = &l->lists[BPF_LRU_LIST_T_FREE];
+	if (list_empty(free_list))
+		__bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
+				      BPF_LRU_LIST_T_FREE);
+
+	if (!list_empty(free_list)) {
+		node = list_first_entry(free_list, struct bpf_lru_node, list);
+		*(u32 *)((void *)node + lru->hash_offset) = hash;
+		node->ref = 0;
+		__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
+	}
+
+	raw_spin_unlock_irqrestore(&l->lock, flags);
+
+	return node;
+}
+
+static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
+						    u32 hash)
 {
 	struct bpf_lru_locallist *loc_l, *steal_loc_l;
 	struct bpf_common_lru *clru = &lru->common_lru;
@@ -458,7 +494,16 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
 	return node;
 }
 
-void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
+struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
+{
+	if (lru->percpu)
+		return bpf_percpu_lru_pop_free(lru, hash);
+	else
+		return bpf_common_lru_pop_free(lru, hash);
+}
+
+static void bpf_common_lru_push_free(struct bpf_lru *lru,
+				     struct bpf_lru_node *node)
 {
 	unsigned long flags;
 
@@ -490,8 +535,31 @@ check_lru_list:
 	bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
 }
 
-void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
-		      u32 elem_size, u32 nr_elems)
+static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
+				     struct bpf_lru_node *node)
+{
+	struct bpf_lru_list *l;
+	unsigned long flags;
+
+	l = per_cpu_ptr(lru->percpu_lru, node->cpu);
+
+	raw_spin_lock_irqsave(&l->lock, flags);
+
+	__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
+
+	raw_spin_unlock_irqrestore(&l->lock, flags);
+}
+
+void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
+{
+	if (lru->percpu)
+		bpf_percpu_lru_push_free(lru, node);
+	else
+		bpf_common_lru_push_free(lru, node);
+}
+
+void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+			     u32 elem_size, u32 nr_elems)
 {
 	struct bpf_lru_list *l = &lru->common_lru.lru_list;
 	u32 i;
@@ -507,6 +575,47 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 	}
 }
 
+void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+			     u32 elem_size, u32 nr_elems)
+{
+	u32 i, pcpu_entries;
+	int cpu;
+	struct bpf_lru_list *l;
+
+	pcpu_entries = nr_elems / num_possible_cpus();
+
+	i = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct bpf_lru_node *node;
+
+		l = per_cpu_ptr(lru->percpu_lru, cpu);
+again:
+		node = (struct bpf_lru_node *)(buf + node_offset);
+		node->cpu = cpu;
+		node->type = BPF_LRU_LIST_T_FREE;
+		node->ref = 0;
+		list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
+		i++;
+		buf += elem_size;
+		if (i == nr_elems)
+			break;
+		if (i % pcpu_entries)
+			goto again;
+	}
+}
+
+void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
+		      u32 elem_size, u32 nr_elems)
+{
+	if (lru->percpu)
+		bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
+					nr_elems);
+	else
+		bpf_common_lru_populate(lru, buf, node_offset, elem_size,
+					nr_elems);
+}
+
 static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
 {
 	int i;
@@ -534,26 +643,42 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 	raw_spin_lock_init(&l->lock);
 }
 
-int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset,
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
 		 del_from_htab_func del_from_htab, void *del_arg)
 {
 	int cpu;
-	struct bpf_common_lru *clru = &lru->common_lru;
 
-	clru->local_list = alloc_percpu(struct bpf_lru_locallist);
-	if (!clru->local_list)
-		return -ENOMEM;
+	if (percpu) {
+		lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+		if (!lru->percpu_lru)
+			return -ENOMEM;
 
-	for_each_possible_cpu(cpu) {
-		struct bpf_lru_locallist *loc_l;
+		for_each_possible_cpu(cpu) {
+			struct bpf_lru_list *l;
 
-		loc_l = per_cpu_ptr(clru->local_list, cpu);
-		bpf_lru_locallist_init(loc_l, cpu);
-	}
+			l = per_cpu_ptr(lru->percpu_lru, cpu);
+			bpf_lru_list_init(l);
+		}
+		lru->nr_scans = PERCPU_NR_SCANS;
+	} else {
+		struct bpf_common_lru *clru = &lru->common_lru;
 
-	bpf_lru_list_init(&clru->lru_list);
-	lru->nr_scans = LOCAL_NR_SCANS;
+		clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+		if (!clru->local_list)
+			return -ENOMEM;
 
+		for_each_possible_cpu(cpu) {
+			struct bpf_lru_locallist *loc_l;
+
+			loc_l = per_cpu_ptr(clru->local_list, cpu);
+			bpf_lru_locallist_init(loc_l, cpu);
+		}
+
+		bpf_lru_list_init(&clru->lru_list);
+		lru->nr_scans = LOCAL_NR_SCANS;
+	}
+
+	lru->percpu = percpu;
 	lru->del_from_htab = del_from_htab;
 	lru->del_arg = del_arg;
 	lru->hash_offset = hash_offset;
@@ -563,5 +688,8 @@ int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset,
 
 void bpf_lru_destroy(struct bpf_lru *lru)
 {
-	free_percpu(lru->common_lru.local_list);
+	if (lru->percpu)
+		free_percpu(lru->percpu_lru);
+	else
+		free_percpu(lru->common_lru.local_list);
 }
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index aaa2445ed1ca..5c35a98d02bf 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -53,11 +53,15 @@ struct bpf_common_lru {
 typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
 
 struct bpf_lru {
-	struct bpf_common_lru common_lru;
+	union {
+		struct bpf_common_lru common_lru;
+		struct bpf_lru_list __percpu *percpu_lru;
+	};
 	del_from_htab_func del_from_htab;
 	void *del_arg;
 	unsigned int hash_offset;
 	unsigned int nr_scans;
+	bool percpu;
 };
 
 static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
@@ -68,7 +72,7 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
 	node->ref = 1;
 }
 
-int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset,
+int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
 		 del_from_htab_func del_from_htab, void *delete_arg);
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
 		      u32 elem_size, u32 nr_elems);
-- 
cgit v1.2.3


From fd91de7b3c69a7f108b92521e1115df3e058af55 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:08 -0800
Subject: bpf: Refactor codes handling percpu map

Refactor the codes that populate the value
of a htab_elem in a BPF_MAP_TYPE_PERCPU_HASH
typed bpf_map.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 47 +++++++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ad1bc67aff1b..b478d80f9771 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -420,6 +420,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 	}
 }
 
+static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
+			    void *value, bool onallcpus)
+{
+	if (!onallcpus) {
+		/* copy true value_size bytes */
+		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+	} else {
+		u32 size = round_up(htab->map.value_size, 8);
+		int off = 0, cpu;
+
+		for_each_possible_cpu(cpu) {
+			bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+					value + off, size);
+			off += size;
+		}
+	}
+}
+
 static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 void *value, u32 key_size, u32 hash,
 					 bool percpu, bool onallcpus,
@@ -479,18 +497,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 			}
 		}
 
-		if (!onallcpus) {
-			/* copy true value_size bytes */
-			memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
-		} else {
-			int off = 0, cpu;
+		pcpu_copy_value(htab, pptr, value, onallcpus);
 
-			for_each_possible_cpu(cpu) {
-				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
-						value + off, size);
-				off += size;
-			}
-		}
 		if (!prealloc)
 			htab_elem_set_ptr(l_new, key_size, pptr);
 	} else {
@@ -606,22 +614,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 		goto err;
 
 	if (l_old) {
-		void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
-		u32 size = htab->map.value_size;
-
 		/* per-cpu hash map can update value in-place */
-		if (!onallcpus) {
-			memcpy(this_cpu_ptr(pptr), value, size);
-		} else {
-			int off = 0, cpu;
-
-			size = round_up(size, 8);
-			for_each_possible_cpu(cpu) {
-				bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
-						value + off, size);
-				off += size;
-			}
-		}
+		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+				value, onallcpus);
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
 					hash, true, onallcpus, false);
-- 
cgit v1.2.3


From 29ba732acbeece1e34c68483d1ec1f3720fa1bb3 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:09 -0800
Subject: bpf: Add BPF_MAP_TYPE_LRU_HASH

Provide a LRU version of the existing BPF_MAP_TYPE_HASH.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   8 ++
 kernel/bpf/hashtab.c     | 266 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 260 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2f38e0091b6..ed8c6799fb14 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -85,6 +85,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_ARRAY,
 	BPF_MAP_TYPE_STACK_TRACE,
 	BPF_MAP_TYPE_CGROUP_ARRAY,
+	BPF_MAP_TYPE_LRU_HASH,
 };
 
 enum bpf_prog_type {
@@ -106,6 +107,13 @@ enum bpf_prog_type {
 #define BPF_EXIST	2 /* update existing element */
 
 #define BPF_F_NO_PREALLOC	(1U << 0)
+/* Instead of having one common LRU list in the
+ * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list
+ * which can scale and perform better.
+ * Note, the LRU nodes (including free nodes) cannot be moved
+ * across different LRU lists.
+ */
+#define BPF_F_NO_COMMON_LRU	(1U << 1)
 
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index b478d80f9771..4a9e71a7c41f 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -15,6 +15,7 @@
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
 #include "percpu_freelist.h"
+#include "bpf_lru_list.h"
 
 struct bucket {
 	struct hlist_head head;
@@ -25,7 +26,10 @@ struct bpf_htab {
 	struct bpf_map map;
 	struct bucket *buckets;
 	void *elems;
-	struct pcpu_freelist freelist;
+	union {
+		struct pcpu_freelist freelist;
+		struct bpf_lru lru;
+	};
 	void __percpu *extra_elems;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
@@ -48,11 +52,19 @@ struct htab_elem {
 	union {
 		struct rcu_head rcu;
 		enum extra_elem_state state;
+		struct bpf_lru_node lru_node;
 	};
 	u32 hash;
 	char key[0] __aligned(8);
 };
 
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
+
+static bool htab_is_lru(const struct bpf_htab *htab)
+{
+	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH;
+}
+
 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
 				     void __percpu *pptr)
 {
@@ -87,7 +99,22 @@ free_elems:
 	vfree(htab->elems);
 }
 
-static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
+					  u32 hash)
+{
+	struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
+	struct htab_elem *l;
+
+	if (node) {
+		l = container_of(node, struct htab_elem, lru_node);
+		memcpy(l->key, key, htab->map.key_size);
+		return l;
+	}
+
+	return NULL;
+}
+
+static int prealloc_init(struct bpf_htab *htab)
 {
 	int err = -ENOMEM, i;
 
@@ -110,12 +137,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
 	}
 
 skip_percpu_elems:
-	err = pcpu_freelist_init(&htab->freelist);
+	if (htab_is_lru(htab))
+		err = bpf_lru_init(&htab->lru,
+				   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
+				   offsetof(struct htab_elem, hash) -
+				   offsetof(struct htab_elem, lru_node),
+				   htab_lru_map_delete_node,
+				   htab);
+	else
+		err = pcpu_freelist_init(&htab->freelist);
+
 	if (err)
 		goto free_elems;
 
-	pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
-			       htab->map.max_entries);
+	if (htab_is_lru(htab))
+		bpf_lru_populate(&htab->lru, htab->elems,
+				 offsetof(struct htab_elem, lru_node),
+				 htab->elem_size, htab->map.max_entries);
+	else
+		pcpu_freelist_populate(&htab->freelist, htab->elems,
+				       htab->elem_size, htab->map.max_entries);
+
 	return 0;
 
 free_elems:
@@ -123,6 +165,16 @@ free_elems:
 	return err;
 }
 
+static void prealloc_destroy(struct bpf_htab *htab)
+{
+	htab_free_elems(htab);
+
+	if (htab_is_lru(htab))
+		bpf_lru_destroy(&htab->lru);
+	else
+		pcpu_freelist_destroy(&htab->freelist);
+}
+
 static int alloc_extra_elems(struct bpf_htab *htab)
 {
 	void __percpu *pptr;
@@ -144,14 +196,34 @@ static int alloc_extra_elems(struct bpf_htab *htab)
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
+	bool lru = attr->map_type == BPF_MAP_TYPE_LRU_HASH;
+	/* percpu_lru means each cpu has its own LRU list.
+	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
+	 * the map's value itself is percpu.  percpu_lru has
+	 * nothing to do with the map's value.
+	 */
+	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
+	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 	struct bpf_htab *htab;
 	int err, i;
 	u64 cost;
 
-	if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+	if (lru && !capable(CAP_SYS_ADMIN))
+		/* LRU implementation is much complicated than other
+		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
+		 */
+		return ERR_PTR(-EPERM);
+
+	if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
 		/* reserved bits should not be used */
 		return ERR_PTR(-EINVAL);
 
+	if (!lru && percpu_lru)
+		return ERR_PTR(-EINVAL);
+
+	if (lru && !prealloc)
+		return ERR_PTR(-ENOTSUPP);
+
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
@@ -171,6 +243,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	    htab->map.value_size == 0)
 		goto free_htab;
 
+	if (percpu_lru) {
+		/* ensure each CPU's lru list has >=1 elements.
+		 * since we are at it, make each lru list has the same
+		 * number of elements.
+		 */
+		htab->map.max_entries = roundup(attr->max_entries,
+						num_possible_cpus());
+		if (htab->map.max_entries < attr->max_entries)
+			htab->map.max_entries = rounddown(attr->max_entries,
+							  num_possible_cpus());
+	}
+
 	/* hash table size must be power of 2 */
 	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
 
@@ -241,14 +325,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
-	if (!percpu) {
+	if (!percpu && !lru) {
+		/* lru itself can remove the least used element, so
+		 * there is no need for an extra elem during map_update.
+		 */
 		err = alloc_extra_elems(htab);
 		if (err)
 			goto free_buckets;
 	}
 
-	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
-		err = prealloc_elems_and_freelist(htab);
+	if (prealloc) {
+		err = prealloc_init(htab);
 		if (err)
 			goto free_extra_elems;
 	}
@@ -323,6 +410,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 	return NULL;
 }
 
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+	if (l) {
+		bpf_lru_node_set_ref(&l->lru_node);
+		return l->key + round_up(map->key_size, 8);
+	}
+
+	return NULL;
+}
+
+/* It is called from the bpf_lru_list when the LRU needs to delete
+ * older elements from the htab.
+ */
+static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
+{
+	struct bpf_htab *htab = (struct bpf_htab *)arg;
+	struct htab_elem *l, *tgt_l;
+	struct hlist_head *head;
+	unsigned long flags;
+	struct bucket *b;
+
+	tgt_l = container_of(node, struct htab_elem, lru_node);
+	b = __select_bucket(htab, tgt_l->hash);
+	head = &b->head;
+
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	hlist_for_each_entry_rcu(l, head, hash_node)
+		if (l == tgt_l) {
+			hlist_del_rcu(&l->hash_node);
+			break;
+		}
+
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+
+	return l == tgt_l;
+}
+
 /* Called from syscall */
 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
@@ -579,6 +706,70 @@ err:
 	return ret;
 }
 
+static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+				    u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new, *l_old = NULL;
+	struct hlist_head *head;
+	unsigned long flags;
+	struct bucket *b;
+	u32 key_size, hash;
+	int ret;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	/* For LRU, we need to alloc before taking bucket's
+	 * spinlock because getting free nodes from LRU may need
+	 * to remove older elements from htab and this removal
+	 * operation will need a bucket lock.
+	 */
+	l_new = prealloc_lru_pop(htab, key, hash);
+	if (!l_new)
+		return -ENOMEM;
+	memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
+		goto err;
+
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
+	 */
+	hlist_add_head_rcu(&l_new->hash_node, head);
+	if (l_old) {
+		bpf_lru_node_set_ref(&l_new->lru_node);
+		hlist_del_rcu(&l_old->hash_node);
+	}
+	ret = 0;
+
+err:
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+
+	if (ret)
+		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+	else if (l_old)
+		bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+
+	return ret;
+}
+
 static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 					 void *value, u64 map_flags,
 					 bool onallcpus)
@@ -671,6 +862,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
+static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_head *head;
+	struct bucket *b;
+	struct htab_elem *l;
+	unsigned long flags;
+	u32 hash, key_size;
+	int ret = -ENOENT;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l = lookup_elem_raw(head, hash, key, key_size);
+
+	if (l) {
+		hlist_del_rcu(&l->hash_node);
+		ret = 0;
+	}
+
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	if (l)
+		bpf_lru_push_free(&htab->lru, &l->lru_node);
+	return ret;
+}
+
 static void delete_all_elements(struct bpf_htab *htab)
 {
 	int i;
@@ -703,12 +927,11 @@ static void htab_map_free(struct bpf_map *map)
 	 * not have executed. Wait for them.
 	 */
 	rcu_barrier();
-	if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+	if (htab->map.map_flags & BPF_F_NO_PREALLOC)
 		delete_all_elements(htab);
-	} else {
-		htab_free_elems(htab);
-		pcpu_freelist_destroy(&htab->freelist);
-	}
+	else
+		prealloc_destroy(htab);
+
 	free_percpu(htab->extra_elems);
 	kvfree(htab->buckets);
 	kfree(htab);
@@ -728,6 +951,20 @@ static struct bpf_map_type_list htab_type __read_mostly = {
 	.type = BPF_MAP_TYPE_HASH,
 };
 
+static const struct bpf_map_ops htab_lru_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_lru_map_lookup_elem,
+	.map_update_elem = htab_lru_map_update_elem,
+	.map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_type __read_mostly = {
+	.ops = &htab_lru_ops,
+	.type = BPF_MAP_TYPE_LRU_HASH,
+};
+
 /* Called from eBPF program */
 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 {
@@ -798,6 +1035,7 @@ static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
 	bpf_register_map_type(&htab_percpu_type);
+	bpf_register_map_type(&htab_lru_type);
 	return 0;
 }
 late_initcall(register_htab_map);
-- 
cgit v1.2.3


From 8f8449384ec364ba2a654f11f94e754e4ff719e0 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:10 -0800
Subject: bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH

Provide a LRU version of the existing BPF_MAP_TYPE_PERCPU_HASH

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   3 +-
 kernel/bpf/hashtab.c     | 129 ++++++++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/syscall.c     |   8 ++-
 3 files changed, 131 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ed8c6799fb14..7d9b2832c280 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -86,6 +86,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STACK_TRACE,
 	BPF_MAP_TYPE_CGROUP_ARRAY,
 	BPF_MAP_TYPE_LRU_HASH,
+	BPF_MAP_TYPE_LRU_PERCPU_HASH,
 };
 
 enum bpf_prog_type {
@@ -108,7 +109,7 @@ enum bpf_prog_type {
 
 #define BPF_F_NO_PREALLOC	(1U << 0)
 /* Instead of having one common LRU list in the
- * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list
+ * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
  * which can scale and perform better.
  * Note, the LRU nodes (including free nodes) cannot be moved
  * across different LRU lists.
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4a9e71a7c41f..34debc1a9641 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -62,7 +62,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
 
 static bool htab_is_lru(const struct bpf_htab *htab)
 {
-	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH;
+	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
+		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
+}
+
+static bool htab_is_percpu(const struct bpf_htab *htab)
+{
+	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 }
 
 static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
@@ -85,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab)
 {
 	int i;
 
-	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+	if (!htab_is_percpu(htab))
 		goto free_elems;
 
 	for (i = 0; i < htab->map.max_entries; i++) {
@@ -122,7 +129,7 @@ static int prealloc_init(struct bpf_htab *htab)
 	if (!htab->elems)
 		return -ENOMEM;
 
-	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+	if (!htab_is_percpu(htab))
 		goto skip_percpu_elems;
 
 	for (i = 0; i < htab->map.max_entries; i++) {
@@ -195,8 +202,10 @@ static int alloc_extra_elems(struct bpf_htab *htab)
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
-	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
-	bool lru = attr->map_type == BPF_MAP_TYPE_LRU_HASH;
+	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+		       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
+	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
+		    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 	/* percpu_lru means each cpu has its own LRU list.
 	 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
 	 * the map's value itself is percpu.  percpu_lru has
@@ -823,12 +832,84 @@ err:
 	return ret;
 }
 
+static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					     void *value, u64 map_flags,
+					     bool onallcpus)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new = NULL, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	struct bucket *b;
+	u32 key_size, hash;
+	int ret;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	/* For LRU, we need to alloc before taking bucket's
+	 * spinlock because LRU's elem alloc may need
+	 * to remove older elem from htab and this removal
+	 * operation will need a bucket lock.
+	 */
+	if (map_flags != BPF_EXIST) {
+		l_new = prealloc_lru_pop(htab, key, hash);
+		if (!l_new)
+			return -ENOMEM;
+	}
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
+		goto err;
+
+	if (l_old) {
+		bpf_lru_node_set_ref(&l_old->lru_node);
+
+		/* per-cpu hash map can update value in-place */
+		pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
+				value, onallcpus);
+	} else {
+		pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
+				value, onallcpus);
+		hlist_add_head_rcu(&l_new->hash_node, head);
+		l_new = NULL;
+	}
+	ret = 0;
+err:
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	if (l_new)
+		bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+	return ret;
+}
+
 static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 				       void *value, u64 map_flags)
 {
 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
 }
 
+static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					   void *value, u64 map_flags)
+{
+	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
+						 false);
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
@@ -976,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 		return NULL;
 }
 
+static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+	if (l) {
+		bpf_lru_node_set_ref(&l->lru_node);
+		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+	}
+
+	return NULL;
+}
+
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 {
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l;
 	void __percpu *pptr;
 	int ret = -ENOENT;
@@ -993,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
 	l = __htab_map_lookup_elem(map, key);
 	if (!l)
 		goto out;
+	if (htab_is_lru(htab))
+		bpf_lru_node_set_ref(&l->lru_node);
 	pptr = htab_elem_get_ptr(l, map->key_size);
 	for_each_possible_cpu(cpu) {
 		bpf_long_memcpy(value + off,
@@ -1008,10 +1104,16 @@ out:
 int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 			   u64 map_flags)
 {
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	int ret;
 
 	rcu_read_lock();
-	ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+	if (htab_is_lru(htab))
+		ret = __htab_lru_percpu_map_update_elem(map, key, value,
+							map_flags, true);
+	else
+		ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
+						    true);
 	rcu_read_unlock();
 
 	return ret;
@@ -1031,11 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = {
 	.type = BPF_MAP_TYPE_PERCPU_HASH,
 };
 
+static const struct bpf_map_ops htab_lru_percpu_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
+	.map_update_elem = htab_lru_percpu_map_update_elem,
+	.map_delete_elem = htab_lru_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
+	.ops = &htab_lru_percpu_ops,
+	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+};
+
 static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
 	bpf_register_map_type(&htab_percpu_type);
 	bpf_register_map_type(&htab_lru_type);
+	bpf_register_map_type(&htab_lru_percpu_type);
 	return 0;
 }
 late_initcall(register_htab_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 233e3ac836a6..ce1b7de7d72c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr)
 		goto free_key;
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 	else
@@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (!value)
 		goto free_key;
 
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 		err = bpf_percpu_hash_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_copy(map, key, value);
@@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr)
 		goto free_key;
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
 	else
@@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr)
 	 */
 	preempt_disable();
 	__this_cpu_inc(bpf_prog_active);
-	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
 		err = bpf_percpu_hash_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
-- 
cgit v1.2.3


From fa32e8557b470f5ff90babc6cbacc61535a81a0f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 6 Jul 2016 15:25:08 -0400
Subject: tracing: Add new trace_marker_raw

A new file is created:

 /sys/kernel/debug/tracing/trace_marker_raw

This allows for appications to create data structures and write the binary
data directly into it, and then read the trace data out from trace_pipe_raw
into the same type of data structure. This saves on converting numbers into
ASCII that would be required by trace_marker.

Suggested-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.txt |   6 ++
 kernel/trace/trace.c           | 165 +++++++++++++++++++++++++++++++++--------
 kernel/trace/trace.h           |   2 +
 kernel/trace/trace_entries.h   |  15 ++++
 kernel/trace/trace_output.c    |  30 ++++++++
 5 files changed, 187 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 1bc66c1db0cb..6c374c5fe400 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -396,6 +396,12 @@ of ftrace. Here is a list of some of the key files:
 
 		trace_fd = open("trace_marker", WR_ONLY);
 
+  trace_marker_raw:
+
+	This is similar to trace_marker above, but is meant for for binary data
+	to be written to it, where a tool can be used to parse the data
+	from trace_pipe_raw.
+
   uprobe_events:
  
 	Add dynamic tracepoints in programs.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d904516dfdab..57069e7f369c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4054,6 +4054,7 @@ static const char readme_msg[] =
 	"     x86-tsc:   TSC cycle counter\n"
 #endif
 	"\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+	"\n  trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
 	"  tracing_cpumask\t- Limit which CPUs to trace\n"
 	"  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
 	"\t\t\t  Remove sub-buffer with rmdir\n"
@@ -5514,35 +5515,15 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static ssize_t
-tracing_mark_write(struct file *filp, const char __user *ubuf,
-					size_t cnt, loff_t *fpos)
+static inline int lock_user_pages(const char __user *ubuf, size_t cnt,
+				  struct page **pages, void **map_page,
+				  int *offset)
 {
 	unsigned long addr = (unsigned long)ubuf;
-	struct trace_array *tr = filp->private_data;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct print_entry *entry;
-	unsigned long irq_flags;
-	struct page *pages[2];
-	void *map_page[2];
 	int nr_pages = 1;
-	ssize_t written;
-	int offset;
-	int size;
-	int len;
 	int ret;
 	int i;
 
-	if (tracing_disabled)
-		return -EINVAL;
-
-	if (!(tr->trace_flags & TRACE_ITER_MARKERS))
-		return -EINVAL;
-
-	if (cnt > TRACE_BUF_SIZE)
-		cnt = TRACE_BUF_SIZE;
-
 	/*
 	 * Userspace is injecting traces into the kernel trace buffer.
 	 * We want to be as non intrusive as possible.
@@ -5557,26 +5538,70 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	 * pages directly. We then write the data directly into the
 	 * ring buffer.
 	 */
-	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
 	/* check if we cross pages */
 	if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
 		nr_pages = 2;
 
-	offset = addr & (PAGE_SIZE - 1);
+	*offset = addr & (PAGE_SIZE - 1);
 	addr &= PAGE_MASK;
 
 	ret = get_user_pages_fast(addr, nr_pages, 0, pages);
 	if (ret < nr_pages) {
 		while (--ret >= 0)
 			put_page(pages[ret]);
-		written = -EFAULT;
-		goto out;
+		return -EFAULT;
 	}
 
 	for (i = 0; i < nr_pages; i++)
 		map_page[i] = kmap_atomic(pages[i]);
 
+	return nr_pages;
+}
+
+static inline void unlock_user_pages(struct page **pages,
+				     void **map_page, int nr_pages)
+{
+	int i;
+
+	for (i = nr_pages - 1; i >= 0; i--) {
+		kunmap_atomic(map_page[i]);
+		put_page(pages[i]);
+	}
+}
+
+static ssize_t
+tracing_mark_write(struct file *filp, const char __user *ubuf,
+					size_t cnt, loff_t *fpos)
+{
+	struct trace_array *tr = filp->private_data;
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+	struct page *pages[2];
+	void *map_page[2];
+	int nr_pages = 1;
+	ssize_t written;
+	int offset;
+	int size;
+	int len;
+
+	if (tracing_disabled)
+		return -EINVAL;
+
+	if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+		return -EINVAL;
+
+	if (cnt > TRACE_BUF_SIZE)
+		cnt = TRACE_BUF_SIZE;
+
+	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+
+	nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset);
+	if (nr_pages < 0)
+		return nr_pages;
+
 	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt + 2; /* possible \n added */
 	buffer = tr->trace_buffer.buffer;
@@ -5611,11 +5636,79 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	*fpos += written;
 
  out_unlock:
-	for (i = nr_pages - 1; i >= 0; i--) {
-		kunmap_atomic(map_page[i]);
-		put_page(pages[i]);
+	unlock_user_pages(pages, map_page, nr_pages);
+
+	return written;
+}
+
+/* Limit it for now to 3K (including tag) */
+#define RAW_DATA_MAX_SIZE (1024*3)
+
+static ssize_t
+tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
+					size_t cnt, loff_t *fpos)
+{
+	struct trace_array *tr = filp->private_data;
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct raw_data_entry *entry;
+	unsigned long irq_flags;
+	struct page *pages[2];
+	void *map_page[2];
+	int nr_pages = 1;
+	ssize_t written;
+	int offset;
+	int size;
+	int len;
+
+	if (tracing_disabled)
+		return -EINVAL;
+
+	if (!(tr->trace_flags & TRACE_ITER_MARKERS))
+		return -EINVAL;
+
+	/* The marker must at least have a tag id */
+	if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+		return -EINVAL;
+
+	if (cnt > TRACE_BUF_SIZE)
+		cnt = TRACE_BUF_SIZE;
+
+	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+
+	nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset);
+	if (nr_pages < 0)
+		return nr_pages;
+
+	local_save_flags(irq_flags);
+	size = sizeof(*entry) + cnt;
+	buffer = tr->trace_buffer.buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
+					  irq_flags, preempt_count());
+	if (!event) {
+		/* Ring buffer disabled, return as if not open for write */
+		written = -EBADF;
+		goto out_unlock;
 	}
- out:
+
+	entry = ring_buffer_event_data(event);
+
+	if (nr_pages == 2) {
+		len = PAGE_SIZE - offset;
+		memcpy(&entry->id, map_page[0] + offset, len);
+		memcpy(((char *)&entry->id) + len, map_page[1], cnt - len);
+	} else
+		memcpy(&entry->id, map_page[0] + offset, cnt);
+
+	__buffer_unlock_commit(buffer, event);
+
+	written = cnt;
+
+	*fpos += written;
+
+ out_unlock:
+	unlock_user_pages(pages, map_page, nr_pages);
+
 	return written;
 }
 
@@ -5945,6 +6038,13 @@ static const struct file_operations tracing_mark_fops = {
 	.release	= tracing_release_generic_tr,
 };
 
+static const struct file_operations tracing_mark_raw_fops = {
+	.open		= tracing_open_generic_tr,
+	.write		= tracing_mark_raw_write,
+	.llseek		= generic_file_llseek,
+	.release	= tracing_release_generic_tr,
+};
+
 static const struct file_operations trace_clock_fops = {
 	.open		= tracing_clock_open,
 	.read		= seq_read,
@@ -7214,6 +7314,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_marker", 0220, d_tracer,
 			  tr, &tracing_mark_fops);
 
+	trace_create_file("trace_marker_raw", 0220, d_tracer,
+			  tr, &tracing_mark_raw_fops);
+
 	trace_create_file("trace_clock", 0644, d_tracer, tr,
 			  &trace_clock_fops);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4b7918902ab8..9294f8606ade 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -40,6 +40,7 @@ enum trace_type {
 	TRACE_BLK,
 	TRACE_BPUTS,
 	TRACE_HWLAT,
+	TRACE_RAW_DATA,
 
 	__TRACE_LAST_TYPE,
 };
@@ -331,6 +332,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\
 		IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT);	\
+		IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index d1cc37e78f99..eb7396b7e7c3 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -244,6 +244,21 @@ FTRACE_ENTRY(print, print_entry,
 	FILTER_OTHER
 );
 
+FTRACE_ENTRY(raw_data, raw_data_entry,
+
+	TRACE_RAW_DATA,
+
+	F_STRUCT(
+		__field(	unsigned int,	id	)
+		__dynamic_array(	char,	buf	)
+	),
+
+	F_printk("id:%04x %08x",
+		 __entry->id, (int)__entry->buf[0]),
+
+	FILTER_OTHER
+);
+
 FTRACE_ENTRY(bputs, bputs_entry,
 
 	TRACE_BPUTS,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 3fc20422c166..5d33a7352919 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1288,6 +1288,35 @@ static struct trace_event trace_print_event = {
 	.funcs		= &trace_print_funcs,
 };
 
+static enum print_line_t trace_raw_data(struct trace_iterator *iter, int flags,
+					 struct trace_event *event)
+{
+	struct raw_data_entry *field;
+	int i;
+
+	trace_assign_type(field, iter->ent);
+
+	trace_seq_printf(&iter->seq, "# %x buf:", field->id);
+
+	for (i = 0; i < iter->ent_size - offsetof(struct raw_data_entry, buf); i++)
+		trace_seq_printf(&iter->seq, " %02x",
+				 (unsigned char)field->buf[i]);
+
+	trace_seq_putc(&iter->seq, '\n');
+
+	return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions trace_raw_data_funcs = {
+	.trace		= trace_raw_data,
+	.raw		= trace_raw_data,
+};
+
+static struct trace_event trace_raw_data_event = {
+	.type	 	= TRACE_RAW_DATA,
+	.funcs		= &trace_raw_data_funcs,
+};
+
 
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
@@ -1299,6 +1328,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_bprint_event,
 	&trace_print_event,
 	&trace_hwlat_event,
+	&trace_raw_data_event,
 	NULL
 };
 
-- 
cgit v1.2.3


From d032ae8921ea792c1e6b2abb44022b2403f651f6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Tue, 15 Nov 2016 12:31:20 -0800
Subject: ftrace: Provide API to use global filtering for ftrace ops

Currently the global_ops filtering hash is not available to outside users
registering for function tracing. Provide an API for those users to be
able to choose global filtering.

This is in preparation for pstore's ftrace feature to be able to
use the global filters.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Joel Fernandes <joelaf@google.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/ftrace.c  | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b3d34d3e0e7e..d4a884db16a3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -398,6 +398,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
 void ftrace_set_global_filter(unsigned char *buf, int len, int reset);
 void ftrace_set_global_notrace(unsigned char *buf, int len, int reset);
 void ftrace_free_filter(struct ftrace_ops *ops);
+void ftrace_ops_set_global_filter(struct ftrace_ops *ops);
 
 int register_ftrace_command(struct ftrace_func_command *cmd);
 int unregister_ftrace_command(struct ftrace_func_command *cmd);
@@ -645,6 +646,7 @@ static inline unsigned long ftrace_location(unsigned long ip)
 #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; })
 #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; })
 #define ftrace_free_filter(ops) do { } while (0)
+#define ftrace_ops_set_global_filter(ops) do { } while (0)
 
 static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos) { return -ENODEV; }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2050a7652a86..89d46e1c9302 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4239,6 +4239,23 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
 
+/**
+ * ftrace_ops_set_global_filter - setup ops to use global filters
+ * @ops - the ops which will use the global filters
+ *
+ * ftrace users who need global function trace filtering should call this.
+ * It can set the global filter only if ops were not initialized before.
+ */
+void ftrace_ops_set_global_filter(struct ftrace_ops *ops)
+{
+	if (ops->flags & FTRACE_OPS_FL_INITIALIZED)
+		return;
+
+	ftrace_ops_init(ops);
+	ops->func_hash = &global_ops.local_hash;
+}
+EXPORT_SYMBOL_GPL(ftrace_ops_set_global_filter);
+
 static int
 ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 		 int reset, int enable)
-- 
cgit v1.2.3


From 74ba181e61c6accf9066d6980f44588de2f854f6 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 11 Nov 2016 00:10:08 -0500
Subject: timer: Move sys_alarm from timer.c to itimer.c

Move the only user of alarm_setitimer to itimer.c where it is defined.
This allows for making alarm_setitimer static, and dropping it from the
build when __ARCH_WANT_SYS_ALARM is not defined.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Paul Bolle <pebolle@tiscali.nl>
Cc: linux-kbuild@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Edward Cree <ecree@solarflare.com>
Link: http://lkml.kernel.org/r/1478841010-28605-5-git-send-email-nicolas.pitre@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h |  2 --
 kernel/time/itimer.c | 15 ++++++++++++++-
 kernel/time/timer.c  | 13 -------------
 3 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index 4cea09d94208..23f0f5ce3090 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -172,8 +172,6 @@ extern int do_setitimer(int which, struct itimerval *value,
 			struct itimerval *ovalue);
 extern int do_getitimer(int which, struct itimerval *value);
 
-extern unsigned int alarm_setitimer(unsigned int seconds);
-
 extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags);
 
 struct tms;
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 1d5c7204ddc9..2b9f45bc955d 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -238,6 +238,8 @@ again:
 	return 0;
 }
 
+#ifdef __ARCH_WANT_SYS_ALARM
+
 /**
  * alarm_setitimer - set alarm in seconds
  *
@@ -250,7 +252,7 @@ again:
  * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid
  * negative timeval settings which would cause immediate expiry.
  */
-unsigned int alarm_setitimer(unsigned int seconds)
+static unsigned int alarm_setitimer(unsigned int seconds)
 {
 	struct itimerval it_new, it_old;
 
@@ -275,6 +277,17 @@ unsigned int alarm_setitimer(unsigned int seconds)
 	return it_old.it_value.tv_sec;
 }
 
+/*
+ * For backwards compatibility?  This can be done in libc so Alpha
+ * and all newer ports shouldn't need it.
+ */
+SYSCALL_DEFINE1(alarm, unsigned int, seconds)
+{
+	return alarm_setitimer(seconds);
+}
+
+#endif
+
 SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 		struct itimerval __user *, ovalue)
 {
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 88aab86a4594..42d27aa242b9 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1662,19 +1662,6 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-#ifdef __ARCH_WANT_SYS_ALARM
-
-/*
- * For backwards compatibility?  This can be done in libc so Alpha
- * and all newer ports shouldn't need it.
- */
-SYSCALL_DEFINE1(alarm, unsigned int, seconds)
-{
-	return alarm_setitimer(seconds);
-}
-
-#endif
-
 static void process_timeout(unsigned long __data)
 {
 	wake_up_process((struct task_struct *)__data);
-- 
cgit v1.2.3


From 53d3eaa31508222e445b489f3c3ac4c63542a4ef Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 11 Nov 2016 00:10:09 -0500
Subject: posix_cpu_timers: Move the add_device_randomness() call to a proper
 place

There is no logical relation between add_device_randomness() and
posix_cpu_timers_exit(). Let's move the former to where the later
is called. This way, when posix-cpu-timers.c is compiled out, there
is no need to worry about not losing a call to add_device_randomness().

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Paul Bolle <pebolle@tiscali.nl>
Cc: linux-kbuild@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: Edward Cree <ecree@solarflare.com>
Link: http://lkml.kernel.org/r/1478841010-28605-6-git-send-email-nicolas.pitre@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/exit.c                  | 4 ++++
 kernel/time/posix-cpu-timers.c | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 9d68c45ebbe3..d16bcdd89dbe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,7 @@
 #include <linux/writeback.h>
 #include <linux/shm.h>
 #include <linux/kcov.h>
+#include <linux/random.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -116,6 +117,9 @@ static void __exit_signal(struct task_struct *tsk)
 			sig->curr_target = next_thread(tsk);
 	}
 
+	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+			      sizeof(unsigned long long));
+
 	/*
 	 * Accumulate here the counters for all threads as they die. We could
 	 * skip the group leader because it is the last user of signal_struct,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d78927a..e582f20f47a4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -9,7 +9,6 @@
 #include <asm/uaccess.h>
 #include <linux/kernel_stat.h>
 #include <trace/events/timer.h>
-#include <linux/random.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
 
@@ -447,10 +446,7 @@ static void cleanup_timers(struct list_head *head)
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
-						sizeof(unsigned long long));
 	cleanup_timers(tsk->cpu_timers);
-
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-- 
cgit v1.2.3


From baa73d9e478ff32d62f3f9422822b59dd9a95a21 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 11 Nov 2016 00:10:10 -0500
Subject: posix-timers: Make them configurable

Some embedded systems have no use for them.  This removes about
25KB from the kernel binary size when configured out.

Corresponding syscalls are routed to a stub logging the attempt to
use those syscalls which should be enough of a clue if they were
disabled without proper consideration. They are: timer_create,
timer_gettime: timer_getoverrun, timer_settime, timer_delete,
clock_adjtime, setitimer, getitimer, alarm.

The clock_settime, clock_gettime, clock_getres and clock_nanosleep
syscalls are replaced by simple wrappers compatible with CLOCK_REALTIME,
CLOCK_MONOTONIC and CLOCK_BOOTTIME only which should cover the vast
majority of use cases with very little code.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: Paul Bolle <pebolle@tiscali.nl>
Cc: linux-kbuild@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: Michal Marek <mmarek@suse.com>
Cc: Edward Cree <ecree@solarflare.com>
Link: http://lkml.kernel.org/r/1478841010-28605-7-git-send-email-nicolas.pitre@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/kernel/osf_sys.c |   8 +++
 drivers/char/Kconfig        |   1 +
 drivers/ptp/Kconfig         |   2 +-
 fs/exec.c                   |   2 +
 init/Kconfig                |  17 ++++++
 kernel/compat.c             |   8 +++
 kernel/exit.c               |  11 +++-
 kernel/fork.c               |   2 +
 kernel/signal.c             |   6 +++
 kernel/sys.c                |   3 +-
 kernel/time/Makefile        |  10 +++-
 kernel/time/alarmtimer.c    |   6 ++-
 kernel/time/posix-stubs.c   | 123 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/timer.c         |   3 +-
 security/selinux/hooks.c    |  11 ++--
 15 files changed, 200 insertions(+), 13 deletions(-)
 create mode 100644 kernel/time/posix-stubs.c

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index ffb93f499c83..56e427c7aa3c 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1029,11 +1029,16 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
 	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
 }
 
+asmlinkage long sys_ni_posix_timers(void);
+
 SYSCALL_DEFINE2(osf_getitimer, int, which, struct itimerval32 __user *, it)
 {
 	struct itimerval kit;
 	int error;
 
+	if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
+		return sys_ni_posix_timers();
+
 	error = do_getitimer(which, &kit);
 	if (!error && put_it32(it, &kit))
 		error = -EFAULT;
@@ -1047,6 +1052,9 @@ SYSCALL_DEFINE3(osf_setitimer, int, which, struct itimerval32 __user *, in,
 	struct itimerval kin, kout;
 	int error;
 
+	if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
+		return sys_ni_posix_timers();
+
 	if (in) {
 		if (get_it32(&kin, in))
 			return -EFAULT;
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index dcc09739a54e..45ba878ae025 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -542,6 +542,7 @@ config HANGCHECK_TIMER
 config MMTIMER
 	tristate "MMTIMER Memory mapped RTC for SGI Altix"
 	depends on IA64_GENERIC || IA64_SGI_SN2
+	depends on POSIX_TIMERS
 	default y
 	help
 	  The mmtimer device allows direct userspace access to the
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 0f7492f8ea23..bdce33291161 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -6,7 +6,7 @@ menu "PTP clock support"
 
 config PTP_1588_CLOCK
 	tristate "PTP clock support"
-	depends on NET
+	depends on NET && POSIX_TIMERS
 	select PPS
 	select NET_PTP_CLASSIFY
 	help
diff --git a/fs/exec.c b/fs/exec.c
index 4e497b9ee71e..923c57d96899 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1169,8 +1169,10 @@ no_thread_group:
 	/* we have changed execution domain */
 	tsk->exit_signal = SIGCHLD;
 
+#ifdef CONFIG_POSIX_TIMERS
 	exit_itimers(sig);
 	flush_itimer_signals();
+#endif
 
 	if (atomic_read(&oldsighand->count) != 1) {
 		struct sighand_struct *newsighand;
diff --git a/init/Kconfig b/init/Kconfig
index 34407f15e6d3..456e0b891238 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1445,6 +1445,23 @@ config SYSCTL_SYSCALL
 
 	  If unsure say N here.
 
+config POSIX_TIMERS
+	bool "Posix Clocks & timers" if EXPERT
+	default y
+	help
+	  This includes native support for POSIX timers to the kernel.
+	  Some embedded systems have no use for them and therefore they
+	  can be configured out to reduce the size of the kernel image.
+
+	  When this option is disabled, the following syscalls won't be
+	  available: timer_create, timer_gettime: timer_getoverrun,
+	  timer_settime, timer_delete, clock_adjtime, getitimer,
+	  setitimer, alarm. Furthermore, the clock_settime, clock_gettime,
+	  clock_getres and clock_nanosleep syscalls will be limited to
+	  CLOCK_REALTIME, CLOCK_MONOTONIC and CLOCK_BOOTTIME only.
+
+	  If unsure say y.
+
 config KALLSYMS
 	 bool "Load all symbols for debugging/ksymoops" if EXPERT
 	 default y
diff --git a/kernel/compat.c b/kernel/compat.c
index 333d364be29d..b3a047f208a7 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -307,12 +307,17 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
 		 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
 }
 
+asmlinkage long sys_ni_posix_timers(void);
+
 COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
 		struct compat_itimerval __user *, it)
 {
 	struct itimerval kit;
 	int error;
 
+	if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
+		return sys_ni_posix_timers();
+
 	error = do_getitimer(which, &kit);
 	if (!error && put_compat_itimerval(it, &kit))
 		error = -EFAULT;
@@ -326,6 +331,9 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
 	struct itimerval kin, kout;
 	int error;
 
+	if (!IS_ENABLED(CONFIG_POSIX_TIMERS))
+		return sys_ni_posix_timers();
+
 	if (in) {
 		if (get_compat_itimerval(&kin, in))
 			return -EFAULT;
diff --git a/kernel/exit.c b/kernel/exit.c
index d16bcdd89dbe..684de019b674 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -92,11 +92,10 @@ static void __exit_signal(struct task_struct *tsk)
 					lockdep_tasklist_lock_is_held());
 	spin_lock(&sighand->siglock);
 
+#ifdef CONFIG_POSIX_TIMERS
 	posix_cpu_timers_exit(tsk);
 	if (group_dead) {
 		posix_cpu_timers_exit_group(tsk);
-		tty = sig->tty;
-		sig->tty = NULL;
 	} else {
 		/*
 		 * This can only happen if the caller is de_thread().
@@ -105,7 +104,13 @@ static void __exit_signal(struct task_struct *tsk)
 		 */
 		if (unlikely(has_group_leader_pid(tsk)))
 			posix_cpu_timers_exit_group(tsk);
+	}
+#endif
 
+	if (group_dead) {
+		tty = sig->tty;
+		sig->tty = NULL;
+	} else {
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
@@ -803,8 +808,10 @@ void __noreturn do_exit(long code)
 	acct_update_integrals(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
+#ifdef CONFIG_POSIX_TIMERS
 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
+#endif
 		if (tsk->mm)
 			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 	}
diff --git a/kernel/fork.c b/kernel/fork.c
index 623259fc794d..17da35fa77e7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1342,8 +1342,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	seqlock_init(&sig->stats_lock);
 	prev_cputime_init(&sig->prev_cputime);
 
+#ifdef CONFIG_POSIX_TIMERS
 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	sig->real_timer.function = it_real_fn;
+#endif
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
diff --git a/kernel/signal.c b/kernel/signal.c
index 75761acc77cf..29a410780aa9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -427,6 +427,7 @@ void flush_signals(struct task_struct *t)
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
+#ifdef CONFIG_POSIX_TIMERS
 static void __flush_itimer_signals(struct sigpending *pending)
 {
 	sigset_t signal, retain;
@@ -460,6 +461,7 @@ void flush_itimer_signals(void)
 	__flush_itimer_signals(&tsk->signal->shared_pending);
 	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 }
+#endif
 
 void ignore_signals(struct task_struct *t)
 {
@@ -567,6 +569,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	if (!signr) {
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
+#ifdef CONFIG_POSIX_TIMERS
 		/*
 		 * itimer signal ?
 		 *
@@ -590,6 +593,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 				hrtimer_restart(tmr);
 			}
 		}
+#endif
 	}
 
 	recalc_sigpending();
@@ -611,6 +615,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		 */
 		current->jobctl |= JOBCTL_STOP_DEQUEUED;
 	}
+#ifdef CONFIG_POSIX_TIMERS
 	if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
 		/*
 		 * Release the siglock to ensure proper locking order
@@ -622,6 +627,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		do_schedule_next_timer(info);
 		spin_lock(&tsk->sighand->siglock);
 	}
+#endif
 	return signr;
 }
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 89d5be418157..78c9fb7dd680 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1416,7 +1416,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 	 * applications, so we live with it
 	 */
 	 if (!retval && new_rlim && resource == RLIMIT_CPU &&
-			 new_rlim->rlim_cur != RLIM_INFINITY)
+	     new_rlim->rlim_cur != RLIM_INFINITY &&
+	     IS_ENABLED(CONFIG_POSIX_TIMERS))
 		update_rlimit_cpu(tsk, new_rlim->rlim_cur);
 out:
 	read_unlock(&tasklist_lock);
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 49eca0beed32..976840d29a71 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,12 @@
-obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
+obj-y += time.o timer.o hrtimer.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o alarmtimer.o
+
+ifeq ($(CONFIG_POSIX_TIMERS),y)
+ obj-y += posix-timers.o posix-cpu-timers.o posix-clock.o itimer.o
+else
+ obj-y += posix-stubs.o
+endif
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= clockevents.o tick-common.o
 ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 12dd190634ab..a15caa3d1721 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -846,8 +846,10 @@ static int __init alarmtimer_init(void)
 
 	alarmtimer_rtc_timer_init();
 
-	posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
-	posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+	if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+		posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
+		posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+	}
 
 	/* Initialize alarm bases */
 	alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
new file mode 100644
index 000000000000..cd6716e115e8
--- /dev/null
+++ b/kernel/time/posix-stubs.c
@@ -0,0 +1,123 @@
+/*
+ * Dummy stubs used when CONFIG_POSIX_TIMERS=n
+ *
+ * Created by:  Nicolas Pitre, July 2016
+ * Copyright:   (C) 2016 Linaro Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/ktime.h>
+#include <linux/timekeeping.h>
+#include <linux/posix-timers.h>
+
+asmlinkage long sys_ni_posix_timers(void)
+{
+	pr_err_once("process %d (%s) attempted a POSIX timer syscall "
+		    "while CONFIG_POSIX_TIMERS is not set\n",
+		    current->pid, current->comm);
+	return -ENOSYS;
+}
+
+#define SYS_NI(name)  SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers)
+
+SYS_NI(timer_create);
+SYS_NI(timer_gettime);
+SYS_NI(timer_getoverrun);
+SYS_NI(timer_settime);
+SYS_NI(timer_delete);
+SYS_NI(clock_adjtime);
+SYS_NI(getitimer);
+SYS_NI(setitimer);
+#ifdef __ARCH_WANT_SYS_ALARM
+SYS_NI(alarm);
+#endif
+
+/*
+ * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
+ * as it is easy to remain compatible with little code. CLOCK_BOOTTIME
+ * is also included for convenience as at least systemd uses it.
+ */
+
+SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
+		const struct timespec __user *, tp)
+{
+	struct timespec new_tp;
+
+	if (which_clock != CLOCK_REALTIME)
+		return -EINVAL;
+	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+		return -EFAULT;
+	return do_sys_settimeofday(&new_tp, NULL);
+}
+
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+		struct timespec __user *,tp)
+{
+	struct timespec kernel_tp;
+
+	switch (which_clock) {
+	case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
+	case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
+	case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+	default: return -EINVAL;
+	}
+	if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+		return -EFAULT;
+	return 0;
+}
+
+SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
+{
+	struct timespec rtn_tp = {
+		.tv_sec = 0,
+		.tv_nsec = hrtimer_resolution,
+	};
+
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+	case CLOCK_MONOTONIC:
+	case CLOCK_BOOTTIME:
+		if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp)))
+			return -EFAULT;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
+		const struct timespec __user *, rqtp,
+		struct timespec __user *, rmtp)
+{
+	struct timespec t;
+
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+	case CLOCK_MONOTONIC:
+	case CLOCK_BOOTTIME:
+		if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+			return -EFAULT;
+		if (!timespec_valid(&t))
+			return -EINVAL;
+		return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+					 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
+					 which_clock);
+	default:
+		return -EINVAL;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	return hrtimer_nanosleep_restart(restart_block);
+}
+#endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 42d27aa242b9..e2892e454fe3 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1601,7 +1601,8 @@ void update_process_times(int user_tick)
 		irq_work_tick();
 #endif
 	scheduler_tick();
-	run_posix_cpu_timers(p);
+	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+		run_posix_cpu_timers(p);
 }
 
 /**
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 09fd6108e421..38b79d797aaf 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2525,7 +2525,8 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm)
 			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
 		}
 		task_unlock(current);
-		update_rlimit_cpu(current, rlimit(RLIMIT_CPU));
+		if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+			update_rlimit_cpu(current, rlimit(RLIMIT_CPU));
 	}
 }
 
@@ -2555,9 +2556,11 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 	 */
 	rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL);
 	if (rc) {
-		memset(&itimer, 0, sizeof itimer);
-		for (i = 0; i < 3; i++)
-			do_setitimer(i, &itimer, NULL);
+		if (IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+			memset(&itimer, 0, sizeof itimer);
+			for (i = 0; i < 3; i++)
+				do_setitimer(i, &itimer, NULL);
+		}
 		spin_lock_irq(&current->sighand->siglock);
 		if (!fatal_signal_pending(current)) {
 			flush_sigqueue(&current->pending);
-- 
cgit v1.2.3


From bf0d31c05411f030e7df9f98b5be4d0c6fd5cd39 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 25 Oct 2016 11:03:12 +0200
Subject: locking/core, stop_machine: Yield the CPU during stop machine()

Some time ago the following commit:

  57f2ffe14fd125c2 ("s390: remove diag 44 calls from cpu_relax()")

... stopped cpu_relax() on s390 yielding to the hypervisor.

As it turns out this made stop_machine() run really slow on virtualized
overcommited systems. For example the kprobes test during bootup took
several seconds instead of just running unnoticed with large guests.

Therefore, yielding was reintroduced with commit:

  4d92f50249eb ("s390: reintroduce diag 44 calls for cpu_relax()")

... but in fact the stop machine code seems to be the only place where
this yielding was really necessary. This place is probably the most
important one as it makes all but one guest CPUs wait for one guest CPU.

As we now have cpu_relax_yield(), we can use this in multi_cpu_stop().
For now lets only add it here. We can add it later in other places
when necessary.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Noam Camus <noamc@ezchip.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1477386195-32736-3-git-send-email-borntraeger@de.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ec9ab2f01489..1eb82661ecdb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data)
 	/* Simple state machine */
 	do {
 		/* Chill out and ensure we re-read multi_stop_state. */
-		cpu_relax();
+		cpu_relax_yield();
 		if (msdata->state != curstate) {
 			curstate = msdata->state;
 			switch (curstate) {
-- 
cgit v1.2.3


From f2f09a4cee3507dba0e24b87ba2961a5c377d3a7 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 25 Oct 2016 11:03:14 +0200
Subject: locking/core: Remove cpu_relax_lowlatency() users

With the s390 special case of a yielding cpu_relax() implementation gone,
we can now remove all users of cpu_relax_lowlatency() and replace them
with cpu_relax().

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Noam Camus <noamc@ezchip.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: virtualization@lists.linux-foundation.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1477386195-32736-5-git-send-email-borntraeger@de.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
 drivers/vhost/net.c                     | 4 ++--
 kernel/locking/mcs_spinlock.h           | 4 ++--
 kernel/locking/mutex.c                  | 4 ++--
 kernel/locking/osq_lock.c               | 6 +++---
 kernel/locking/qrwlock.c                | 6 +++---
 kernel/locking/rwsem-xadd.c             | 4 ++--
 lib/lockref.c                           | 2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 8832f8ec1583..383d13416442 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -723,7 +723,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 		if (busywait_stop(timeout_us, cpu))
 			break;
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 	} while (!need_resched());
 
 	return false;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 5dc128a8da83..5dc34653274a 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -342,7 +342,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
 		endtime = busy_clock() + vq->busyloop_timeout;
 		while (vhost_can_busy_poll(vq->dev, endtime) &&
 		       vhost_vq_avail_empty(vq->dev, vq))
-			cpu_relax_lowlatency();
+			cpu_relax();
 		preempt_enable();
 		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
 				      out_num, in_num, NULL, NULL);
@@ -533,7 +533,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 		while (vhost_can_busy_poll(&net->dev, endtime) &&
 		       !sk_has_rx_data(sk) &&
 		       vhost_vq_avail_empty(&net->dev, vq))
-			cpu_relax_lowlatency();
+			cpu_relax();
 
 		preempt_enable();
 
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index c835270f0c2f..6a385aabcce7 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -28,7 +28,7 @@ struct mcs_spinlock {
 #define arch_mcs_spin_lock_contended(l)					\
 do {									\
 	while (!(smp_load_acquire(l)))					\
-		cpu_relax_lowlatency();					\
+		cpu_relax();						\
 } while (0)
 #endif
 
@@ -108,7 +108,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 			return;
 		/* Wait until the next pointer is set */
 		while (!(next = READ_ONCE(node->next)))
-			cpu_relax_lowlatency();
+			cpu_relax();
 	}
 
 	/* Pass lock to next waiter. */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 17a88e929e6a..a65e09a046ac 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -369,7 +369,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 			break;
 		}
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 	rcu_read_unlock();
 
@@ -492,7 +492,7 @@ static bool mutex_optimistic_spin(struct mutex *lock,
 		 * memory barriers as we'll eventually observe the right
 		 * values at the cost of a few extra spins.
 		 */
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 
 	if (!waiter)
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a37857ab55..4ea2710b9d6c 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -75,7 +75,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 				break;
 		}
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 
 	return next;
@@ -122,7 +122,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 		if (need_resched())
 			goto unqueue;
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 	return true;
 
@@ -148,7 +148,7 @@ unqueue:
 		if (smp_load_acquire(&node->locked))
 			return true;
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 
 		/*
 		 * Or we race against a concurrent unqueue()'s step-B, in which
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 19248ddf37ce..cc3ed0ccdfa2 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -54,7 +54,7 @@ static __always_inline void
 rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
 {
 	while ((cnts & _QW_WMASK) == _QW_LOCKED) {
-		cpu_relax_lowlatency();
+		cpu_relax();
 		cnts = atomic_read_acquire(&lock->cnts);
 	}
 }
@@ -130,7 +130,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 		   (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
 			break;
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 
 	/* When no more readers, set the locked flag */
@@ -141,7 +141,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 					    _QW_LOCKED) == _QW_WAITING))
 			break;
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 unlock:
 	arch_spin_unlock(&lock->wait_lock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2337b4bb2366..2fa2e2e64950 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -368,7 +368,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 			return false;
 		}
 
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 	rcu_read_unlock();
 out:
@@ -423,7 +423,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
 		 * memory barriers as we'll eventually observe the right
 		 * values at the cost of a few extra spins.
 		 */
-		cpu_relax_lowlatency();
+		cpu_relax();
 	}
 	osq_unlock(&sem->osq);
 done:
diff --git a/lib/lockref.c b/lib/lockref.c
index 5a92189ad711..c4bfcb8836cd 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -20,7 +20,7 @@
 		if (likely(old.lock_count == prev.lock_count)) {		\
 			SUCCESS;						\
 		}								\
-		cpu_relax_lowlatency();						\
+		cpu_relax();							\
 	}									\
 } while (0)
 
-- 
cgit v1.2.3


From 527b0a76f41d062381adbb55c8eb61e32cb0bfc9 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 11 Nov 2016 15:27:49 +0100
Subject: sched/cpuacct: Avoid %lld seq_printf warning

For s390 kernel builds I keep getting this warning:

 kernel/sched/cpuacct.c: In function 'cpuacct_stats_show':
 kernel/sched/cpuacct.c:298:25: warning: format '%lld' expects argument of type 'long long int', but argument 4 has type 'clock_t {aka long int}' [-Wformat=]
   seq_printf(sf, "%s %lld\n",

Silence the warning by adding an explicit cast.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161111142749.6545-1-schwidefsky@de.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpuacct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index bc0b309c3f19..9add206b5608 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
 	for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
 		seq_printf(sf, "%s %lld\n",
 			   cpuacct_stat_desc[stat],
-			   cputime64_to_clock_t(val[stat]));
+			   (long long)cputime64_to_clock_t(val[stat]));
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 104cb16d9eb684f071d5bf3aa87c0d01af259b7c Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:07 +0100
Subject: sched/fair: Compute task/cpu utilization at wake-up correctly

At task wake-up load-tracking isn't updated until the task is enqueued.
The task's own view of its utilization contribution may therefore not be
aligned with its contribution to the cfs_rq load-tracking which may have
been updated in the meantime. Basically, the task's own utilization
hasn't yet accounted for the sleep decay, while the cfs_rq may have
(partially). Estimating the cfs_rq utilization in case the task is
migrated at wake-up as task_rq(p)->cfs.avg.util_avg - p->se.avg.util_avg
is therefore incorrect as the two load-tracking signals aren't time
synchronized (different last update).

To solve this problem, this patch synchronizes the task utilization with
its previous rq before the task utilization is used in the wake-up path.
Currently the update/synchronization is done _after_ the task has been
placed by select_task_rq_fair(). The synchronization is done without
having to take the rq lock using the existing mechanism used in
remove_entity_load_avg().

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-2-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3cf446c53043..b05d691bbda8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3198,6 +3198,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 }
 #endif
 
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 last_update_time;
+
+	last_update_time = cfs_rq_last_update_time(cfs_rq);
+	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
 /*
  * Task first catches up with cfs_rq, and then subtract
  * itself from the cfs_rq (task must be off the queue now).
@@ -3205,7 +3218,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 void remove_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 last_update_time;
 
 	/*
 	 * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3217,9 +3229,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 	 * calls this.
 	 */
 
-	last_update_time = cfs_rq_last_update_time(cfs_rq);
-
-	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+	sync_entity_load_avg(se);
 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
 }
@@ -5582,6 +5592,24 @@ static inline int task_util(struct task_struct *p)
 	return p->se.avg.util_avg;
 }
 
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+	unsigned long util, capacity;
+
+	/* Task has no contribution or is new */
+	if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+		return cpu_util(cpu);
+
+	capacity = capacity_orig_of(cpu);
+	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+	return (util >= capacity) ? capacity : util;
+}
+
 /*
  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
@@ -5600,6 +5628,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	if (max_cap - min_cap < max_cap >> 3)
 		return 0;
 
+	/* Bring task utilization in sync with prev_cpu */
+	sync_entity_load_avg(&p->se);
+
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
-- 
cgit v1.2.3


From 6a0b19c0f39a7a7b7fb77d3867a733136ff059a3 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:08 +0100
Subject: sched/fair: Consider spare capacity in find_idlest_group()

In low-utilization scenarios comparing relative loads in
find_idlest_group() doesn't always lead to the most optimum choice.
Systems with groups containing different numbers of cpus and/or cpus of
different compute capacity are significantly better off when considering
spare capacity rather than relative load in those scenarios.

In addition to existing load based search an alternative spare capacity
based candidate sched_group is found and selected instead if sufficient
spare capacity exists. If not, existing behaviour is preserved.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-3-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b05d691bbda8..1ad37064c0c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5202,6 +5202,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 	return 1;
 }
 
+static inline int task_util(struct task_struct *p);
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -5211,7 +5219,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		  int this_cpu, int sd_flag)
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
+	struct sched_group *most_spare_sg = NULL;
 	unsigned long min_load = ULONG_MAX, this_load = 0;
+	unsigned long most_spare = 0, this_spare = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -5219,7 +5229,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		load_idx = sd->wake_idx;
 
 	do {
-		unsigned long load, avg_load;
+		unsigned long load, avg_load, spare_cap, max_spare_cap;
 		int local_group;
 		int i;
 
@@ -5231,8 +5241,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
 
-		/* Tally up the load of all CPUs in the group */
+		/*
+		 * Tally up the load of all CPUs in the group and find
+		 * the group containing the CPU with most spare capacity.
+		 */
 		avg_load = 0;
+		max_spare_cap = 0;
 
 		for_each_cpu(i, sched_group_cpus(group)) {
 			/* Bias balancing toward cpus of our domain */
@@ -5242,6 +5256,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 				load = target_load(i, load_idx);
 
 			avg_load += load;
+
+			spare_cap = capacity_spare_wake(i, p);
+
+			if (spare_cap > max_spare_cap)
+				max_spare_cap = spare_cap;
 		}
 
 		/* Adjust by relative CPU capacity of the group */
@@ -5249,12 +5268,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
 		if (local_group) {
 			this_load = avg_load;
-		} else if (avg_load < min_load) {
-			min_load = avg_load;
-			idlest = group;
+			this_spare = max_spare_cap;
+		} else {
+			if (avg_load < min_load) {
+				min_load = avg_load;
+				idlest = group;
+			}
+
+			if (most_spare < max_spare_cap) {
+				most_spare = max_spare_cap;
+				most_spare_sg = group;
+			}
 		}
 	} while (group = group->next, group != sd->groups);
 
+	/*
+	 * The cross-over point between using spare capacity or least load
+	 * is too conservative for high utilization tasks on partially
+	 * utilized systems if we require spare_capacity > task_util(p),
+	 * so we allow for some task stuffing by using
+	 * spare_capacity > task_util(p)/2.
+	 */
+	if (this_spare > task_util(p) / 2 &&
+	    imbalance*this_spare > 100*most_spare)
+		return NULL;
+	else if (most_spare > task_util(p) / 2)
+		return most_spare_sg;
+
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
-- 
cgit v1.2.3


From bf475ce0a3dd75b5d1df6c6c14ae25168caa15ac Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:09 +0100
Subject: sched/fair: Add per-CPU min capacity to sched_group_capacity

struct sched_group_capacity currently represents the compute capacity
sum of all CPUs in the sched_group.

Unless it is divided by the group_weight to get the average capacity
per CPU, it hides differences in CPU capacity for mixed capacity systems
(e.g. high RT/IRQ utilization or ARM big.LITTLE).

But even the average may not be sufficient if the group covers CPUs of
different capacities.

Instead, by extending struct sched_group_capacity to indicate min per-CPU
capacity in the group a suitable group for a given task utilization can
more easily be found such that CPUs with reduced capacity can be avoided
for tasks with high utilization (not implemented by this patch).

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-4-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  3 ++-
 kernel/sched/fair.c  | 17 ++++++++++++-----
 kernel/sched/sched.h |  3 ++-
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3cfa0dd5b34..6bf1fd3514d5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5708,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		printk(KERN_CONT " %*pbl",
 		       cpumask_pr_args(sched_group_cpus(group)));
 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
-			printk(KERN_CONT " (cpu_capacity = %d)",
+			printk(KERN_CONT " (cpu_capacity = %lu)",
 				group->sgc->capacity);
 		}
 
@@ -6185,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 		 * die on a /0 trap.
 		 */
 		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+		sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
 
 		/*
 		 * Make sure the first group of this domain contains the
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1ad37064c0c2..faf8f18616e6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6909,13 +6909,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 
 	cpu_rq(cpu)->cpu_capacity = capacity;
 	sdg->sgc->capacity = capacity;
+	sdg->sgc->min_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
 	struct sched_domain *child = sd->child;
 	struct sched_group *group, *sdg = sd->groups;
-	unsigned long capacity;
+	unsigned long capacity, min_capacity;
 	unsigned long interval;
 
 	interval = msecs_to_jiffies(sd->balance_interval);
@@ -6928,6 +6929,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 	}
 
 	capacity = 0;
+	min_capacity = ULONG_MAX;
 
 	if (child->flags & SD_OVERLAP) {
 		/*
@@ -6952,11 +6954,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 			 */
 			if (unlikely(!rq->sd)) {
 				capacity += capacity_of(cpu);
-				continue;
+			} else {
+				sgc = rq->sd->groups->sgc;
+				capacity += sgc->capacity;
 			}
 
-			sgc = rq->sd->groups->sgc;
-			capacity += sgc->capacity;
+			min_capacity = min(capacity, min_capacity);
 		}
 	} else  {
 		/*
@@ -6966,12 +6969,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
 		group = child->groups;
 		do {
-			capacity += group->sgc->capacity;
+			struct sched_group_capacity *sgc = group->sgc;
+
+			capacity += sgc->capacity;
+			min_capacity = min(sgc->min_capacity, min_capacity);
 			group = group->next;
 		} while (group != child->groups);
 	}
 
 	sdg->sgc->capacity = capacity;
+	sdg->sgc->min_capacity = min_capacity;
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935d4421..345c1ccaba34 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -892,7 +892,8 @@ struct sched_group_capacity {
 	 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
 	 * for a single CPU.
 	 */
-	unsigned int capacity;
+	unsigned long capacity;
+	unsigned long min_capacity; /* Min per-CPU capacity in group */
 	unsigned long next_update;
 	int imbalance; /* XXX unrelated to capacity but shared group state */
 
-- 
cgit v1.2.3


From 9e0994c0a1c1f82c705f1f66388e1bcffcee8bb9 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:10 +0100
Subject: sched/fair: Avoid pulling tasks from non-overloaded higher capacity
 groups

For asymmetric CPU capacity systems it is counter-productive for
throughput if low capacity CPUs are pulling tasks from non-overloaded
CPUs with higher capacity. The assumption is that higher CPU capacity is
preferred over running alone in a group with lower CPU capacity.

This patch rejects higher CPU capacity groups with one or less task per
CPU as potential busiest group which could otherwise lead to a series of
failing load-balancing attempts leading to a force-migration.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-5-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index faf8f18616e6..ee39bfda5ae5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7073,6 +7073,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 	return false;
 }
 
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+	return sg->sgc->min_capacity * capacity_margin <
+						ref->sgc->min_capacity * 1024;
+}
+
 static inline enum
 group_type group_classify(struct sched_group *group,
 			  struct sg_lb_stats *sgs)
@@ -7176,6 +7187,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	if (sgs->avg_load <= busiest->avg_load)
 		return false;
 
+	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+		goto asym_packing;
+
+	/*
+	 * Candidate sg has no more than one task per CPU and
+	 * has higher per-CPU capacity. Migrating tasks to less
+	 * capable CPUs may harm throughput. Maximize throughput,
+	 * power/energy consequences are not considered.
+	 */
+	if (sgs->sum_nr_running <= sgs->group_weight &&
+	    group_smaller_cpu_capacity(sds->local, sg))
+		return false;
+
+asym_packing:
 	/* This is the busiest node in its class. */
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;
-- 
cgit v1.2.3


From 893c5d2279041afeb593f1fa8edd9d02edf5b7cb Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 14 Oct 2016 14:41:12 +0100
Subject: sched/fair: Fix incorrect comment for capacity_margin

The comment for capacity_margin introduced in:

  3273163c6775 ("sched/fair: Let asymmetric CPU configurations balance at wake-up")

... got its usage the wrong way round - fix it.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: freedom.tan@mediatek.com
Cc: keita.kobayashi.ym@renesas.com
Cc: mgalbraith@suse.de
Cc: sgurrappadi@nvidia.com
Cc: vincent.guittot@linaro.org
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1476452472-24740-7-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee39bfda5ae5..5e6c00ad2ac3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -109,7 +109,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 
 /*
  * The margin used when comparing utilization with CPU capacity:
- * util * 1024 < capacity * margin
+ * util * margin < capacity * 1024
  */
 unsigned int capacity_margin = 1280; /* ~20% */
 
-- 
cgit v1.2.3


From df217913e72ec7e603d8b68cc4c70646cf7000db Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:42 +0100
Subject: sched/fair: Factorize attach/detach entity

Factorize post_init_entity_util_avg() and part of attach_task_cfs_rq()
in one function attach_entity_cfs_rq().

Create symmetric detach_entity_cfs_rq() function.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-2-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5e6c00ad2ac3..0731affbe81f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -701,9 +701,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 }
 
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+static void attach_entity_cfs_rq(struct sched_entity *se);
 
 /*
  * With new tasks being created, their initial util_avgs are extrapolated
@@ -735,7 +733,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct sched_avg *sa = &se->avg;
 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
-	u64 now = cfs_rq_clock_task(cfs_rq);
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
@@ -763,14 +760,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
 			 * such that the next switched_to_fair() has the
 			 * expected state.
 			 */
-			se->avg.last_update_time = now;
+			se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
 			return;
 		}
 	}
 
-	update_cfs_rq_load_avg(now, cfs_rq, false);
-	attach_entity_load_avg(cfs_rq, se);
-	update_tg_load_avg(cfs_rq, false);
+	attach_entity_cfs_rq(se);
 }
 
 #else /* !CONFIG_SMP */
@@ -8783,30 +8778,19 @@ static inline bool vruntime_normalized(struct task_struct *p)
 	return false;
 }
 
-static void detach_task_cfs_rq(struct task_struct *p)
+static void detach_entity_cfs_rq(struct sched_entity *se)
 {
-	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
 
-	if (!vruntime_normalized(p)) {
-		/*
-		 * Fix up our vruntime so that the current sleep doesn't
-		 * cause 'unlimited' sleep bonus.
-		 */
-		place_entity(cfs_rq, se, 0);
-		se->vruntime -= cfs_rq->min_vruntime;
-	}
-
 	/* Catch up with the cfs_rq and remove our load when we leave */
 	update_cfs_rq_load_avg(now, cfs_rq, false);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 }
 
-static void attach_task_cfs_rq(struct task_struct *p)
+static void attach_entity_cfs_rq(struct sched_entity *se)
 {
-	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
 
@@ -8818,10 +8802,35 @@ static void attach_task_cfs_rq(struct task_struct *p)
 	se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
 
-	/* Synchronize task with its cfs_rq */
+	/* Synchronize entity with its cfs_rq */
 	update_cfs_rq_load_avg(now, cfs_rq, false);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	if (!vruntime_normalized(p)) {
+		/*
+		 * Fix up our vruntime so that the current sleep doesn't
+		 * cause 'unlimited' sleep bonus.
+		 */
+		place_entity(cfs_rq, se, 0);
+		se->vruntime -= cfs_rq->min_vruntime;
+	}
+
+	detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	attach_entity_cfs_rq(se);
 
 	if (!vruntime_normalized(p))
 		se->vruntime += cfs_rq->min_vruntime;
-- 
cgit v1.2.3


From 9c2791f936ef5fd04a118b5c284f2c9a95f4a647 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:43 +0100
Subject: sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list

Fix the insertion of cfs_rq in rq->leaf_cfs_rq_list to ensure that a
child will always be called before its parent.

The hierarchical order in shares update list has been introduced by
commit:

  67e86250f8ea ("sched: Introduce hierarchal order on shares update list")

With the current implementation a child can be still put after its
parent.

Lets take the example of:

       root
        \
         b
         /\
         c d*
           |
           e*

with root -> b -> c already enqueued but not d -> e so the
leaf_cfs_rq_list looks like: head -> c -> b -> root -> tail

The branch d -> e will be added the first time that they are enqueued,
starting with e then d.

When e is added, its parents is not already on the list so e is put at
the tail : head -> c -> b -> root -> e -> tail

Then, d is added at the head because its parent is already on the
list: head -> d -> c -> b -> root -> e -> tail

e is not placed at the right position and will be called the last
whereas it should be called at the beginning.

Because it follows the bottom-up enqueue sequence, we are sure that we
will finished to add either a cfs_rq without parent or a cfs_rq with a
parent that is already on the list. We can use this event to detect
when we have finished to add a new branch. For the others, whose
parents are not already added, we have to ensure that they will be
added after their children that have just been inserted the steps
before, and after any potential parents that are already in the list.
The easiest way is to put the cfs_rq just after the last inserted one
and to keep track of it untl the branch is fully added.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-3-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  1 +
 kernel/sched/fair.c  | 54 +++++++++++++++++++++++++++++++++++++++++++++-------
 kernel/sched/sched.h |  1 +
 3 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6bf1fd3514d5..dc64bd71ed2b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7604,6 +7604,7 @@ void __init sched_init(void)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
 		/*
 		 * How much cpu bandwidth does root_task_group get?
 		 *
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0731affbe81f..4a67026a2424 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,19 +283,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_rq->on_list) {
+		struct rq *rq = rq_of(cfs_rq);
+		int cpu = cpu_of(rq);
 		/*
 		 * Ensure we either appear before our parent (if already
 		 * enqueued) or force our parent to appear after us when it is
-		 * enqueued.  The fact that we always enqueue bottom-up
-		 * reduces this to two cases.
+		 * enqueued. The fact that we always enqueue bottom-up
+		 * reduces this to two cases and a special case for the root
+		 * cfs_rq. Furthermore, it also means that we will always reset
+		 * tmp_alone_branch either when the branch is connected
+		 * to a tree or when we reach the beg of the tree
 		 */
 		if (cfs_rq->tg->parent &&
-		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
-			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
-				&rq_of(cfs_rq)->leaf_cfs_rq_list);
-		} else {
+		    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+			/*
+			 * If parent is already on the list, we add the child
+			 * just before. Thanks to circular linked property of
+			 * the list, this means to put the child at the tail
+			 * of the list that starts by parent.
+			 */
+			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+				&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+			/*
+			 * The branch is now connected to its tree so we can
+			 * reset tmp_alone_branch to the beginning of the
+			 * list.
+			 */
+			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+		} else if (!cfs_rq->tg->parent) {
+			/*
+			 * cfs rq without parent should be put
+			 * at the tail of the list.
+			 */
 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
-				&rq_of(cfs_rq)->leaf_cfs_rq_list);
+				&rq->leaf_cfs_rq_list);
+			/*
+			 * We have reach the beg of a tree so we can reset
+			 * tmp_alone_branch to the beginning of the list.
+			 */
+			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+		} else {
+			/*
+			 * The parent has not already been added so we want to
+			 * make sure that it will be put after us.
+			 * tmp_alone_branch points to the beg of the branch
+			 * where we will add parent.
+			 */
+			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+				rq->tmp_alone_branch);
+			/*
+			 * update tmp_alone_branch to points to the new beg
+			 * of the branch
+			 */
+			rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
 		}
 
 		cfs_rq->on_list = 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 345c1ccaba34..36f30e0aa266 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -623,6 +623,7 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
+	struct list_head *tmp_alone_branch;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	/*
-- 
cgit v1.2.3


From d31b1a66cbe0931733583ad9d9e8c6cfd710907d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:44 +0100
Subject: sched/fair: Factorize PELT update

Every time we modify load/utilization of sched_entity, we start to
sync it with its cfs_rq. This update is done in different ways:

 - when attaching/detaching a sched_entity, we update cfs_rq and then
   we sync the entity with the cfs_rq.

 - when enqueueing/dequeuing the sched_entity, we update both
   sched_entity and cfs_rq metrics to now.

Use update_load_avg() everytime we have to update and sync cfs_rq and
sched_entity before changing the state of a sched_enity.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-4-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 76 ++++++++++++++++++-----------------------------------
 1 file changed, 25 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a67026a2424..d707ad037b31 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3092,8 +3092,14 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 	return decayed || removed_load;
 }
 
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG	0x1
+#define SKIP_AGE_LOAD	0x2
+
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
@@ -3104,11 +3110,13 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 	 * Track task load average for carrying it to new CPU after migrated, and
 	 * track group sched_entity load average for task_h_load calc in migration
 	 */
-	__update_load_avg(now, cpu, &se->avg,
+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+		__update_load_avg(now, cpu, &se->avg,
 			  se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
+	}
 
-	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
+	if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
 		update_tg_load_avg(cfs_rq, 0);
 }
 
@@ -3122,26 +3130,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
  */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (!sched_feat(ATTACH_AGE_LOAD))
-		goto skip_aging;
-
-	/*
-	 * If we got migrated (either between CPUs or between cgroups) we'll
-	 * have aged the average right before clearing @last_update_time.
-	 *
-	 * Or we're fresh through post_init_entity_util_avg().
-	 */
-	if (se->avg.last_update_time) {
-		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-				  &se->avg, 0, 0, NULL);
-
-		/*
-		 * XXX: we could have just aged the entire load away if we've been
-		 * absent from the fair class for too long.
-		 */
-	}
-
-skip_aging:
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 	cfs_rq->avg.load_avg += se->avg.load_avg;
 	cfs_rq->avg.load_sum += se->avg.load_sum;
@@ -3161,9 +3149,6 @@ skip_aging:
  */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
-			  cfs_rq->curr == se, NULL);
 
 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
@@ -3178,34 +3163,20 @@ static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct sched_avg *sa = &se->avg;
-	u64 now = cfs_rq_clock_task(cfs_rq);
-	int migrated, decayed;
-
-	migrated = !sa->last_update_time;
-	if (!migrated) {
-		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-			se->on_rq * scale_load_down(se->load.weight),
-			cfs_rq->curr == se, NULL);
-	}
-
-	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
 
 	cfs_rq->runnable_load_avg += sa->load_avg;
 	cfs_rq->runnable_load_sum += sa->load_sum;
 
-	if (migrated)
+	if (!sa->last_update_time) {
 		attach_entity_load_avg(cfs_rq, se);
-
-	if (decayed || migrated)
 		update_tg_load_avg(cfs_rq, 0);
+	}
 }
 
 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
 static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	update_load_avg(se, 1);
-
 	cfs_rq->runnable_load_avg =
 		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
 	cfs_rq->runnable_load_sum =
@@ -3289,7 +3260,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 	return 0;
 }
 
-static inline void update_load_avg(struct sched_entity *se, int not_used)
+#define UPDATE_TG	0x0
+#define SKIP_AGE_LOAD	0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1)
 {
 	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
 }
@@ -3434,6 +3408,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
 
+	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
@@ -3508,6 +3483,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se, flags);
@@ -3595,7 +3571,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
-		update_load_avg(se, 1);
+		update_load_avg(se, UPDATE_TG);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
@@ -3713,7 +3689,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
-	update_load_avg(curr, 1);
+	update_load_avg(curr, UPDATE_TG);
 	update_cfs_shares(cfs_rq);
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -4610,7 +4586,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, 1);
+		update_load_avg(se, UPDATE_TG);
 		update_cfs_shares(cfs_rq);
 	}
 
@@ -4669,7 +4645,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, 1);
+		update_load_avg(se, UPDATE_TG);
 		update_cfs_shares(cfs_rq);
 	}
 
@@ -8821,10 +8797,9 @@ static inline bool vruntime_normalized(struct task_struct *p)
 static void detach_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 now = cfs_rq_clock_task(cfs_rq);
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
-	update_cfs_rq_load_avg(now, cfs_rq, false);
+	update_load_avg(se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 }
@@ -8832,7 +8807,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 static void attach_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 now = cfs_rq_clock_task(cfs_rq);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
@@ -8843,7 +8817,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 #endif
 
 	/* Synchronize entity with its cfs_rq */
-	update_cfs_rq_load_avg(now, cfs_rq, false);
+	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 }
-- 
cgit v1.2.3


From 09a43ace1f986b003c118fdf6ddf1fd685692d49 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:45 +0100
Subject: sched/fair: Propagate load during synchronous attach/detach

When a task moves from/to a cfs_rq, we set a flag which is then used to
propagate the change at parent level (sched_entity and cfs_rq) during
next update. If the cfs_rq is throttled, the flag will stay pending until
the cfs_rq is unthrottled.

For propagating the utilization, we copy the utilization of group cfs_rq to
the sched_entity.

For propagating the load, we have to take into account the load of the
whole task group in order to evaluate the load of the sched_entity.
Similarly to what was done before the rewrite of PELT, we add a correction
factor in case the task group's load is greater than its share so it will
contribute the same load of a task of equal weight.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-5-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |   1 +
 2 files changed, 188 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d707ad037b31..8cf26fd7ce58 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2918,6 +2918,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	return decayed;
 }
 
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+	typeof(_ptr) ptr = (_ptr);                              \
+	typeof(_val) val = (_val);                              \
+	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+								\
+	res = var + val;                                        \
+								\
+	if (val < 0 && res > var)                               \
+		res = 0;                                        \
+								\
+	WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /**
  * update_tg_load_avg - update the tg's load avg
@@ -2997,8 +3017,138 @@ void set_task_rq_fair(struct sched_entity *se,
 		se->avg.last_update_time = n_last_update_time;
 	}
 }
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's utilization */
+	se->avg.util_avg = gcfs_rq->avg.util_avg;
+	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq utilization */
+	add_positive(&cfs_rq->avg.util_avg, delta);
+	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long delta, load = gcfs_rq->avg.load_avg;
+
+	/*
+	 * If the load of group cfs_rq is null, the load of the
+	 * sched_entity will also be null so we can skip the formula
+	 */
+	if (load) {
+		long tg_load;
+
+		/* Get tg's load and ensure tg_load > 0 */
+		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+		/* Ensure tg_load >= load and updated with current load*/
+		tg_load -= gcfs_rq->tg_load_avg_contrib;
+		tg_load += load;
+
+		/*
+		 * We need to compute a correction term in the case that the
+		 * task group is consuming more CPU than a task of equal
+		 * weight. A task with a weight equals to tg->shares will have
+		 * a load less or equal to scale_load_down(tg->shares).
+		 * Similarly, the sched_entities that represent the task group
+		 * at parent level, can't have a load higher than
+		 * scale_load_down(tg->shares). And the Sum of sched_entities'
+		 * load must be <= scale_load_down(tg->shares).
+		 */
+		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+			/* scale gcfs_rq's load into tg's shares*/
+			load *= scale_load_down(gcfs_rq->tg->shares);
+			load /= tg_load;
+		}
+	}
+
+	delta = load - se->avg.load_avg;
+
+	/* Nothing to update */
+	if (!delta)
+		return;
+
+	/* Set new sched_entity's load */
+	se->avg.load_avg = load;
+	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+	/* Update parent cfs_rq load */
+	add_positive(&cfs_rq->avg.load_avg, delta);
+	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+	/*
+	 * If the sched_entity is already enqueued, we also have to update the
+	 * runnable load avg.
+	 */
+	if (se->on_rq) {
+		/* Update parent cfs_rq runnable_load_avg */
+		add_positive(&cfs_rq->runnable_load_avg, delta);
+		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+	}
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+	if (!cfs_rq->propagate_avg)
+		return 0;
+
+	cfs_rq->propagate_avg = 0;
+	return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	if (entity_is_task(se))
+		return 0;
+
+	if (!test_and_clear_tg_cfs_propagate(se))
+		return 0;
+
+	cfs_rq = cfs_rq_of(se);
+
+	set_tg_cfs_propagate(cfs_rq);
+
+	update_tg_cfs_util(cfs_rq, se);
+	update_tg_cfs_load(cfs_rq, se);
+
+	return 1;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
+
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
@@ -3105,6 +3255,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	struct rq *rq = rq_of(cfs_rq);
 	int cpu = cpu_of(rq);
+	int decayed;
 
 	/*
 	 * Track task load average for carrying it to new CPU after migrated, and
@@ -3116,7 +3267,10 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 			  cfs_rq->curr == se, NULL);
 	}
 
-	if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG))
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
+	decayed |= propagate_entity_load_avg(se);
+
+	if (decayed && (flags & UPDATE_TG))
 		update_tg_load_avg(cfs_rq, 0);
 }
 
@@ -3135,6 +3289,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq->avg.load_sum += se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3154,6 +3309,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+	set_tg_cfs_propagate(cfs_rq);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -8794,6 +8950,31 @@ static inline bool vruntime_normalized(struct task_struct *p)
 	return false;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	/* Start to propagate at parent */
+	se = se->parent;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+		update_load_avg(se, UPDATE_TG);
+	}
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
 static void detach_entity_cfs_rq(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -8802,6 +8983,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 	update_load_avg(se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+	propagate_entity_cfs_rq(se);
 }
 
 static void attach_entity_cfs_rq(struct sched_entity *se)
@@ -8820,6 +9002,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
+	propagate_entity_cfs_rq(se);
 }
 
 static void detach_task_cfs_rq(struct task_struct *p)
@@ -8898,6 +9081,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq->propagate_avg = 0;
+#endif
 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 36f30e0aa266..d7e39317d688 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,7 @@ struct cfs_rq {
 	unsigned long runnable_load_avg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	unsigned long tg_load_avg_contrib;
+	unsigned long propagate_avg;
 #endif
 	atomic_long_t removed_load_avg, removed_util_avg;
 #ifndef CONFIG_64BIT
-- 
cgit v1.2.3


From 4e5160766fcc9f41bbd38bac11f92dce993644aa Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:46 +0100
Subject: sched/fair: Propagate asynchrous detach

A task can be asynchronously detached from cfs_rq when migrating
between CPUs. The load of the migrated task is then removed from
source cfs_rq during its next update. We use this event to set
propagation flag.

During the load balance, we take advantage of the update of blocked
load to propagate any pending changes.

The propagation relies on patch:

  "sched: Fix hierarchical order in rq->leaf_cfs_rq_list"

... which orders children and parents, to ensure that it's done in one pass.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-6-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8cf26fd7ce58..090a9bb51ab2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3219,6 +3219,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 		sub_positive(&sa->load_avg, r);
 		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
 		removed_load = 1;
+		set_tg_cfs_propagate(cfs_rq);
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3226,6 +3227,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 		sub_positive(&sa->util_avg, r);
 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 		removed_util = 1;
+		set_tg_cfs_propagate(cfs_rq);
 	}
 
 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -6872,6 +6874,10 @@ static void update_blocked_averages(int cpu)
 
 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
 			update_tg_load_avg(cfs_rq, 0);
+
+		/* Propagate pending load changes to the parent */
+		if (cfs_rq->tg->se[cpu])
+			update_load_avg(cfs_rq->tg->se[cpu], 0);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-- 
cgit v1.2.3


From d03266910a533d874c01ef2ca8dc73009f2925fa Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 8 Nov 2016 10:53:47 +0100
Subject: sched/fair: Fix task group initialization

The moves of tasks are now propagated down to root and the utilization
of cfs_rq reflects reality so it doesn't need to be estimated at init.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: kernellwp@gmail.com
Cc: pjt@google.com
Cc: yuyang.du@intel.com
Link: http://lkml.kernel.org/r/1478598827-32372-7-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 090a9bb51ab2..02605f2826a2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9198,7 +9198,7 @@ void online_fair_sched_group(struct task_group *tg)
 		se = tg->se[i];
 
 		raw_spin_lock_irq(&rq->lock);
-		post_init_entity_util_avg(se);
+		attach_entity_cfs_rq(se);
 		sync_throttle(tg, i);
 		raw_spin_unlock_irq(&rq->lock);
 	}
-- 
cgit v1.2.3


From 2874aa2e467dbc0b4f7cb0ee5dc872e98e000a47 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 15 Nov 2016 11:00:04 -0800
Subject: bpf: Fix compilation warning in __bpf_lru_list_rotate_inactive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc-6.2.1 gives the following warning:
kernel/bpf/bpf_lru_list.c: In function ‘__bpf_lru_list_rotate_inactive.isra.3’:
kernel/bpf/bpf_lru_list.c:201:28: warning: ‘next’ may be used uninitialized in this function [-Wmaybe-uninitialized]

The "next" is currently initialized in the while() loop which must have >=1
iterations.

This patch initializes next to get rid of the compiler warning.

Fixes: 3a08c2fd7634 ("bpf: LRU List")
Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/bpf_lru_list.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index bfebff010ba9..89b7ef41c86b 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -170,7 +170,7 @@ static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
 					   struct bpf_lru_list *l)
 {
 	struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
-	struct list_head *cur, *next, *last;
+	struct list_head *cur, *last, *next = inactive;
 	struct bpf_lru_node *node;
 	unsigned int i = 0;
 
-- 
cgit v1.2.3


From bfe130773862bb3a02cdc4d4c2169f7f0210a46b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Nov 2016 10:12:58 +0100
Subject: genirq/affinity: Take reserved vectors into account when spreading
 irqs

The recent addition of reserved vectors at the beginning or the end of the
vector space did not take the reserved vectors at the beginning into
account for the various loop exit conditions. As a consequence the last
vectors of the spread area are not included into the spread algorithm and
are treated like the reserved vectors at the end of the vector space and
get the default affinity mask assigned.

Sum up the affinity vectors and the reserved vectors at the beginning and
use the sum as exit condition.

[ tglx: Fixed all conditions instead of only one and massaged changelog ]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/1479201178-29604-2-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 17360bd9619b..49eb38d48816 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -61,6 +61,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
 	int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
 	int affv = nvecs - affd->pre_vectors - affd->post_vectors;
+	int last_affv = affv + affd->pre_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 	struct cpumask *masks;
 	cpumask_var_t nmsk;
@@ -87,7 +88,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (affv <= nodes) {
 		for_each_node_mask(n, nodemsk) {
 			cpumask_copy(masks + curvec, cpumask_of_node(n));
-			if (++curvec == affv)
+			if (++curvec == last_affv)
 				break;
 		}
 		goto done;
@@ -107,7 +108,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
 
-		for (v = 0; curvec < affv && v < vecs_to_assign; curvec++, v++) {
+		for (v = 0; curvec < last_affv && v < vecs_to_assign;
+		     curvec++, v++) {
 			cpus_per_vec = ncpus / vecs_to_assign;
 
 			/* Account for extra vectors to compensate rounding errors */
@@ -119,7 +121,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
 		}
 
-		if (curvec >= affv)
+		if (curvec >= last_affv)
 			break;
 	}
 
-- 
cgit v1.2.3


From b6e5d5b947527558afac4aa0cdfa2ac586332e03 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 16 Nov 2016 18:36:44 +0100
Subject: genirq/affinity: Use default affinity mask for reserved vectors

The reserved vectors at the beginning and the end of the vector space get
cpu_possible_mask assigned as their affinity mask.

All other non-auto affine interrupts get the default irq affinity mask
assigned. Using cpu_possible_mask breaks that rule.

Treat them like any other interrupt and use irq_default_affinity as target
mask.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Hellwig <hch@lst.de>
---
 kernel/irq/affinity.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 49eb38d48816..9be9bda7c1f9 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -75,7 +75,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-		cpumask_copy(masks + curvec, cpu_possible_mask);
+		cpumask_copy(masks + curvec, irq_default_affinity);
 
 	/* Stabilize the cpumasks */
 	get_online_cpus();
@@ -130,7 +130,7 @@ done:
 
 	/* Fill out vectors at the end that don't need affinity */
 	for (; curvec < nvecs; curvec++)
-		cpumask_copy(masks + curvec, cpu_possible_mask);
+		cpumask_copy(masks + curvec, irq_default_affinity);
 out:
 	free_cpumask_var(nmsk);
 	return masks;
-- 
cgit v1.2.3


From f23cc643f9baec7f71f2b74692da3cf03abbbfda Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 14 Nov 2016 15:45:36 -0500
Subject: bpf: fix range arithmetic for bpf map access

I made some invalid assumptions with BPF_AND and BPF_MOD that could result in
invalid accesses to bpf map entries.  Fix this up by doing a few things

1) Kill BPF_MOD support.  This doesn't actually get used by the compiler in real
life and just adds extra complexity.

2) Fix the logic for BPF_AND, don't allow AND of negative numbers and set the
minimum value to 0 for positive AND's.

3) Don't do operations on the ranges if they are set to the limits, as they are
by definition undefined, and allowing arithmetic operations on those values
could make them appear valid when they really aren't.

This fixes the testcase provided by Jann as well as a few other theoretical
problems.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  5 ++--
 kernel/bpf/verifier.c        | 70 +++++++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7035b997aaa5..6aaf425cebc3 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -14,7 +14,7 @@
   * are obviously wrong for any sort of memory access.
   */
 #define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024)
-#define BPF_REGISTER_MIN_RANGE -(1024 * 1024 * 1024)
+#define BPF_REGISTER_MIN_RANGE -1
 
 struct bpf_reg_state {
 	enum bpf_reg_type type;
@@ -22,7 +22,8 @@ struct bpf_reg_state {
 	 * Used to determine if any memory access using this register will
 	 * result in a bad access.
 	 */
-	u64 min_value, max_value;
+	s64 min_value;
+	u64 max_value;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 99a7e5b388f2..6a936159c6e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -216,8 +216,8 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				reg->map_ptr->key_size,
 				reg->map_ptr->value_size);
 		if (reg->min_value != BPF_REGISTER_MIN_RANGE)
-			verbose(",min_value=%llu",
-				(unsigned long long)reg->min_value);
+			verbose(",min_value=%lld",
+				(long long)reg->min_value);
 		if (reg->max_value != BPF_REGISTER_MAX_RANGE)
 			verbose(",max_value=%llu",
 				(unsigned long long)reg->max_value);
@@ -758,7 +758,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			 * index'es we need to make sure that whatever we use
 			 * will have a set floor within our range.
 			 */
-			if ((s64)reg->min_value < 0) {
+			if (reg->min_value < 0) {
 				verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 					regno);
 				return -EACCES;
@@ -1468,7 +1468,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
 {
 	if (reg->max_value > BPF_REGISTER_MAX_RANGE)
 		reg->max_value = BPF_REGISTER_MAX_RANGE;
-	if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE)
+	if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
+	    reg->min_value > BPF_REGISTER_MAX_RANGE)
 		reg->min_value = BPF_REGISTER_MIN_RANGE;
 }
 
@@ -1476,7 +1477,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				    struct bpf_insn *insn)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
-	u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE;
+	s64 min_val = BPF_REGISTER_MIN_RANGE;
+	u64 max_val = BPF_REGISTER_MAX_RANGE;
 	bool min_set = false, max_set = false;
 	u8 opcode = BPF_OP(insn->code);
 
@@ -1512,22 +1514,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		return;
 	}
 
+	/* If one of our values was at the end of our ranges then we can't just
+	 * do our normal operations to the register, we need to set the values
+	 * to the min/max since they are undefined.
+	 */
+	if (min_val == BPF_REGISTER_MIN_RANGE)
+		dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+	if (max_val == BPF_REGISTER_MAX_RANGE)
+		dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+
 	switch (opcode) {
 	case BPF_ADD:
-		dst_reg->min_value += min_val;
-		dst_reg->max_value += max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value += min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value += max_val;
 		break;
 	case BPF_SUB:
-		dst_reg->min_value -= min_val;
-		dst_reg->max_value -= max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value -= min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value -= max_val;
 		break;
 	case BPF_MUL:
-		dst_reg->min_value *= min_val;
-		dst_reg->max_value *= max_val;
+		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+			dst_reg->min_value *= min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value *= max_val;
 		break;
 	case BPF_AND:
-		/* & is special since it could end up with 0 bits set. */
-		dst_reg->min_value &= min_val;
+		/* Disallow AND'ing of negative numbers, ain't nobody got time
+		 * for that.  Otherwise the minimum is 0 and the max is the max
+		 * value we could AND against.
+		 */
+		if (min_val < 0)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		else
+			dst_reg->min_value = 0;
 		dst_reg->max_value = max_val;
 		break;
 	case BPF_LSH:
@@ -1537,24 +1560,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		else
+		else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 			dst_reg->min_value <<= min_val;
 
 		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
-		else
+		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value <<= max_val;
 		break;
 	case BPF_RSH:
-		dst_reg->min_value >>= min_val;
-		dst_reg->max_value >>= max_val;
-		break;
-	case BPF_MOD:
-		/* % is special since it is an unsigned modulus, so the floor
-		 * will always be 0.
+		/* RSH by a negative number is undefined, and the BPF_RSH is an
+		 * unsigned shift, so make the appropriate casts.
 		 */
-		dst_reg->min_value = 0;
-		dst_reg->max_value = max_val - 1;
+		if (min_val < 0 || dst_reg->min_value < 0)
+			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+		else
+			dst_reg->min_value =
+				(u64)(dst_reg->min_value) >> min_val;
+		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+			dst_reg->max_value >>= max_val;
 		break;
 	default:
 		reset_reg_range_values(regs, insn->dst_reg);
-- 
cgit v1.2.3


From 8e2ddb03643eb9d0bc4926946d7ce0d308eef0a5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:20 +0530
Subject: cpufreq: schedutil: Avoid indented labels

Switch to the more common practice of writing labels.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 69e06898997d..8c4e1652e895 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -454,17 +454,17 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto fail;
 
- out:
+out:
 	mutex_unlock(&global_tunables_lock);
 
 	cpufreq_enable_fast_switch(policy);
 	return 0;
 
- fail:
+fail:
 	policy->governor_data = NULL;
 	sugov_tunables_free(tunables);
 
- free_sg_policy:
+free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
-- 
cgit v1.2.3


From 4a71ce4348bb61740d411822357061f8bf870f4c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:21 +0530
Subject: cpufreq: schedutil: enable fast switch earlier

The fast_switch_enabled flag will be used by both sugov_policy_alloc()
and sugov_policy_free() with a later patch.

Prepare for that by moving the calls to enable and disable it to the
beginning of sugov_init() and end of sugov_exit().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 8c4e1652e895..68f21bb6bd44 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -416,9 +416,13 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (policy->governor_data)
 		return -EBUSY;
 
+	cpufreq_enable_fast_switch(policy);
+
 	sg_policy = sugov_policy_alloc(policy);
-	if (!sg_policy)
-		return -ENOMEM;
+	if (!sg_policy) {
+		ret = -ENOMEM;
+		goto disable_fast_switch;
+	}
 
 	mutex_lock(&global_tunables_lock);
 
@@ -456,8 +460,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 
 out:
 	mutex_unlock(&global_tunables_lock);
-
-	cpufreq_enable_fast_switch(policy);
 	return 0;
 
 fail:
@@ -468,6 +470,10 @@ free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+	cpufreq_disable_fast_switch(policy);
+
 	pr_err("initialization failed (error %d)\n", ret);
 	return ret;
 }
@@ -478,8 +484,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
 	struct sugov_tunables *tunables = sg_policy->tunables;
 	unsigned int count;
 
-	cpufreq_disable_fast_switch(policy);
-
 	mutex_lock(&global_tunables_lock);
 
 	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -490,6 +494,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
+	cpufreq_disable_fast_switch(policy);
 }
 
 static int sugov_start(struct cpufreq_policy *policy)
-- 
cgit v1.2.3


From 02a7b1ee3baa15a98b541d8cfd156bbe1a091c20 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:22 +0530
Subject: cpufreq: schedutil: move slow path from workqueue to SCHED_FIFO task

If slow path frequency changes are conducted in a SCHED_OTHER context
then they may be delayed for some amount of time, including
indefinitely, when real time or deadline activity is taking place.

Move the slow path to a real time kernel thread. In the future the
thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set
to 50 for now.

Hackbench results on ARM Exynos, dual core A15 platform for 10
iterations:

$ hackbench -s 100 -l 100 -g 10 -f 20

Before			After
---------------------------------
1.808			1.603
1.847			1.251
2.229			1.590
1.952			1.600
1.947			1.257
1.925			1.627
2.694			1.620
1.258			1.621
1.919			1.632
1.250			1.240

Average:

1.8829			1.5041

Based on initial work by Steve Muckle.

Signed-off-by: Steve Muckle <smuckle.linux@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 85 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 78 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 68f21bb6bd44..f165ba0f0766 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,11 +12,14 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/cpufreq.h>
+#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
 #include "sched.h"
 
+#define SUGOV_KTHREAD_PRIORITY	50
+
 struct sugov_tunables {
 	struct gov_attr_set attr_set;
 	unsigned int rate_limit_us;
@@ -35,8 +38,10 @@ struct sugov_policy {
 
 	/* The next fields are only needed if fast switch cannot be used. */
 	struct irq_work irq_work;
-	struct work_struct work;
+	struct kthread_work work;
 	struct mutex work_lock;
+	struct kthread_worker worker;
+	struct task_struct *thread;
 	bool work_in_progress;
 
 	bool need_freq_update;
@@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	raw_spin_unlock(&sg_policy->update_lock);
 }
 
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
 {
 	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
 	struct sugov_policy *sg_policy;
 
 	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
-	schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+	/*
+	 * For Real Time and Deadline tasks, schedutil governor shoots the
+	 * frequency to maximum. And special care must be taken to ensure that
+	 * this kthread doesn't result in that.
+	 *
+	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+	 * updated only at the end of the sugov_work() and before that schedutil
+	 * rejects all other frequency scaling requests.
+	 *
+	 * Though there is a very rare case where the RT thread yields right
+	 * after the work_in_progress flag is cleared. The effects of that are
+	 * neglected for now.
+	 */
+	kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 }
 
 /************************** sysfs interface ************************/
@@ -372,7 +391,6 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 
 	sg_policy->policy = policy;
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
-	INIT_WORK(&sg_policy->work, sugov_work);
 	mutex_init(&sg_policy->work_lock);
 	raw_spin_lock_init(&sg_policy->update_lock);
 	return sg_policy;
@@ -384,6 +402,51 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
 	kfree(sg_policy);
 }
 
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+	struct task_struct *thread;
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+	struct cpufreq_policy *policy = sg_policy->policy;
+	int ret;
+
+	/* kthread only required for slow path */
+	if (policy->fast_switch_enabled)
+		return 0;
+
+	kthread_init_work(&sg_policy->work, sugov_work);
+	kthread_init_worker(&sg_policy->worker);
+	thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+				"sugov:%d",
+				cpumask_first(policy->related_cpus));
+	if (IS_ERR(thread)) {
+		pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+		return PTR_ERR(thread);
+	}
+
+	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+	if (ret) {
+		kthread_stop(thread);
+		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		return ret;
+	}
+
+	sg_policy->thread = thread;
+	kthread_bind_mask(thread, policy->related_cpus);
+	wake_up_process(thread);
+
+	return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+	/* kthread only required for slow path */
+	if (sg_policy->policy->fast_switch_enabled)
+		return;
+
+	kthread_flush_worker(&sg_policy->worker);
+	kthread_stop(sg_policy->thread);
+}
+
 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 {
 	struct sugov_tunables *tunables;
@@ -424,12 +487,16 @@ static int sugov_init(struct cpufreq_policy *policy)
 		goto disable_fast_switch;
 	}
 
+	ret = sugov_kthread_create(sg_policy);
+	if (ret)
+		goto free_sg_policy;
+
 	mutex_lock(&global_tunables_lock);
 
 	if (global_tunables) {
 		if (WARN_ON(have_governor_per_policy())) {
 			ret = -EINVAL;
-			goto free_sg_policy;
+			goto stop_kthread;
 		}
 		policy->governor_data = sg_policy;
 		sg_policy->tunables = global_tunables;
@@ -441,7 +508,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 	tunables = sugov_tunables_alloc(sg_policy);
 	if (!tunables) {
 		ret = -ENOMEM;
-		goto free_sg_policy;
+		goto stop_kthread;
 	}
 
 	tunables->rate_limit_us = LATENCY_MULTIPLIER;
@@ -466,6 +533,9 @@ fail:
 	policy->governor_data = NULL;
 	sugov_tunables_free(tunables);
 
+stop_kthread:
+	sugov_kthread_stop(sg_policy);
+
 free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
@@ -493,6 +563,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
 
 	mutex_unlock(&global_tunables_lock);
 
+	sugov_kthread_stop(sg_policy);
 	sugov_policy_free(sg_policy);
 	cpufreq_disable_fast_switch(policy);
 }
@@ -541,7 +612,7 @@ static void sugov_stop(struct cpufreq_policy *policy)
 	synchronize_sched();
 
 	irq_work_sync(&sg_policy->irq_work);
-	cancel_work_sync(&sg_policy->work);
+	kthread_cancel_work_sync(&sg_policy->work);
 }
 
 static void sugov_limits(struct cpufreq_policy *policy)
-- 
cgit v1.2.3


From 21ef57297b15a49b0c4dd4e7135c1a08e9a29a1c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:23 +0530
Subject: cpufreq: schedutil: irq-work and mutex are only used in slow path

Execute the irq-work specific initialization/exit code only when the
fast path isn't available.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f165ba0f0766..42a220e78f00 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -390,15 +390,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 		return NULL;
 
 	sg_policy->policy = policy;
-	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
-	mutex_init(&sg_policy->work_lock);
 	raw_spin_lock_init(&sg_policy->update_lock);
 	return sg_policy;
 }
 
 static void sugov_policy_free(struct sugov_policy *sg_policy)
 {
-	mutex_destroy(&sg_policy->work_lock);
 	kfree(sg_policy);
 }
 
@@ -432,6 +429,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 
 	sg_policy->thread = thread;
 	kthread_bind_mask(thread, policy->related_cpus);
+	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+	mutex_init(&sg_policy->work_lock);
+
 	wake_up_process(thread);
 
 	return 0;
@@ -445,6 +445,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 
 	kthread_flush_worker(&sg_policy->worker);
 	kthread_stop(sg_policy->thread);
+	mutex_destroy(&sg_policy->work_lock);
 }
 
 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
@@ -611,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
 
 	synchronize_sched();
 
-	irq_work_sync(&sg_policy->irq_work);
-	kthread_cancel_work_sync(&sg_policy->work);
+	if (!policy->fast_switch_enabled) {
+		irq_work_sync(&sg_policy->irq_work);
+		kthread_cancel_work_sync(&sg_policy->work);
+	}
 }
 
 static void sugov_limits(struct cpufreq_policy *policy)
-- 
cgit v1.2.3


From f97960fbddd98e1b7bfba469b8510b49a62b4bc5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Nov 2016 17:31:55 +0100
Subject: kernel/printk: Block cpuhotplug callback when tasks are frozen

The recent conversion of the console hotplug notifier to the state machine
missed the fact, that the notifier only operated on the non frozen
transitions. As a consequence the console_lock/unlock() pair is also
invoked during suspend, which results in a lockdep warning.

Restore the previous state by making the lock/unlock conditional on
!tasks_frozen.

Fixes: 90b14889d2f9 ("kernel/printk: Convert to hotplug state machine")
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1611171729320.3645@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/printk/printk.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4487ffcd42d5..37893da9bae8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2194,8 +2194,10 @@ void resume_console(void)
  */
 static int console_cpu_notify(unsigned int cpu)
 {
-	console_lock();
-	console_unlock();
+	if (!cpuhp_tasks_frozen) {
+		console_lock();
+		console_unlock();
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From c7d03a00b56fc23c3a01a8353789ad257363e281 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 17 Nov 2016 04:58:21 +0300
Subject: netns: make struct pernet_operations::id unsigned int

Make struct pernet_operations::id unsigned.

There are 2 reasons to do so:

1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.

2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.

"int" being used as an array index needs to be sign-extended
to 64-bit before being used.

	void f(long *p, int i)
	{
		g(p[i]);
	}

  roughly translates to

	movsx	rsi, esi
	mov	rdi, [rsi+...]
	call 	g

MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.

Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:

	static inline void *net_generic(const struct net *net, int id)
	{
		...
		ptr = ng->ptr[id - 1];
		...
	}

And this function is used a lot, so those sign extensions add up.

Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):

	add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)

Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]

However, overall balance is in negative direction:

	add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
	function                                     old     new   delta
	nfsd4_lock                                  3886    3959     +73
	tipc_link_build_proto_msg                   1096    1140     +44
	mac80211_hwsim_new_radio                    2776    2808     +32
	tipc_mon_rcv                                1032    1058     +26
	svcauth_gss_legacy_init                     1413    1429     +16
	tipc_bcbase_select_primary                   379     392     +13
	nfsd4_exchange_id                           1247    1260     +13
	nfsd4_setclientid_confirm                    782     793     +11
		...
	put_client_renew_locked                      494     480     -14
	ip_set_sockfn_get                            730     716     -14
	geneve_sock_add                              829     813     -16
	nfsd4_sequence_done                          721     703     -18
	nlmclnt_lookup_host                          708     686     -22
	nfsd4_lockt                                 1085    1063     -22
	nfs_get_client                              1077    1050     -27
	tcf_bpf_init                                1106    1076     -30
	nfsd4_encode_fattr                          5997    5930     -67
	Total: Before=154856051, After=154854321, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/core/cma.c                 | 2 +-
 drivers/net/bonding/bond_main.c               | 2 +-
 drivers/net/geneve.c                          | 2 +-
 drivers/net/gtp.c                             | 2 +-
 drivers/net/ppp/ppp_generic.c                 | 2 +-
 drivers/net/ppp/pppoe.c                       | 2 +-
 drivers/net/vxlan.c                           | 2 +-
 drivers/net/wireless/mac80211_hwsim.c         | 2 +-
 fs/lockd/netns.h                              | 2 +-
 fs/lockd/svc.c                                | 2 +-
 fs/nfs/inode.c                                | 2 +-
 fs/nfs/netns.h                                | 2 +-
 fs/nfs_common/grace.c                         | 2 +-
 fs/nfsd/netns.h                               | 2 +-
 fs/nfsd/nfsctl.c                              | 2 +-
 include/net/bonding.h                         | 2 +-
 include/net/ip_tunnels.h                      | 6 +++---
 include/net/net_namespace.h                   | 2 +-
 include/net/netfilter/nf_conntrack_l4proto.h  | 2 +-
 include/net/netfilter/nf_conntrack_synproxy.h | 2 +-
 include/net/netns/generic.h                   | 2 +-
 kernel/audit.c                                | 2 +-
 net/8021q/vlan.c                              | 2 +-
 net/8021q/vlan.h                              | 2 +-
 net/bridge/br_netfilter_hooks.c               | 2 +-
 net/caif/caif_dev.c                           | 2 +-
 net/core/net_namespace.c                      | 7 +++----
 net/core/pktgen.c                             | 2 +-
 net/ipv4/ip_gre.c                             | 4 ++--
 net/ipv4/ip_tunnel.c                          | 4 ++--
 net/ipv4/ip_vti.c                             | 2 +-
 net/ipv4/ipip.c                               | 2 +-
 net/ipv4/netfilter/ipt_CLUSTERIP.c            | 2 +-
 net/ipv6/ip6_gre.c                            | 2 +-
 net/ipv6/ip6_tunnel.c                         | 2 +-
 net/ipv6/ip6_vti.c                            | 2 +-
 net/ipv6/sit.c                                | 2 +-
 net/ipv6/xfrm6_tunnel.c                       | 2 +-
 net/key/af_key.c                              | 2 +-
 net/netfilter/ipset/ip_set_core.c             | 2 +-
 net/netfilter/ipvs/ip_vs_core.c               | 2 +-
 net/netfilter/nf_conntrack_proto_dccp.c       | 2 +-
 net/netfilter/nf_conntrack_proto_gre.c        | 2 +-
 net/netfilter/nf_conntrack_proto_sctp.c       | 2 +-
 net/netfilter/nf_conntrack_proto_udplite.c    | 2 +-
 net/netfilter/nf_synproxy_core.c              | 2 +-
 net/netfilter/nfnetlink_log.c                 | 2 +-
 net/netfilter/nfnetlink_queue.c               | 2 +-
 net/netfilter/xt_hashlimit.c                  | 2 +-
 net/netfilter/xt_recent.c                     | 2 +-
 net/openvswitch/datapath.c                    | 2 +-
 net/openvswitch/datapath.h                    | 2 +-
 net/phonet/pn_dev.c                           | 2 +-
 net/rds/tcp.c                                 | 2 +-
 net/sched/act_bpf.c                           | 2 +-
 net/sched/act_connmark.c                      | 2 +-
 net/sched/act_csum.c                          | 2 +-
 net/sched/act_gact.c                          | 2 +-
 net/sched/act_ife.c                           | 2 +-
 net/sched/act_ipt.c                           | 4 ++--
 net/sched/act_mirred.c                        | 2 +-
 net/sched/act_nat.c                           | 2 +-
 net/sched/act_pedit.c                         | 2 +-
 net/sched/act_police.c                        | 2 +-
 net/sched/act_simple.c                        | 2 +-
 net/sched/act_skbedit.c                       | 2 +-
 net/sched/act_skbmod.c                        | 2 +-
 net/sched/act_tunnel_key.c                    | 2 +-
 net/sched/act_vlan.c                          | 2 +-
 net/sunrpc/netns.h                            | 2 +-
 net/sunrpc/sunrpc_syms.c                      | 2 +-
 net/tipc/core.c                               | 2 +-
 net/tipc/core.h                               | 2 +-
 73 files changed, 80 insertions(+), 81 deletions(-)

(limited to 'kernel')

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 89a6b0546804..c68f4fe001d7 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -116,7 +116,7 @@ static LIST_HEAD(dev_list);
 static LIST_HEAD(listen_any_list);
 static DEFINE_MUTEX(lock);
 static struct workqueue_struct *cma_wq;
-static int cma_pernet_id;
+static unsigned int cma_pernet_id;
 
 struct cma_pernet {
 	struct idr tcp_ps;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5708f17e4cdf..8029dd4912b6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -199,7 +199,7 @@ MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where "
 atomic_t netpoll_block_tx = ATOMIC_INIT(0);
 #endif
 
-int bond_net_id __read_mostly;
+unsigned int bond_net_id __read_mostly;
 
 static __be32 arp_target[BOND_MAX_ARP_TARGETS];
 static int arp_ip_count;
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 85a423a66478..90dc6b188607 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -43,7 +43,7 @@ struct geneve_net {
 	struct list_head	sock_list;
 };
 
-static int geneve_net_id;
+static unsigned int geneve_net_id;
 
 union geneve_addr {
 	struct sockaddr_in sin;
diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 719d19f35673..98f10c216521 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -77,7 +77,7 @@ struct gtp_dev {
 	struct hlist_head	*addr_hash;
 };
 
-static int gtp_net_id __read_mostly;
+static unsigned int gtp_net_id __read_mostly;
 
 struct gtp_net {
 	struct list_head gtp_dev_list;
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 5489c0ec1d9a..3d3b1f4339ef 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -204,7 +204,7 @@ static atomic_t ppp_unit_count = ATOMIC_INIT(0);
 static atomic_t channel_count = ATOMIC_INIT(0);
 
 /* per-net private data for this module */
-static int ppp_net_id __read_mostly;
+static unsigned int ppp_net_id __read_mostly;
 struct ppp_net {
 	/* units to ppp mapping */
 	struct idr units_idr;
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 4ddae8118c85..f017c72bb7fd 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -95,7 +95,7 @@ static const struct proto_ops pppoe_ops;
 static const struct ppp_channel_ops pppoe_chan_ops;
 
 /* per-net private data for this module */
-static int pppoe_net_id __read_mostly;
+static unsigned int pppoe_net_id __read_mostly;
 struct pppoe_net {
 	/*
 	 * we could use _single_ hash table for all
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0a3fd675408f..21e92be6e56c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -52,7 +52,7 @@ static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
-static int vxlan_net_id;
+static unsigned int vxlan_net_id;
 static struct rtnl_link_ops vxlan_link_ops;
 
 static const u8 all_zeros_mac[ETH_ALEN + 2];
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 8f366cc097e6..1293f8494985 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -250,7 +250,7 @@ static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c)
 	cp->magic = 0;
 }
 
-static int hwsim_net_id;
+static unsigned int hwsim_net_id;
 
 static int hwsim_netgroup;
 
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5426189406c1..fb8cac88251a 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -15,6 +15,6 @@ struct lockd_net {
 	struct list_head nsm_handles;
 };
 
-extern int lockd_net_id;
+extern unsigned int lockd_net_id;
 
 #endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index fc4084ef4736..1c13dd80744f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,7 +57,7 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
-int lockd_net_id;
+unsigned int lockd_net_id;
 
 /*
  * These can be set at insmod time (useful for NFS as root filesystem),
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bf4ec5ecc97e..ce42dd00e4ee 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2015,7 +2015,7 @@ static void nfsiod_stop(void)
 	destroy_workqueue(wq);
 }
 
-int nfs_net_id;
+unsigned int nfs_net_id;
 EXPORT_SYMBOL_GPL(nfs_net_id);
 
 static int nfs_net_init(struct net *net)
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index fbce0d885d4c..5fbd2bde91ba 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -35,6 +35,6 @@ struct nfs_net {
 #endif
 };
 
-extern int nfs_net_id;
+extern unsigned int nfs_net_id;
 
 #endif
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index fd8c9a5bcac4..420d3a0ab258 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -9,7 +9,7 @@
 #include <net/netns/generic.h>
 #include <linux/fs.h>
 
-static int grace_net_id;
+static unsigned int grace_net_id;
 static DEFINE_SPINLOCK(grace_lock);
 
 /**
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ee36efd5aece..3714231a9d0f 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -124,5 +124,5 @@ struct nfsd_net {
 /* Simple check to find out if a given net was properly initialized */
 #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
 
-extern int nfsd_net_id;
+extern unsigned int nfsd_net_id;
 #endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 36b2af931e06..2857e46d5cc5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1201,7 +1201,7 @@ static int create_proc_exports_entry(void)
 }
 #endif
 
-int nfsd_net_id;
+unsigned int nfsd_net_id;
 
 static __net_init int nfsd_init_net(struct net *net)
 {
diff --git a/include/net/bonding.h b/include/net/bonding.h
index f32f7ef8a23a..3c857778a6ca 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -681,7 +681,7 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
 }
 
 /* exported from bond_main.c */
-extern int bond_net_id;
+extern unsigned int bond_net_id;
 extern const struct bond_parm_tbl bond_lacp_tbl[];
 extern const struct bond_parm_tbl xmit_hashtype_tbl[];
 extern const struct bond_parm_tbl arp_validate_tbl[];
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 59557c07904b..e893fe43dd13 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -129,7 +129,7 @@ struct ip_tunnel {
 #endif
 	struct ip_tunnel_prl_entry __rcu *prl;	/* potential router list */
 	unsigned int		prl_count;	/* # of entries in PRL */
-	int			ip_tnl_net_id;
+	unsigned int		ip_tnl_net_id;
 	struct gro_cells	gro_cells;
 	bool			collect_md;
 	bool			ignore_df;
@@ -248,7 +248,7 @@ void ip_tunnel_uninit(struct net_device *dev);
 void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
 struct net *ip_tunnel_get_link_net(const struct net_device *dev);
 int ip_tunnel_get_iflink(const struct net_device *dev);
-int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
@@ -275,7 +275,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p);
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p);
-void ip_tunnel_setup(struct net_device *dev, int net_id);
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fc4f757107df..d7149e93a60a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -291,7 +291,7 @@ struct pernet_operations {
 	int (*init)(struct net *net);
 	void (*exit)(struct net *net);
 	void (*exit_batch)(struct list_head *net_exit_list);
-	int *id;
+	unsigned int *id;
 	size_t size;
 };
 
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 2152b70626d5..e7b836590f0b 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -98,7 +98,7 @@ struct nf_conntrack_l4proto {
 		const struct nla_policy *nla_policy;
 	} ctnl_timeout;
 #endif
-	int	*net_id;
+	unsigned int	*net_id;
 	/* Init l4proto pernet data */
 	int (*init_net)(struct net *net, u_int16_t proto);
 
diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index e6937318546c..b0ca402c1f72 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -54,7 +54,7 @@ struct synproxy_net {
 	struct synproxy_stats __percpu	*stats;
 };
 
-extern int synproxy_net_id;
+extern unsigned int synproxy_net_id;
 static inline struct synproxy_net *synproxy_pernet(struct net *net)
 {
 	return net_generic(net, synproxy_net_id);
diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 70e158551704..d315786bcfd7 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -31,7 +31,7 @@ struct net_generic {
 	void *ptr[0];
 };
 
-static inline void *net_generic(const struct net *net, int id)
+static inline void *net_generic(const struct net *net, unsigned int id)
 {
 	struct net_generic *ng;
 	void *ptr;
diff --git a/kernel/audit.c b/kernel/audit.c
index f1ca11613379..92c463d2d1c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,7 +126,7 @@ static atomic_t    audit_lost = ATOMIC_INIT(0);
 
 /* The netlink socket. */
 static struct sock *audit_sock;
-static int audit_net_id;
+static unsigned int audit_net_id;
 
 /* Hash for inode-based rules */
 struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a79365574531..691f0ad7067d 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -44,7 +44,7 @@
 
 /* Global VLAN variables */
 
-int vlan_net_id __read_mostly;
+unsigned int vlan_net_id __read_mostly;
 
 const char vlan_fullname[] = "802.1Q VLAN Support";
 const char vlan_version[] = DRV_VERSION;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index cc1557978066..df8bd65dd370 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -159,7 +159,7 @@ void vlan_netlink_fini(void);
 
 extern struct rtnl_link_ops vlan_link_ops;
 
-extern int vlan_net_id;
+extern unsigned int vlan_net_id;
 
 struct proc_dir_entry;
 
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 8155bd2a5138..83d937f4415e 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -46,7 +46,7 @@
 #include <linux/sysctl.h>
 #endif
 
-static int brnf_net_id __read_mostly;
+static unsigned int brnf_net_id __read_mostly;
 
 struct brnf_net {
 	bool enabled;
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index d730a0f68f46..2d38b6e34203 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -52,7 +52,7 @@ struct caif_net {
 	struct caif_device_entry_list caifdevs;
 };
 
-static int caif_net_id;
+static unsigned int caif_net_id;
 static int q_high = 50; /* Percent */
 
 struct cfcnfg *get_cfcnfg(struct net *net)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1309d78e2a64..35d37b196e67 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -55,7 +55,7 @@ static struct net_generic *net_alloc_generic(void)
 	return ng;
 }
 
-static int net_assign_generic(struct net *net, int id, void *data)
+static int net_assign_generic(struct net *net, unsigned int id, void *data)
 {
 	struct net_generic *ng, *old_ng;
 
@@ -122,8 +122,7 @@ out:
 static void ops_free(const struct pernet_operations *ops, struct net *net)
 {
 	if (ops->id && ops->size) {
-		int id = *ops->id;
-		kfree(net_generic(net, id));
+		kfree(net_generic(net, *ops->id));
 	}
 }
 
@@ -881,7 +880,7 @@ again:
 			}
 			return error;
 		}
-		max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
+		max_gen_ptrs = max(max_gen_ptrs, *ops->id);
 	}
 	error = __register_pernet_operations(list, ops);
 	if (error) {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 306b8f0e03c1..8e69ce472236 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -413,7 +413,7 @@ struct pktgen_hdr {
 };
 
 
-static int pg_net_id __read_mostly;
+static unsigned int pg_net_id __read_mostly;
 
 struct pktgen_net {
 	struct net		*net;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 576f705d8180..78fd62048335 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
 static int ipgre_tunnel_init(struct net_device *dev);
 
-static int ipgre_net_id __read_mostly;
-static int gre_tap_net_id __read_mostly;
+static unsigned int ipgre_net_id __read_mostly;
+static unsigned int gre_tap_net_id __read_mostly;
 
 static void ipgre_err(struct sk_buff *skb, u32 info,
 		      const struct tnl_ptk_info *tpi)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 12a92e3349ed..823abaef006b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -994,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev)
 }
 EXPORT_SYMBOL(ip_tunnel_get_iflink);
 
-int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 				  struct rtnl_link_ops *ops, char *devname)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
@@ -1196,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev)
 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
 
 /* Do least required initialization, rest of init is done in tunnel_init call */
-void ip_tunnel_setup(struct net_device *dev, int net_id)
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	tunnel->ip_tnl_net_id = net_id;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5d7944f394d9..8b14f1404c8f 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -46,7 +46,7 @@
 
 static struct rtnl_link_ops vti_link_ops __read_mostly;
 
-static int vti_net_id __read_mostly;
+static unsigned int vti_net_id __read_mostly;
 static int vti_tunnel_init(struct net_device *dev);
 
 static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c9392589c415..79489f017854 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -121,7 +121,7 @@ static bool log_ecn_error = true;
 module_param(log_ecn_error, bool, 0644);
 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 
-static int ipip_net_id __read_mostly;
+static unsigned int ipip_net_id __read_mostly;
 
 static int ipip_tunnel_init(struct net_device *dev);
 static struct rtnl_link_ops ipip_link_ops __read_mostly;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4a9e6db9df8d..e6e206fa86c8 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -62,7 +62,7 @@ struct clusterip_config {
 static const struct file_operations clusterip_proc_fops;
 #endif
 
-static int clusterip_net_id __read_mostly;
+static unsigned int clusterip_net_id __read_mostly;
 
 struct clusterip_net {
 	struct list_head configs;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 710bc79f9113..75b6108234dd 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -64,7 +64,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
 #define IP6_GRE_HASH_SIZE_SHIFT  5
 #define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT)
 
-static int ip6gre_net_id __read_mostly;
+static unsigned int ip6gre_net_id __read_mostly;
 struct ip6gre_net {
 	struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE];
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 259e8507d2cd..d3c619eda051 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -83,7 +83,7 @@ static int ip6_tnl_dev_init(struct net_device *dev);
 static void ip6_tnl_dev_setup(struct net_device *dev);
 static struct rtnl_link_ops ip6_link_ops __read_mostly;
 
-static int ip6_tnl_net_id __read_mostly;
+static unsigned int ip6_tnl_net_id __read_mostly;
 struct ip6_tnl_net {
 	/* the IPv6 tunnel fallback device */
 	struct net_device *fb_tnl_dev;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index af3f0e011265..c476bb8e9cdb 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -64,7 +64,7 @@ static int vti6_dev_init(struct net_device *dev);
 static void vti6_dev_setup(struct net_device *dev);
 static struct rtnl_link_ops vti6_link_ops __read_mostly;
 
-static int vti6_net_id __read_mostly;
+static unsigned int vti6_net_id __read_mostly;
 struct vti6_net {
 	/* the vti6 tunnel fallback device */
 	struct net_device *fb_tnl_dev;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index dc7a3449ffc1..0355231162b8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -76,7 +76,7 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
 		      __be32 *v4dst);
 static struct rtnl_link_ops sit_link_ops __read_mostly;
 
-static int sit_net_id __read_mostly;
+static unsigned int sit_net_id __read_mostly;
 struct sit_net {
 	struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
 	struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index e1c0bbe7996c..d7b731a78d09 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -44,7 +44,7 @@ struct xfrm6_tunnel_net {
 	u32 spi;
 };
 
-static int xfrm6_tunnel_net_id __read_mostly;
+static unsigned int xfrm6_tunnel_net_id __read_mostly;
 static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net)
 {
 	return net_generic(net, xfrm6_tunnel_net_id);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f9c9ecb0cdd3..c6252ed42c1d 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -36,7 +36,7 @@
 #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
 #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
 
-static int pfkey_net_id __read_mostly;
+static unsigned int pfkey_net_id __read_mostly;
 struct netns_pfkey {
 	/* List of all pfkey sockets. */
 	struct hlist_head table;
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 23345d2d136a..c296f9b606d4 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -36,7 +36,7 @@ struct ip_set_net {
 	bool		is_destroyed;	/* all sets are destroyed */
 };
 
-static int ip_set_net_id __read_mostly;
+static unsigned int ip_set_net_id __read_mostly;
 
 static inline struct ip_set_net *ip_set_pernet(struct net *net)
 {
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2c1b498a7a27..db40050f8785 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level);
 #endif
 EXPORT_SYMBOL(ip_vs_new_conn_out);
 
-static int ip_vs_net_id __read_mostly;
+static unsigned int ip_vs_net_id __read_mostly;
 /* netns cnt used for uniqueness */
 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
 
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index ac8976964975..073b047314dc 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -385,7 +385,7 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
 };
 
 /* this module per-net specifics */
-static int dccp_net_id __read_mostly;
+static unsigned int dccp_net_id __read_mostly;
 struct dccp_net {
 	struct nf_proto_net pn;
 	int dccp_loose;
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index ff405c9183f1..87bb40a3feb5 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -53,7 +53,7 @@ static unsigned int gre_timeouts[GRE_CT_MAX] = {
 	[GRE_CT_REPLIED]	= 180*HZ,
 };
 
-static int proto_gre_net_id __read_mostly;
+static unsigned int proto_gre_net_id __read_mostly;
 struct netns_proto_gre {
 	struct nf_proto_net	nf;
 	rwlock_t		keymap_lock;
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 17c0ade23fd8..d096c2d6b87b 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -144,7 +144,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = {
 	}
 };
 
-static int sctp_net_id	__read_mostly;
+static unsigned int sctp_net_id	__read_mostly;
 struct sctp_net {
 	struct nf_proto_net pn;
 	unsigned int timeouts[SCTP_CONNTRACK_MAX];
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 8cdb4b1bf933..7808604c70a2 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -35,7 +35,7 @@ static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = {
 	[UDPLITE_CT_REPLIED]	= 180*HZ,
 };
 
-static int udplite_net_id __read_mostly;
+static unsigned int udplite_net_id __read_mostly;
 struct udplite_net {
 	struct nf_proto_net pn;
 	unsigned int timeouts[UDPLITE_CT_MAX];
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index c8a4a48bced9..7c6d1fbe38b9 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -24,7 +24,7 @@
 #include <net/netfilter/nf_conntrack_synproxy.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 
-int synproxy_net_id;
+unsigned int synproxy_net_id;
 EXPORT_SYMBOL_GPL(synproxy_net_id);
 
 bool
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 7435505037b7..763cb4d54e8d 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -80,7 +80,7 @@ struct nfulnl_instance {
 
 #define INSTANCE_BUCKETS	16
 
-static int nfnl_log_net_id __read_mostly;
+static unsigned int nfnl_log_net_id __read_mostly;
 
 struct nfnl_log_net {
 	spinlock_t instances_lock;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 1e33115b399f..be7627b80400 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -77,7 +77,7 @@ struct nfqnl_instance {
 
 typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
 
-static int nfnl_queue_net_id __read_mostly;
+static unsigned int nfnl_queue_net_id __read_mostly;
 
 #define INSTANCE_BUCKETS	16
 struct nfnl_queue_net {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index b89b688e9d01..10063408141d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -49,7 +49,7 @@ struct hashlimit_net {
 	struct proc_dir_entry	*ip6t_hashlimit;
 };
 
-static int hashlimit_net_id;
+static unsigned int hashlimit_net_id;
 static inline struct hashlimit_net *hashlimit_pernet(struct net *net)
 {
 	return net_generic(net, hashlimit_net_id);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index bf250000e084..1d89a4eaf841 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -95,7 +95,7 @@ struct recent_net {
 #endif
 };
 
-static int recent_net_id __read_mostly;
+static unsigned int recent_net_id __read_mostly;
 
 static inline struct recent_net *recent_pernet(struct net *net)
 {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 1402f1be642d..2d4c4d3911c0 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -58,7 +58,7 @@
 #include "vport-internal_dev.h"
 #include "vport-netdev.h"
 
-int ovs_net_id __read_mostly;
+unsigned int ovs_net_id __read_mostly;
 
 static struct genl_family dp_packet_genl_family;
 static struct genl_family dp_flow_genl_family;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index ab85c1cae255..1c6e9377436d 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -144,7 +144,7 @@ struct ovs_net {
 	bool xt_label;
 };
 
-extern int ovs_net_id;
+extern unsigned int ovs_net_id;
 void ovs_lock(void);
 void ovs_unlock(void);
 
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index a58680016472..2cb4c5dfad6f 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -44,7 +44,7 @@ struct phonet_net {
 	struct phonet_routes routes;
 };
 
-static int phonet_net_id __read_mostly;
+static unsigned int phonet_net_id __read_mostly;
 
 static struct phonet_net *phonet_pernet(struct net *net)
 {
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 3296a6ac583a..1a0399dea764 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -366,7 +366,7 @@ struct rds_transport rds_tcp_transport = {
 	.t_mp_capable		= 1,
 };
 
-static int rds_tcp_netid;
+static unsigned int rds_tcp_netid;
 
 /* per-network namespace private data for this module */
 struct rds_tcp_net {
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 9ff06cfbcdec..1aa4ecf41baf 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -33,7 +33,7 @@ struct tcf_bpf_cfg {
 	bool is_ebpf;
 };
 
-static int bpf_net_id;
+static unsigned int bpf_net_id;
 static struct tc_action_ops act_bpf_ops;
 
 static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index eae07a2e774d..ab8062909962 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -30,7 +30,7 @@
 
 #define CONNMARK_TAB_MASK     3
 
-static int connmark_net_id;
+static unsigned int connmark_net_id;
 static struct tc_action_ops act_connmark_ops;
 
 static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index e0defcef376d..a0edd80a44db 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -42,7 +42,7 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
 	[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
 };
 
-static int csum_net_id;
+static unsigned int csum_net_id;
 static struct tc_action_ops act_csum_ops;
 
 static int tcf_csum_init(struct net *net, struct nlattr *nla,
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e0aa30f83c6c..e6c874a2b283 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,7 +25,7 @@
 
 #define GACT_TAB_MASK	15
 
-static int gact_net_id;
+static unsigned int gact_net_id;
 static struct tc_action_ops act_gact_ops;
 
 #ifdef CONFIG_GACT_PROB
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 95c463cbb9a6..80b848d3f096 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -35,7 +35,7 @@
 
 #define IFE_TAB_MASK 15
 
-static int ife_net_id;
+static unsigned int ife_net_id;
 static int max_metacnt = IFE_META_MAX + 1;
 static struct tc_action_ops act_ife_ops;
 
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index ce7ea6c1c50d..992ef8d624f1 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -30,10 +30,10 @@
 
 #define IPT_TAB_MASK     15
 
-static int ipt_net_id;
+static unsigned int ipt_net_id;
 static struct tc_action_ops act_ipt_ops;
 
-static int xt_net_id;
+static unsigned int xt_net_id;
 static struct tc_action_ops act_xt_ops;
 
 static int ipt_init_target(struct xt_entry_target *t, char *table,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6073a1132725..b2d417b8f46c 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -70,7 +70,7 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
 	[TCA_MIRRED_PARMS]	= { .len = sizeof(struct tc_mirred) },
 };
 
-static int mirred_net_id;
+static unsigned int mirred_net_id;
 static struct tc_action_ops act_mirred_ops;
 
 static bool dev_is_mac_header_xmit(const struct net_device *dev)
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 8e8b0cc30704..9b6aec665495 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -31,7 +31,7 @@
 
 #define NAT_TAB_MASK	15
 
-static int nat_net_id;
+static unsigned int nat_net_id;
 static struct tc_action_ops act_nat_ops;
 
 static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b54d56d4959b..eda322045e75 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -25,7 +25,7 @@
 
 #define PEDIT_TAB_MASK	15
 
-static int pedit_net_id;
+static unsigned int pedit_net_id;
 static struct tc_action_ops act_pedit_ops;
 
 static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index d1bd248fe146..c990b73a6c85 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -55,7 +55,7 @@ struct tc_police_compat {
 
 /* Each policer is serialized by its individual spinlock */
 
-static int police_net_id;
+static unsigned int police_net_id;
 static struct tc_action_ops act_police_ops;
 
 static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 289af6f9bb3b..823a73ad0c60 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -26,7 +26,7 @@
 
 #define SIMP_TAB_MASK     7
 
-static int simp_net_id;
+static unsigned int simp_net_id;
 static struct tc_action_ops act_simp_ops;
 
 #define SIMP_MAX_DATA	32
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 024f3a3afeff..06ccae3c12ee 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -29,7 +29,7 @@
 
 #define SKBEDIT_TAB_MASK     15
 
-static int skbedit_net_id;
+static unsigned int skbedit_net_id;
 static struct tc_action_ops act_skbedit_ops;
 
 static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index e7d96381c908..3b7074e23024 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -22,7 +22,7 @@
 
 #define SKBMOD_TAB_MASK     15
 
-static int skbmod_net_id;
+static unsigned int skbmod_net_id;
 static struct tc_action_ops act_skbmod_ops;
 
 #define MAX_EDIT_LEN ETH_HLEN
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index edc720f11687..7af712526f01 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -22,7 +22,7 @@
 
 #define TUNNEL_KEY_TAB_MASK     15
 
-static int tunnel_key_net_id;
+static unsigned int tunnel_key_net_id;
 static struct tc_action_ops act_tunnel_key_ops;
 
 static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index b57fcbcefea1..19e0dba305ce 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -21,7 +21,7 @@
 
 #define VLAN_TAB_MASK     15
 
-static int vlan_net_id;
+static unsigned int vlan_net_id;
 static struct tc_action_ops act_vlan_ops;
 
 static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index df5826876535..394ce523174c 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -34,7 +34,7 @@ struct sunrpc_net {
 	struct proc_dir_entry *use_gssp_proc;
 };
 
-extern int sunrpc_net_id;
+extern unsigned int sunrpc_net_id;
 
 int ip_map_cache_create(struct net *);
 void ip_map_cache_destroy(struct net *);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index ee5d3d253102..d1c330a7953a 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -24,7 +24,7 @@
 
 #include "netns.h"
 
-int sunrpc_net_id;
+unsigned int sunrpc_net_id;
 EXPORT_SYMBOL_GPL(sunrpc_net_id);
 
 static __net_init int sunrpc_init_net(struct net *net)
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 236b043a4156..0b982d048fb9 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -47,7 +47,7 @@
 #include <linux/module.h>
 
 /* configurable TIPC parameters */
-int tipc_net_id __read_mostly;
+unsigned int tipc_net_id __read_mostly;
 int sysctl_tipc_rmem[3] __read_mostly;	/* min/default/max */
 
 static int __net_init tipc_init_net(struct net *net)
diff --git a/net/tipc/core.h b/net/tipc/core.h
index a1845fb27d80..5cc5398be722 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -74,7 +74,7 @@ struct tipc_monitor;
 #define MAX_BEARERS	         3
 #define TIPC_DEF_MON_THRESHOLD  32
 
-extern int tipc_net_id __read_mostly;
+extern unsigned int tipc_net_id __read_mostly;
 extern int sysctl_tipc_rmem[3] __read_mostly;
 extern int sysctl_tipc_named_timeout __read_mostly;
 
-- 
cgit v1.2.3


From e245d99e6cc4a0b904b87b46b4f60d46fb405987 Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@oracle.com>
Date: Wed, 2 Nov 2016 09:36:33 -0700
Subject: lockdep: Limit static allocations if PROVE_LOCKING_SMALL is defined

Reduce the size of data structure for lockdep entries by half if
PROVE_LOCKING_SMALL if defined. This is used only for sparc.

Signed-off-by: Babu Moger <babu.moger@oracle.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/locking/lockdep_internals.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 51c4b24b6328..c2b88490d857 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -45,6 +45,14 @@ enum {
 #define LOCKF_USED_IN_IRQ_READ \
 		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 
+/*
+ * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text,
+ * .data and .bss to fit in required 32MB limit for the kernel. With
+ * PROVE_LOCKING we could go over this limit and cause system boot-up problems.
+ * So, reduce the static allocations for lockdeps related structures so that
+ * everything fits in current required size limit.
+ */
+#ifdef CONFIG_PROVE_LOCKING_SMALL
 /*
  * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
  * we track.
@@ -54,18 +62,24 @@ enum {
  * table (if it's not there yet), and we check it for lock order
  * conflicts and deadlocks.
  */
+#define MAX_LOCKDEP_ENTRIES	16384UL
+#define MAX_LOCKDEP_CHAINS_BITS	15
+#define MAX_STACK_TRACE_ENTRIES	262144UL
+#else
 #define MAX_LOCKDEP_ENTRIES	32768UL
 
 #define MAX_LOCKDEP_CHAINS_BITS	16
-#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
-
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
 
 /*
  * Stack-trace: tightly packed array of stack backtrace
  * addresses. Protected by the hash_lock.
  */
 #define MAX_STACK_TRACE_ENTRIES	524288UL
+#endif
+
+#define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
 
 extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
-- 
cgit v1.2.3


From 833fc48d18ce3595990b405ae82a160b33028994 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 10 Nov 2016 01:41:14 -0500
Subject: audit: skip sessionid sentinel value when auto-incrementing

The value (unsigned int)-1 is used as a sentinel to indicate the
sessionID is unset.  Skip this value when the session_id value wraps.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8c434318ec8d..d161b17ce8ce 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2025,8 +2025,11 @@ int audit_set_loginuid(kuid_t loginuid)
 		goto out;
 
 	/* are we setting or clearing? */
-	if (uid_valid(loginuid))
+	if (uid_valid(loginuid)) {
 		sessionid = (unsigned int)atomic_inc_return(&session_id);
+		if (unlikely(sessionid == (unsigned int)-1))
+			sessionid = (unsigned int)atomic_inc_return(&session_id);
+	}
 
 	task->sessionid = sessionid;
 	task->loginuid = loginuid;
-- 
cgit v1.2.3


From c1e8f06d7a0eea232ce0767471e1b4756ccab70a Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Wed, 16 Nov 2016 16:14:33 -0500
Subject: audit: fix formatting of AUDIT_CONFIG_CHANGE events

The AUDIT_CONFIG_CHANGE events sometimes use a op= field. The current
code logs the value of the field with quotes. This field is documented
to not be encoded, so it should not have quotes.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
[PM: reformatted commit description to make checkpatch.pl happy]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_fsnotify.c | 5 ++---
 kernel/audit_tree.c     | 3 +--
 kernel/audit_watch.c    | 5 ++---
 kernel/auditfilter.c    | 3 +--
 4 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index f84f8d06e1f6..f75154889aa9 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -130,10 +130,9 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c
 	ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return;
-	audit_log_format(ab, "auid=%u ses=%u op=",
+	audit_log_format(ab, "auid=%u ses=%u op=%s",
 			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
-			 audit_get_sessionid(current));
-	audit_log_string(ab, op);
+			 audit_get_sessionid(current), op);
 	audit_log_format(ab, " path=");
 	audit_log_untrustedstring(ab, audit_mark->path);
 	audit_log_key(ab, rule->filterkey);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 25772476fa4a..055f11b0a50f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -458,8 +458,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule)
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return;
-	audit_log_format(ab, "op=");
-	audit_log_string(ab, "remove_rule");
+	audit_log_format(ab, "op=remove_rule");
 	audit_log_format(ab, " dir=");
 	audit_log_untrustedstring(ab, rule->tree->pathname);
 	audit_log_key(ab, rule->filterkey);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0d302a87f21b..686e068ec3da 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -242,10 +242,9 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
 		ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
 		if (unlikely(!ab))
 			return;
-		audit_log_format(ab, "auid=%u ses=%u op=",
+		audit_log_format(ab, "auid=%u ses=%u op=%s",
 				 from_kuid(&init_user_ns, audit_get_loginuid(current)),
-				 audit_get_sessionid(current));
-		audit_log_string(ab, op);
+				 audit_get_sessionid(current), op);
 		audit_log_format(ab, " path=");
 		audit_log_untrustedstring(ab, w->path);
 		audit_log_key(ab, r->filterkey);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 85d9cac497e4..632e90d1005f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1074,8 +1074,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 		return;
 	audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " op=");
-	audit_log_string(ab, action);
+	audit_log_format(ab, " op=%s", action);
 	audit_log_key(ab, rule->filterkey);
 	audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
 	audit_log_end(ab);
-- 
cgit v1.2.3


From 194a6b5b9cb6b91a5f7d86984165a3bc55188599 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 17 Nov 2016 11:46:38 -0500
Subject: sched/wake_q: Rename WAKE_Q to DEFINE_WAKE_Q

Currently the wake_q data structure is defined by the WAKE_Q() macro.
This macro, however, looks like a function doing something as "wake" is
a verb. Even checkpatch.pl was confused as it reported warnings like

  WARNING: Missing a blank line after declarations
  #548: FILE: kernel/futex.c:3665:
  +	int ret;
  +	WAKE_Q(wake_q);

This patch renames the WAKE_Q() macro to DEFINE_WAKE_Q() which clarifies
what the macro is doing and eliminates the checkpatch.pl warnings.

Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1479401198-1765-1-git-send-email-longman@redhat.com
[ Resolved conflict and added missing rename. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       |  4 ++--
 ipc/mqueue.c                |  4 ++--
 ipc/msg.c                   |  8 ++++----
 kernel/futex.c              |  8 ++++----
 kernel/locking/mutex.c      |  2 +-
 kernel/locking/rtmutex.c    |  2 +-
 kernel/locking/rwsem-xadd.c | 10 +++++-----
 7 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c1aa3b02f6ac..dc37cbe2b13c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -989,7 +989,7 @@ enum cpu_idle_type {
  * already in a wake queue, the wakeup will happen soon and the second
  * waker can just skip it.
  *
- * The WAKE_Q macro declares and initializes the list head.
+ * The DEFINE_WAKE_Q macro declares and initializes the list head.
  * wake_up_q() does NOT reinitialize the list; it's expected to be
  * called near the end of a function, where the fact that the queue is
  * not used again will be easy to see by inspection.
@@ -1009,7 +1009,7 @@ struct wake_q_head {
 
 #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
 
-#define WAKE_Q(name)					\
+#define DEFINE_WAKE_Q(name)				\
 	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
 
 extern void wake_q_add(struct wake_q_head *head,
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 8cbd6e6894d5..7a2d8f0c8ae5 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -967,7 +967,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 	struct timespec ts;
 	struct posix_msg_tree_node *new_leaf = NULL;
 	int ret = 0;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	if (u_abs_timeout) {
 		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -1151,7 +1151,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 			msg_ptr = wait.msg;
 		}
 	} else {
-		WAKE_Q(wake_q);
+		DEFINE_WAKE_Q(wake_q);
 
 		msg_ptr = msg_get(info);
 
diff --git a/ipc/msg.c b/ipc/msg.c
index e12307d0c920..32e9bd837cde 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -235,7 +235,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
 	struct msg_msg *msg, *t;
 	struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	expunge_all(msq, -EIDRM, &wake_q);
 	ss_wakeup(msq, &wake_q, true);
@@ -397,7 +397,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 		goto out_up;
 	case IPC_SET:
 	{
-		WAKE_Q(wake_q);
+		DEFINE_WAKE_Q(wake_q);
 
 		if (msqid64.msg_qbytes > ns->msg_ctlmnb &&
 		    !capable(CAP_SYS_RESOURCE)) {
@@ -634,7 +634,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,
 	struct msg_msg *msg;
 	int err;
 	struct ipc_namespace *ns;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	ns = current->nsproxy->ipc_ns;
 
@@ -850,7 +850,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl
 	struct msg_queue *msq;
 	struct ipc_namespace *ns;
 	struct msg_msg *msg, *copy = NULL;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	ns = current->nsproxy->ipc_ns;
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 2c4be467fecd..9246d9f593d1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1298,7 +1298,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 	struct task_struct *new_owner;
 	struct futex_pi_state *pi_state = this->pi_state;
 	u32 uninitialized_var(curval), newval;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 	bool deboost;
 	int ret = 0;
 
@@ -1415,7 +1415,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 	struct futex_q *this, *next;
 	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	if (!bitset)
 		return -EINVAL;
@@ -1469,7 +1469,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
 	int ret, op_ret;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
@@ -1708,7 +1708,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	if (requeue_pi) {
 		/*
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a65e09a046ac..c0731685603f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -840,7 +840,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 {
 	struct task_struct *next = NULL;
 	unsigned long owner, flags;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	mutex_release(&lock->dep_map, 1, ip);
 
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1ec0f48962b3..d8d7a153b711 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1382,7 +1382,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
 		    bool (*slowfn)(struct rt_mutex *lock,
 				   struct wake_q_head *wqh))
 {
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
 		rt_mutex_deadlock_account_unlock(current);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2fa2e2e64950..263e7449282a 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -225,7 +225,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	waiter.task = tsk;
 	waiter.type = RWSEM_WAITING_FOR_READ;
@@ -461,7 +461,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 	bool waiting = true; /* any queued threads before us */
 	struct rwsem_waiter waiter;
 	struct rw_semaphore *ret = sem;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	/* undo write bias from down_write operation, stop active locking */
 	count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
@@ -495,7 +495,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 		 * wake any read locks that were queued ahead of us.
 		 */
 		if (count > RWSEM_WAITING_BIAS) {
-			WAKE_Q(wake_q);
+			DEFINE_WAKE_Q(wake_q);
 
 			__rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
 			/*
@@ -571,7 +571,7 @@ __visible
 struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	/*
 	 * If a spinner is present, it is not necessary to do the wakeup.
@@ -625,7 +625,7 @@ __visible
 struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
-	WAKE_Q(wake_q);
+	DEFINE_WAKE_Q(wake_q);
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-- 
cgit v1.2.3


From e96271f3ed7e702fa36dd0605c0c5b5f065af816 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 18 Nov 2016 13:38:43 +0200
Subject: perf/core: Fix address filter parser

The token table passed into match_token() must be null-terminated, which
it currently is not in the perf's address filter string parser, as caught
by Vince's perf_fuzzer and KASAN.

It doesn't blow up otherwise because of the alignment padding of the table
to the next element in the .rodata, which is luck.

Fixing by adding a null-terminator to the token table.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dvyukov@google.com
Cc: stable@vger.kernel.org # v4.7+
Fixes: 375637bc524 ("perf/core: Introduce address range filtering")
Link: http://lkml.kernel.org/r/877f81f264.fsf@ashishki-desk.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff230bb4a02e..6ee1febdf6ff 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8029,6 +8029,7 @@ restart:
  * if <size> is not specified, the range is treated as a single address.
  */
 enum {
+	IF_ACT_NONE = -1,
 	IF_ACT_FILTER,
 	IF_ACT_START,
 	IF_ACT_STOP,
@@ -8052,6 +8053,7 @@ static const match_table_t if_tokens = {
 	{ IF_SRC_KERNEL,	"%u/%u" },
 	{ IF_SRC_FILEADDR,	"%u@%s" },
 	{ IF_SRC_KERNELADDR,	"%u" },
+	{ IF_ACT_NONE,		NULL },
 };
 
 /*
-- 
cgit v1.2.3


From 97bc402db7821259f6a722cb38e060aa9b35b6e8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 19 Nov 2016 01:45:00 +0100
Subject: bpf, mlx5: fix mlx5e_create_rq taking reference on prog

In mlx5e_create_rq(), when creating a new queue, we call bpf_prog_add() but
without checking the return value. bpf_prog_add() can fail since 92117d8443bc
("bpf: fix refcnt overflow"), so we really must check it. Take the reference
right when we assign it to the rq from priv->xdp_prog, and just drop the
reference on error path. Destruction in mlx5e_destroy_rq() looks good, though.

Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +++++++++----
 kernel/bpf/syscall.c                              |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bd0732d5d219..54bae797b338 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -513,7 +513,13 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->channel = c;
 	rq->ix      = c->ix;
 	rq->priv    = c->priv;
-	rq->xdp_prog = priv->xdp_prog;
+
+	rq->xdp_prog = priv->xdp_prog ? bpf_prog_inc(priv->xdp_prog) : NULL;
+	if (IS_ERR(rq->xdp_prog)) {
+		err = PTR_ERR(rq->xdp_prog);
+		rq->xdp_prog = NULL;
+		goto err_rq_wq_destroy;
+	}
 
 	rq->buff.map_dir = DMA_FROM_DEVICE;
 	if (rq->xdp_prog)
@@ -590,12 +596,11 @@ static int mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->page_cache.head = 0;
 	rq->page_cache.tail = 0;
 
-	if (rq->xdp_prog)
-		bpf_prog_add(rq->xdp_prog, 1);
-
 	return 0;
 
 err_rq_wq_destroy:
+	if (rq->xdp_prog)
+		bpf_prog_put(rq->xdp_prog);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ce1b7de7d72c..eb15498b8d55 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -696,6 +696,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 {
 	return bpf_prog_add(prog, 1);
 }
+EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
-- 
cgit v1.2.3


From 4e201566402c878a225d4425df8a4a664c6f251e Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 22 Nov 2016 09:21:16 +0000
Subject: genirq/msi: Drop artificial PCI dependency

The generic MSI layer doesn't have any PCI ties anymore, and the
build hack should have been removed some time ago.

Fixes: d9109698be6e ("genirq: Introduce msi_domain_alloc/free_irqs()")
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1479806476-20801-1-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/msi.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 8a3e872798f3..ee230063f033 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,9 +14,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
-
-/* Temparory solution for building, will be removed later */
-#include <linux/pci.h>
+#include <linux/slab.h>
 
 /**
  * alloc_msi_entry - Allocate an initialize msi_entry
-- 
cgit v1.2.3


From 18f649ef344127ef6de23a5a4272dbe2fdb73dde Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 14 Nov 2016 19:46:09 +0100
Subject: sched/autogroup: Fix autogroup_move_group() to never skip
 sched_move_task()

The PF_EXITING check in task_wants_autogroup() is no longer needed. Remove
it, but see the next patch.

However the comment is correct in that autogroup_move_group() must always
change task_group() for every thread so the sysctl_ check is very wrong;
we can race with cgroups and even sys_setsid() is not safe because a task
running with task_group() == ag->tg must participate in refcounting:

	int main(void)
	{
		int sctl = open("/proc/sys/kernel/sched_autogroup_enabled", O_WRONLY);

		assert(sctl > 0);
		if (fork()) {
			wait(NULL); // destroy the child's ag/tg
			pause();
		}

		assert(pwrite(sctl, "1\n", 2, 0) == 2);
		assert(setsid() > 0);
		if (fork())
			pause();

		kill(getppid(), SIGKILL);
		sleep(1);

		// The child has gone, the grandchild runs with kref == 1
		assert(pwrite(sctl, "0\n", 2, 0) == 2);
		assert(setsid() > 0);

		// runs with the freed ag/tg
		for (;;)
			sleep(1);

		return 0;
	}

crashes the kernel. It doesn't really need sleep(1), it doesn't matter if
autogroup_move_group() actually frees the task_group or this happens later.

Reported-by: Vern Lovejoy <vlovejoy@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hartsjc@redhat.com
Cc: vbendel@redhat.com
Link: http://lkml.kernel.org/r/20161114184609.GA15965@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/auto_group.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966cb8891..ad2b19ad6ca0 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
 	if (tg != &root_task_group)
 		return false;
-
 	/*
-	 * We can only assume the task group can't go away on us if
-	 * autogroup_move_group() can see us on ->thread_group list.
+	 * If we race with autogroup_move_group() the caller can use the old
+	 * value of signal->autogroup but in this case sched_move_task() will
+	 * be called again before autogroup_kref_put().
 	 */
-	if (p->flags & PF_EXITING)
-		return false;
-
 	return true;
 }
 
@@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	}
 
 	p->signal->autogroup = autogroup_kref_get(ag);
-
-	if (!READ_ONCE(sysctl_sched_autogroup_enabled))
-		goto out;
-
+	/*
+	 * We can't avoid sched_move_task() after we changed signal->autogroup,
+	 * this process can already run with task_group() == prev->tg or we can
+	 * race with cgroup code which can read autogroup = prev under rq->lock.
+	 * In the latter case for_each_thread() can not miss a migrating thread,
+	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+	 * can't be removed from thread list, we hold ->siglock.
+	 */
 	for_each_thread(p, t)
 		sched_move_task(t);
-out:
+
 	unlock_task_sighand(p, &flags);
 	autogroup_kref_put(prev);
 }
-- 
cgit v1.2.3


From 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 14 Nov 2016 19:46:12 +0100
Subject: sched/autogroup: Do not use autogroup->tg in zombie threads

Exactly because for_each_thread() in autogroup_move_group() can't see it
and update its ->sched_task_group before _put() and possibly free().

So the exiting task needs another sched_move_task() before exit_notify()
and we need to re-introduce the PF_EXITING (or similar) check removed by
the previous change for another reason.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: hartsjc@redhat.com
Cc: vbendel@redhat.com
Cc: vlovejoy@redhat.com
Link: http://lkml.kernel.org/r/20161114184612.GA15968@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h     |  2 ++
 kernel/exit.c             |  1 +
 kernel/sched/auto_group.c | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..e9c009dc3a4a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p);
 extern void sched_autogroup_detach(struct task_struct *p);
 extern void sched_autogroup_fork(struct signal_struct *sig);
 extern void sched_autogroup_exit(struct signal_struct *sig);
+extern void sched_autogroup_exit_task(struct task_struct *p);
 #ifdef CONFIG_PROC_FS
 extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
 extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
@@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { }
 static inline void sched_autogroup_detach(struct task_struct *p) { }
 static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit_task(struct task_struct *p) { }
 #endif
 
 extern int yield_to(struct task_struct *p, bool preempt);
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d68c45ebbe3..3076f3089919 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -836,6 +836,7 @@ void __noreturn do_exit(long code)
 	 */
 	perf_event_exit_task(tsk);
 
+	sched_autogroup_exit_task(tsk);
 	cgroup_exit(tsk);
 
 	/*
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index ad2b19ad6ca0..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -115,10 +115,26 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 	 * If we race with autogroup_move_group() the caller can use the old
 	 * value of signal->autogroup but in this case sched_move_task() will
 	 * be called again before autogroup_kref_put().
+	 *
+	 * However, there is no way sched_autogroup_exit_task() could tell us
+	 * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
 	 */
+	if (p->flags & PF_EXITING)
+		return false;
+
 	return true;
 }
 
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+	/*
+	 * We are going to call exit_notify() and autogroup_move_group() can't
+	 * see this thread after that: we can no longer use signal->autogroup.
+	 * See the PF_EXITING check in task_wants_autogroup().
+	 */
+	sched_move_task(p);
+}
+
 static void
 autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 {
@@ -142,6 +158,9 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 	 * In the latter case for_each_thread() can not miss a migrating thread,
 	 * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
 	 * can't be removed from thread list, we hold ->siglock.
+	 *
+	 * If an exiting thread was already removed from thread list we rely on
+	 * sched_autogroup_exit_task().
 	 */
 	for_each_thread(p, t)
 		sched_move_task(t);
-- 
cgit v1.2.3


From 5aff60a191e579ae00ae5ca6ce16c13b687bc8a3 Mon Sep 17 00:00:00 2001
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2016 05:08:29 -0400
Subject: locking/osq: Break out of spin-wait busy waiting loop for a preempted
 vCPU in osq_lock()

An over-committed guest with more vCPUs than pCPUs has a heavy overload
in osq_lock().

This is because if vCPU-A holds the osq lock and yields out, vCPU-B ends
up waiting for per_cpu node->locked to be set. IOW, vCPU-B waits for
vCPU-A to run and unlock the osq lock.

Use the new vcpu_is_preempted(cpu) interface to detect if a vCPU is
currently running or not, and break out of the spin-loop if so.

test case:

 $ perf record -a perf bench sched messaging -g 400 -p && perf report

 before patch:
 18.09%  sched-messaging  [kernel.vmlinux]  [k] osq_lock
 12.28%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
  5.27%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
  3.89%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task
  3.64%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
  3.41%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner.is
  2.49%  sched-messaging  [kernel.vmlinux]  [k] system_call

 after patch:
 20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
  8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
  4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
  3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
  2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
  2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
  2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Tested-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: benh@kernel.crashing.org
Cc: bsingharora@gmail.com
Cc: dave@stgolabs.net
Cc: kernellwp@gmail.com
Cc: konrad.wilk@oracle.com
Cc: linuxppc-dev@lists.ozlabs.org
Cc: mpe@ellerman.id.au
Cc: paulmck@linux.vnet.ibm.com
Cc: paulus@samba.org
Cc: rkrcmar@redhat.com
Cc: virtualization@lists.linux-foundation.org
Cc: will.deacon@arm.com
Cc: xen-devel-request@lists.xenproject.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1478077718-37424-3-git-send-email-xinhui.pan@linux.vnet.ibm.com
[ Translated to English. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/osq_lock.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 4ea2710b9d6c..a3167941093b 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr)
 	return cpu_nr + 1;
 }
 
+static inline int node_cpu(struct optimistic_spin_node *node)
+{
+	return node->cpu - 1;
+}
+
 static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
 {
 	int cpu_nr = encoded_cpu_val - 1;
@@ -118,8 +123,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	while (!READ_ONCE(node->locked)) {
 		/*
 		 * If we need to reschedule bail... so we can block.
+		 * Use vcpu_is_preempted() to avoid waiting for a preempted
+		 * lock holder:
 		 */
-		if (need_resched())
+		if (need_resched() || vcpu_is_preempted(node_cpu(node->prev)))
 			goto unqueue;
 
 		cpu_relax();
-- 
cgit v1.2.3


From 05ffc951392df57edecc2519327b169210c3df75 Mon Sep 17 00:00:00 2001
From: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Date: Wed, 2 Nov 2016 05:08:30 -0400
Subject: locking/mutex: Break out of expensive busy-loop on
 {mutex,rwsem}_spin_on_owner() when owner vCPU is preempted

An over-committed guest with more vCPUs than pCPUs has a heavy overload
in the two spin_on_owner. This blames on the lock holder preemption
issue.

Break out of the loop if the vCPU is preempted: if vcpu_is_preempted(cpu)
is true.

test-case:
perf record -a perf bench sched messaging -g 400 -p && perf report

before patch:
20.68%  sched-messaging  [kernel.vmlinux]  [k] mutex_spin_on_owner
 8.45%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 4.12%  sched-messaging  [kernel.vmlinux]  [k] system_call
 3.01%  sched-messaging  [kernel.vmlinux]  [k] system_call_common
 2.83%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 2.64%  sched-messaging  [kernel.vmlinux]  [k] rwsem_spin_on_owner
 2.00%  sched-messaging  [kernel.vmlinux]  [k] osq_lock

after patch:
 9.99%  sched-messaging  [kernel.vmlinux]  [k] mutex_unlock
 5.28%  sched-messaging  [unknown]         [H] 0xc0000000000768e0
 4.27%  sched-messaging  [kernel.vmlinux]  [k] __copy_tofrom_user_power7
 3.77%  sched-messaging  [kernel.vmlinux]  [k] copypage_power7
 3.24%  sched-messaging  [kernel.vmlinux]  [k] _raw_write_lock_irq
 3.02%  sched-messaging  [kernel.vmlinux]  [k] system_call
 2.69%  sched-messaging  [kernel.vmlinux]  [k] wait_consider_task

Tested-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: David.Laight@ACULAB.COM
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: benh@kernel.crashing.org
Cc: boqun.feng@gmail.com
Cc: bsingharora@gmail.com
Cc: dave@stgolabs.net
Cc: kernellwp@gmail.com
Cc: konrad.wilk@oracle.com
Cc: linuxppc-dev@lists.ozlabs.org
Cc: mpe@ellerman.id.au
Cc: paulmck@linux.vnet.ibm.com
Cc: paulus@samba.org
Cc: rkrcmar@redhat.com
Cc: virtualization@lists.linux-foundation.org
Cc: will.deacon@arm.com
Cc: xen-devel-request@lists.xenproject.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1478077718-37424-4-git-send-email-xinhui.pan@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c      | 13 +++++++++++--
 kernel/locking/rwsem-xadd.c | 14 +++++++++++---
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index c0731685603f..9b349619f431 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -364,7 +364,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 		 */
 		barrier();
 
-		if (!owner->on_cpu || need_resched()) {
+		/*
+		 * Use vcpu_is_preempted to detect lock holder preemption issue.
+		 */
+		if (!owner->on_cpu || need_resched() ||
+				vcpu_is_preempted(task_cpu(owner))) {
 			ret = false;
 			break;
 		}
@@ -389,8 +393,13 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 
 	rcu_read_lock();
 	owner = __mutex_owner(lock);
+
+	/*
+	 * As lock holder preemption issue, we both skip spinning if task is not
+	 * on cpu or its cpu is preempted
+	 */
 	if (owner)
-		retval = owner->on_cpu;
+		retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 	rcu_read_unlock();
 
 	/*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 263e7449282a..631506004f9e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 		goto done;
 	}
 
-	ret = owner->on_cpu;
+	/*
+	 * As lock holder preemption issue, we both skip spinning if task is not
+	 * on cpu or its cpu is preempted
+	 */
+	ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
 done:
 	rcu_read_unlock();
 	return ret;
@@ -362,8 +366,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
 		 */
 		barrier();
 
-		/* abort spinning when need_resched or owner is not running */
-		if (!owner->on_cpu || need_resched()) {
+		/*
+		 * abort spinning when need_resched or owner is not running or
+		 * owner's cpu is preempted.
+		 */
+		if (!owner->on_cpu || need_resched() ||
+				vcpu_is_preempted(task_cpu(owner))) {
 			rcu_read_unlock();
 			return false;
 		}
-- 
cgit v1.2.3


From bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 13 Oct 2016 21:23:16 -0500
Subject: mm: Add a user_ns owner to mm_struct and fix ptrace permission checks

During exec dumpable is cleared if the file that is being executed is
not readable by the user executing the file.  A bug in
ptrace_may_access allows reading the file if the executable happens to
enter into a subordinate user namespace (aka clone(CLONE_NEWUSER),
unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER).

This problem is fixed with only necessary userspace breakage by adding
a user namespace owner to mm_struct, captured at the time of exec, so
it is clear in which user namespace CAP_SYS_PTRACE must be present in
to be able to safely give read permission to the executable.

The function ptrace_may_access is modified to verify that the ptracer
has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns.
This ensures that if the task changes it's cred into a subordinate
user namespace it does not become ptraceable.

The function ptrace_attach is modified to only set PT_PTRACE_CAP when
CAP_SYS_PTRACE is held over task->mm->user_ns.  The intent of
PT_PTRACE_CAP is to be a flag to note that whatever permission changes
the task might go through the tracer has sufficient permissions for
it not to be an issue.  task->cred->user_ns is always the same
as or descendent of mm->user_ns.  Which guarantees that having
CAP_SYS_PTRACE over mm->user_ns is the worst case for the tasks
credentials.

To prevent regressions mm->dumpable and mm->user_ns are not considered
when a task has no mm.  As simply failing ptrace_may_attach causes
regressions in privileged applications attempting to read things
such as /proc/<pid>/stat

Cc: stable@vger.kernel.org
Acked-by: Kees Cook <keescook@chromium.org>
Tested-by: Cyrill Gorcunov <gorcunov@openvz.org>
Fixes: 8409cca70561 ("userns: allow ptrace from non-init user namespaces")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/mm_types.h |  1 +
 kernel/fork.c            |  9 ++++++---
 kernel/ptrace.c          | 26 +++++++++++---------------
 mm/init-mm.c             |  2 ++
 4 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4a8acedf4b7d..08d947fc4c59 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -473,6 +473,7 @@ struct mm_struct {
 	 */
 	struct task_struct __rcu *owner;
 #endif
+	struct user_namespace *user_ns;
 
 	/* store ref to file /proc/<pid>/exe symlink points to */
 	struct file __rcu *exe_file;
diff --git a/kernel/fork.c b/kernel/fork.c
index 997ac1d584f7..ba8a01564985 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -745,7 +745,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
 
-static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
+static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+	struct user_namespace *user_ns)
 {
 	mm->mmap = NULL;
 	mm->mm_rb = RB_ROOT;
@@ -785,6 +786,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	if (init_new_context(p, mm))
 		goto fail_nocontext;
 
+	mm->user_ns = get_user_ns(user_ns);
 	return mm;
 
 fail_nocontext:
@@ -830,7 +832,7 @@ struct mm_struct *mm_alloc(void)
 		return NULL;
 
 	memset(mm, 0, sizeof(*mm));
-	return mm_init(mm, current);
+	return mm_init(mm, current, current_user_ns());
 }
 
 /*
@@ -845,6 +847,7 @@ void __mmdrop(struct mm_struct *mm)
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
 	check_mm(mm);
+	put_user_ns(mm->user_ns);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1126,7 +1129,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
 
 	memcpy(mm, oldmm, sizeof(*mm));
 
-	if (!mm_init(mm, tsk))
+	if (!mm_init(mm, tsk, mm->user_ns))
 		goto fail_nomem;
 
 	err = dup_mmap(mm, oldmm);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e6474f7272ec..282821557183 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -220,7 +220,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
 static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	const struct cred *cred = current_cred(), *tcred;
-	int dumpable = 0;
+	struct mm_struct *mm;
 	kuid_t caller_uid;
 	kgid_t caller_gid;
 
@@ -271,16 +271,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return -EPERM;
 ok:
 	rcu_read_unlock();
-	smp_rmb();
-	if (task->mm)
-		dumpable = get_dumpable(task->mm);
-	rcu_read_lock();
-	if (dumpable != SUID_DUMP_USER &&
-	    !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
-		rcu_read_unlock();
-		return -EPERM;
-	}
-	rcu_read_unlock();
+	mm = task->mm;
+	if (mm &&
+	    ((get_dumpable(mm) != SUID_DUMP_USER) &&
+	     !ptrace_has_cap(mm->user_ns, mode)))
+	    return -EPERM;
 
 	return security_ptrace_access_check(task, mode);
 }
@@ -331,6 +326,11 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	task_lock(task);
 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
+	if (!retval) {
+		struct mm_struct *mm = task->mm;
+		if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE))
+			flags |= PT_PTRACE_CAP;
+	}
 	task_unlock(task);
 	if (retval)
 		goto unlock_creds;
@@ -344,10 +344,6 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	if (seize)
 		flags |= PT_SEIZED;
-	rcu_read_lock();
-	if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
-		flags |= PT_PTRACE_CAP;
-	rcu_read_unlock();
 	task->ptrace = flags;
 
 	__ptrace_link(task, current);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a56a851908d2..975e49f00f34 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -6,6 +6,7 @@
 #include <linux/cpumask.h>
 
 #include <linux/atomic.h>
+#include <linux/user_namespace.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 
@@ -21,5 +22,6 @@ struct mm_struct init_mm = {
 	.mmap_sem	= __RWSEM_INITIALIZER(init_mm.mmap_sem),
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
+	.user_ns	= &init_user_ns,
 	INIT_MM_CONTEXT(init_mm)
 };
-- 
cgit v1.2.3


From 64b875f7ac8a5d60a4e191479299e931ee949b67 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 14 Nov 2016 18:48:07 -0600
Subject: ptrace: Capture the ptracer's creds not PT_PTRACE_CAP

When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was
overlooked.  This can result in incorrect behavior when an application
like strace traces an exec of a setuid executable.

Further PT_PTRACE_CAP does not have enough information for making good
security decisions as it does not report which user namespace the
capability is in.  This has already allowed one mistake through
insufficient granulariy.

I found this issue when I was testing another corner case of exec and
discovered that I could not get strace to set PT_PTRACE_CAP even when
running strace as root with a full set of caps.

This change fixes the above issue with strace allowing stracing as
root a setuid executable without disabling setuid.  More fundamentaly
this change allows what is allowable at all times, by using the correct
information in it's decision.

Cc: stable@vger.kernel.org
Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c                  |  2 +-
 include/linux/capability.h |  1 +
 include/linux/ptrace.h     |  1 -
 include/linux/sched.h      |  1 +
 kernel/capability.c        | 20 ++++++++++++++++++++
 kernel/ptrace.c            | 12 +++++++-----
 6 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 4e497b9ee71e..3cf2cfced97a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1406,7 +1406,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	unsigned n_fs;
 
 	if (p->ptrace) {
-		if (p->ptrace & PT_PTRACE_CAP)
+		if (ptracer_capable(p, current_user_ns()))
 			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
 		else
 			bprm->unsafe |= LSM_UNSAFE_PTRACE;
diff --git a/include/linux/capability.h b/include/linux/capability.h
index dbc21c719ce6..d6088e2a7668 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -242,6 +242,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
 #endif /* CONFIG_MULTIUSER */
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
+extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 504c98a278d4..e13bfdf7f314 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -19,7 +19,6 @@
 #define PT_SEIZED	0x00010000	/* SEIZE used, enable new behavior */
 #define PT_PTRACED	0x00000001
 #define PT_DTRACE	0x00000002	/* delayed trace (used on m68k, i386) */
-#define PT_PTRACE_CAP	0x00000004	/* ptracer can follow suid-exec */
 
 #define PT_OPT_FLAG_SHIFT	3
 /* PT_TRACE_* event enable flags */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..e9f693598e15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1656,6 +1656,7 @@ struct task_struct {
 	struct list_head cpu_timers[3];
 
 /* process credentials */
+	const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */
 	const struct cred __rcu *real_cred; /* objective and real subjective task
 					 * credentials (COW) */
 	const struct cred __rcu *cred;	/* effective (overridable) subjective task
diff --git a/kernel/capability.c b/kernel/capability.c
index 00411c82dac5..dfa0e4528b0b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -473,3 +473,23 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
 		kgid_has_mapping(ns, inode->i_gid);
 }
 EXPORT_SYMBOL(capable_wrt_inode_uidgid);
+
+/**
+ * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
+ * @tsk: The task that may be ptraced
+ * @ns: The user namespace to search for CAP_SYS_PTRACE in
+ *
+ * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
+ * in the specified user namespace.
+ */
+bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
+{
+	int ret = 0;  /* An absent tracer adds no restrictions */
+	const struct cred *cred;
+	rcu_read_lock();
+	cred = rcu_dereference(tsk->ptracer_cred);
+	if (cred)
+		ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE);
+	rcu_read_unlock();
+	return (ret == 0);
+}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 282821557183..e82c15cadd6d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -39,6 +39,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 	BUG_ON(!list_empty(&child->ptrace_entry));
 	list_add(&child->ptrace_entry, &new_parent->ptraced);
 	child->parent = new_parent;
+	rcu_read_lock();
+	child->ptracer_cred = get_cred(__task_cred(new_parent));
+	rcu_read_unlock();
 }
 
 /**
@@ -71,12 +74,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
  */
 void __ptrace_unlink(struct task_struct *child)
 {
+	const struct cred *old_cred;
 	BUG_ON(!child->ptrace);
 
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
+	old_cred = child->ptracer_cred;
+	child->ptracer_cred = NULL;
+	put_cred(old_cred);
 
 	spin_lock(&child->sighand->siglock);
 	child->ptrace = 0;
@@ -326,11 +333,6 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	task_lock(task);
 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS);
-	if (!retval) {
-		struct mm_struct *mm = task->mm;
-		if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE))
-			flags |= PT_PTRACE_CAP;
-	}
 	task_unlock(task);
 	if (retval)
 		goto unlock_creds;
-- 
cgit v1.2.3


From 84d77d3f06e7e8dea057d10e8ec77ad71f721be3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 22 Nov 2016 12:06:50 -0600
Subject: ptrace: Don't allow accessing an undumpable mm

It is the reasonable expectation that if an executable file is not
readable there will be no way for a user without special privileges to
read the file.  This is enforced in ptrace_attach but if ptrace
is already attached before exec there is no enforcement for read-only
executables.

As the only way to read such an mm is through access_process_vm
spin a variant called ptrace_access_vm that will fail if the
target process is not being ptraced by the current process, or
the current process did not have sufficient privileges when ptracing
began to read the target processes mm.

In the ptrace implementations replace access_process_vm by
ptrace_access_vm.  There remain several ptrace sites that still use
access_process_vm as they are reading the target executables
instructions (for kernel consumption) or register stacks.  As such it
does not appear necessary to add a permission check to those calls.

This bug has always existed in Linux.

Fixes: v1.0
Cc: stable@vger.kernel.org
Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/alpha/kernel/ptrace.c         |  2 +-
 arch/blackfin/kernel/ptrace.c      |  4 ++--
 arch/cris/arch-v32/kernel/ptrace.c |  2 +-
 arch/ia64/kernel/ptrace.c          |  2 +-
 arch/mips/kernel/ptrace32.c        |  4 ++--
 arch/powerpc/kernel/ptrace32.c     |  4 ++--
 include/linux/mm.h                 |  2 ++
 include/linux/ptrace.h             |  3 +++
 kernel/ptrace.c                    | 42 ++++++++++++++++++++++++++++++++------
 mm/memory.c                        |  2 +-
 mm/nommu.c                         |  2 +-
 11 files changed, 52 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 940dfb406591..04abdec7f496 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -283,7 +283,7 @@ long arch_ptrace(struct task_struct *child, long request,
 	/* When I and D space are separate, these will need to be fixed.  */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp),
+		copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp),
 				FOLL_FORCE);
 		ret = -EIO;
 		if (copied != sizeof(tmp))
diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c
index 8d79286ee4e8..360d99645163 100644
--- a/arch/blackfin/kernel/ptrace.c
+++ b/arch/blackfin/kernel/ptrace.c
@@ -270,7 +270,7 @@ long arch_ptrace(struct task_struct *child, long request,
 			switch (bfin_mem_access_type(addr, to_copy)) {
 			case BFIN_MEM_ACCESS_CORE:
 			case BFIN_MEM_ACCESS_CORE_ONLY:
-				copied = access_process_vm(child, addr, &tmp,
+				copied = ptrace_access_vm(child, addr, &tmp,
 							   to_copy, FOLL_FORCE);
 				if (copied)
 					break;
@@ -323,7 +323,7 @@ long arch_ptrace(struct task_struct *child, long request,
 			switch (bfin_mem_access_type(addr, to_copy)) {
 			case BFIN_MEM_ACCESS_CORE:
 			case BFIN_MEM_ACCESS_CORE_ONLY:
-				copied = access_process_vm(child, addr, &data,
+				copied = ptrace_access_vm(child, addr, &data,
 				                           to_copy,
 							   FOLL_FORCE | FOLL_WRITE);
 				break;
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index f0df654ac6fc..fe1f9cf7b391 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request,
 				/* The trampoline page is globally mapped, no page table to traverse.*/
 				tmp = *(unsigned long*)addr;
 			} else {
-				copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+				copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE);
 
 				if (copied != sizeof(tmp))
 					break;
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 31aa8c0f68e1..36f660da8124 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1159,7 +1159,7 @@ arch_ptrace (struct task_struct *child, long request,
 	case PTRACE_PEEKTEXT:
 	case PTRACE_PEEKDATA:
 		/* read word at location addr */
-		if (access_process_vm(child, addr, &data, sizeof(data),
+		if (ptrace_access_vm(child, addr, &data, sizeof(data),
 				FOLL_FORCE)
 		    != sizeof(data))
 			return -EIO;
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 7e71a4e0281b..5fcbdcd7abd0 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -69,7 +69,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 		if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
 			break;
 
-		copied = access_process_vm(child, (u64)addrOthers, &tmp,
+		copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
 				sizeof(tmp), FOLL_FORCE);
 		if (copied != sizeof(tmp))
 			break;
@@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 		if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0)
 			break;
 		ret = 0;
-		if (access_process_vm(child, (u64)addrOthers, &data,
+		if (ptrace_access_vm(child, (u64)addrOthers, &data,
 					sizeof(data),
 					FOLL_FORCE | FOLL_WRITE) == sizeof(data))
 			break;
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index 010b7b310237..1e887f3a61a6 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -73,7 +73,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 		if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
 			break;
 
-		copied = access_process_vm(child, (u64)addrOthers, &tmp,
+		copied = ptrace_access_vm(child, (u64)addrOthers, &tmp,
 				sizeof(tmp), FOLL_FORCE);
 		if (copied != sizeof(tmp))
 			break;
@@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 		if (get_user(addrOthers, (u32 __user * __user *)addr) != 0)
 			break;
 		ret = 0;
-		if (access_process_vm(child, (u64)addrOthers, &tmp,
+		if (ptrace_access_vm(child, (u64)addrOthers, &tmp,
 					sizeof(tmp),
 					FOLL_FORCE | FOLL_WRITE) == sizeof(tmp))
 			break;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a92c8d73aeaf..0b5b2e4df14e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1270,6 +1270,8 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *
 		unsigned int gup_flags);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, unsigned int gup_flags);
+extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long addr, void *buf, int len, unsigned int gup_flags);
 
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 			    unsigned long start, unsigned long nr_pages,
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e13bfdf7f314..e0e539321ab9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -8,6 +8,9 @@
 #include <linux/pid_namespace.h>	/* For task_active_pid_ns.  */
 #include <uapi/linux/ptrace.h>
 
+extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+			    void *buf, int len, unsigned int gup_flags);
+
 /*
  * Ptrace flags
  *
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e82c15cadd6d..49ba7c1ade9d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -27,6 +27,35 @@
 #include <linux/cn_proc.h>
 #include <linux/compat.h>
 
+/*
+ * Access another process' address space via ptrace.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
+		     void *buf, int len, unsigned int gup_flags)
+{
+	struct mm_struct *mm;
+	int ret;
+
+	mm = get_task_mm(tsk);
+	if (!mm)
+		return 0;
+
+	if (!tsk->ptrace ||
+	    (current != tsk->parent) ||
+	    ((get_dumpable(mm) != SUID_DUMP_USER) &&
+	     !ptracer_capable(tsk, mm->user_ns))) {
+		mmput(mm);
+		return 0;
+	}
+
+	ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+	mmput(mm);
+
+	return ret;
+}
+
 
 /*
  * ptrace a task: make the debugger its new parent and
@@ -535,7 +564,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
 		int this_len, retval;
 
 		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
-		retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE);
+		retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE);
+
 		if (!retval) {
 			if (copied)
 				break;
@@ -562,7 +592,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 		this_len = (len > sizeof(buf)) ? sizeof(buf) : len;
 		if (copy_from_user(buf, src, this_len))
 			return -EFAULT;
-		retval = access_process_vm(tsk, dst, buf, this_len,
+		retval = ptrace_access_vm(tsk, dst, buf, this_len,
 				FOLL_FORCE | FOLL_WRITE);
 		if (!retval) {
 			if (copied)
@@ -1126,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
 	unsigned long tmp;
 	int copied;
 
-	copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
+	copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE);
 	if (copied != sizeof(tmp))
 		return -EIO;
 	return put_user(tmp, (unsigned long __user *)data);
@@ -1137,7 +1167,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
 {
 	int copied;
 
-	copied = access_process_vm(tsk, addr, &data, sizeof(data),
+	copied = ptrace_access_vm(tsk, addr, &data, sizeof(data),
 			FOLL_FORCE | FOLL_WRITE);
 	return (copied == sizeof(data)) ? 0 : -EIO;
 }
@@ -1155,7 +1185,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 	switch (request) {
 	case PTRACE_PEEKTEXT:
 	case PTRACE_PEEKDATA:
-		ret = access_process_vm(child, addr, &word, sizeof(word),
+		ret = ptrace_access_vm(child, addr, &word, sizeof(word),
 				FOLL_FORCE);
 		if (ret != sizeof(word))
 			ret = -EIO;
@@ -1165,7 +1195,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 
 	case PTRACE_POKETEXT:
 	case PTRACE_POKEDATA:
-		ret = access_process_vm(child, addr, &data, sizeof(data),
+		ret = ptrace_access_vm(child, addr, &data, sizeof(data),
 				FOLL_FORCE | FOLL_WRITE);
 		ret = (ret != sizeof(data) ? -EIO : 0);
 		break;
diff --git a/mm/memory.c b/mm/memory.c
index e18c57bdc75c..cbb1e5e5f791 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3868,7 +3868,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
  * Access another process' address space as given in mm.  If non-NULL, use the
  * given task for page fault accounting.
  */
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b8faaf2a9e9..44265e00b701 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1808,7 +1808,7 @@ void filemap_map_pages(struct fault_env *fe,
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, unsigned int gup_flags)
 {
 	struct vm_area_struct *vma;
-- 
cgit v1.2.3


From f84df2a6f268de584a201e8911384a2d244876e3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 16 Nov 2016 22:06:51 -0600
Subject: exec: Ensure mm->user_ns contains the execed files

When the user namespace support was merged the need to prevent
ptrace from revealing the contents of an unreadable executable
was overlooked.

Correct this oversight by ensuring that the executed file
or files are in mm->user_ns, by adjusting mm->user_ns.

Use the new function privileged_wrt_inode_uidgid to see if
the executable is a member of the user namespace, and as such
if having CAP_SYS_PTRACE in the user namespace should allow
tracing the executable.  If not update mm->user_ns to
the parent user namespace until an appropriate parent is found.

Cc: stable@vger.kernel.org
Reported-by: Jann Horn <jann@thejh.net>
Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/exec.c                  | 19 +++++++++++++++++--
 include/linux/capability.h |  1 +
 kernel/capability.c        | 16 ++++++++++++++--
 3 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 3cf2cfced97a..fdf53f0c421b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1275,8 +1275,22 @@ EXPORT_SYMBOL(flush_old_exec);
 
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
-	if (inode_permission(file_inode(file), MAY_READ) < 0)
+	struct inode *inode = file_inode(file);
+	if (inode_permission(inode, MAY_READ) < 0) {
+		struct user_namespace *old, *user_ns;
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+		/* Ensure mm->user_ns contains the executable */
+		user_ns = old = bprm->mm->user_ns;
+		while ((user_ns != &init_user_ns) &&
+		       !privileged_wrt_inode_uidgid(user_ns, inode))
+			user_ns = user_ns->parent;
+
+		if (old != user_ns) {
+			bprm->mm->user_ns = get_user_ns(user_ns);
+			put_user_ns(old);
+		}
+	}
 }
 EXPORT_SYMBOL(would_dump);
 
@@ -1306,7 +1320,6 @@ void setup_new_exec(struct linux_binprm * bprm)
 	    !gid_eq(bprm->cred->gid, current_egid())) {
 		current->pdeath_signal = 0;
 	} else {
-		would_dump(bprm, bprm->file);
 		if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
 			set_dumpable(current->mm, suid_dumpable);
 	}
@@ -1741,6 +1754,8 @@ static int do_execveat_common(int fd, struct filename *filename,
 	if (retval < 0)
 		goto out;
 
+	would_dump(bprm, bprm->file);
+
 	retval = exec_binprm(bprm);
 	if (retval < 0)
 		goto out;
diff --git a/include/linux/capability.h b/include/linux/capability.h
index d6088e2a7668..6ffb67e10c06 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -240,6 +240,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
 	return true;
 }
 #endif /* CONFIG_MULTIUSER */
+extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode);
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
diff --git a/kernel/capability.c b/kernel/capability.c
index dfa0e4528b0b..4984e1f552eb 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -456,6 +456,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
 }
 EXPORT_SYMBOL(file_ns_capable);
 
+/**
+ * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
+ * @ns: The user namespace in question
+ * @inode: The inode in question
+ *
+ * Return true if the inode uid and gid are within the namespace.
+ */
+bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode)
+{
+	return kuid_has_mapping(ns, inode->i_uid) &&
+		kgid_has_mapping(ns, inode->i_gid);
+}
+
 /**
  * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
  * @inode: The inode in question
@@ -469,8 +482,7 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
 {
 	struct user_namespace *ns = current_user_ns();
 
-	return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
-		kgid_has_mapping(ns, inode->i_gid);
+	return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode);
 }
 EXPORT_SYMBOL(capable_wrt_inode_uidgid);
 
-- 
cgit v1.2.3


From 31eff2434db542763a00074a8368d7bd78d14ea1 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 17 Nov 2016 19:35:34 +0100
Subject: sched/nohz: Convert to hotplug state machine

Install the callbacks via the state machine.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: rt@linuxtronix.de
Link: http://lkml.kernel.org/r/20161117183541.8588-14-bigeasy@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3bcb61b52f6c..71496a20e670 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -390,24 +390,16 @@ static int __init tick_nohz_full_setup(char *str)
 }
 __setup("nohz_full=", tick_nohz_full_setup);
 
-static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
-				       unsigned long action,
-				       void *hcpu)
+static int tick_nohz_cpu_down(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		/*
-		 * The boot CPU handles housekeeping duty (unbound timers,
-		 * workqueues, timekeeping, ...) on behalf of full dynticks
-		 * CPUs. It must remain online when nohz full is enabled.
-		 */
-		if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
-			return NOTIFY_BAD;
-		break;
-	}
-	return NOTIFY_OK;
+	/*
+	 * The boot CPU handles housekeeping duty (unbound timers,
+	 * workqueues, timekeeping, ...) on behalf of full dynticks
+	 * CPUs. It must remain online when nohz full is enabled.
+	 */
+	if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
+		return -EBUSY;
+	return 0;
 }
 
 static int tick_nohz_init_all(void)
@@ -428,7 +420,7 @@ static int tick_nohz_init_all(void)
 
 void __init tick_nohz_init(void)
 {
-	int cpu;
+	int cpu, ret;
 
 	if (!tick_nohz_full_running) {
 		if (tick_nohz_init_all() < 0)
@@ -469,7 +461,10 @@ void __init tick_nohz_init(void)
 	for_each_cpu(cpu, tick_nohz_full_mask)
 		context_tracking_cpu_set(cpu);
 
-	cpu_notifier(tick_nohz_cpu_down_callback, 0);
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"kernel/nohz:predown", NULL,
+					tick_nohz_cpu_down);
+	WARN_ON(ret < 0);
 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 		cpumask_pr_args(tick_nohz_full_mask));
 
-- 
cgit v1.2.3


From 478409dd683db76cbcfe7bf8332a37f01deb0a2d Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhang.chunyan@linaro.org>
Date: Mon, 21 Nov 2016 15:57:18 +0800
Subject: tracing: Add hook to function tracing for other subsystems to use

Currently Function traces can be only exported to the ring buffer. This
adds a trace_export concept which can process traces and export
them to a registered destination as an addition to the current
one that outputs to Ftrace - i.e. ring buffer.

In this way, if we want function traces to be sent to other destinations
rather than only to the ring buffer, we just need to register a new
trace_export and implement its own .write() function for writing traces to
storage.

With this patch, only function tracing (trace type is TRACE_FN)
is supported.

Link: http://lkml.kernel.org/r/1479715043-6534-2-git-send-email-zhang.chunyan@linaro.org

Signed-off-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace.h |  28 +++++++++++
 kernel/trace/trace.c  | 129 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/trace.h

(limited to 'kernel')

diff --git a/include/linux/trace.h b/include/linux/trace.h
new file mode 100644
index 000000000000..9330a58e2651
--- /dev/null
+++ b/include/linux/trace.h
@@ -0,0 +1,28 @@
+#ifndef _LINUX_TRACE_H
+#define _LINUX_TRACE_H
+
+#ifdef CONFIG_TRACING
+/*
+ * The trace export - an export of Ftrace output. The trace_export
+ * can process traces and export them to a registered destination as
+ * an addition to the current only output of Ftrace - i.e. ring buffer.
+ *
+ * If you want traces to be sent to some other place rather than ring
+ * buffer only, just need to register a new trace_export and implement
+ * its own .write() function for writing traces to the storage.
+ *
+ * next		- pointer to the next trace_export
+ * write	- copy traces which have been delt with ->commit() to
+ *		  the destination
+ */
+struct trace_export {
+	struct trace_export __rcu	*next;
+	void (*write)(const void *, unsigned int);
+};
+
+int register_ftrace_export(struct trace_export *export);
+int unregister_ftrace_export(struct trace_export *export);
+
+#endif	/* CONFIG_TRACING */
+
+#endif	/* _LINUX_TRACE_H */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 57069e7f369c..edccdff8a36d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -40,6 +40,7 @@
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/fs.h>
+#include <linux/trace.h>
 #include <linux/sched/rt.h>
 
 #include "trace.h"
@@ -2128,6 +2129,129 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
+static void
+trace_process_export(struct trace_export *export,
+	       struct ring_buffer_event *event)
+{
+	struct trace_entry *entry;
+	unsigned int size = 0;
+
+	entry = ring_buffer_event_data(event);
+	size = ring_buffer_event_length(event);
+	export->write(entry, size);
+}
+
+static DEFINE_MUTEX(ftrace_export_lock);
+
+static struct trace_export __rcu *ftrace_exports_list __read_mostly;
+
+static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled);
+
+static inline void ftrace_exports_enable(void)
+{
+	static_branch_enable(&ftrace_exports_enabled);
+}
+
+static inline void ftrace_exports_disable(void)
+{
+	static_branch_disable(&ftrace_exports_enabled);
+}
+
+void ftrace_exports(struct ring_buffer_event *event)
+{
+	struct trace_export *export;
+
+	preempt_disable_notrace();
+
+	export = rcu_dereference_raw_notrace(ftrace_exports_list);
+	while (export) {
+		trace_process_export(export, event);
+		export = rcu_dereference_raw_notrace(export->next);
+	}
+
+	preempt_enable_notrace();
+}
+
+static inline void
+add_trace_export(struct trace_export **list, struct trace_export *export)
+{
+	rcu_assign_pointer(export->next, *list);
+	/*
+	 * We are entering export into the list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the export->next pointer is valid before another CPU sees
+	 * the export pointer included into the list.
+	 */
+	rcu_assign_pointer(*list, export);
+}
+
+static inline int
+rm_trace_export(struct trace_export **list, struct trace_export *export)
+{
+	struct trace_export **p;
+
+	for (p = list; *p != NULL; p = &(*p)->next)
+		if (*p == export)
+			break;
+
+	if (*p != export)
+		return -1;
+
+	rcu_assign_pointer(*p, (*p)->next);
+
+	return 0;
+}
+
+static inline void
+add_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+	if (*list == NULL)
+		ftrace_exports_enable();
+
+	add_trace_export(list, export);
+}
+
+static inline int
+rm_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+	int ret;
+
+	ret = rm_trace_export(list, export);
+	if (*list == NULL)
+		ftrace_exports_disable();
+
+	return ret;
+}
+
+int register_ftrace_export(struct trace_export *export)
+{
+	if (WARN_ON_ONCE(!export->write))
+		return -1;
+
+	mutex_lock(&ftrace_export_lock);
+
+	add_ftrace_export(&ftrace_exports_list, export);
+
+	mutex_unlock(&ftrace_export_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_export);
+
+int unregister_ftrace_export(struct trace_export *export)
+{
+	int ret;
+
+	mutex_lock(&ftrace_export_lock);
+
+	ret = rm_ftrace_export(&ftrace_exports_list, export);
+
+	mutex_unlock(&ftrace_export_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_export);
+
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -2146,8 +2270,11 @@ trace_function(struct trace_array *tr,
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
+	if (!call_filter_check_discard(call, entry, buffer, event)) {
+		if (static_branch_unlikely(&ftrace_exports_enabled))
+			ftrace_exports(event);
 		__buffer_unlock_commit(buffer, event);
+	}
 }
 
 #ifdef CONFIG_STACKTRACE
-- 
cgit v1.2.3


From 7d436400223bb46e9f88e6bba6f8d867acf0d82c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 22 Nov 2016 18:32:03 -0500
Subject: tracing: Add error checks to creation of event files

The creation of the set_event_pid file was assigned to a variable "entry"
but that variable was never used. Ideally, it should be used to check if the
file was created and warn if it was not.

The files header_page, header_event should also be checked and a warning if
they fail to be created.

The "enable" file was moved up, as it is a more crucial file to have and a
hard failure (return -ENOMEM) should be returned if it is not created.

Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 03c0a48c3ac4..ba67ede48822 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2843,20 +2843,32 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 		return -ENOMEM;
 	}
 
+	entry = trace_create_file("enable", 0644, d_events,
+				  tr, &ftrace_tr_enable_fops);
+	if (!entry) {
+		pr_warn("Could not create tracefs 'enable' entry\n");
+		return -ENOMEM;
+	}
+
+	/* There are not as crucial, just warn if they are not created */
+
 	entry = tracefs_create_file("set_event_pid", 0644, parent,
 				    tr, &ftrace_set_event_pid_fops);
+	if (!entry)
+		pr_warn("Could not create tracefs 'set_event_pid' entry\n");
 
 	/* ring buffer internal formats */
-	trace_create_file("header_page", 0444, d_events,
-			  ring_buffer_print_page_header,
-			  &ftrace_show_header_fops);
-
-	trace_create_file("header_event", 0444, d_events,
-			  ring_buffer_print_entry_header,
-			  &ftrace_show_header_fops);
+	entry = trace_create_file("header_page", 0444, d_events,
+				  ring_buffer_print_page_header,
+				  &ftrace_show_header_fops);
+	if (!entry)
+		pr_warn("Could not create tracefs 'header_page' entry\n");
 
-	trace_create_file("enable", 0644, d_events,
-			  tr, &ftrace_tr_enable_fops);
+	entry = trace_create_file("header_event", 0444, d_events,
+				  ring_buffer_print_entry_header,
+				  &ftrace_show_header_fops);
+	if (!entry)
+		pr_warn("Could not create tracefs 'header_event' entry\n");
 
 	tr->event_dir = d_events;
 
-- 
cgit v1.2.3


From 176cedc4ed143745708999155c11b5717cdebb35 Mon Sep 17 00:00:00 2001
From: "T.Zhou" <t1zhou@163.com>
Date: Wed, 23 Nov 2016 08:48:32 +0800
Subject: sched/dl: Fix comment in pick_next_task_dl()

Fix cut & paste oversight:

  s/pull_rt_task/pull_dl_task

Signed-off-by: T.Zhou <t1zhou@163.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: juri.lelli@gmail.com
Link: http://lkml.kernel.org/r/20161123004832.GA2983@geo
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c61b461248a3..70ef2b1901e4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
 		pull_dl_task(rq);
 		lockdep_repin_lock(&rq->lock, cookie);
 		/*
-		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
+		 * pull_dl_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
 		 * re-start task selection.
 		 */
-- 
cgit v1.2.3


From 2b4d5b2582deffb77b3b4b48a59cd36e9e1e14d9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 23 Nov 2016 07:37:00 +0100
Subject: sched/fair: Clean up the tunable parameter definitions

No change in functionality:

 - align the default values vertically to make them easier to scan
 - standardize the 'default:' lines
 - fix minor whitespace typos

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 02605f2826a2..aa475896782d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,7 +37,6 @@
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * NOTE: this latency value is not the same as the concept of
  * 'timeslice length' - timeslices in CFS are of variable length
@@ -46,31 +45,35 @@
  *
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
+ *
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_latency			= 6000000ULL;
+unsigned int normalized_sysctl_sched_latency		= 6000000ULL;
 
 /*
  * The initial- and re-scaling of tunables is configurable
- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  *
  * Options are:
- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
+ *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
+ *
+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  */
-enum sched_tunable_scaling sysctl_sched_tunable_scaling
-	= SCHED_TUNABLESCALING_LOG;
+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
+ *
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity		= 750000ULL;
+unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
 
 /*
- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
  */
 static unsigned int sched_nr_latency = 8;
 
@@ -82,16 +85,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
 
 /*
  * SCHED_OTHER wake-up granularity.
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
+ *
+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity		= 1000000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
 
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
@@ -102,16 +106,18 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  * to consumption or the quota being specified to be smaller than the slice)
  * we will always only issue the remaining available time.
  *
- * default: 5 msec, units: microseconds
-  */
-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+ * (default: 5 msec, units: microseconds)
+ */
+unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
 
 /*
  * The margin used when comparing utilization with CPU capacity:
  * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
  */
-unsigned int capacity_margin = 1280; /* ~20% */
+unsigned int capacity_margin				= 1280;
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
@@ -7174,8 +7180,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
  * Something like:
  *
- * 	{ 0 1 2 3 } { 4 5 6 7 }
- * 	        *     * * *
+ *	{ 0 1 2 3 } { 4 5 6 7 }
+ *	        *     * * *
  *
  * If we were to balance group-wise we'd place two tasks in the first group and
  * two tasks in the second group. Clearly this is undesired as it will overload
-- 
cgit v1.2.3


From 3e9a8aadca4807b4eadd33a50014c9b2767a4f1f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 11:29:58 -0500
Subject: tracing: Create a always_inlined __trace_buffer_lock_reserve()

As Andi Kleen pointed out in the Link below, the trace events has quite a
bit of code execution. A lot of that happens to be calling functions, where
some of them should simply be inlined. One of these functions happens to be
trace_buffer_lock_reserve() which is also a global, but it is used
throughout the file it is defined in. Create a __trace_buffer_lock_reserve()
that is always inlined that the file can benefit from.

Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 87 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index edccdff8a36d..490533726b54 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -739,6 +739,31 @@ static inline void ftrace_trace_stack(struct trace_array *tr,
 
 #endif
 
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+		  int type, unsigned long flags, int pc)
+{
+	struct trace_entry *ent = ring_buffer_event_data(event);
+
+	tracing_generic_entry_update(ent, flags, pc);
+	ent->type = type;
+}
+
+static __always_inline struct ring_buffer_event *
+__trace_buffer_lock_reserve(struct ring_buffer *buffer,
+			  int type,
+			  unsigned long len,
+			  unsigned long flags, int pc)
+{
+	struct ring_buffer_event *event;
+
+	event = ring_buffer_lock_reserve(buffer, len);
+	if (event != NULL)
+		trace_event_setup(event, type, flags, pc);
+
+	return event;
+}
+
 static void tracer_tracing_on(struct trace_array *tr)
 {
 	if (tr->trace_buffer.buffer)
@@ -795,8 +820,8 @@ int __trace_puts(unsigned long ip, const char *str, int size)
 
 	local_save_flags(irq_flags);
 	buffer = global_trace.trace_buffer.buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
-					  irq_flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
+					    irq_flags, pc);
 	if (!event)
 		return 0;
 
@@ -843,8 +868,8 @@ int __trace_bputs(unsigned long ip, const char *str)
 
 	local_save_flags(irq_flags);
 	buffer = global_trace.trace_buffer.buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
-					  irq_flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+					    irq_flags, pc);
 	if (!event)
 		return 0;
 
@@ -1913,29 +1938,13 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
-static __always_inline void
-trace_event_setup(struct ring_buffer_event *event,
-		  int type, unsigned long flags, int pc)
-{
-	struct trace_entry *ent = ring_buffer_event_data(event);
-
-	tracing_generic_entry_update(ent, flags, pc);
-	ent->type = type;
-}
-
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct ring_buffer *buffer,
 			  int type,
 			  unsigned long len,
 			  unsigned long flags, int pc)
 {
-	struct ring_buffer_event *event;
-
-	event = ring_buffer_lock_reserve(buffer, len);
-	if (event != NULL)
-		trace_event_setup(event, type, flags, pc);
-
-	return event;
+	return __trace_buffer_lock_reserve(buffer, type, len, flags, pc);
 }
 
 DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
@@ -2090,8 +2099,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 		this_cpu_dec(trace_buffered_event_cnt);
 	}
 
-	entry = trace_buffer_lock_reserve(*current_rb,
-					 type, len, flags, pc);
+	entry = __trace_buffer_lock_reserve(*current_rb,
+					    type, len, flags, pc);
 	/*
 	 * If tracing is off, but we have triggers enabled
 	 * we still need to look at the event data. Use the temp_buffer
@@ -2100,8 +2109,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 	 */
 	if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
 		*current_rb = temp_buffer;
-		entry = trace_buffer_lock_reserve(*current_rb,
-						  type, len, flags, pc);
+		entry = __trace_buffer_lock_reserve(*current_rb,
+						    type, len, flags, pc);
 	}
 	return entry;
 }
@@ -2262,8 +2271,8 @@ trace_function(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
 
-	event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
-					  flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
+					    flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -2342,8 +2351,8 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 
 	size *= sizeof(unsigned long);
 
-	event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
-					  sizeof(*entry) + size, flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
+					    sizeof(*entry) + size, flags, pc);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -2444,8 +2453,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 
 	__this_cpu_inc(user_stack_count);
 
-	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
-					  sizeof(*entry), flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
+					    sizeof(*entry), flags, pc);
 	if (!event)
 		goto out_drop_count;
 	entry	= ring_buffer_event_data(event);
@@ -2615,8 +2624,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	local_save_flags(flags);
 	size = sizeof(*entry) + sizeof(u32) * len;
 	buffer = tr->trace_buffer.buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
-					  flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
+					    flags, pc);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -2671,8 +2680,8 @@ __trace_array_vprintk(struct ring_buffer *buffer,
 
 	local_save_flags(flags);
 	size = sizeof(*entry) + len + 1;
-	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-					  flags, pc);
+	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+					    flags, pc);
 	if (!event)
 		goto out;
 	entry = ring_buffer_event_data(event);
@@ -5732,8 +5741,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt + 2; /* possible \n added */
 	buffer = tr->trace_buffer.buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
-					  irq_flags, preempt_count());
+	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+					    irq_flags, preempt_count());
 	if (!event) {
 		/* Ring buffer disabled, return as if not open for write */
 		written = -EBADF;
@@ -5810,8 +5819,8 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt;
 	buffer = tr->trace_buffer.buffer;
-	event = trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
-					  irq_flags, preempt_count());
+	event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
+					    irq_flags, preempt_count());
 	if (!event) {
 		/* Ring buffer disabled, return as if not open for write */
 		written = -EBADF;
-- 
cgit v1.2.3


From fa7ffb39efccd574163ebc5dbfe4ff066186f261 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 11:36:30 -0500
Subject: ring-buffer: Make rb_reserve_next_event() always inlined

The function rb_reserved_next_event() is called by two functions:
ring_buffer_lock_reserve() and ring_buffer_write(). This is in a very hot
path of the tracing code, and it is best that they are not functions. The
two callers are basically wrapers for rb_reserver_next_event(). Removing the
function calls can save execution time in the hotpath of tracing.

Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c143739b8d7..1f3580cee6cc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2733,7 +2733,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
-static struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer *buffer,
 		      struct ring_buffer_per_cpu *cpu_buffer,
 		      unsigned long length)
-- 
cgit v1.2.3


From 929ddbf3ef4e07fef67e93e998020d49d2533724 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 11:40:34 -0500
Subject: ring-buffer: Always inline rb_event_data()

The rb_event_data() is the fast path of getting the ring buffer data from an
event. Externally, ring_buffer_event_data() is used to access this function.
But unfortunately, rb_event_data() is not inlined, and calling
ring_buffer_event_data() causes that function to be called again. Force
rb_event_data() to be inlined to lower the number of operations needed when
calling ring_buffer_event_data().

Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1f3580cee6cc..2760aaca6d1b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -245,7 +245,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 
 /* inline for ring buffer fast paths */
-static void *
+static __always_inline void *
 rb_event_data(struct ring_buffer_event *event)
 {
 	if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
-- 
cgit v1.2.3


From 4239174570da080f3623724d97062bf55de7e36b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 15:52:45 -0500
Subject: tracing: Make tracepoint_printk a static_key

Currently, when tracepoint_printk is set (enabled by the "tp_printk" kernel
command line), it causes trace events to print via printk(). This is a very
dangerous operation, but is useful for debugging.

The issue is, it's seldom used, but it is always checked even if it's not
enabled by the kernel command line. Instead of having this feature called by
a branch against a variable, turn that variable into a static key, and this
will remove the test and jump.

To simplify things, the functions output_printk() and
trace_event_buffer_commit() were moved from trace_events.c to trace.c.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h      |  4 +++
 kernel/sysctl.c             |  2 +-
 kernel/trace/trace.c        | 78 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c | 40 -----------------------
 4 files changed, 83 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b3d34d3e0e7e..8700049fd0e5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -945,6 +945,10 @@ extern int __disable_trace_on_warning;
 #define INIT_TRACE_RECURSION		.trace_recursion = 0,
 #endif
 
+int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp,
+			     loff_t *ppos);
+
 #else /* CONFIG_TRACING */
 static inline void  disable_trace_on_warning(void) { }
 #endif /* CONFIG_TRACING */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 706309f9ed84..6ccc60dfbc7a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -634,7 +634,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &tracepoint_printk,
 		.maxlen		= sizeof(tracepoint_printk),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= tracepoint_printk_sysctl,
 	},
 #endif
 #ifdef CONFIG_KEXEC_CORE
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 490533726b54..725e8b2c453f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -69,6 +69,7 @@ bool __read_mostly tracing_selftest_disabled;
 /* Pipe tracepoints to printk */
 struct trace_iterator *tracepoint_print_iter;
 int tracepoint_printk;
+static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key);
 
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
@@ -2116,6 +2117,81 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
+static DEFINE_SPINLOCK(tracepoint_iter_lock);
+static DEFINE_MUTEX(tracepoint_printk_mutex);
+
+static void output_printk(struct trace_event_buffer *fbuffer)
+{
+	struct trace_event_call *event_call;
+	struct trace_event *event;
+	unsigned long flags;
+	struct trace_iterator *iter = tracepoint_print_iter;
+
+	/* We should never get here if iter is NULL */
+	if (WARN_ON_ONCE(!iter))
+		return;
+
+	event_call = fbuffer->trace_file->event_call;
+	if (!event_call || !event_call->event.funcs ||
+	    !event_call->event.funcs->trace)
+		return;
+
+	event = &fbuffer->trace_file->event_call->event;
+
+	spin_lock_irqsave(&tracepoint_iter_lock, flags);
+	trace_seq_init(&iter->seq);
+	iter->ent = fbuffer->entry;
+	event_call->event.funcs->trace(iter, 0, event);
+	trace_seq_putc(&iter->seq, 0);
+	printk("%s", iter->seq.buffer);
+
+	spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
+}
+
+int tracepoint_printk_sysctl(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp,
+			     loff_t *ppos)
+{
+	int save_tracepoint_printk;
+	int ret;
+
+	mutex_lock(&tracepoint_printk_mutex);
+	save_tracepoint_printk = tracepoint_printk;
+
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	/*
+	 * This will force exiting early, as tracepoint_printk
+	 * is always zero when tracepoint_printk_iter is not allocated
+	 */
+	if (!tracepoint_print_iter)
+		tracepoint_printk = 0;
+
+	if (save_tracepoint_printk == tracepoint_printk)
+		goto out;
+
+	if (tracepoint_printk)
+		static_key_enable(&tracepoint_printk_key.key);
+	else
+		static_key_disable(&tracepoint_printk_key.key);
+
+ out:
+	mutex_unlock(&tracepoint_printk_mutex);
+
+	return ret;
+}
+
+void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
+{
+	if (static_key_false(&tracepoint_printk_key.key))
+		output_printk(fbuffer);
+
+	event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
+				    fbuffer->event, fbuffer->entry,
+				    fbuffer->flags, fbuffer->pc);
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
+
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event,
@@ -7977,6 +8053,8 @@ void __init trace_init(void)
 			kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
 		if (WARN_ON(!tracepoint_print_iter))
 			tracepoint_printk = 0;
+		else
+			static_key_enable(&tracepoint_printk_key.key);
 	}
 	tracer_alloc_buffers();
 	trace_event_init();
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ba67ede48822..d35fc2b0d304 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -283,46 +283,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_reserve);
 
-static DEFINE_SPINLOCK(tracepoint_iter_lock);
-
-static void output_printk(struct trace_event_buffer *fbuffer)
-{
-	struct trace_event_call *event_call;
-	struct trace_event *event;
-	unsigned long flags;
-	struct trace_iterator *iter = tracepoint_print_iter;
-
-	if (!iter)
-		return;
-
-	event_call = fbuffer->trace_file->event_call;
-	if (!event_call || !event_call->event.funcs ||
-	    !event_call->event.funcs->trace)
-		return;
-
-	event = &fbuffer->trace_file->event_call->event;
-
-	spin_lock_irqsave(&tracepoint_iter_lock, flags);
-	trace_seq_init(&iter->seq);
-	iter->ent = fbuffer->entry;
-	event_call->event.funcs->trace(iter, 0, event);
-	trace_seq_putc(&iter->seq, 0);
-	printk("%s", iter->seq.buffer);
-
-	spin_unlock_irqrestore(&tracepoint_iter_lock, flags);
-}
-
-void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
-{
-	if (tracepoint_printk)
-		output_printk(fbuffer);
-
-	event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer,
-				    fbuffer->event, fbuffer->entry,
-				    fbuffer->flags, fbuffer->pc);
-}
-EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
-
 int trace_event_reg(struct trace_event_call *call,
 		    enum trace_reg type, void *data)
 {
-- 
cgit v1.2.3


From 52ffabe3848a1ebd944cdf7801a77247b1cb46d5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 20:28:38 -0500
Subject: tracing: Make __buffer_unlock_commit() always_inline

The function __buffer_unlock_commit() is called in a few places outside of
trace.c. But for the most part, it should really be inlined, as it is in the
hot path of the trace_events. For the callers outside of trace.c, create a
new function trace_buffer_unlock_commit_nostack(), as the reason it was used
was to avoid the stack tracing that trace_buffer_unlock_commit() could do.

Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c                 | 41 +++++++++++++++++++++++-------------
 kernel/trace/trace.h                 |  4 ++--
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_functions_graph.c |  4 ++--
 kernel/trace/trace_hwlat.c           |  2 +-
 5 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 725e8b2c453f..60416bf7c591 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -794,6 +794,22 @@ void tracing_on(void)
 }
 EXPORT_SYMBOL_GPL(tracing_on);
 
+
+static __always_inline void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+	__this_cpu_write(trace_cmdline_save, true);
+
+	/* If this is the temp buffer, we need to commit fully */
+	if (this_cpu_read(trace_buffered_event) == event) {
+		/* Length is in event->array[0] */
+		ring_buffer_write(buffer, event->array[0], &event->array[1]);
+		/* Release the temp buffer */
+		this_cpu_dec(trace_buffered_event_cnt);
+	} else
+		ring_buffer_unlock_commit(buffer, event);
+}
+
 /**
  * __trace_puts - write a constant string into the trace buffer.
  * @ip:	   The address of the caller
@@ -2059,21 +2075,6 @@ void trace_buffered_event_disable(void)
 	preempt_enable();
 }
 
-void
-__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
-{
-	__this_cpu_write(trace_cmdline_save, true);
-
-	/* If this is the temp buffer, we need to commit fully */
-	if (this_cpu_read(trace_buffered_event) == event) {
-		/* Length is in event->array[0] */
-		ring_buffer_write(buffer, event->array[0], &event->array[1]);
-		/* Release the temp buffer */
-		this_cpu_dec(trace_buffered_event_cnt);
-	} else
-		ring_buffer_unlock_commit(buffer, event);
-}
-
 static struct ring_buffer *temp_buffer;
 
 struct ring_buffer_event *
@@ -2214,6 +2215,16 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
+/*
+ * Similar to trace_buffer_unlock_commit_regs() but do not dump stack.
+ */
+void
+trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+				   struct ring_buffer_event *event)
+{
+	__buffer_unlock_commit(buffer, event);
+}
+
 static void
 trace_process_export(struct trace_export *export,
 	       struct ring_buffer_event *event)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9294f8606ade..37602e722336 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -602,8 +602,8 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts);
 
-void __buffer_unlock_commit(struct ring_buffer *buffer,
-			    struct ring_buffer_event *event);
+void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer,
+					struct ring_buffer_event *event);
 
 int trace_empty(struct trace_iterator *iter);
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 3a2a73716a5b..75489de546b6 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -81,7 +81,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->correct = val == expect;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		__buffer_unlock_commit(buffer, event);
+		trace_buffer_unlock_commit_nostack(buffer, event);
 
  out:
 	current->trace_recursion &= ~TRACE_BRANCH_BIT;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4e480e870474..8e1a115439fa 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -358,7 +358,7 @@ int __trace_graph_entry(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		__buffer_unlock_commit(buffer, event);
+		trace_buffer_unlock_commit_nostack(buffer, event);
 
 	return 1;
 }
@@ -469,7 +469,7 @@ void __trace_graph_return(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		__buffer_unlock_commit(buffer, event);
+		trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b97286c48735..775569ec50d0 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -127,7 +127,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
 	entry->nmi_count		= sample->nmi_count;
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-		__buffer_unlock_commit(buffer, event);
+		trace_buffer_unlock_commit_nostack(buffer, event);
 }
 
 /* Macros to encapsulate the time capturing infrastructure */
-- 
cgit v1.2.3


From 2289d5672f99d36764cbf66e13b5401f700e7043 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 20:35:32 -0500
Subject: ring-buffer: Force inline of hotpath helper functions

There's several small helper functions in ring_buffer.c that are used in the
hot path. For some reason, even though they are marked inline, gcc tends not
to enforce it. Make sure these functions are always inlined.

Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2760aaca6d1b..04068c0ed927 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1829,48 +1829,48 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
 
-static inline void *
+static __always_inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
 	return bpage->data + index;
 }
 
-static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
+static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
 {
 	return bpage->page->data + index;
 }
 
-static inline struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
 rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	return __rb_page_index(cpu_buffer->reader_page,
 			       cpu_buffer->reader_page->read);
 }
 
-static inline struct ring_buffer_event *
+static __always_inline struct ring_buffer_event *
 rb_iter_head_event(struct ring_buffer_iter *iter)
 {
 	return __rb_page_index(iter->head_page, iter->head);
 }
 
-static inline unsigned rb_page_commit(struct buffer_page *bpage)
+static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
 {
 	return local_read(&bpage->page->commit);
 }
 
 /* Size is determined by what has been committed */
-static inline unsigned rb_page_size(struct buffer_page *bpage)
+static __always_inline unsigned rb_page_size(struct buffer_page *bpage)
 {
 	return rb_page_commit(bpage);
 }
 
-static inline unsigned
+static __always_inline unsigned
 rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	return rb_page_commit(cpu_buffer->commit_page);
 }
 
-static inline unsigned
+static __always_inline unsigned
 rb_event_index(struct ring_buffer_event *event)
 {
 	unsigned long addr = (unsigned long)event;
-- 
cgit v1.2.3


From babe3fce95e6490b88a2a21d90eb4ba9884edb82 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 20:38:39 -0500
Subject: ring-buffer: Froce rb_update_write_stamp() to be inlined

The function rb_update_write_stamp() is in the hotpath of the ring buffer
recording. Make sure that it is inlined as well. There's not many places
that call it.

Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 04068c0ed927..04579a0c7711 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2486,7 +2486,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
 		event->time_delta = 1;
 }
 
-static inline bool
+static __always_inline bool
 rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		   struct ring_buffer_event *event)
 {
@@ -2500,7 +2500,7 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_commit_index(cpu_buffer) == index;
 }
 
-static void
+static __always_inline void
 rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
-- 
cgit v1.2.3


From 38e11df134297ea3860c7aad8263ece27db01308 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 23 Nov 2016 20:42:31 -0500
Subject: ring-buffer: Force rb_end_commit() and rb_set_commit_to_write()
 inline

Both rb_end_commit() and rb_set_commit_to_write() are in the fast path of
the ring buffer recording. Make sure they are always inlined.

Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 04579a0c7711..f17476f9d7f4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2386,7 +2386,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
 	local_inc(&cpu_buffer->commits);
 }
 
-static void
+static __always_inline void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	unsigned long max_count;
@@ -2441,7 +2441,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 		goto again;
 }
 
-static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	unsigned long commits;
 
-- 
cgit v1.2.3


From 83929cce95251cc77e5659bf493bd424ae0e7a67 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 23 Nov 2016 11:33:37 +0100
Subject: sched/autogroup: Fix 64-bit kernel nice level adjustment

Michael Kerrisk reported:

> Regarding the previous paragraph...  My tests indicate
> that writing *any* value to the autogroup [nice priority level]
> file causes the task group to get a lower priority.

Because autogroup didn't call the then meaningless scale_load()...

Autogroup nice level adjustment has been broken ever since load
resolution was increased for 64-bit kernels.  Use scale_load() to
scale group weight.

Michael Kerrisk tested this patch to fix the problem:

> Applied and tested against 4.9-rc6 on an Intel u7 (4 cores).
> Test setup:
>
> Terminal window 1: running 40 CPU burner jobs
> Terminal window 2: running 40 CPU burner jobs
> Terminal window 1: running  1 CPU burner job
>
> Demonstrated that:
> * Writing "0" to the autogroup file for TW1 now causes no change
>   to the rate at which the process on the terminal consume CPU.
> * Writing -20 to the autogroup file for TW1 caused those processes
>   to get the lion's share of CPU while TW2 TW3 get a tiny amount.
> * Writing -20 to the autogroup files for TW1 and TW3 allowed the
>   process on TW3 to get as much CPU as it was getting as when
>   the autogroup nice values for both terminals were 0.

Reported-by: Michael Kerrisk <mtk.manpages@gmail.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-man <linux-man@vger.kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1479897217.4306.6.camel@gmx.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/auto_group.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index f1c8fd566246..da39489d2d80 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -212,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 {
 	static unsigned long next = INITIAL_JIFFIES;
 	struct autogroup *ag;
+	unsigned long shares;
 	int err;
 
 	if (nice < MIN_NICE || nice > MAX_NICE)
@@ -230,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 
 	next = HZ / 10 + jiffies;
 	ag = autogroup_task_get(p);
+	shares = scale_load(sched_prio_to_weight[nice + 20]);
 
 	down_write(&ag->lock);
-	err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]);
+	err = sched_group_set_shares(ag->tg, shares);
 	if (!err)
 		ag->nice = nice;
 	up_write(&ag->lock);
-- 
cgit v1.2.3


From afe06efdf07c12fd9370d5cce5383398cedf6c90 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Tue, 22 Nov 2016 12:23:53 -0800
Subject: sched: Extend scheduler's asym packing

We generalize the scheduler's asym packing to provide an ordering
of the cpu beyond just the cpu number.  This allows the use of the
ASYM_PACKING scheduler machinery to move loads to preferred CPU in a
sched domain. The preference is defined with the cpu priority
given by arch_asym_cpu_priority(cpu).

We also record the most preferred cpu in a sched group when
we build the cpu's capacity for fast lookup of preferred cpu
during load balancing.

Co-developed-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-pm@vger.kernel.org
Cc: jolsa@redhat.com
Cc: rjw@rjwysocki.net
Cc: linux-acpi@vger.kernel.org
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: bp@suse.de
Link: http://lkml.kernel.org/r/0e73ae12737dfaafa46c07066cc7c5d3f1675e46.1479844244.git.tim.c.chen@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   | 15 +++++++++++++++
 kernel/sched/fair.c   | 53 ++++++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h  |  6 ++++++
 4 files changed, 59 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 19abba04ceca..fe9a499d5aa4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1077,6 +1077,8 @@ static inline int cpu_numa_flags(void)
 }
 #endif
 
+extern int arch_asym_cpu_priority(int cpu);
+
 struct sched_domain_attr {
 	int relax_domain_level;
 };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dc64bd71ed2b..393759bd526e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6303,7 +6303,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 	WARN_ON(!sg);
 
 	do {
+		int cpu, max_cpu = -1;
+
 		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+
+		if (!(sd->flags & SD_ASYM_PACKING))
+			goto next;
+
+		for_each_cpu(cpu, sched_group_cpus(sg)) {
+			if (max_cpu < 0)
+				max_cpu = cpu;
+			else if (sched_asym_prefer(cpu, max_cpu))
+				max_cpu = cpu;
+		}
+		sg->asym_prefer_cpu = max_cpu;
+
+next:
 		sg = sg->next;
 	} while (sg != sd->groups);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aa475896782d..18d9e75f1f6e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -97,6 +97,16 @@ unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
+#ifdef CONFIG_SMP
+/*
+ * For asym packing, by default the lower numbered cpu has higher priority.
+ */
+int __weak arch_asym_cpu_priority(int cpu)
+{
+	return -cpu;
+}
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 /*
  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
@@ -7388,16 +7398,18 @@ asym_packing:
 	if (env->idle == CPU_NOT_IDLE)
 		return true;
 	/*
-	 * ASYM_PACKING needs to move all the work to the lowest
-	 * numbered CPUs in the group, therefore mark all groups
-	 * higher than ourself as busy.
+	 * ASYM_PACKING needs to move all the work to the highest
+	 * prority CPUs in the group, therefore mark all groups
+	 * of lower priority than ourself as busy.
 	 */
-	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
+	if (sgs->sum_nr_running &&
+	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
 		if (!sds->busiest)
 			return true;
 
-		/* Prefer to move from highest possible cpu's work */
-		if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
+		/* Prefer to move from lowest priority cpu's work */
+		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
+				      sg->asym_prefer_cpu))
 			return true;
 	}
 
@@ -7549,8 +7561,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 	if (!sds->busiest)
 		return 0;
 
-	busiest_cpu = group_first_cpu(sds->busiest);
-	if (env->dst_cpu > busiest_cpu)
+	busiest_cpu = sds->busiest->asym_prefer_cpu;
+	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
 		return 0;
 
 	env->imbalance = DIV_ROUND_CLOSEST(
@@ -7888,10 +7900,11 @@ static int need_active_balance(struct lb_env *env)
 
 		/*
 		 * ASYM_PACKING needs to force migrate tasks from busy but
-		 * higher numbered CPUs in order to pack all tasks in the
-		 * lowest numbered CPUs.
+		 * lower priority CPUs in order to pack all tasks in the
+		 * highest priority CPUs.
 		 */
-		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
+		if ((sd->flags & SD_ASYM_PACKING) &&
+		    sched_asym_prefer(env->dst_cpu, env->src_cpu))
 			return 1;
 	}
 
@@ -8740,7 +8753,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	unsigned long now = jiffies;
 	struct sched_domain_shared *sds;
 	struct sched_domain *sd;
-	int nr_busy, cpu = rq->cpu;
+	int nr_busy, i, cpu = rq->cpu;
 	bool kick = false;
 
 	if (unlikely(rq->idle_balance))
@@ -8791,12 +8804,18 @@ static inline bool nohz_kick_needed(struct rq *rq)
 	}
 
 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
-	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
-				  sched_domain_span(sd)) < cpu)) {
-		kick = true;
-		goto unlock;
-	}
+	if (sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (i == cpu ||
+			    !cpumask_test_cpu(i, nohz.idle_cpus_mask))
+				continue;
 
+			if (sched_asym_prefer(i, cpu)) {
+				kick = true;
+				goto unlock;
+			}
+		}
+	}
 unlock:
 	rcu_read_unlock();
 	return kick;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d7e39317d688..7b34c7826ca5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -540,6 +540,11 @@ struct dl_rq {
 
 #ifdef CONFIG_SMP
 
+static inline bool sched_asym_prefer(int a, int b)
+{
+	return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
+}
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -908,6 +913,7 @@ struct sched_group {
 
 	unsigned int group_weight;
 	struct sched_group_capacity *sgc;
+	int asym_prefer_cpu;		/* cpu of highest priority in group */
 
 	/*
 	 * The CPUs this group covers.
-- 
cgit v1.2.3


From d06e622d3d9206e6a2cc45a0f9a3256da8773ff4 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 24 Nov 2016 13:51:11 +0530
Subject: cpufreq: schedutil: Rectify comment in sugov_irq_work() function

This patch rectifies a comment present in sugov_irq_work() function to
follow proper grammar.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 42a220e78f00..fd4659313640 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -315,15 +315,15 @@ static void sugov_irq_work(struct irq_work *irq_work)
 	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 
 	/*
-	 * For Real Time and Deadline tasks, schedutil governor shoots the
-	 * frequency to maximum. And special care must be taken to ensure that
-	 * this kthread doesn't result in that.
+	 * For RT and deadline tasks, the schedutil governor shoots the
+	 * frequency to maximum. Special care must be taken to ensure that this
+	 * kthread doesn't result in the same behavior.
 	 *
 	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
-	 * updated only at the end of the sugov_work() and before that schedutil
-	 * rejects all other frequency scaling requests.
+	 * updated only at the end of the sugov_work() function and before that
+	 * the schedutil governor rejects all other frequency scaling requests.
 	 *
-	 * Though there is a very rare case where the RT thread yields right
+	 * There is a very rare case though, where the RT thread yields right
 	 * after the work_in_progress flag is cleared. The effects of that are
 	 * neglected for now.
 	 */
-- 
cgit v1.2.3


From 3007098494bec614fb55dee7bc0410bb7db5ad18 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 Nov 2016 16:52:26 +0100
Subject: cgroup: add support for eBPF programs

This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.

To illustrate the logic behind that, assume the following example
cgroup hierarchy.

  A - B - C
        \ D - E

If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.

Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h  |  79 +++++++++++++++++++++
 include/linux/cgroup-defs.h |   4 ++
 init/Kconfig                |  12 ++++
 kernel/bpf/Makefile         |   1 +
 kernel/bpf/cgroup.c         | 167 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c             |  18 +++++
 6 files changed, 281 insertions(+)
 create mode 100644 include/linux/bpf-cgroup.h
 create mode 100644 kernel/bpf/cgroup.c

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644
index 000000000000..ec80d0c0953e
--- /dev/null
+++ b/include/linux/bpf-cgroup.h
@@ -0,0 +1,79 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/bpf.h>
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+	/*
+	 * Store two sets of bpf_prog pointers, one for programs that are
+	 * pinned directly to this cgroup, and one for those that are effective
+	 * when this cgroup is accessed.
+	 */
+	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled)						\
+		__ret = __cgroup_bpf_run_filter(sk, skb,		\
+						BPF_CGROUP_INET_INGRESS); \
+									\
+	__ret;								\
+})
+
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
+		typeof(sk) __sk = sk_to_full_sk(sk);			\
+		if (sk_fullsock(__sk))					\
+			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
+						BPF_CGROUP_INET_EGRESS); \
+	}								\
+	__ret;								\
+})
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+				      struct cgroup *parent) {}
+
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5b17de62c962..861b4677fc5b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -300,6 +301,9 @@ struct cgroup {
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 
+	/* used to store eBPF programs */
+	struct cgroup_bpf bpf;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/init/Kconfig b/init/Kconfig
index 34407f15e6d3..405120b5f13e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1154,6 +1154,18 @@ config CGROUP_PERF
 
 	  Say N if unsure.
 
+config CGROUP_BPF
+	bool "Support for eBPF programs attached to cgroups"
+	depends on BPF_SYSCALL && SOCK_CGROUP_DATA
+	help
+	  Allow attaching eBPF programs to a cgroup using the bpf(2)
+	  syscall command BPF_PROG_ATTACH.
+
+	  In which context these programs are accessed depends on the type
+	  of attachment. For instance, programs that are attached using
+	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+	  inet sockets.
+
 config CGROUP_DEBUG
 	bool "Example controller"
 	default n
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index c4d89d6e2058..1276474ac3cd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..a0ab43f264b0
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,167 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+		struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+		if (prog) {
+			bpf_prog_put(prog);
+			static_branch_dec(&cgroup_bpf_enabled_key);
+		}
+	}
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+		struct bpf_prog *e;
+
+		e = rcu_dereference_protected(parent->bpf.effective[type],
+					      lockdep_is_held(&cgroup_mutex));
+		rcu_assign_pointer(cgrp->bpf.effective[type], e);
+	}
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is %NULL, this function attaches a new program to the cgroup and
+ * releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type)
+{
+	struct bpf_prog *old_prog, *effective;
+	struct cgroup_subsys_state *pos;
+
+	old_prog = xchg(cgrp->bpf.prog + type, prog);
+
+	effective = (!prog && parent) ?
+		rcu_dereference_protected(parent->bpf.effective[type],
+					  lockdep_is_held(&cgroup_mutex)) :
+		prog;
+
+	css_for_each_descendant_pre(pos, &cgrp->self) {
+		struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+		/* skip the subtree if the descendant has its own program */
+		if (desc->bpf.prog[type] && desc != cgrp)
+			pos = css_rightmost_descendant(pos);
+		else
+			rcu_assign_pointer(desc->bpf.effective[type],
+					   effective);
+	}
+
+	if (prog)
+		static_branch_inc(&cgroup_bpf_enabled_key);
+
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		static_branch_dec(&cgroup_bpf_enabled_key);
+	}
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socken sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	if (sk->sk_family != AF_INET &&
+	    sk->sk_family != AF_INET6)
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog) {
+		unsigned int offset = skb->data - skb_network_header(skb);
+
+		__skb_push(skb, offset);
+		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+		__skb_pull(skb, offset);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 85bc9beb046d..2ee9ec3051b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
 		if (cgrp->kn)
 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
 					 NULL);
+
+		cgroup_bpf_put(cgrp);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
+	if (parent)
+		cgroup_bpf_inherit(cgrp, parent);
+
 	cgroup_propagate_control(cgrp);
 
 	/* @cgrp doesn't have dir yet so the following will only create csses */
@@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
 }
 subsys_initcall(cgroup_namespaces_init);
 
+#ifdef CONFIG_CGROUP_BPF
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	mutex_lock(&cgroup_mutex);
+	__cgroup_bpf_update(cgrp, parent, prog, type);
+	mutex_unlock(&cgroup_mutex);
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
-- 
cgit v1.2.3


From f4324551489e8781d838f941b7aee4208e52e8bf Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 Nov 2016 16:52:27 +0100
Subject: bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands

Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and
BPF_PROG_DETACH which allow attaching and detaching eBPF programs
to a target.

On the API level, the target could be anything that has an fd in
userspace, hence the name of the field in union bpf_attr is called
'target_fd'.

When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is
expected to be a valid file descriptor of a cgroup v2 directory which
has the bpf controller enabled. These are the only use-cases
implemented by this patch at this point, but more can be added.

If a program of the given type already exists in the given cgroup,
the program is swapped automically, so userspace does not have to drop
an existing program first before installing a new one, which would
otherwise leave a gap in which no program is attached.

For more information on the propagation logic to subcgroups, please
refer to the bpf cgroup controller implementation.

The API is guarded by CAP_NET_ADMIN.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  8 +++++
 kernel/bpf/syscall.c     | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5ae679fac993..1370a9d1456f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@ enum bpf_cmd {
 	BPF_PROG_LOAD,
 	BPF_OBJ_PIN,
 	BPF_OBJ_GET,
+	BPF_PROG_ATTACH,
+	BPF_PROG_DETACH,
 };
 
 enum bpf_map_type {
@@ -159,6 +161,12 @@ union bpf_attr {
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
 	};
+
+	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+		__u32		target_fd;	/* container object to attach to */
+		__u32		attach_bpf_fd;	/* eBPF program to attach */
+		__u32		attach_type;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index eb15498b8d55..1090d16a31c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -835,6 +835,77 @@ static int bpf_obj_get(const union bpf_attr *attr)
 	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
 }
 
+#ifdef CONFIG_CGROUP_BPF
+
+#define BPF_PROG_ATTACH_LAST_FIELD attach_type
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (CHECK_ATTR(BPF_PROG_ATTACH))
+		return -EINVAL;
+
+	switch (attr->attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+		prog = bpf_prog_get_type(attr->attach_bpf_fd,
+					 BPF_PROG_TYPE_CGROUP_SKB);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+
+		cgrp = cgroup_get_from_fd(attr->target_fd);
+		if (IS_ERR(cgrp)) {
+			bpf_prog_put(prog);
+			return PTR_ERR(cgrp);
+		}
+
+		cgroup_bpf_update(cgrp, prog, attr->attach_type);
+		cgroup_put(cgrp);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD attach_type
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+	struct cgroup *cgrp;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (CHECK_ATTR(BPF_PROG_DETACH))
+		return -EINVAL;
+
+	switch (attr->attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+		cgrp = cgroup_get_from_fd(attr->target_fd);
+		if (IS_ERR(cgrp))
+			return PTR_ERR(cgrp);
+
+		cgroup_bpf_update(cgrp, NULL, attr->attach_type);
+		cgroup_put(cgrp);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -901,6 +972,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
+
+#ifdef CONFIG_CGROUP_BPF
+	case BPF_PROG_ATTACH:
+		err = bpf_prog_attach(&attr);
+		break;
+	case BPF_PROG_DETACH:
+		err = bpf_prog_detach(&attr);
+		break;
+#endif
+
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 7fd8329ba502ef76dd91db561c7aed696b2c7720 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 21 Sep 2016 13:47:22 +0200
Subject: taint/module: Clean up global and module taint flags handling

The commit 66cc69e34e86a231 ("Fix: module signature vs tracepoints:
add new TAINT_UNSIGNED_MODULE") updated module_taint_flags() to
potentially print one more character. But it did not increase the
size of the corresponding buffers in m_show() and print_modules().

We have recently done the same mistake when adding a taint flag
for livepatching, see
https://lkml.kernel.org/r/cfba2c823bb984690b73572aaae1db596b54a082.1472137475.git.jpoimboe@redhat.com

Also struct module uses an incompatible type for mod-taints flags.
It survived from the commit 2bc2d61a9638dab670d ("[PATCH] list module
taint flags in Oops/panic"). There was used "int" for the global taint
flags at these times. But only the global tain flags was later changed
to "unsigned long" by the commit 25ddbb18aae33ad2 ("Make the taint
flags reliable").

This patch defines TAINT_FLAGS_COUNT that can be used to create
arrays and buffers of the right size. Note that we could not use
enum because the taint flag indexes are used also in assembly code.

Then it reworks the table that describes the taint flags. The TAINT_*
numbers can be used as the index. Instead, we add information
if the taint flag is also shown per-module.

Finally, it uses "unsigned long", bit operations, and the updated
taint_flags table also for mod->taints.

It is not optimal because only few taint flags can be printed by
module_taint_flags(). But better be on the safe side. IMHO, it is
not worth the optimization and this is a good compromise.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: http://lkml.kernel.org/r/1474458442-21581-1-git-send-email-pmladek@suse.com
[jeyu@redhat.com: fix broken lkml link in changelog]
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 include/linux/kernel.h |  9 +++++++++
 include/linux/module.h |  2 +-
 kernel/module.c        | 33 +++++++++++++------------------
 kernel/panic.c         | 53 ++++++++++++++++++++++++--------------------------
 4 files changed, 48 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bc6ed52a39b9..441def77246d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -506,6 +506,15 @@ extern enum system_states {
 #define TAINT_UNSIGNED_MODULE		13
 #define TAINT_SOFTLOCKUP		14
 #define TAINT_LIVEPATCH			15
+#define TAINT_FLAGS_COUNT		16
+
+struct taint_flag {
+	char true;	/* character printed when tainted */
+	char false;	/* character printed when not tainted */
+	bool module;	/* also show as a per-module taint flag */
+};
+
+extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT];
 
 extern const char hex_asc[];
 #define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
diff --git a/include/linux/module.h b/include/linux/module.h
index 0c3207d26ac0..f6ee569c62bb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -399,7 +399,7 @@ struct module {
 	/* Arch-specific module values */
 	struct mod_arch_specific arch;
 
-	unsigned int taints;	/* same bits as kernel:tainted */
+	unsigned long taints;	/* same bits as kernel:taint_flags */
 
 #ifdef CONFIG_GENERIC_BUG
 	/* Support for BUG */
diff --git a/kernel/module.c b/kernel/module.c
index f57dd63186e6..a4acd8f403ae 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -330,7 +330,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
 				    enum lockdep_ok lockdep_ok)
 {
 	add_taint(flag, lockdep_ok);
-	mod->taints |= (1U << flag);
+	set_bit(flag, &mod->taints);
 }
 
 /*
@@ -1138,24 +1138,13 @@ static inline int module_unload_init(struct module *mod)
 static size_t module_flags_taint(struct module *mod, char *buf)
 {
 	size_t l = 0;
+	int i;
+
+	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+		if (taint_flags[i].module && test_bit(i, &mod->taints))
+			buf[l++] = taint_flags[i].true;
+	}
 
-	if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
-		buf[l++] = 'P';
-	if (mod->taints & (1 << TAINT_OOT_MODULE))
-		buf[l++] = 'O';
-	if (mod->taints & (1 << TAINT_FORCED_MODULE))
-		buf[l++] = 'F';
-	if (mod->taints & (1 << TAINT_CRAP))
-		buf[l++] = 'C';
-	if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
-		buf[l++] = 'E';
-	if (mod->taints & (1 << TAINT_LIVEPATCH))
-		buf[l++] = 'K';
-	/*
-	 * TAINT_FORCED_RMMOD: could be added.
-	 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
-	 * apply to modules.
-	 */
 	return l;
 }
 
@@ -4041,6 +4030,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
 }
 #endif /* CONFIG_KALLSYMS */
 
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
+
+/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
 static char *module_flags(struct module *mod, char *buf)
 {
 	int bx = 0;
@@ -4085,7 +4078,7 @@ static void m_stop(struct seq_file *m, void *p)
 static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
-	char buf[8];
+	char buf[MODULE_FLAGS_BUF_SIZE];
 
 	/* We always ignore unformed modules. */
 	if (mod->state == MODULE_STATE_UNFORMED)
@@ -4256,7 +4249,7 @@ EXPORT_SYMBOL_GPL(__module_text_address);
 void print_modules(void)
 {
 	struct module *mod;
-	char buf[8];
+	char buf[MODULE_FLAGS_BUF_SIZE];
 
 	printk(KERN_DEFAULT "Modules linked in:");
 	/* Most callers should already have preempt disabled, but make sure */
diff --git a/kernel/panic.c b/kernel/panic.c
index e6480e20379e..c51edaa04fce 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -298,30 +298,27 @@ void panic(const char *fmt, ...)
 
 EXPORT_SYMBOL(panic);
 
-
-struct tnt {
-	u8	bit;
-	char	true;
-	char	false;
-};
-
-static const struct tnt tnts[] = {
-	{ TAINT_PROPRIETARY_MODULE,	'P', 'G' },
-	{ TAINT_FORCED_MODULE,		'F', ' ' },
-	{ TAINT_CPU_OUT_OF_SPEC,	'S', ' ' },
-	{ TAINT_FORCED_RMMOD,		'R', ' ' },
-	{ TAINT_MACHINE_CHECK,		'M', ' ' },
-	{ TAINT_BAD_PAGE,		'B', ' ' },
-	{ TAINT_USER,			'U', ' ' },
-	{ TAINT_DIE,			'D', ' ' },
-	{ TAINT_OVERRIDDEN_ACPI_TABLE,	'A', ' ' },
-	{ TAINT_WARN,			'W', ' ' },
-	{ TAINT_CRAP,			'C', ' ' },
-	{ TAINT_FIRMWARE_WORKAROUND,	'I', ' ' },
-	{ TAINT_OOT_MODULE,		'O', ' ' },
-	{ TAINT_UNSIGNED_MODULE,	'E', ' ' },
-	{ TAINT_SOFTLOCKUP,		'L', ' ' },
-	{ TAINT_LIVEPATCH,		'K', ' ' },
+/*
+ * TAINT_FORCED_RMMOD could be a per-module flag but the module
+ * is being removed anyway.
+ */
+const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
+	{ 'P', 'G', true },	/* TAINT_PROPRIETARY_MODULE */
+	{ 'F', ' ', true },	/* TAINT_FORCED_MODULE */
+	{ 'S', ' ', false },	/* TAINT_CPU_OUT_OF_SPEC */
+	{ 'R', ' ', false },	/* TAINT_FORCED_RMMOD */
+	{ 'M', ' ', false },	/* TAINT_MACHINE_CHECK */
+	{ 'B', ' ', false },	/* TAINT_BAD_PAGE */
+	{ 'U', ' ', false },	/* TAINT_USER */
+	{ 'D', ' ', false },	/* TAINT_DIE */
+	{ 'A', ' ', false },	/* TAINT_OVERRIDDEN_ACPI_TABLE */
+	{ 'W', ' ', false },	/* TAINT_WARN */
+	{ 'C', ' ', true },	/* TAINT_CRAP */
+	{ 'I', ' ', false },	/* TAINT_FIRMWARE_WORKAROUND */
+	{ 'O', ' ', true },	/* TAINT_OOT_MODULE */
+	{ 'E', ' ', true },	/* TAINT_UNSIGNED_MODULE */
+	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
+	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
 };
 
 /**
@@ -348,16 +345,16 @@ static const struct tnt tnts[] = {
  */
 const char *print_tainted(void)
 {
-	static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];
+	static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")];
 
 	if (tainted_mask) {
 		char *s;
 		int i;
 
 		s = buf + sprintf(buf, "Tainted: ");
-		for (i = 0; i < ARRAY_SIZE(tnts); i++) {
-			const struct tnt *t = &tnts[i];
-			*s++ = test_bit(t->bit, &tainted_mask) ?
+		for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
+			const struct taint_flag *t = &taint_flags[i];
+			*s++ = test_bit(i, &tainted_mask) ?
 					t->true : t->false;
 		}
 		*s = 0;
-- 
cgit v1.2.3


From 885a78d4a5b3ad2d7c41d1819b001d7957f442cd Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Thu, 20 Oct 2016 17:18:12 +0100
Subject: module: Ensure a module's state is set accordingly during module
 coming cleanup code

In load_module() in the event of an error, for e.g. unknown module
parameter(s) specified we go to perform some module coming clean up
operations. At this point the module is still in a "formed" state
when it is actually going away.

This patch updates the module's state accordingly to ensure anyone on the
module_notify_list waiting for a module going away notification will be
notified accordingly.

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Link: http://lkml.kernel.org/r/1476980293-19062-2-git-send-email-atomlin@redhat.com
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index a4acd8f403ae..f082832ad3ad 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3697,6 +3697,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
  sysfs_cleanup:
 	mod_sysfs_teardown(mod);
  coming_cleanup:
+	mod->state = MODULE_STATE_GOING;
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
 	klp_module_going(mod);
-- 
cgit v1.2.3


From 905dd707fc856bae57de144dcf873583881f9489 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Thu, 27 Oct 2016 10:36:06 +0100
Subject: module: When modifying a module's text ignore modules which are going
 away too

By default, during the access permission modification of a module's core
and init pages, we only ignore modules that are malformed. Albeit for a
module which is going away, it does not make sense to change its text to
RO since the module should be RW, before deallocation.

This patch makes set_all_modules_text_ro() skip modules which are going
away too.

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Link: http://lkml.kernel.org/r/1477560966-781-1-git-send-email-atomlin@redhat.com
[jeyu@redhat.com: add comment as suggested by Steven Rostedt]
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index f082832ad3ad..927a67e30855 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1958,7 +1958,13 @@ void set_all_modules_text_ro(void)
 
 	mutex_lock(&module_mutex);
 	list_for_each_entry_rcu(mod, &modules, list) {
-		if (mod->state == MODULE_STATE_UNFORMED)
+		/*
+		 * Ignore going modules since it's possible that ro
+		 * protection has already been disabled, otherwise we'll
+		 * run into protection faults at module deallocation.
+		 */
+		if (mod->state == MODULE_STATE_UNFORMED ||
+			mod->state == MODULE_STATE_GOING)
 			continue;
 
 		frob_text(&mod->core_layout, set_memory_ro);
-- 
cgit v1.2.3


From 71d9f5079358c148e71eba930e436a7a0cb35d95 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Wed, 16 Nov 2016 16:45:48 +0100
Subject: module: Fix a comment above strong_try_module_get()

The comment above strong_try_module_get() function is not true anymore.
Return values changed with commit c9a3ba55bb5d ("module: wait for
dependent modules doing init.").

Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1611161635330.12580@pobox.suse.cz
[jeyu@redhat.com: style fixes to make checkpatch.pl happy]
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 kernel/module.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 927a67e30855..6281c70683d3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -313,8 +313,11 @@ struct load_info {
 	} index;
 };
 
-/* We require a truly strong try_module_get(): 0 means failure due to
-   ongoing or failed initialization etc. */
+/*
+ * We require a truly strong try_module_get(): 0 means success.
+ * Otherwise an error is returned due to ongoing or failed
+ * initialization etc.
+ */
 static inline int strong_try_module_get(struct module *mod)
 {
 	BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
-- 
cgit v1.2.3


From 39290b389ea2654f9190e3b48c57d27b24def83e Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Mon, 14 Nov 2016 15:15:05 +0900
Subject: module: extend 'rodata=off' boot cmdline parameter to module mappings

The current "rodata=off" parameter disables read-only kernel mappings
under CONFIG_DEBUG_RODATA:
    commit d2aa1acad22f ("mm/init: Add 'rodata=off' boot cmdline parameter
    to disable read-only kernel mappings")

This patch is a logical extension to module mappings ie. read-only mappings
at module loading can be disabled even if CONFIG_DEBUG_SET_MODULE_RONX
(mainly for debug use). Please note, however, that it only affects RO/RW
permissions, keeping NX set.

This is the first step to make CONFIG_DEBUG_SET_MODULE_RONX mandatory
(always-on) in the future as CONFIG_DEBUG_RODATA on x86 and arm64.

Suggested-by: and Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Link: http://lkml.kernel.org/r/20161114061505.15238-1-takahiro.akashi@linaro.org
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 include/linux/init.h |  3 +++
 init/main.c          |  7 +++++--
 kernel/module.c      | 20 +++++++++++++++++---
 3 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init.h b/include/linux/init.h
index e30104ceb86d..885c3e6d0f9d 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -126,6 +126,9 @@ void prepare_namespace(void);
 void __init load_default_modules(void);
 int __init init_rootfs(void);
 
+#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_DEBUG_SET_MODULE_RONX)
+extern bool rodata_enabled;
+#endif
 #ifdef CONFIG_DEBUG_RODATA
 void mark_rodata_ro(void);
 #endif
diff --git a/init/main.c b/init/main.c
index 2858be732f6d..959a24242988 100644
--- a/init/main.c
+++ b/init/main.c
@@ -81,6 +81,7 @@
 #include <linux/integrity.h>
 #include <linux/proc_ns.h>
 #include <linux/io.h>
+#include <linux/cache.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -914,14 +915,16 @@ static int try_to_run_init_process(const char *init_filename)
 
 static noinline void __init kernel_init_freeable(void);
 
-#ifdef CONFIG_DEBUG_RODATA
-static bool rodata_enabled = true;
+#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_SET_MODULE_RONX)
+bool rodata_enabled __ro_after_init = true;
 static int __init set_debug_rodata(char *str)
 {
 	return strtobool(str, &rodata_enabled);
 }
 __setup("rodata=", set_debug_rodata);
+#endif
 
+#ifdef CONFIG_DEBUG_RODATA
 static void mark_readonly(void)
 {
 	if (rodata_enabled)
diff --git a/kernel/module.c b/kernel/module.c
index 6281c70683d3..039ce82803f7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1902,6 +1902,9 @@ static void frob_writable_data(const struct module_layout *layout,
 /* livepatching wants to disable read-only so it can frob module. */
 void module_disable_ro(const struct module *mod)
 {
+	if (!rodata_enabled)
+		return;
+
 	frob_text(&mod->core_layout, set_memory_rw);
 	frob_rodata(&mod->core_layout, set_memory_rw);
 	frob_ro_after_init(&mod->core_layout, set_memory_rw);
@@ -1911,6 +1914,9 @@ void module_disable_ro(const struct module *mod)
 
 void module_enable_ro(const struct module *mod, bool after_init)
 {
+	if (!rodata_enabled)
+		return;
+
 	frob_text(&mod->core_layout, set_memory_ro);
 	frob_rodata(&mod->core_layout, set_memory_ro);
 	frob_text(&mod->init_layout, set_memory_ro);
@@ -1943,6 +1949,9 @@ void set_all_modules_text_rw(void)
 {
 	struct module *mod;
 
+	if (!rodata_enabled)
+		return;
+
 	mutex_lock(&module_mutex);
 	list_for_each_entry_rcu(mod, &modules, list) {
 		if (mod->state == MODULE_STATE_UNFORMED)
@@ -1959,6 +1968,9 @@ void set_all_modules_text_ro(void)
 {
 	struct module *mod;
 
+	if (!rodata_enabled)
+		return;
+
 	mutex_lock(&module_mutex);
 	list_for_each_entry_rcu(mod, &modules, list) {
 		/*
@@ -1978,10 +1990,12 @@ void set_all_modules_text_ro(void)
 
 static void disable_ro_nx(const struct module_layout *layout)
 {
-	frob_text(layout, set_memory_rw);
-	frob_rodata(layout, set_memory_rw);
+	if (rodata_enabled) {
+		frob_text(layout, set_memory_rw);
+		frob_rodata(layout, set_memory_rw);
+		frob_ro_after_init(layout, set_memory_rw);
+	}
 	frob_rodata(layout, set_memory_x);
-	frob_ro_after_init(layout, set_memory_rw);
 	frob_ro_after_init(layout, set_memory_x);
 	frob_writable_data(layout, set_memory_x);
 }
-- 
cgit v1.2.3


From 88575199cc65de99a156888629a68180c830eff2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 26 Nov 2016 01:28:04 +0100
Subject: bpf: drop unnecessary context cast from BPF_PROG_RUN

Since long already bpf_func is not only about struct sk_buff * as
input anymore. Make it generic as void *, so that callers don't
need to cast for it each time they call BPF_PROG_RUN().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 include/linux/filter.h                              | 6 +++---
 kernel/events/core.c                                | 2 +-
 kernel/seccomp.c                                    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index eb3715700c95..876ab3a92ad5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1518,7 +1518,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len)
 	xdp.data = data;
 	xdp.data_end = data + len;
 
-	return BPF_PROG_RUN(prog, (void *)&xdp);
+	return BPF_PROG_RUN(prog, &xdp);
 }
 
 /**
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c521adfe..7f246a281435 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -408,8 +408,8 @@ struct bpf_prog {
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
-	unsigned int		(*bpf_func)(const struct sk_buff *skb,
-					    const struct bpf_insn *filter);
+	unsigned int		(*bpf_func)(const void *ctx,
+					    const struct bpf_insn *insn);
 	/* Instructions for interpreter */
 	union {
 		struct sock_filter	insns[0];
@@ -504,7 +504,7 @@ static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	u32 ret;
 
 	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, (void *)xdp);
+	ret = BPF_PROG_RUN(prog, xdp);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6ee1febdf6ff..22cc734aa1b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7726,7 +7726,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
 		goto out;
 	rcu_read_lock();
-	ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
+	ret = BPF_PROG_RUN(event->prog, &ctx);
 	rcu_read_unlock();
 out:
 	__this_cpu_dec(bpf_prog_active);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0db7c8a2afe2..bff9c774987a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -195,7 +195,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
 	 * value always takes priority (ignoring the DATA).
 	 */
 	for (; f; f = f->prev) {
-		u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
+		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
 
 		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 			ret = cur_ret;
-- 
cgit v1.2.3


From 21116b7068b9b66ac16b2fe3675469f459968c3f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 26 Nov 2016 01:28:07 +0100
Subject: bpf: add owner_prog_type and accounted mem to array map's fdinfo

Allow for checking the owner_prog_type of a program array map. In some
cases bpf(2) can return -EINVAL /after/ the verifier passed and did all
the rewrites of the bpf program.

The reason that lets us fail at this late stage is that program array
maps are incompatible. Allow users to inspect this earlier after they
got the map fd through BPF_OBJ_GET command. tc will get support for this.

Also, display how much we charged the map with regards to RLIMIT_MEMLOCK.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1090d16a31c1..4caa18e6860a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -138,18 +138,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_map *map = filp->private_data;
+	const struct bpf_array *array;
+	u32 owner_prog_type = 0;
+
+	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		array = container_of(map, struct bpf_array, map);
+		owner_prog_type = array->owner_prog_type;
+	}
 
 	seq_printf(m,
 		   "map_type:\t%u\n"
 		   "key_size:\t%u\n"
 		   "value_size:\t%u\n"
 		   "max_entries:\t%u\n"
-		   "map_flags:\t%#x\n",
+		   "map_flags:\t%#x\n"
+		   "memlock:\t%llu\n",
 		   map->map_type,
 		   map->key_size,
 		   map->value_size,
 		   map->max_entries,
-		   map->map_flags);
+		   map->map_flags,
+		   map->pages * 1ULL << PAGE_SHIFT);
+
+	if (owner_prog_type)
+		seq_printf(m, "owner_prog_type:\t%u\n",
+			   owner_prog_type);
 }
 #endif
 
-- 
cgit v1.2.3


From a3af5f80010625a9ffbe8edd4bae615a7516b6bc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 26 Nov 2016 01:28:08 +0100
Subject: bpf: allow for mount options to specify permissions

Since we recently converted the BPF filesystem over to use mount_nodev(),
we now have the possibility to also hold mount options in sb's s_fs_info.
This work implements mount options support for specifying permissions on
the sb's inode, which will be used by tc when it manually needs to mount
the fs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 2565809fbb34..0b030c9126d3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -18,6 +18,7 @@
 #include <linux/namei.h>
 #include <linux/fs.h>
 #include <linux/kdev_t.h>
+#include <linux/parser.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
 
@@ -364,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode)
 static const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
+	.show_options	= generic_show_options,
 	.evict_inode	= bpf_evict_inode,
 };
 
+enum {
+	OPT_MODE,
+	OPT_ERR,
+};
+
+static const match_table_t bpf_mount_tokens = {
+	{ OPT_MODE, "mode=%o" },
+	{ OPT_ERR, NULL },
+};
+
+struct bpf_mount_opts {
+	umode_t mode;
+};
+
+static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
+{
+	substring_t args[MAX_OPT_ARGS];
+	int option, token;
+	char *ptr;
+
+	opts->mode = S_IRWXUGO;
+
+	while ((ptr = strsep(&data, ",")) != NULL) {
+		if (!*ptr)
+			continue;
+
+		token = match_token(ptr, bpf_mount_tokens, args);
+		switch (token) {
+		case OPT_MODE:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->mode = option & S_IALLUGO;
+			break;
+		/* We might like to report bad mount options here, but
+		 * traditionally we've ignored all mount options, so we'd
+		 * better continue to ignore non-existing options for bpf.
+		 */
+		}
+	}
+
+	return 0;
+}
+
 static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr bpf_rfiles[] = { { "" } };
+	struct bpf_mount_opts opts;
 	struct inode *inode;
 	int ret;
 
+	save_mount_options(sb, data);
+
+	ret = bpf_parse_options(data, &opts);
+	if (ret)
+		return ret;
+
 	ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
 	if (ret)
 		return ret;
@@ -382,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 	inode = sb->s_root->d_inode;
 	inode->i_op = &bpf_dir_iops;
 	inode->i_mode &= ~S_IALLUGO;
-	inode->i_mode |= S_ISVTX | S_IRWXUGO;
+	inode->i_mode |= S_ISVTX | opts.mode;
 
 	return 0;
 }
-- 
cgit v1.2.3


From bb8313b603eb8fd52de48a079bfcd72dcab2ef1e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 28 Nov 2016 23:03:04 -0800
Subject: cpuidle: Allow enforcing deepest idle state selection

When idle injection is used to cap power, we need to override the
governor's choice of idle states.

For this reason, make it possible the deepest idle state selection to
be enforced by setting a flag on a given CPU to achieve the maximum
potential power draw reduction.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c | 13 ++++++++++++-
 include/linux/cpuidle.h   |  7 ++++++-
 kernel/sched/idle.c       | 13 ++++++++-----
 3 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c73207abb5a4..afc005b917fe 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -97,7 +97,17 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
-#ifdef CONFIG_SUSPEND
+/* Set the current cpu to use the deepest idle state, override governors */
+void cpuidle_use_deepest_state(bool enable)
+{
+	struct cpuidle_device *dev;
+
+	preempt_disable();
+	dev = cpuidle_get_device();
+	dev->use_deepest_state = enable;
+	preempt_enable();
+}
+
 /**
  * cpuidle_find_deepest_state - Find the deepest available idle state.
  * @drv: cpuidle driver for the given CPU.
@@ -109,6 +119,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 	return find_deepest_state(drv, dev, UINT_MAX, 0, false);
 }
 
+#ifdef CONFIG_SUSPEND
 static void enter_freeze_proper(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
 {
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 15deea449edc..da346f2817a8 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -74,6 +74,7 @@ struct cpuidle_driver_kobj;
 struct cpuidle_device {
 	unsigned int		registered:1;
 	unsigned int		enabled:1;
+	unsigned int		use_deepest_state:1;
 	unsigned int		cpu;
 
 	int			last_residency;
@@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
 #endif
 
-#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND)
+#ifdef CONFIG_CPU_IDLE
 extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev);
 extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev);
+extern void cpuidle_use_deepest_state(bool enable);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 					     struct cpuidle_device *dev)
@@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
 				       struct cpuidle_device *dev)
 {return -ENODEV; }
+static inline void cpuidle_use_deepest_state(bool enable)
+{
+}
 #endif
 
 /* kernel/sched/idle.c */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d5300d..513e4dfeeae7 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -164,11 +164,14 @@ static void cpuidle_idle_call(void)
 	 * timekeeping to prevent timer interrupts from kicking us out of idle
 	 * until a proper wakeup interrupt happens.
 	 */
-	if (idle_should_freeze()) {
-		entered_state = cpuidle_enter_freeze(drv, dev);
-		if (entered_state > 0) {
-			local_irq_enable();
-			goto exit_idle;
+
+	if (idle_should_freeze() || dev->use_deepest_state) {
+		if (idle_should_freeze()) {
+			entered_state = cpuidle_enter_freeze(drv, dev);
+			if (entered_state > 0) {
+				local_irq_enable();
+				goto exit_idle;
+			}
 		}
 
 		next_state = cpuidle_find_deepest_state(drv, dev);
-- 
cgit v1.2.3


From c1de45ca831acee9b72c9320dde447edafadb43f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Nov 2016 23:03:05 -0800
Subject: sched/idle: Add support for tasks that inject idle

Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use
realtime tasks to take control of CPU then inject idle. There are two
issues with this approach:

 1. Low efficiency: injected idle task is treated as busy so sched ticks
    do not stop during injected idle period, the result of these
    unwanted wakeups can be ~20% loss in power savings.

 2. Idle accounting: injected idle time is presented to user as busy.

This patch addresses the issues by introducing a new PF_IDLE flag which
allows any given task to be treated as idle task while the flag is set.
Therefore, idle injection tasks can run through the normal flow of NOHZ
idle enter/exit to get the correct accounting as well as tick stop when
possible.

The implication is that idle task is then no longer limited to PID == 0.

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpu.h   |   2 +
 include/linux/sched.h |   3 +-
 kernel/fork.c         |   2 +-
 kernel/sched/core.c   |   1 +
 kernel/sched/idle.c   | 162 +++++++++++++++++++++++++++++++-------------------
 5 files changed, 107 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b886dc17f2f3..ac0efae38072 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void);
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
+void play_idle(unsigned long duration_ms);
+
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
 bool cpu_report_death(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..114c7fcb6af6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 /*
  * Per process flags
  */
+#define PF_IDLE		0x00000002	/* I am an IDLE thread */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
@@ -2609,7 +2610,7 @@ extern struct task_struct *idle_task(int cpu);
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
-	return p->pid == 0;
+	return !!(p->flags & PF_IDLE);
 }
 extern struct task_struct *curr_task(int cpu);
 extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 623259fc794d..5074b2f0827b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1537,7 +1537,7 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cleanup_count;
 
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
-	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
 	p->flags |= PF_FORKNOEXEC;
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..63b3a8a49884 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5285,6 +5285,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
+	idle->flags |= PF_IDLE;
 
 	kasan_unpoison_task_stack(idle);
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 513e4dfeeae7..6a4bae0a649d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -205,76 +205,65 @@ exit_idle:
  *
  * Called with polling cleared.
  */
-static void cpu_idle_loop(void)
+static void do_idle(void)
 {
-	int cpu = smp_processor_id();
-
-	while (1) {
-		/*
-		 * If the arch has a polling bit, we maintain an invariant:
-		 *
-		 * Our polling bit is clear if we're not scheduled (i.e. if
-		 * rq->curr != rq->idle).  This means that, if rq->idle has
-		 * the polling bit set, then setting need_resched is
-		 * guaranteed to cause the cpu to reschedule.
-		 */
-
-		__current_set_polling();
-		quiet_vmstat();
-		tick_nohz_idle_enter();
+	/*
+	 * If the arch has a polling bit, we maintain an invariant:
+	 *
+	 * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+	 * rq->idle). This means that, if rq->idle has the polling bit set,
+	 * then setting need_resched is guaranteed to cause the CPU to
+	 * reschedule.
+	 */
 
-		while (!need_resched()) {
-			check_pgt_cache();
-			rmb();
+	__current_set_polling();
+	tick_nohz_idle_enter();
 
-			if (cpu_is_offline(cpu)) {
-				cpuhp_report_idle_dead();
-				arch_cpu_idle_dead();
-			}
+	while (!need_resched()) {
+		check_pgt_cache();
+		rmb();
 
-			local_irq_disable();
-			arch_cpu_idle_enter();
-
-			/*
-			 * In poll mode we reenable interrupts and spin.
-			 *
-			 * Also if we detected in the wakeup from idle
-			 * path that the tick broadcast device expired
-			 * for us, we don't want to go deep idle as we
-			 * know that the IPI is going to arrive right
-			 * away
-			 */
-			if (cpu_idle_force_poll || tick_check_broadcast_expired())
-				cpu_idle_poll();
-			else
-				cpuidle_idle_call();
-
-			arch_cpu_idle_exit();
+		if (cpu_is_offline(smp_processor_id())) {
+			cpuhp_report_idle_dead();
+			arch_cpu_idle_dead();
 		}
 
-		/*
-		 * Since we fell out of the loop above, we know
-		 * TIF_NEED_RESCHED must be set, propagate it into
-		 * PREEMPT_NEED_RESCHED.
-		 *
-		 * This is required because for polling idle loops we will
-		 * not have had an IPI to fold the state for us.
-		 */
-		preempt_set_need_resched();
-		tick_nohz_idle_exit();
-		__current_clr_polling();
+		local_irq_disable();
+		arch_cpu_idle_enter();
 
 		/*
-		 * We promise to call sched_ttwu_pending and reschedule
-		 * if need_resched is set while polling is set.  That
-		 * means that clearing polling needs to be visible
-		 * before doing these things.
+		 * In poll mode we reenable interrupts and spin. Also if we
+		 * detected in the wakeup from idle path that the tick
+		 * broadcast device expired for us, we don't want to go deep
+		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		smp_mb__after_atomic();
-
-		sched_ttwu_pending();
-		schedule_preempt_disabled();
+		if (cpu_idle_force_poll || tick_check_broadcast_expired())
+			cpu_idle_poll();
+		else
+			cpuidle_idle_call();
+		arch_cpu_idle_exit();
 	}
+
+	/*
+	 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+	 * be set, propagate it into PREEMPT_NEED_RESCHED.
+	 *
+	 * This is required because for polling idle loops we will not have had
+	 * an IPI to fold the state for us.
+	 */
+	preempt_set_need_resched();
+	tick_nohz_idle_exit();
+	__current_clr_polling();
+
+	/*
+	 * We promise to call sched_ttwu_pending() and reschedule if
+	 * need_resched() is set while polling is set. That means that clearing
+	 * polling needs to be visible before doing these things.
+	 */
+	smp_mb__after_atomic();
+
+	sched_ttwu_pending();
+	schedule_preempt_disabled();
 }
 
 bool cpu_in_idle(unsigned long pc)
@@ -283,6 +272,56 @@ bool cpu_in_idle(unsigned long pc)
 		pc < (unsigned long)__cpuidle_text_end;
 }
 
+struct idle_timer {
+	struct hrtimer timer;
+	int done;
+};
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+	struct idle_timer *it = container_of(timer, struct idle_timer, timer);
+
+	WRITE_ONCE(it->done, 1);
+	set_tsk_need_resched(current);
+
+	return HRTIMER_NORESTART;
+}
+
+void play_idle(unsigned long duration_ms)
+{
+	struct idle_timer it;
+
+	/*
+	 * Only FIFO tasks can disable the tick since they don't need the forced
+	 * preemption.
+	 */
+	WARN_ON_ONCE(current->policy != SCHED_FIFO);
+	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+	WARN_ON_ONCE(!duration_ms);
+
+	rcu_sleep_check();
+	preempt_disable();
+	current->flags |= PF_IDLE;
+	cpuidle_use_deepest_state(true);
+
+	it.done = 0;
+	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	it.timer.function = idle_inject_timer_fn;
+	hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+
+	while (!READ_ONCE(it.done))
+		do_idle();
+
+	cpuidle_use_deepest_state(false);
+	current->flags &= ~PF_IDLE;
+
+	preempt_fold_need_resched();
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(play_idle);
+
 void cpu_startup_entry(enum cpuhp_state state)
 {
 	/*
@@ -302,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 #endif
 	arch_cpu_idle_prepare();
 	cpuhp_online_idle(state);
-	cpu_idle_loop();
+	while (1)
+		do_idle();
 }
-- 
cgit v1.2.3


From 948a5312f41658f7b76a598a139ef1f4dea09ca9 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 28 Nov 2016 14:35:22 -0800
Subject: timekeeping: Add a fast and NMI safe boot clock

This boot clock can be used as a tracing clock and will account for
suspend time.

To keep it NMI safe since we're accessing from tracing, we're not using a
separate timekeeper with updates to monotonic clock and boot offset
protected with seqlocks. This has the following minor side effects:

(1) Its possible that a timestamp be taken after the boot offset is updated
but before the timekeeper is updated. If this happens, the new boot offset
is added to the old timekeeping making the clock appear to update slightly
earlier:
   CPU 0                                        CPU 1
   timekeeping_inject_sleeptime64()
   __timekeeping_inject_sleeptime(tk, delta);
                                                timestamp();
   timekeeping_update(tk, TK_CLEAR_NTP...);

(2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
partially updated.  Since the tk->offs_boot update is a rare event, this
should be a rare occurrence which postprocessing should be able to handle.

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1480372524-15181-6-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timekeeping.h |  1 +
 kernel/time/timekeeping.c   | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 09168c52ab64..361f8bf1429d 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -249,6 +249,7 @@ static inline u64 ktime_get_raw_ns(void)
 
 extern u64 ktime_get_mono_fast_ns(void);
 extern u64 ktime_get_raw_fast_ns(void);
+extern u64 ktime_get_boot_fast_ns(void);
 
 /*
  * Timespec interfaces utilizing the ktime based ones
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 37dec7e3db43..b2286e94c934 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -425,6 +425,35 @@ u64 ktime_get_raw_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
 
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ *    CPU 0                                        CPU 1
+ *    timekeeping_inject_sleeptime64()
+ *    __timekeeping_inject_sleeptime(tk, delta);
+ *                                                 timestamp();
+ *    timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated.  Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+
+	return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
 /* Suspend-time cycles value for halted fast timekeeper. */
 static cycle_t cycles_at_suspend;
 
-- 
cgit v1.2.3


From 80ec3552107ac16a836dbff4cf3c23fdd3256ee3 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 28 Nov 2016 14:35:23 -0800
Subject: trace: Add an option for boot clock as trace clock

Unlike monotonic clock, boot clock as a trace clock will account for
time spent in suspend useful for tracing suspend/resume. This uses
earlier introduced infrastructure for using the fast boot clock.

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Link: http://lkml.kernel.org/r/1480372524-15181-7-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8696ce6bf2f6..f7b64db285df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1125,6 +1125,7 @@ static struct {
 	{ trace_clock,			"perf",		1 },
 	{ ktime_get_mono_fast_ns,	"mono",		1 },
 	{ ktime_get_raw_fast_ns,	"mono_raw",	1 },
+	{ ktime_get_boot_fast_ns,	"boot",		1 },
 	ARCH_TRACE_CLOCKS
 };
 
-- 
cgit v1.2.3


From 8fae47705685fcaa75a1fe4c8c3e18300a702979 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Sun, 20 Nov 2016 16:47:55 -0500
Subject: audit: add support for session ID user filter

Define AUDIT_SESSIONID in the uapi and add support for specifying user
filters based on the session ID.  Also add the new session ID filter
to the feature bitmap so userspace knows it is available.

https://github.com/linux-audit/audit-kernel/issues/4
RFE: add a session ID filter to the kernel's user filter

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: combine multiple patches from Richard into this one]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h | 5 ++++-
 kernel/auditfilter.c       | 2 ++
 kernel/auditsc.c           | 5 +++++
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 82e8aa59446b..c8dc97bc2c1b 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -254,6 +254,7 @@
 #define AUDIT_OBJ_LEV_LOW	22
 #define AUDIT_OBJ_LEV_HIGH	23
 #define AUDIT_LOGINUID_SET	24
+#define AUDIT_SESSIONID	25	/* Session ID */
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
@@ -329,9 +330,11 @@ enum {
 #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT	0x00000001
 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME	0x00000002
 #define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH	0x00000004
+#define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER	0x00000010
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
 				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
-				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH)
+				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH | \
+				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 632e90d1005f..880519d6cf2a 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -363,6 +363,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 	case AUDIT_EXIT:
 	case AUDIT_SUCCESS:
 	case AUDIT_INODE:
+	case AUDIT_SESSIONID:
 		/* bit ops are only useful on syscall args */
 		if (f->op == Audit_bitmask || f->op == Audit_bittest)
 			return -EINVAL;
@@ -476,6 +477,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			if (!gid_valid(f->gid))
 				goto exit_free;
 			break;
+		case AUDIT_SESSIONID:
 		case AUDIT_ARCH:
 			entry->rule.arch_f = f;
 			break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d161b17ce8ce..f78cb1b3fa74 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -446,6 +446,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 	const struct cred *cred;
 	int i, need_sid = 1;
 	u32 sid;
+	unsigned int sessionid;
 
 	cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
 
@@ -508,6 +509,10 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_FSGID:
 			result = audit_gid_comparator(cred->fsgid, f->op, f->gid);
 			break;
+		case AUDIT_SESSIONID:
+			sessionid = audit_get_sessionid(current);
+			result = audit_comparator(sessionid, f->op, f->val);
+			break;
 		case AUDIT_PERS:
 			result = audit_comparator(tsk->personality, f->op, f->val);
 			break;
-- 
cgit v1.2.3


From faaae2a581435f32781a105dda3501df388fddcb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 29 Nov 2016 15:20:14 -0800
Subject: Re-enable CONFIG_MODVERSIONS in a slightly weaker form

This enables CONFIG_MODVERSIONS again, but allows for missing symbol CRC
information in order to work around the issue that newer binutils
versions seem to occasionally drop the CRC on the floor.  binutils 2.26
seems to work fine, while binutils 2.27 seems to break MODVERSIONS of
symbols that have been defined in assembler files.

[ We've had random missing CRC's before - it may be an old problem that
  just is now reliably triggered with the weak asm symbols and a new
  version of binutils ]

Some day I really do want to remove MODVERSIONS entirely.  Sadly, today
does not appear to be that day: Debian people apparently do want the
option to enable MODVERSIONS to make it easier to have external modules
across kernel versions, and this seems to be a fairly minimal fix for
the annoying problem.

Cc: Ben Hutchings <ben@decadent.org.uk>
Acked-by: Michal Marek <mmarek@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig    | 1 -
 kernel/module.c | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index c4fbc1e55c25..34407f15e6d3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1945,7 +1945,6 @@ config MODULE_FORCE_UNLOAD
 
 config MODVERSIONS
 	bool "Module versioning support"
-	depends on BROKEN
 	help
 	  Usually, you have to use modules compiled with your kernel.
 	  Saying Y here makes it sometimes possible to use modules
diff --git a/kernel/module.c b/kernel/module.c
index f57dd63186e6..0e54d5bf0097 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1301,8 +1301,9 @@ static int check_version(Elf_Shdr *sechdrs,
 		goto bad_version;
 	}
 
-	pr_warn("%s: no symbol version for %s\n", mod->name, symname);
-	return 0;
+	/* Broken toolchain. Warn once, then let it go.. */
+	pr_warn_once("%s: no symbol version for %s\n", mod->name, symname);
+	return 1;
 
 bad_version:
 	pr_warn("%s: disagrees about version of symbol %s\n",
-- 
cgit v1.2.3


From 01ae87eab53675cbdabd5c4d727c4a35e397cce0 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Mon, 28 Nov 2016 14:11:04 +0100
Subject: bpf: cgroup: fix documentation of __cgroup_bpf_update()

There's a 'not' missing in one paragraph. Add it.

Fixes: 3007098494be ("cgroup: add support for eBPF programs")
Signed-off-by: Daniel Mack <daniel@zonque.org>
Reported-by: Rami Rosen <roszenrami@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index a0ab43f264b0..8c784f8c67cd 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -66,8 +66,8 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
  * Each cgroup has a set of two pointers for bpf programs; one for eBPF
  * programs it owns, and which is effective for execution.
  *
- * If @prog is %NULL, this function attaches a new program to the cgroup and
- * releases the one that is currently attached, if any. @prog is then made
+ * If @prog is not %NULL, this function attaches a new program to the cgroup
+ * and releases the one that is currently attached, if any. @prog is then made
  * the effective program of type @type in that cgroup.
  *
  * If @prog is %NULL, the currently attached program of type @type is released,
-- 
cgit v1.2.3


From f8319483f57f1ca22370f4150bb990aca7728a67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Nov 2016 14:32:25 +1100
Subject: locking/lockdep: Provide a type check for lock_is_held

Christoph requested lockdep_assert_held() variants that distinguish
between held-for-read or held-for-write.

Provide:

  int lock_is_held_type(struct lockdep_map *lock, int read)

which takes the same argument as lock_acquire(.read) and matches it to
the held_lock instance.

Use of this function should be gated by the debug_locks variable. When
that is 0 the return value of the lock_is_held_type() function is
undefined. This is done to allow both negative and positive tests for
holding locks.

By default we provide (positive) lockdep_assert_held{,_exclusive,_read}()
macros.

Requested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 include/linux/lockdep.h  | 25 +++++++++++++++++++++++--
 kernel/locking/lockdep.c | 20 ++++++++++++--------
 2 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c1458fede1f9..1e327bb80838 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -338,9 +338,18 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
-#define lockdep_is_held(lock)	lock_is_held(&(lock)->dep_map)
+/*
+ * Same "read" as for lock_acquire(), except -1 means any.
+ */
+extern int lock_is_held_type(struct lockdep_map *lock, int read);
+
+static inline int lock_is_held(struct lockdep_map *lock)
+{
+	return lock_is_held_type(lock, -1);
+}
 
-extern int lock_is_held(struct lockdep_map *lock);
+#define lockdep_is_held(lock)		lock_is_held(&(lock)->dep_map)
+#define lockdep_is_held_type(lock, r)	lock_is_held_type(&(lock)->dep_map, (r))
 
 extern void lock_set_class(struct lockdep_map *lock, const char *name,
 			   struct lock_class_key *key, unsigned int subclass,
@@ -372,6 +381,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 
+#define lockdep_assert_held_exclusive(l)	do {			\
+		WARN_ON(debug_locks && !lockdep_is_held_type(l, 0));	\
+	} while (0)
+
+#define lockdep_assert_held_read(l)	do {				\
+		WARN_ON(debug_locks && !lockdep_is_held_type(l, 1));	\
+	} while (0)
+
 #define lockdep_assert_held_once(l)	do {				\
 		WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
@@ -428,7 +445,11 @@ struct lock_class_key { };
 
 #define lockdep_depth(tsk)	(0)
 
+#define lockdep_is_held_type(l, r)		(1)
+
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
+#define lockdep_assert_held_exclusive(l)	do { (void)(l); } while (0)
+#define lockdep_assert_held_read(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
 
 #define lockdep_recursing(tsk)			(0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763a49b3..cff580a6edf9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3188,7 +3188,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
 	return 0;
 }
 
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(struct lockdep_map *lock, int read);
 
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3329,7 +3329,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	}
 	chain_key = iterate_chain_key(chain_key, class_idx);
 
-	if (nest_lock && !__lock_is_held(nest_lock))
+	if (nest_lock && !__lock_is_held(nest_lock, -1))
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
 
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3576,7 +3576,7 @@ found_it:
 	return 1;
 }
 
-static int __lock_is_held(struct lockdep_map *lock)
+static int __lock_is_held(struct lockdep_map *lock, int read)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3584,8 +3584,12 @@ static int __lock_is_held(struct lockdep_map *lock)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock))
-			return 1;
+		if (match_held_lock(hlock, lock)) {
+			if (read == -1 || hlock->read == read)
+				return 1;
+
+			return 0;
+		}
 	}
 
 	return 0;
@@ -3769,7 +3773,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
-int lock_is_held(struct lockdep_map *lock)
+int lock_is_held_type(struct lockdep_map *lock, int read)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -3781,13 +3785,13 @@ int lock_is_held(struct lockdep_map *lock)
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	ret = __lock_is_held(lock);
+	ret = __lock_is_held(lock, read);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(lock_is_held_type);
 
 struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 {
-- 
cgit v1.2.3


From 60fe3910bb029e3671ce7ac080a7acb7e032b9e0 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Tue, 29 Nov 2016 23:45:47 +1100
Subject: kexec_file: Allow arch-specific memory walking for kexec_add_buffer

Allow architectures to specify a different memory walking function for
kexec_add_buffer. x86 uses iomem to track reserved memory ranges, but
PowerPC uses the memblock subsystem.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/kexec.h   | 29 ++++++++++++++++++++++++++++-
 kernel/kexec_file.c     | 30 ++++++++++++++++++++++--------
 kernel/kexec_internal.h | 16 ----------------
 3 files changed, 50 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 406c33dcae13..5e320ddaaa82 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -148,7 +148,34 @@ struct kexec_file_ops {
 	kexec_verify_sig_t *verify_sig;
 #endif
 };
-#endif
+
+/**
+ * struct kexec_buf - parameters for finding a place for a buffer in memory
+ * @image:	kexec image in which memory to search.
+ * @buffer:	Contents which will be copied to the allocated memory.
+ * @bufsz:	Size of @buffer.
+ * @mem:	On return will have address of the buffer in memory.
+ * @memsz:	Size for the buffer in memory.
+ * @buf_align:	Minimum alignment needed.
+ * @buf_min:	The buffer can't be placed below this address.
+ * @buf_max:	The buffer can't be placed above this address.
+ * @top_down:	Allocate from top of memory.
+ */
+struct kexec_buf {
+	struct kimage *image;
+	char *buffer;
+	unsigned long bufsz;
+	unsigned long mem;
+	unsigned long memsz;
+	unsigned long buf_align;
+	unsigned long buf_min;
+	unsigned long buf_max;
+	bool top_down;
+};
+
+int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
+			       int (*func)(u64, u64, void *));
+#endif /* CONFIG_KEXEC_FILE */
 
 struct kimage {
 	kimage_entry_t head;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 037c321c5618..f865674bff51 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -428,6 +428,27 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
 	return locate_mem_hole_bottom_up(start, end, kbuf);
 }
 
+/**
+ * arch_kexec_walk_mem - call func(data) on free memory regions
+ * @kbuf:	Context info for the search. Also passed to @func.
+ * @func:	Function to call for each memory region.
+ *
+ * Return: The memory walk will stop when func returns a non-zero value
+ * and that value will be returned. If all free regions are visited without
+ * func returning non-zero, then zero will be returned.
+ */
+int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
+			       int (*func)(u64, u64, void *))
+{
+	if (kbuf->image->type == KEXEC_TYPE_CRASH)
+		return walk_iomem_res_desc(crashk_res.desc,
+					   IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+					   crashk_res.start, crashk_res.end,
+					   kbuf, func);
+	else
+		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
+}
+
 /*
  * Helper function for placing a buffer in a kexec segment. This assumes
  * that kexec_mutex is held.
@@ -474,14 +495,7 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
 	kbuf->top_down = top_down;
 
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
-	if (image->type == KEXEC_TYPE_CRASH)
-		ret = walk_iomem_res_desc(crashk_res.desc,
-				IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
-				crashk_res.start, crashk_res.end, kbuf,
-				locate_mem_hole_callback);
-	else
-		ret = walk_system_ram_res(0, -1, kbuf,
-					  locate_mem_hole_callback);
+	ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
 	if (ret != 1) {
 		/* A suitable memory range could not be found for buffer */
 		return -EADDRNOTAVAIL;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 0a52315d9c62..4cef7e4706b0 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -20,22 +20,6 @@ struct kexec_sha_region {
 	unsigned long len;
 };
 
-/*
- * Keeps track of buffer parameters as provided by caller for requesting
- * memory placement of buffer.
- */
-struct kexec_buf {
-	struct kimage *image;
-	char *buffer;
-	unsigned long bufsz;
-	unsigned long mem;
-	unsigned long memsz;
-	unsigned long buf_align;
-	unsigned long buf_min;
-	unsigned long buf_max;
-	bool top_down;		/* allocate from top of memory hole */
-};
-
 void kimage_file_post_load_cleanup(struct kimage *image);
 #else /* CONFIG_KEXEC_FILE */
 static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-- 
cgit v1.2.3


From ec2b9bfaac44ea889625a6b9473d33898b10d35f Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Tue, 29 Nov 2016 23:45:48 +1100
Subject: kexec_file: Change kexec_add_buffer to take kexec_buf as argument.

This is done to simplify the kexec_add_buffer argument list.
Adapt all callers to set up a kexec_buf to pass to kexec_add_buffer.

In addition, change the type of kexec_buf.buffer from char * to void *.
There is no particular reason for it to be a char *, and the change
allows us to get rid of 3 existing casts to char * in the code.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/x86/kernel/crash.c           | 37 ++++++++--------
 arch/x86/kernel/kexec-bzimage64.c | 48 +++++++++++----------
 include/linux/kexec.h             |  8 +---
 kernel/kexec_file.c               | 88 ++++++++++++++++++---------------------
 4 files changed, 87 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 650830e39e3a..3741461c63a0 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg)
 
 int crash_load_segments(struct kimage *image)
 {
-	unsigned long src_start, src_sz, elf_sz;
-	void *elf_addr;
 	int ret;
+	struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+				  .buf_max = ULONG_MAX, .top_down = false };
 
 	/*
 	 * Determine and load a segment for backup area. First 640K RAM
@@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image)
 	if (ret < 0)
 		return ret;
 
-	src_start = image->arch.backup_src_start;
-	src_sz = image->arch.backup_src_sz;
-
 	/* Add backup segment. */
-	if (src_sz) {
+	if (image->arch.backup_src_sz) {
+		kbuf.buffer = &crash_zero_bytes;
+		kbuf.bufsz = sizeof(crash_zero_bytes);
+		kbuf.memsz = image->arch.backup_src_sz;
+		kbuf.buf_align = PAGE_SIZE;
 		/*
 		 * Ideally there is no source for backup segment. This is
 		 * copied in purgatory after crash. Just add a zero filled
 		 * segment for now to make sure checksum logic works fine.
 		 */
-		ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
-				       sizeof(crash_zero_bytes), src_sz,
-				       PAGE_SIZE, 0, -1, 0,
-				       &image->arch.backup_load_addr);
+		ret = kexec_add_buffer(&kbuf);
 		if (ret)
 			return ret;
+		image->arch.backup_load_addr = kbuf.mem;
 		pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
-			 image->arch.backup_load_addr, src_start, src_sz);
+			 image->arch.backup_load_addr,
+			 image->arch.backup_src_start, kbuf.memsz);
 	}
 
 	/* Prepare elf headers and add a segment */
-	ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+	ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
 	if (ret)
 		return ret;
 
-	image->arch.elf_headers = elf_addr;
-	image->arch.elf_headers_sz = elf_sz;
+	image->arch.elf_headers = kbuf.buffer;
+	image->arch.elf_headers_sz = kbuf.bufsz;
 
-	ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
-			ELF_CORE_HEADER_ALIGN, 0, -1, 0,
-			&image->arch.elf_load_addr);
+	kbuf.memsz = kbuf.bufsz;
+	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+	ret = kexec_add_buffer(&kbuf);
 	if (ret) {
 		vfree((void *)image->arch.elf_headers);
 		return ret;
 	}
+	image->arch.elf_load_addr = kbuf.mem;
 	pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-		 image->arch.elf_load_addr, elf_sz, elf_sz);
+		 image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
 
 	return ret;
 }
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 3407b148c240..d0a814a9d96a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 
 	struct setup_header *header;
 	int setup_sects, kern16_size, ret = 0;
-	unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+	unsigned long setup_header_size, params_cmdline_sz;
 	struct boot_params *params;
 	unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
 	unsigned long purgatory_load_addr;
-	unsigned long kernel_bufsz, kernel_memsz, kernel_align;
-	char *kernel_buf;
 	struct bzimage64_data *ldata;
 	struct kexec_entry64_regs regs64;
 	void *stack;
 	unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
 	unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+	struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+				  .top_down = true };
 
 	header = (struct setup_header *)(kernel + setup_hdr_offset);
 	setup_sects = header->setup_sects;
@@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
 				MAX_ELFCOREHDR_STR_LEN;
 	params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
-	params_misc_sz = params_cmdline_sz + efi_map_sz +
+	kbuf.bufsz = params_cmdline_sz + efi_map_sz +
 				sizeof(struct setup_data) +
 				sizeof(struct efi_setup_data);
 
-	params = kzalloc(params_misc_sz, GFP_KERNEL);
+	params = kzalloc(kbuf.bufsz, GFP_KERNEL);
 	if (!params)
 		return ERR_PTR(-ENOMEM);
 	efi_map_offset = params_cmdline_sz;
@@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	/* Is there a limit on setup header size? */
 	memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
 
-	ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
-			       params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
-			       ULONG_MAX, 1, &bootparam_load_addr);
+	kbuf.buffer = params;
+	kbuf.memsz = kbuf.bufsz;
+	kbuf.buf_align = 16;
+	kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+	ret = kexec_add_buffer(&kbuf);
 	if (ret)
 		goto out_free_params;
+	bootparam_load_addr = kbuf.mem;
 	pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-		 bootparam_load_addr, params_misc_sz, params_misc_sz);
+		 bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
 
 	/* Load kernel */
-	kernel_buf = kernel + kern16_size;
-	kernel_bufsz =  kernel_len - kern16_size;
-	kernel_memsz = PAGE_ALIGN(header->init_size);
-	kernel_align = header->kernel_alignment;
-
-	ret = kexec_add_buffer(image, kernel_buf,
-			       kernel_bufsz, kernel_memsz, kernel_align,
-			       MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
-			       &kernel_load_addr);
+	kbuf.buffer = kernel + kern16_size;
+	kbuf.bufsz =  kernel_len - kern16_size;
+	kbuf.memsz = PAGE_ALIGN(header->init_size);
+	kbuf.buf_align = header->kernel_alignment;
+	kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+	ret = kexec_add_buffer(&kbuf);
 	if (ret)
 		goto out_free_params;
+	kernel_load_addr = kbuf.mem;
 
 	pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-		 kernel_load_addr, kernel_memsz, kernel_memsz);
+		 kernel_load_addr, kbuf.bufsz, kbuf.memsz);
 
 	/* Load initrd high */
 	if (initrd) {
-		ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
-				       PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
-				       ULONG_MAX, 1, &initrd_load_addr);
+		kbuf.buffer = initrd;
+		kbuf.bufsz = kbuf.memsz = initrd_len;
+		kbuf.buf_align = PAGE_SIZE;
+		kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+		ret = kexec_add_buffer(&kbuf);
 		if (ret)
 			goto out_free_params;
+		initrd_load_addr = kbuf.mem;
 
 		pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
 				initrd_load_addr, initrd_len, initrd_len);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 5e320ddaaa82..437ef1b47428 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -163,7 +163,7 @@ struct kexec_file_ops {
  */
 struct kexec_buf {
 	struct kimage *image;
-	char *buffer;
+	void *buffer;
 	unsigned long bufsz;
 	unsigned long mem;
 	unsigned long memsz;
@@ -175,6 +175,7 @@ struct kexec_buf {
 
 int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
 			       int (*func)(u64, u64, void *));
+extern int kexec_add_buffer(struct kexec_buf *kbuf);
 #endif /* CONFIG_KEXEC_FILE */
 
 struct kimage {
@@ -239,11 +240,6 @@ extern asmlinkage long sys_kexec_load(unsigned long entry,
 					struct kexec_segment __user *segments,
 					unsigned long flags);
 extern int kernel_kexec(void);
-extern int kexec_add_buffer(struct kimage *image, char *buffer,
-			    unsigned long bufsz, unsigned long memsz,
-			    unsigned long buf_align, unsigned long buf_min,
-			    unsigned long buf_max, bool top_down,
-			    unsigned long *load_addr);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
 extern int kexec_load_purgatory(struct kimage *image, unsigned long min,
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f865674bff51..efd2c094af7e 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -449,25 +449,27 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
 }
 
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
+/**
+ * kexec_add_buffer - place a buffer in a kexec segment
+ * @kbuf:	Buffer contents and memory parameters.
+ *
+ * This function assumes that kexec_mutex is held.
+ * On successful return, @kbuf->mem will have the physical address of
+ * the buffer in memory.
+ *
+ * Return: 0 on success, negative errno on error.
  */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
-		     unsigned long memsz, unsigned long buf_align,
-		     unsigned long buf_min, unsigned long buf_max,
-		     bool top_down, unsigned long *load_addr)
+int kexec_add_buffer(struct kexec_buf *kbuf)
 {
 
 	struct kexec_segment *ksegment;
-	struct kexec_buf buf, *kbuf;
 	int ret;
 
 	/* Currently adding segment this way is allowed only in file mode */
-	if (!image->file_mode)
+	if (!kbuf->image->file_mode)
 		return -EINVAL;
 
-	if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+	if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX)
 		return -EINVAL;
 
 	/*
@@ -477,22 +479,14 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
 	 * logic goes through list of segments to make sure there are
 	 * no destination overlaps.
 	 */
-	if (!list_empty(&image->control_pages)) {
+	if (!list_empty(&kbuf->image->control_pages)) {
 		WARN_ON(1);
 		return -EINVAL;
 	}
 
-	memset(&buf, 0, sizeof(struct kexec_buf));
-	kbuf = &buf;
-	kbuf->image = image;
-	kbuf->buffer = buffer;
-	kbuf->bufsz = bufsz;
-
-	kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
-	kbuf->buf_align = max(buf_align, PAGE_SIZE);
-	kbuf->buf_min = buf_min;
-	kbuf->buf_max = buf_max;
-	kbuf->top_down = top_down;
+	/* Ensure minimum alignment needed for segments. */
+	kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
+	kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
 
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
 	ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
@@ -502,13 +496,12 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
 	}
 
 	/* Found a suitable memory range */
-	ksegment = &image->segment[image->nr_segments];
+	ksegment = &kbuf->image->segment[kbuf->image->nr_segments];
 	ksegment->kbuf = kbuf->buffer;
 	ksegment->bufsz = kbuf->bufsz;
 	ksegment->mem = kbuf->mem;
 	ksegment->memsz = kbuf->memsz;
-	image->nr_segments++;
-	*load_addr = ksegment->mem;
+	kbuf->image->nr_segments++;
 	return 0;
 }
 
@@ -630,13 +623,15 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 				  unsigned long max, int top_down)
 {
 	struct purgatory_info *pi = &image->purgatory_info;
-	unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
-	unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+	unsigned long align, bss_align, bss_sz, bss_pad;
+	unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
 	unsigned char *buf_addr, *src;
 	int i, ret = 0, entry_sidx = -1;
 	const Elf_Shdr *sechdrs_c;
 	Elf_Shdr *sechdrs = NULL;
-	void *purgatory_buf = NULL;
+	struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
+				  .buf_min = min, .buf_max = max,
+				  .top_down = top_down };
 
 	/*
 	 * sechdrs_c points to section headers in purgatory and are read
@@ -702,9 +697,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 	}
 
 	/* Determine how much memory is needed to load relocatable object. */
-	buf_align = 1;
 	bss_align = 1;
-	buf_sz = 0;
 	bss_sz = 0;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
@@ -713,10 +706,10 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 
 		align = sechdrs[i].sh_addralign;
 		if (sechdrs[i].sh_type != SHT_NOBITS) {
-			if (buf_align < align)
-				buf_align = align;
-			buf_sz = ALIGN(buf_sz, align);
-			buf_sz += sechdrs[i].sh_size;
+			if (kbuf.buf_align < align)
+				kbuf.buf_align = align;
+			kbuf.bufsz = ALIGN(kbuf.bufsz, align);
+			kbuf.bufsz += sechdrs[i].sh_size;
 		} else {
 			/* bss section */
 			if (bss_align < align)
@@ -728,32 +721,31 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 
 	/* Determine the bss padding required to align bss properly */
 	bss_pad = 0;
-	if (buf_sz & (bss_align - 1))
-		bss_pad = bss_align - (buf_sz & (bss_align - 1));
+	if (kbuf.bufsz & (bss_align - 1))
+		bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
 
-	memsz = buf_sz + bss_pad + bss_sz;
+	kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
 
 	/* Allocate buffer for purgatory */
-	purgatory_buf = vzalloc(buf_sz);
-	if (!purgatory_buf) {
+	kbuf.buffer = vzalloc(kbuf.bufsz);
+	if (!kbuf.buffer) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	if (buf_align < bss_align)
-		buf_align = bss_align;
+	if (kbuf.buf_align < bss_align)
+		kbuf.buf_align = bss_align;
 
 	/* Add buffer to segment list */
-	ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
-				buf_align, min, max, top_down,
-				&pi->purgatory_load_addr);
+	ret = kexec_add_buffer(&kbuf);
 	if (ret)
 		goto out;
+	pi->purgatory_load_addr = kbuf.mem;
 
 	/* Load SHF_ALLOC sections */
-	buf_addr = purgatory_buf;
+	buf_addr = kbuf.buffer;
 	load_addr = curr_load_addr = pi->purgatory_load_addr;
-	bss_addr = load_addr + buf_sz + bss_pad;
+	bss_addr = load_addr + kbuf.bufsz + bss_pad;
 
 	for (i = 0; i < pi->ehdr->e_shnum; i++) {
 		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -799,11 +791,11 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 	 * Used later to identify which section is purgatory and skip it
 	 * from checksumming.
 	 */
-	pi->purgatory_buf = purgatory_buf;
+	pi->purgatory_buf = kbuf.buffer;
 	return ret;
 out:
 	vfree(sechdrs);
-	vfree(purgatory_buf);
+	vfree(kbuf.buffer);
 	return ret;
 }
 
-- 
cgit v1.2.3


From e2e806f9e437b46a3fc8f3174a225c73f2e38c3d Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Tue, 29 Nov 2016 23:45:49 +1100
Subject: kexec_file: Factor out kexec_locate_mem_hole from kexec_add_buffer.

kexec_locate_mem_hole will be used by the PowerPC kexec_file_load
implementation to find free memory for the purgatory stack.

Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/kexec.h |  1 +
 kernel/kexec_file.c   | 25 ++++++++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 437ef1b47428..a33f63351f86 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -176,6 +176,7 @@ struct kexec_buf {
 int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
 			       int (*func)(u64, u64, void *));
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
+int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 #endif /* CONFIG_KEXEC_FILE */
 
 struct kimage {
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index efd2c094af7e..0c2df7f73792 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -449,6 +449,23 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
 		return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
 }
 
+/**
+ * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
+ * @kbuf:	Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+	int ret;
+
+	ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
+
+	return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
+
 /**
  * kexec_add_buffer - place a buffer in a kexec segment
  * @kbuf:	Buffer contents and memory parameters.
@@ -489,11 +506,9 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
 
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
-	ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
-	if (ret != 1) {
-		/* A suitable memory range could not be found for buffer */
-		return -EADDRNOTAVAIL;
-	}
+	ret = kexec_locate_mem_hole(kbuf);
+	if (ret)
+		return ret;
 
 	/* Found a suitable memory range */
 	ksegment = &kbuf->image->segment[kbuf->image->nr_segments];
-- 
cgit v1.2.3


From e2d2afe15ed452f91797a80dbc0a17838ba03ed4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 29 Nov 2016 12:27:09 -0500
Subject: bpf: fix states equal logic for varlen access

If we have a branch that looks something like this

int foo = map->value;
if (condition) {
  foo += blah;
} else {
  foo = bar;
}
map->array[foo] = baz;

We will incorrectly assume that the !condition branch is equal to the condition
branch as the register for foo will be UNKNOWN_VALUE in both cases.  We need to
adjust this logic to only do this if we didn't do a varlen access after we
processed the !condition branch, otherwise we have different ranges and need to
check the other branch as well.

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a936159c6e0..8199821f54cf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2454,6 +2454,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 			 struct bpf_verifier_state *old,
 			 struct bpf_verifier_state *cur)
 {
+	bool varlen_map_access = env->varlen_map_value_access;
 	struct bpf_reg_state *rold, *rcur;
 	int i;
 
@@ -2467,12 +2468,17 @@ static bool states_equal(struct bpf_verifier_env *env,
 		/* If the ranges were not the same, but everything else was and
 		 * we didn't do a variable access into a map then we are a-ok.
 		 */
-		if (!env->varlen_map_value_access &&
+		if (!varlen_map_access &&
 		    rold->type == rcur->type && rold->imm == rcur->imm)
 			continue;
 
+		/* If we didn't map access then again we don't care about the
+		 * mismatched range values and it's ok if our old type was
+		 * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
+		 */
 		if (rold->type == NOT_INIT ||
-		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
+		    (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
+		     rcur->type != NOT_INIT))
 			continue;
 
 		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
-- 
cgit v1.2.3


From 4a057549d6044c2dea47e80f8369a76225ec9d90 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 28 Nov 2016 14:35:21 -0800
Subject: alarmtimer: Add tracepoints for alarm timers

Alarm timers are one of the mechanisms to wake up a system from suspend,
but there exist no tracepoints to analyse which process/thread armed an
alarmtimer.

Add tracepoints for start/cancel/expire of individual alarm timers and one
for tracing the suspend time decision when to resume the system.

The following trace excerpt illustrates the new mechanism:

Binder:3292_2-3304  [000] d..2   149.981123: alarmtimer_cancel:
alarmtimer:ffffffc1319a7800 type:REALTIME
expires:1325463120000000000 now:1325376810370370245

Binder:3292_2-3304  [000] d..2   149.981136: alarmtimer_start:
alarmtimer:ffffffc1319a7800 type:REALTIME
expires:1325376840000000000 now:1325376810370384591

Binder:3292_9-3953  [000] d..2   150.212991: alarmtimer_cancel:
alarmtimer:ffffffc1319a5a00 type:BOOTTIME
expires:179552000000 now:150154008122

Binder:3292_9-3953  [000] d..2   150.213006: alarmtimer_start:
alarmtimer:ffffffc1319a5a00 type:BOOTTIME
expires:179551000000 now:150154025622

system_server-3000  [002] ...1  162.701940: alarmtimer_suspend:
alarmtimer type:REALTIME expires:1325376840000000000

The wakeup time which is selected at suspend time allows to map it back to
the task arming the timer: Binder:3292_2.

[ tglx: Store alarm timer expiry time instead of some useless RTC relative
  	information, add proper type information for wakeups which are
  	handled via the clock_nanosleep/freezer and massage the changelog. ]

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Link: http://lkml.kernel.org/r/1480372524-15181-5-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/alarmtimer.h        |  5 ++
 include/trace/events/alarmtimer.h | 96 +++++++++++++++++++++++++++++++++++++++
 kernel/time/alarmtimer.c          | 53 +++++++++++++++++----
 3 files changed, 144 insertions(+), 10 deletions(-)
 create mode 100644 include/trace/events/alarmtimer.h

(limited to 'kernel')

diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index 9d8031257a90..c70aac13244a 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -10,7 +10,12 @@ enum alarmtimer_type {
 	ALARM_REALTIME,
 	ALARM_BOOTTIME,
 
+	/* Supported types end here */
 	ALARM_NUMTYPE,
+
+	/* Used for tracing information. No usable types. */
+	ALARM_REALTIME_FREEZER,
+	ALARM_BOOTTIME_FREEZER,
 };
 
 enum alarmtimer_restart {
diff --git a/include/trace/events/alarmtimer.h b/include/trace/events/alarmtimer.h
new file mode 100644
index 000000000000..a1c108c16c9c
--- /dev/null
+++ b/include/trace/events/alarmtimer.h
@@ -0,0 +1,96 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM alarmtimer
+
+#if !defined(_TRACE_ALARMTIMER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ALARMTIMER_H
+
+#include <linux/alarmtimer.h>
+#include <linux/rtc.h>
+#include <linux/tracepoint.h>
+
+TRACE_DEFINE_ENUM(ALARM_REALTIME);
+TRACE_DEFINE_ENUM(ALARM_BOOTTIME);
+TRACE_DEFINE_ENUM(ALARM_REALTIME_FREEZER);
+TRACE_DEFINE_ENUM(ALARM_BOOTTIME_FREEZER);
+
+#define show_alarm_type(type)	__print_flags(type, " | ",	\
+	{ 1 << ALARM_REALTIME, "REALTIME" },			\
+	{ 1 << ALARM_BOOTTIME, "BOOTTIME" },			\
+	{ 1 << ALARM_REALTIME_FREEZER, "REALTIME Freezer" },	\
+	{ 1 << ALARM_BOOTTIME_FREEZER, "BOOTTIME Freezer" })
+
+TRACE_EVENT(alarmtimer_suspend,
+
+	TP_PROTO(ktime_t expires, int flag),
+
+	TP_ARGS(expires, flag),
+
+	TP_STRUCT__entry(
+		__field(s64, expires)
+		__field(unsigned char, alarm_type)
+	),
+
+	TP_fast_assign(
+		__entry->expires = expires.tv64;
+		__entry->alarm_type = flag;
+	),
+
+	TP_printk("alarmtimer type:%s expires:%llu",
+		  show_alarm_type((1 << __entry->alarm_type)),
+		  __entry->expires
+	)
+);
+
+DECLARE_EVENT_CLASS(alarm_class,
+
+	TP_PROTO(struct alarm *alarm, ktime_t now),
+
+	TP_ARGS(alarm, now),
+
+	TP_STRUCT__entry(
+		__field(void *,	alarm)
+		__field(unsigned char, alarm_type)
+		__field(s64, expires)
+		__field(s64, now)
+	),
+
+	TP_fast_assign(
+		__entry->alarm = alarm;
+		__entry->alarm_type = alarm->type;
+		__entry->expires = alarm->node.expires.tv64;
+		__entry->now = now.tv64;
+	),
+
+	TP_printk("alarmtimer:%p type:%s expires:%llu now:%llu",
+		  __entry->alarm,
+		  show_alarm_type((1 << __entry->alarm_type)),
+		  __entry->expires,
+		  __entry->now
+	)
+);
+
+DEFINE_EVENT(alarm_class, alarmtimer_fired,
+
+	TP_PROTO(struct alarm *alarm, ktime_t now),
+
+	TP_ARGS(alarm, now)
+);
+
+DEFINE_EVENT(alarm_class, alarmtimer_start,
+
+	TP_PROTO(struct alarm *alarm, ktime_t now),
+
+	TP_ARGS(alarm, now)
+);
+
+DEFINE_EVENT(alarm_class, alarmtimer_cancel,
+
+	TP_PROTO(struct alarm *alarm, ktime_t now),
+
+	TP_ARGS(alarm, now)
+);
+
+#endif /* _TRACE_ALARMTIMER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index a15caa3d1721..9b08ca391aed 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -26,6 +26,9 @@
 #include <linux/workqueue.h>
 #include <linux/freezer.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/alarmtimer.h>
+
 /**
  * struct alarm_base - Alarm timer bases
  * @lock:		Lock for syncrhonized access to the base
@@ -40,7 +43,9 @@ static struct alarm_base {
 	clockid_t		base_clockid;
 } alarm_bases[ALARM_NUMTYPE];
 
-/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+/* freezer information to handle clock_nanosleep triggered wakeups */
+static enum alarmtimer_type freezer_alarmtype;
+static ktime_t freezer_expires;
 static ktime_t freezer_delta;
 static DEFINE_SPINLOCK(freezer_delta_lock);
 
@@ -194,6 +199,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 	}
 	spin_unlock_irqrestore(&base->lock, flags);
 
+	trace_alarmtimer_fired(alarm, base->gettime());
 	return ret;
 
 }
@@ -218,15 +224,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
  */
 static int alarmtimer_suspend(struct device *dev)
 {
-	struct rtc_time tm;
-	ktime_t min, now;
-	unsigned long flags;
+	ktime_t min, now, expires;
+	int i, ret, type;
 	struct rtc_device *rtc;
-	int i;
-	int ret;
+	unsigned long flags;
+	struct rtc_time tm;
 
 	spin_lock_irqsave(&freezer_delta_lock, flags);
 	min = freezer_delta;
+	expires = freezer_expires;
+	type = freezer_alarmtype;
 	freezer_delta = ktime_set(0, 0);
 	spin_unlock_irqrestore(&freezer_delta_lock, flags);
 
@@ -247,8 +254,11 @@ static int alarmtimer_suspend(struct device *dev)
 		if (!next)
 			continue;
 		delta = ktime_sub(next->expires, base->gettime());
-		if (!min.tv64 || (delta.tv64 < min.tv64))
+		if (!min.tv64 || (delta.tv64 < min.tv64)) {
+			expires = next->expires;
 			min = delta;
+			type = i;
+		}
 	}
 	if (min.tv64 == 0)
 		return 0;
@@ -258,6 +268,8 @@ static int alarmtimer_suspend(struct device *dev)
 		return -EBUSY;
 	}
 
+	trace_alarmtimer_suspend(expires, type);
+
 	/* Setup an rtc timer to fire that far in the future */
 	rtc_timer_cancel(rtc, &rtctimer);
 	rtc_read_time(rtc, &tm);
@@ -295,15 +307,32 @@ static int alarmtimer_resume(struct device *dev)
 
 static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
 {
-	ktime_t delta;
+	struct alarm_base *base;
 	unsigned long flags;
-	struct alarm_base *base = &alarm_bases[type];
+	ktime_t delta;
+
+	switch(type) {
+	case ALARM_REALTIME:
+		base = &alarm_bases[ALARM_REALTIME];
+		type = ALARM_REALTIME_FREEZER;
+		break;
+	case ALARM_BOOTTIME:
+		base = &alarm_bases[ALARM_BOOTTIME];
+		type = ALARM_BOOTTIME_FREEZER;
+		break;
+	default:
+		WARN_ONCE(1, "Invalid alarm type: %d\n", type);
+		return;
+	}
 
 	delta = ktime_sub(absexp, base->gettime());
 
 	spin_lock_irqsave(&freezer_delta_lock, flags);
-	if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
+	if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) {
 		freezer_delta = delta;
+		freezer_expires = absexp;
+		freezer_alarmtype = type;
+	}
 	spin_unlock_irqrestore(&freezer_delta_lock, flags);
 }
 
@@ -342,6 +371,8 @@ void alarm_start(struct alarm *alarm, ktime_t start)
 	alarmtimer_enqueue(base, alarm);
 	hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
 	spin_unlock_irqrestore(&base->lock, flags);
+
+	trace_alarmtimer_start(alarm, base->gettime());
 }
 EXPORT_SYMBOL_GPL(alarm_start);
 
@@ -390,6 +421,8 @@ int alarm_try_to_cancel(struct alarm *alarm)
 	if (ret >= 0)
 		alarmtimer_dequeue(base, alarm);
 	spin_unlock_irqrestore(&base->lock, flags);
+
+	trace_alarmtimer_cancel(alarm, base->gettime());
 	return ret;
 }
 EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
-- 
cgit v1.2.3


From 60602982720f3a77366ee3e493a6e3d15e7e84f5 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Tue, 29 Nov 2016 09:14:56 -0800
Subject: audit: remove useless synchronize_net()

netlink kernel socket is protected by refcount, not RCU.
Its rcv path is neither protected by RCU. So the synchronize_net()
is just pointless.

Cc: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/audit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 92c463d2d1c7..67b9fbd871be 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1172,9 +1172,8 @@ static void __net_exit audit_net_exit(struct net *net)
 		audit_sock = NULL;
 	}
 
-	RCU_INIT_POINTER(aunet->nlsk, NULL);
-	synchronize_net();
 	netlink_kernel_release(sock);
+	aunet->nlsk = NULL;
 }
 
 static struct pernet_operations audit_net_ops __net_initdata = {
-- 
cgit v1.2.3


From b32614c03413f8a6025d8677c2b7c0ee976e63d4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sun, 27 Nov 2016 00:13:34 +0100
Subject: tracing/rb: Convert to hotplug state machine

Install the callbacks via the state machine. The notifier in struct
ring_buffer is replaced by the multi instance interface.  Upon
__ring_buffer_alloc() invocation, cpuhp_state_add_instance() will invoke
the trace_rb_cpu_prepare() on each CPU.

This callback may now fail. This means __ring_buffer_alloc() will fail and
cleanup (like previously) and during a CPU up event this failure will not
allow the CPU to come up.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20161126231350.10321-7-bigeasy@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h  |   1 +
 include/linux/ring_buffer.h |   6 ++
 kernel/trace/ring_buffer.c  | 135 +++++++++++++++-----------------------------
 kernel/trace/trace.c        |  15 ++++-
 4 files changed, 66 insertions(+), 91 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index e3771fb959c0..18bcfeb2463e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -62,6 +62,7 @@ enum cpuhp_state {
 	CPUHP_TOPOLOGY_PREPARE,
 	CPUHP_NET_IUCV_PREPARE,
 	CPUHP_ARM_BL_PREPARE,
+	CPUHP_TRACE_RB_PREPARE,
 	CPUHP_TIMERS_DEAD,
 	CPUHP_NOTF_ERR_INJ_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 4acc552e9279..b6d4568795a7 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -198,4 +198,10 @@ enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
 
+#ifdef CONFIG_RING_BUFFER
+int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
+#else
+#define trace_rb_cpu_prepare	NULL
+#endif
+
 #endif /* _LINUX_RING_BUFFER_H */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c143739b8d7..a7a055f167c7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -479,9 +479,7 @@ struct ring_buffer {
 
 	struct ring_buffer_per_cpu	**buffers;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	struct notifier_block		cpu_notify;
-#endif
+	struct hlist_node		node;
 	u64				(*clock)(void);
 
 	struct rb_irq_work		irq_work;
@@ -1274,11 +1272,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	kfree(cpu_buffer);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-static int rb_cpu_notify(struct notifier_block *self,
-			 unsigned long action, void *hcpu);
-#endif
-
 /**
  * __ring_buffer_alloc - allocate a new ring_buffer
  * @size: the size in bytes per cpu that is needed.
@@ -1296,6 +1289,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	long nr_pages;
 	int bsize;
 	int cpu;
+	int ret;
 
 	/* keep it in its own cache line */
 	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1318,17 +1312,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	if (nr_pages < 2)
 		nr_pages = 2;
 
-	/*
-	 * In case of non-hotplug cpu, if the ring-buffer is allocated
-	 * in early initcall, it will not be notified of secondary cpus.
-	 * In that off case, we need to allocate for all possible cpus.
-	 */
-#ifdef CONFIG_HOTPLUG_CPU
-	cpu_notifier_register_begin();
-	cpumask_copy(buffer->cpumask, cpu_online_mask);
-#else
-	cpumask_copy(buffer->cpumask, cpu_possible_mask);
-#endif
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
@@ -1337,19 +1320,15 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	if (!buffer->buffers)
 		goto fail_free_cpumask;
 
-	for_each_buffer_cpu(buffer, cpu) {
-		buffer->buffers[cpu] =
-			rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
-		if (!buffer->buffers[cpu])
-			goto fail_free_buffers;
-	}
+	cpu = raw_smp_processor_id();
+	cpumask_set_cpu(cpu, buffer->cpumask);
+	buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+	if (!buffer->buffers[cpu])
+		goto fail_free_buffers;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	buffer->cpu_notify.notifier_call = rb_cpu_notify;
-	buffer->cpu_notify.priority = 0;
-	__register_cpu_notifier(&buffer->cpu_notify);
-	cpu_notifier_register_done();
-#endif
+	ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+	if (ret < 0)
+		goto fail_free_buffers;
 
 	mutex_init(&buffer->mutex);
 
@@ -1364,9 +1343,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 
  fail_free_cpumask:
 	free_cpumask_var(buffer->cpumask);
-#ifdef CONFIG_HOTPLUG_CPU
-	cpu_notifier_register_done();
-#endif
 
  fail_free_buffer:
 	kfree(buffer);
@@ -1383,18 +1359,11 @@ ring_buffer_free(struct ring_buffer *buffer)
 {
 	int cpu;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	cpu_notifier_register_begin();
-	__unregister_cpu_notifier(&buffer->cpu_notify);
-#endif
+	cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
 
 	for_each_buffer_cpu(buffer, cpu)
 		rb_free_cpu_buffer(buffer->buffers[cpu]);
 
-#ifdef CONFIG_HOTPLUG_CPU
-	cpu_notifier_register_done();
-#endif
-
 	kfree(buffer->buffers);
 	free_cpumask_var(buffer->cpumask);
 
@@ -4633,62 +4602,48 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
-#ifdef CONFIG_HOTPLUG_CPU
-static int rb_cpu_notify(struct notifier_block *self,
-			 unsigned long action, void *hcpu)
+/*
+ * We only allocate new buffers, never free them if the CPU goes down.
+ * If we were to free the buffer, then the user would lose any trace that was in
+ * the buffer.
+ */
+int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
 {
-	struct ring_buffer *buffer =
-		container_of(self, struct ring_buffer, cpu_notify);
-	long cpu = (long)hcpu;
+	struct ring_buffer *buffer;
 	long nr_pages_same;
 	int cpu_i;
 	unsigned long nr_pages;
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		if (cpumask_test_cpu(cpu, buffer->cpumask))
-			return NOTIFY_OK;
-
-		nr_pages = 0;
-		nr_pages_same = 1;
-		/* check if all cpu sizes are same */
-		for_each_buffer_cpu(buffer, cpu_i) {
-			/* fill in the size from first enabled cpu */
-			if (nr_pages == 0)
-				nr_pages = buffer->buffers[cpu_i]->nr_pages;
-			if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
-				nr_pages_same = 0;
-				break;
-			}
-		}
-		/* allocate minimum pages, user can later expand it */
-		if (!nr_pages_same)
-			nr_pages = 2;
-		buffer->buffers[cpu] =
-			rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
-		if (!buffer->buffers[cpu]) {
-			WARN(1, "failed to allocate ring buffer on CPU %ld\n",
-			     cpu);
-			return NOTIFY_OK;
+	buffer = container_of(node, struct ring_buffer, node);
+	if (cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	nr_pages = 0;
+	nr_pages_same = 1;
+	/* check if all cpu sizes are same */
+	for_each_buffer_cpu(buffer, cpu_i) {
+		/* fill in the size from first enabled cpu */
+		if (nr_pages == 0)
+			nr_pages = buffer->buffers[cpu_i]->nr_pages;
+		if (nr_pages != buffer->buffers[cpu_i]->nr_pages) {
+			nr_pages_same = 0;
+			break;
 		}
-		smp_wmb();
-		cpumask_set_cpu(cpu, buffer->cpumask);
-		break;
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		/*
-		 * Do nothing.
-		 *  If we were to free the buffer, then the user would
-		 *  lose any trace that was in the buffer.
-		 */
-		break;
-	default:
-		break;
 	}
-	return NOTIFY_OK;
+	/* allocate minimum pages, user can later expand it */
+	if (!nr_pages_same)
+		nr_pages = 2;
+	buffer->buffers[cpu] =
+		rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
+	if (!buffer->buffers[cpu]) {
+		WARN(1, "failed to allocate ring buffer on CPU %u\n",
+		     cpu);
+		return -ENOMEM;
+	}
+	smp_wmb();
+	cpumask_set_cpu(cpu, buffer->cpumask);
+	return 0;
 }
-#endif
 
 #ifdef CONFIG_RING_BUFFER_STARTUP_TEST
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8696ce6bf2f6..465d56febc5b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7659,10 +7659,21 @@ __init static int tracer_alloc_buffers(void)
 
 	raw_spin_lock_init(&global_trace.start_lock);
 
+	/*
+	 * The prepare callbacks allocates some memory for the ring buffer. We
+	 * don't free the buffer if the if the CPU goes down. If we were to free
+	 * the buffer, then the user would lose any trace that was in the
+	 * buffer. The memory will be removed once the "instance" is removed.
+	 */
+	ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
+				      "trace/RB:preapre", trace_rb_cpu_prepare,
+				      NULL);
+	if (ret < 0)
+		goto out_free_cpumask;
 	/* Used for event triggers */
 	temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
 	if (!temp_buffer)
-		goto out_free_cpumask;
+		goto out_rm_hp_state;
 
 	if (trace_create_savedcmd() < 0)
 		goto out_free_temp_buffer;
@@ -7723,6 +7734,8 @@ out_free_savedcmd:
 	free_saved_cmdlines_buffer(savedcmd);
 out_free_temp_buffer:
 	ring_buffer_free(temp_buffer);
+out_rm_hp_state:
+	cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE);
 out_free_cpumask:
 	free_cpumask_var(global_trace.tracing_cpumask);
 out_free_buffer_mask:
-- 
cgit v1.2.3


From dbb26055defd03d59f678cb5f2c992abe05b064a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Nov 2016 21:04:41 +0000
Subject: locking/rtmutex: Prevent dequeue vs. unlock race

David reported a futex/rtmutex state corruption. It's caused by the
following problem:

CPU0		CPU1		CPU2

l->owner=T1
		rt_mutex_lock(l)
		lock(l->wait_lock)
		l->owner = T1 | HAS_WAITERS;
		enqueue(T2)
		boost()
		  unlock(l->wait_lock)
		schedule()

				rt_mutex_lock(l)
				lock(l->wait_lock)
				l->owner = T1 | HAS_WAITERS;
				enqueue(T3)
				boost()
				  unlock(l->wait_lock)
				schedule()
		signal(->T2)	signal(->T3)
		lock(l->wait_lock)
		dequeue(T2)
		deboost()
		  unlock(l->wait_lock)
				lock(l->wait_lock)
				dequeue(T3)
				  ===> wait list is now empty
				deboost()
				 unlock(l->wait_lock)
		lock(l->wait_lock)
		fixup_rt_mutex_waiters()
		  if (wait_list_empty(l)) {
		    owner = l->owner & ~HAS_WAITERS;
		    l->owner = owner
		     ==> l->owner = T1
		  }

				lock(l->wait_lock)
rt_mutex_unlock(l)		fixup_rt_mutex_waiters()
				  if (wait_list_empty(l)) {
				    owner = l->owner & ~HAS_WAITERS;
cmpxchg(l->owner, T1, NULL)
 ===> Success (l->owner = NULL)
				    l->owner = owner
				     ==> l->owner = T1
				  }

That means the problem is caused by fixup_rt_mutex_waiters() which does the
RMW to clear the waiters bit unconditionally when there are no waiters in
the rtmutexes rbtree.

This can be fatal: A concurrent unlock can release the rtmutex in the
fastpath because the waiters bit is not set. If the cmpxchg() gets in the
middle of the RMW operation then the previous owner, which just unlocked
the rtmutex is set as the owner again when the write takes place after the
successfull cmpxchg().

The solution is rather trivial: verify that the owner member of the rtmutex
has the waiters bit set before clearing it. This does not require a
cmpxchg() or other atomic operations because the waiters bit can only be
set and cleared with the rtmutex wait_lock held. It's also safe against the
fast path unlock attempt. The unlock attempt via cmpxchg() will either see
the bit set and take the slowpath or see the bit cleared and release it
atomically in the fastpath.

It's remarkable that the test program provided by David triggers on ARM64
and MIPS64 really quick, but it refuses to reproduce on x86-64, while the
problem exists there as well. That refusal might explain that this got not
discovered earlier despite the bug existing from day one of the rtmutex
implementation more than 10 years ago.

Thanks to David for meticulously instrumenting the code and providing the
information which allowed to decode this subtle problem.

Reported-by: David Daney <ddaney@caviumnetworks.com>
Tested-by: David Daney <david.daney@cavium.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: stable@vger.kernel.org
Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core")
Link: http://lkml.kernel.org/r/20161130210030.351136722@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1ec0f48962b3..2c49d76f96c3 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
 
 static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 {
-	if (!rt_mutex_has_waiters(lock))
-		clear_rt_mutex_waiters(lock);
+	unsigned long owner, *p = (unsigned long *) &lock->owner;
+
+	if (rt_mutex_has_waiters(lock))
+		return;
+
+	/*
+	 * The rbtree has no waiters enqueued, now make sure that the
+	 * lock->owner still has the waiters bit set, otherwise the
+	 * following can happen:
+	 *
+	 * CPU 0	CPU 1		CPU2
+	 * l->owner=T1
+	 *		rt_mutex_lock(l)
+	 *		lock(l->lock)
+	 *		l->owner = T1 | HAS_WAITERS;
+	 *		enqueue(T2)
+	 *		boost()
+	 *		  unlock(l->lock)
+	 *		block()
+	 *
+	 *				rt_mutex_lock(l)
+	 *				lock(l->lock)
+	 *				l->owner = T1 | HAS_WAITERS;
+	 *				enqueue(T3)
+	 *				boost()
+	 *				  unlock(l->lock)
+	 *				block()
+	 *		signal(->T2)	signal(->T3)
+	 *		lock(l->lock)
+	 *		dequeue(T2)
+	 *		deboost()
+	 *		  unlock(l->lock)
+	 *				lock(l->lock)
+	 *				dequeue(T3)
+	 *				 ==> wait list is empty
+	 *				deboost()
+	 *				 unlock(l->lock)
+	 *		lock(l->lock)
+	 *		fixup_rt_mutex_waiters()
+	 *		  if (wait_list_empty(l) {
+	 *		    l->owner = owner
+	 *		    owner = l->owner & ~HAS_WAITERS;
+	 *		      ==> l->owner = T1
+	 *		  }
+	 *				lock(l->lock)
+	 * rt_mutex_unlock(l)		fixup_rt_mutex_waiters()
+	 *				  if (wait_list_empty(l) {
+	 *				    owner = l->owner & ~HAS_WAITERS;
+	 * cmpxchg(l->owner, T1, NULL)
+	 *  ===> Success (l->owner = NULL)
+	 *
+	 *				    l->owner = owner
+	 *				      ==> l->owner = T1
+	 *				  }
+	 *
+	 * With the check for the waiter bit in place T3 on CPU2 will not
+	 * overwrite. All tasks fiddling with the waiters bit are
+	 * serialized by l->lock, so nothing else can modify the waiters
+	 * bit. If the bit is set then nothing can change l->owner either
+	 * so the simple RMW is safe. The cmpxchg() will simply fail if it
+	 * happens in the middle of the RMW because the waiters bit is
+	 * still set.
+	 */
+	owner = READ_ONCE(*p);
+	if (owner & RT_MUTEX_HAS_WAITERS)
+		WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
 }
 
 /*
-- 
cgit v1.2.3


From 1be5d4fa0af34fb7bafa205aeb59f5c7cc7a089d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Nov 2016 21:04:42 +0000
Subject: locking/rtmutex: Use READ_ONCE() in rt_mutex_owner()

While debugging the rtmutex unlock vs. dequeue race Will suggested to use
READ_ONCE() in rt_mutex_owner() as it might race against the
cmpxchg_release() in unlock_rt_mutex_safe().

Will: "It's a minor thing which will most likely not matter in practice"

Careful search did not unearth an actual problem in todays code, but it's
better to be safe than surprised.

Suggested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Daney <ddaney@caviumnetworks.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/20161130210030.431379999@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex_common.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 4f5f83c7d2d3..e317e1cbb3eb 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -75,8 +75,9 @@ task_top_pi_waiter(struct task_struct *p)
 
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 {
-	return (struct task_struct *)
-		((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
+	unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
+
+	return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL);
 }
 
 /*
-- 
cgit v1.2.3


From b5016e8203003c44264ec88fe2276ff54a51f689 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Nov 2016 21:04:44 +0000
Subject: locking/rtmutex: Get rid of RT_MUTEX_OWNER_MASKALL

This is a left over from the original rtmutex implementation which used
both bit0 and bit1 in the owner pointer. Commit:

  8161239a8bcc ("rtmutex: Simplify PI algorithm and make highest prio task get lock")

... removed the usage of bit1, but kept the extra mask around. This is
confusing at best.

Remove it and just use RT_MUTEX_HAS_WAITERS for the masking.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Daney <ddaney@caviumnetworks.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20161130210030.509567906@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex_common.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index e317e1cbb3eb..990134617b4c 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -71,13 +71,12 @@ task_top_pi_waiter(struct task_struct *p)
  * lock->owner state tracking:
  */
 #define RT_MUTEX_HAS_WAITERS	1UL
-#define RT_MUTEX_OWNER_MASKALL	1UL
 
 static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 {
 	unsigned long owner = (unsigned long) READ_ONCE(lock->owner);
 
-	return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL);
+	return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS);
 }
 
 /*
-- 
cgit v1.2.3


From 84d82ec5b9046ecdf16031d3e93a66ef50257402 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 30 Nov 2016 21:04:45 +0000
Subject: locking/rtmutex: Explain locking rules for
 rt_mutex_proxy_unlock()/init_proxy_locked()

While debugging the unlock vs. dequeue race which resulted in state
corruption of futexes the lockless nature of rt_mutex_proxy_unlock()
caused some confusion.

Add commentry to explain why it is safe to do this lockless. Add matching
comments to rt_mutex_init_proxy_locked() for completeness sake.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Daney <ddaney@caviumnetworks.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/20161130210030.591941927@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6e6cab7ac12f..2f443ed2320a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1619,11 +1619,15 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
  * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
  *				proxy owner
  *
- * @lock: 	the rt_mutex to be locked
+ * @lock:	the rt_mutex to be locked
  * @proxy_owner:the task to set as owner
  *
  * No locking. Caller has to do serializing itself
- * Special API call for PI-futex support
+ *
+ * Special API call for PI-futex support. This initializes the rtmutex and
+ * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not
+ * possible at this point because the pi_state which contains the rtmutex
+ * is not yet visible to other tasks.
  */
 void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				struct task_struct *proxy_owner)
@@ -1637,10 +1641,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 /**
  * rt_mutex_proxy_unlock - release a lock on behalf of owner
  *
- * @lock: 	the rt_mutex to be locked
+ * @lock:	the rt_mutex to be locked
  *
  * No locking. Caller has to do serializing itself
- * Special API call for PI-futex support
+ *
+ * Special API call for PI-futex support. This merrily cleans up the rtmutex
+ * (debugging) state. Concurrent operations on this rt_mutex are not
+ * possible because it belongs to the pi_state which is about to be freed
+ * and it is not longer visible to other tasks.
  */
 void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 			   struct task_struct *proxy_owner)
-- 
cgit v1.2.3


From 3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 30 Nov 2016 17:10:10 +0100
Subject: bpf: BPF for lightweight tunnel infrastructure

Registers new BPF program types which correspond to the LWT hooks:
  - BPF_PROG_TYPE_LWT_IN   => dst_input()
  - BPF_PROG_TYPE_LWT_OUT  => dst_output()
  - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()

The separate program types are required to differentiate between the
capabilities each LWT hook allows:

 * Programs attached to dst_input() or dst_output() are restricted and
   may only read the data of an skb. This prevent modification and
   possible invalidation of already validated packet headers on receive
   and the construction of illegal headers while the IP headers are
   still being assembled.

 * Programs attached to lwtunnel_xmit() are allowed to modify packet
   content as well as prepending an L2 header via a newly introduced
   helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is
   invoked after the IP header has been assembled completely.

All BPF programs receive an skb with L3 headers attached and may return
one of the following error codes:

 BPF_OK - Continue routing as per nexthop
 BPF_DROP - Drop skb and return EPERM
 BPF_REDIRECT - Redirect skb to device as per redirect() helper.
                (Only valid in lwtunnel_xmit() context)

The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h        |   2 +-
 include/uapi/linux/bpf.h      |  32 +++-
 include/uapi/linux/lwtunnel.h |  23 +++
 kernel/bpf/verifier.c         |  14 +-
 net/Kconfig                   |   8 +
 net/core/Makefile             |   1 +
 net/core/filter.c             | 173 ++++++++++++++++++
 net/core/lwt_bpf.c            | 396 ++++++++++++++++++++++++++++++++++++++++++
 net/core/lwtunnel.c           |   2 +
 9 files changed, 646 insertions(+), 5 deletions(-)
 create mode 100644 net/core/lwt_bpf.c

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7f246a281435..7ba644626553 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -438,7 +438,7 @@ struct xdp_buff {
 };
 
 /* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf and act_bpf programs
+ * will be accessed by cls_bpf, act_bpf and lwt programs
  */
 static inline void bpf_compute_data_end(struct sk_buff *skb)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d1456f..22ac82792687 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,9 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
 };
 
 enum bpf_attach_type {
@@ -409,6 +412,16 @@ union bpf_attr {
  *
  * int bpf_get_numa_node_id()
  *     Return: Id of current NUMA node.
+ *
+ * int bpf_skb_change_head()
+ *     Grows headroom of skb and adjusts MAC header offset accordingly.
+ *     Will extends/reallocae as required automatically.
+ *     May change skb data pointer and will thus invalidate any check
+ *     performed for direct packet access.
+ *     @skb: pointer to skb
+ *     @len: length of header to be pushed in front
+ *     @flags: Flags (unused for now)
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -453,7 +466,8 @@ union bpf_attr {
 	FN(skb_pull_data),		\
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
-	FN(get_numa_node_id),
+	FN(get_numa_node_id),		\
+	FN(skb_change_head),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -537,6 +551,22 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 453cc6215bfd..92724cba1eba 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
 	LWTUNNEL_ENCAP_SEG6,
+	LWTUNNEL_ENCAP_BPF,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {
 
 #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
 
+enum {
+	LWT_BPF_PROG_UNSPEC,
+	LWT_BPF_PROG_FD,
+	LWT_BPF_PROG_NAME,
+	__LWT_BPF_PROG_MAX,
+};
+
+#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
+
+enum {
+	LWT_BPF_UNSPEC,
+	LWT_BPF_IN,
+	LWT_BPF_OUT,
+	LWT_BPF_XMIT,
+	LWT_BPF_XMIT_HEADROOM,
+	__LWT_BPF_MAX,
+};
+
+#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
+
+#define LWT_BPF_MAX_HEADROOM 256
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8740c5fa02fc..8135cb1077ee 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 #define MAX_PACKET_OFF 0xffff
 
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
-				       const struct bpf_call_arg_meta *meta)
+				       const struct bpf_call_arg_meta *meta,
+				       enum bpf_access_type t)
 {
 	switch (env->prog->type) {
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_OUT:
+		/* dst_input() and dst_output() can't write for now */
+		if (t == BPF_WRITE)
+			return false;
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+	case BPF_PROG_TYPE_LWT_XMIT:
 		if (meta)
 			return meta->pkt_access;
 
@@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+	if (type == PTR_TO_PACKET &&
+	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd340b72b..a1005007224c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -402,6 +402,14 @@ config LWTUNNEL
 	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
 	  with light weight tunnel state associated with fib routes.
 
+config LWTUNNEL_BPF
+	bool "Execute BPF program as route nexthop action"
+	depends on LWTUNNEL
+	default y if LWTUNNEL=y
+	---help---
+	  Allows to run BPF programs as a nexthop action following a route
+	  lookup for incoming and outgoing packets.
+
 config DST_CACHE
 	bool
 	default n
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2ddca5..f6761b6e3b29 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262b8ebb..1c4d0faf22c8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
 				 u32 flags)
 {
+	/* Verify that a link layer header is carried */
+	if (unlikely(skb->mac_header >= skb->network_header)) {
+		kfree_skb(skb);
+		return -ERANGE;
+	}
+
 	bpf_push_mac_rcsum(skb);
 	return flags & BPF_F_INGRESS ?
 	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
@@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+	   u64, flags)
+{
+	u32 max_len = __bpf_skb_max_len(skb);
+	u32 new_len = skb->len + head_room;
+	int ret;
+
+	if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+		     new_len < skb->len))
+		return -EINVAL;
+
+	ret = skb_cow(skb, head_room);
+	if (likely(!ret)) {
+		/* Idea for this helper is that we currently only
+		 * allow to expand on mac header. This means that
+		 * skb->protocol network header, etc, stay as is.
+		 * Compared to bpf_skb_change_tail(), we're more
+		 * flexible due to not needing to linearize or
+		 * reset GSO. Intention for this helper is to be
+		 * used by an L3 skb that needs to push mac header
+		 * for redirection into L2 device.
+		 */
+		__skb_push(skb, head_room);
+		memset(skb->data, 0, head_room);
+		skb_reset_mac_header(skb);
+	}
+
+	bpf_compute_data_end(skb);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_change_head_proto = {
+	.func		= bpf_skb_change_head,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_skb_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
 	    func == bpf_skb_vlan_pop ||
 	    func == bpf_skb_store_bytes ||
 	    func == bpf_skb_change_proto ||
+	    func == bpf_skb_change_head ||
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
@@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_get_tunnel_key:
+		return &bpf_skb_get_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_key:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_redirect:
+		return &bpf_redirect_proto;
+	case BPF_FUNC_clone_redirect:
+		return &bpf_clone_redirect_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_change_head:
+		return &bpf_skb_change_head_proto;
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
+	case BPF_FUNC_l3_csum_replace:
+		return &bpf_l3_csum_replace_proto;
+	case BPF_FUNC_l4_csum_replace:
+		return &bpf_l4_csum_replace_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
+	default:
+		return lwt_inout_func_proto(func_id);
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool lwt_is_valid_access(int off, int size,
+				enum bpf_access_type type,
+				enum bpf_reg_type *reg_type)
+{
+	switch (off) {
+	case offsetof(struct __sk_buff, tc_classid):
+		return false;
+	}
+
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct __sk_buff, mark):
+		case offsetof(struct __sk_buff, priority):
+		case offsetof(struct __sk_buff, cb[0]) ...
+		     offsetof(struct __sk_buff, cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	switch (off) {
+	case offsetof(struct __sk_buff, data):
+		*reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct __sk_buff, data_end):
+		*reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	return __is_valid_access(off, size, type);
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops lwt_inout_ops = {
+	.get_func_proto		= lwt_inout_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+	.get_func_proto		= lwt_xmit_func_proto,
+	.is_valid_access	= lwt_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3187,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+	.ops	= &lwt_inout_ops,
+	.type	= BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+	.ops	= &lwt_inout_ops,
+	.type	= BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+	.ops	= &lwt_xmit_ops,
+	.type	= BPF_PROG_TYPE_LWT_XMIT,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3209,9 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&lwt_in_type);
+	bpf_register_prog_type(&lwt_out_type);
+	bpf_register_prog_type(&lwt_xmit_type);
 
 	return 0;
 }
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 000000000000..71bb3e2eca08
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,396 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
+struct bpf_lwt {
+	struct bpf_lwt_prog in;
+	struct bpf_lwt_prog out;
+	struct bpf_lwt_prog xmit;
+	int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+		       struct dst_entry *dst, bool can_redirect)
+{
+	int ret;
+
+	/* Preempt disable is needed to protect per-cpu redirect_info between
+	 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+	 * access to maps strictly require a rcu_read_lock() for protection,
+	 * mixing with BH RCU lock doesn't work.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	bpf_compute_data_end(skb);
+	ret = bpf_prog_run_save_cb(lwt->prog, skb);
+	rcu_read_unlock();
+
+	switch (ret) {
+	case BPF_OK:
+		break;
+
+	case BPF_REDIRECT:
+		if (unlikely(!can_redirect)) {
+			pr_warn_once("Illegal redirect return code in prog %s\n",
+				     lwt->name ? : "<unknown>");
+			ret = BPF_OK;
+		} else {
+			ret = skb_do_redirect(skb);
+			if (ret == 0)
+				ret = BPF_REDIRECT;
+		}
+		break;
+
+	case BPF_DROP:
+		kfree_skb(skb);
+		ret = -EPERM;
+		break;
+
+	default:
+		pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret);
+		kfree_skb(skb);
+		ret = -EINVAL;
+		break;
+	}
+
+	preempt_enable();
+
+	return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->in.prog) {
+		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_input)) {
+		pr_warn_once("orig_input not set on dst for prog %s\n",
+			     bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->out.prog) {
+		ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_output)) {
+		pr_warn_once("orig_output not set on dst for prog %s\n",
+			     bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+	int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+	if (skb_headroom(skb) < hh_len) {
+		int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+		if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->xmit.prog) {
+		int ret;
+
+		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+		switch (ret) {
+		case BPF_OK:
+			/* If the header was expanded, headroom might be too
+			 * small for L2 header to come, expand as needed.
+			 */
+			ret = xmit_check_hhlen(skb);
+			if (unlikely(ret))
+				return ret;
+
+			return LWTUNNEL_XMIT_CONTINUE;
+		case BPF_REDIRECT:
+			return LWTUNNEL_XMIT_DONE;
+		default:
+			return ret;
+		}
+	}
+
+	return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+	if (prog->prog)
+		bpf_prog_put(prog->prog);
+
+	kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	bpf_lwt_prog_destroy(&bpf->in);
+	bpf_lwt_prog_destroy(&bpf->out);
+	bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+	[LWT_BPF_PROG_FD]   = { .type = NLA_U32, },
+	[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				.len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+			  enum bpf_prog_type type)
+{
+	struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+		return -EINVAL;
+
+	prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+	if (!prog->name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+	p = bpf_prog_get_type(fd, type);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	prog->prog = p;
+
+	return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+	[LWT_BPF_IN]		= { .type = NLA_NESTED, },
+	[LWT_BPF_OUT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+			   unsigned int family, const void *cfg,
+			   struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[LWT_BPF_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct bpf_lwt *bpf;
+	int ret;
+
+	if (family != AF_INET && family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*bpf));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->type = LWTUNNEL_ENCAP_BPF;
+	bpf = bpf_lwt_lwtunnel(newts);
+
+	if (tb[LWT_BPF_IN]) {
+		newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+				     BPF_PROG_TYPE_LWT_IN);
+		if (ret  < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_OUT]) {
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+				     BPF_PROG_TYPE_LWT_OUT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT]) {
+		newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+				     BPF_PROG_TYPE_LWT_XMIT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT_HEADROOM]) {
+		u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+		if (headroom > LWT_BPF_MAX_HEADROOM) {
+			ret = -ERANGE;
+			goto errout;
+		}
+
+		newts->headroom = headroom;
+	}
+
+	bpf->family = family;
+	*ts = newts;
+
+	return 0;
+
+errout:
+	bpf_destroy_state(newts);
+	kfree(newts);
+	return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+			     struct bpf_lwt_prog *prog)
+{
+	struct nlattr *nest;
+
+	if (!prog->prog)
+		return 0;
+
+	nest = nla_nest_start(skb, attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (prog->name &&
+	    nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	int nest_len = nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+		       0;
+
+	return nest_len + /* LWT_BPF_IN */
+	       nest_len + /* LWT_BPF_OUT */
+	       nest_len + /* LWT_BPF_XMIT */
+	       0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+	/* FIXME:
+	 * The LWT state is currently rebuilt for delete requests which
+	 * results in a new bpf_prog instance. Comparing names for now.
+	 */
+	if (!a->name && !b->name)
+		return 0;
+
+	if (!a->name || !b->name)
+		return 1;
+
+	return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+	struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+	return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+	       bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+	       bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+	.build_state	= bpf_build_state,
+	.destroy_state	= bpf_destroy_state,
+	.input		= bpf_input,
+	.output		= bpf_output,
+	.xmit		= bpf_xmit,
+	.fill_encap	= bpf_fill_encap_info,
+	.get_encap_size = bpf_encap_nlsize,
+	.cmp_encap	= bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 03976e939818..a5d4e866ce88 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "ILA";
 	case LWTUNNEL_ENCAP_SEG6:
 		return "SEG6";
+	case LWTUNNEL_ENCAP_BPF:
+		return "BPF";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
-- 
cgit v1.2.3


From b2cd12574aa3e1625f471ff57cde7f628a18a46b Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:03 -0800
Subject: bpf: Refactor cgroups code in prep for new type

Code move and rename only; no functional change intended.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 46 +++++++++++++++++++++++-----------------------
 kernel/bpf/cgroup.c        | 10 +++++-----
 kernel/bpf/syscall.c       | 28 +++++++++++++++-------------
 3 files changed, 43 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 0cf1adfadd2d..af2ca8b432c0 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -36,31 +36,31 @@ void cgroup_bpf_update(struct cgroup *cgrp,
 		       struct bpf_prog *prog,
 		       enum bpf_attach_type type);
 
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type);
-
-/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
-#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled)						\
-		__ret = __cgroup_bpf_run_filter(sk, skb,		\
-						BPF_CGROUP_INET_INGRESS); \
-									\
-	__ret;								\
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
+						    BPF_CGROUP_INET_INGRESS); \
+									      \
+	__ret;								      \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
-		typeof(sk) __sk = sk_to_full_sk(sk);			\
-		if (sk_fullsock(__sk))					\
-			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
-						BPF_CGROUP_INET_EGRESS); \
-	}								\
-	__ret;								\
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		       \
+		typeof(sk) __sk = sk_to_full_sk(sk);			       \
+		if (sk_fullsock(__sk))					       \
+			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
+						      BPF_CGROUP_INET_EGRESS); \
+	}								       \
+	__ret;								       \
 })
 
 #else
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8c784f8c67cd..8fe55ffd109d 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -118,7 +118,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
 }
 
 /**
- * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socken sending or receiving traffic
  * @skb: The skb that is being sent or received
  * @type: The type of program to be exectuted
@@ -132,9 +132,9 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
  * This function will return %-EPERM if any if an attached program was found
  * and if it returned != 1 during execution. In all other cases, 0 is returned.
  */
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type)
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
@@ -164,4 +164,4 @@ int __cgroup_bpf_run_filter(struct sock *sk,
 
 	return ret;
 }
-EXPORT_SYMBOL(__cgroup_bpf_run_filter);
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4caa18e6860a..5518a6839ab1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -856,6 +856,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	struct bpf_prog *prog;
 	struct cgroup *cgrp;
+	enum bpf_prog_type ptype;
 
 	if (!capable(CAP_NET_ADMIN))
 		return -EPERM;
@@ -866,25 +867,26 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
-		prog = bpf_prog_get_type(attr->attach_bpf_fd,
-					 BPF_PROG_TYPE_CGROUP_SKB);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp)) {
-			bpf_prog_put(prog);
-			return PTR_ERR(cgrp);
-		}
-
-		cgroup_bpf_update(cgrp, prog, attr->attach_type);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
 
 	default:
 		return -EINVAL;
 	}
 
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp)) {
+		bpf_prog_put(prog);
+		return PTR_ERR(cgrp);
+	}
+
+	cgroup_bpf_update(cgrp, prog, attr->attach_type);
+	cgroup_put(cgrp);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 61023658760032e97869b07d54be9681d2529e77 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:04 -0800
Subject: bpf: Add new cgroup attach type to enable sock modifications

Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 14 +++++++++++
 include/uapi/linux/bpf.h   |  6 +++++
 kernel/bpf/cgroup.c        | 33 ++++++++++++++++++++++++
 kernel/bpf/syscall.c       |  5 +++-
 net/core/filter.c          | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c         | 12 ++++++++-
 net/ipv6/af_inet6.c        |  8 ++++++
 7 files changed, 138 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index af2ca8b432c0..7b6e5d168c95 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -40,6 +40,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -63,6 +66,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
+						 BPF_CGROUP_INET_SOCK_CREATE); \
+	}								       \
+	__ret;								       \
+})
+
 #else
 
 struct cgroup_bpf {};
@@ -72,6 +85,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 22ac82792687..bfe5e31a1288 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
 	BPF_PROG_TYPE_LWT_IN,
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
@@ -109,6 +110,7 @@ enum bpf_prog_type {
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -567,6 +569,10 @@ enum bpf_ret_code {
 	/* >127 are reserved for prog type specific return codes */
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 8fe55ffd109d..a515f7b007c6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -165,3 +165,36 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5518a6839ab1..85af86c496cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -869,7 +869,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_EGRESS:
 		ptype = BPF_PROG_TYPE_CGROUP_SKB;
 		break;
-
+	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -905,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
diff --git a/net/core/filter.c b/net/core/filter.c
index 1c4d0faf22c8..0ab252e462aa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2818,6 +2818,32 @@ static bool lwt_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					enum bpf_reg_type *reg_type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct bpf_sock, bound_dev_if):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	if (off < 0 || off + size > sizeof(struct bpf_sock))
+		return false;
+
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 			       const struct bpf_prog *prog)
 {
@@ -3076,6 +3102,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
+					  int dst_reg, int src_reg,
+					  int ctx_off,
+					  struct bpf_insn *insn_buf,
+					  struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					offsetof(struct sock, sk_bound_dev_if));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sock, sk_bound_dev_if));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 					 int src_reg, int ctx_off,
 					 struct bpf_insn *insn_buf,
@@ -3162,6 +3212,12 @@ static const struct bpf_verifier_ops lwt_xmit_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
+static const struct bpf_verifier_ops cg_sock_ops = {
+	.get_func_proto		= sk_filter_func_proto,
+	.is_valid_access	= sock_filter_is_valid_access,
+	.convert_ctx_access	= sock_filter_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3202,6 +3258,11 @@ static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_LWT_XMIT,
 };
 
+static struct bpf_prog_type_list cg_sock_type __read_mostly = {
+	.ops	= &cg_sock_ops,
+	.type	= BPF_PROG_TYPE_CGROUP_SOCK
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3209,6 +3270,7 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&cg_sock_type);
 	bpf_register_prog_type(&lwt_in_type);
 	bpf_register_prog_type(&lwt_out_type);
 	bpf_register_prog_type(&lwt_xmit_type);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cda07f4..24d2550492ee 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -374,8 +374,18 @@ lookup_protocol:
 
 	if (sk->sk_prot->init) {
 		err = sk->sk_prot->init(sk);
-		if (err)
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
 			sk_common_release(sk);
+			goto out;
+		}
 	}
 out:
 	return err;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d424f3a3737a..237e654ba717 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -258,6 +258,14 @@ lookup_protocol:
 			goto out;
 		}
 	}
+
+	if (!kern) {
+		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
+		if (err) {
+			sk_common_release(sk);
+			goto out;
+		}
+	}
 out:
 	return err;
 out_rcu_unlock:
-- 
cgit v1.2.3


From 450630975da9e7dffe540753e169dc4da5fe7c29 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 Dec 2016 18:24:56 -0500
Subject: don't open-code file_inode()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++++++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c    |  4 ++--
 drivers/staging/greybus/camera.c           |  4 ++--
 drivers/staging/greybus/es2.c              |  6 +++---
 drivers/staging/greybus/svc.c              |  6 +++---
 drivers/staging/greybus/timesync.c         |  2 +-
 drivers/target/target_core_configfs.c      |  2 +-
 fs/aio.c                                   |  6 +++---
 fs/autofs4/inode.c                         |  2 +-
 fs/fcntl.c                                 |  2 +-
 fs/orangefs/file.c                         |  2 +-
 fs/orangefs/orangefs-debugfs.c             |  6 ++++--
 fs/overlayfs/copy_up.c                     |  2 +-
 kernel/audit_watch.c                       |  4 ++--
 kernel/events/core.c                       |  2 +-
 kernel/locking/qspinlock_stat.h            | 12 ++----------
 security/smack/smack_lsm.c                 |  2 +-
 18 files changed, 40 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3161d77bf299..3c2858972217 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2501,7 +2501,7 @@ static void amdgpu_debugfs_remove_files(struct amdgpu_device *adev)
 static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 	bool pm_pg_lock, use_bank;
@@ -2570,7 +2570,7 @@ end:
 static ssize_t amdgpu_debugfs_regs_write(struct file *f, const char __user *buf,
 					 size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -2601,7 +2601,7 @@ static ssize_t amdgpu_debugfs_regs_write(struct file *f, const char __user *buf,
 static ssize_t amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -2628,7 +2628,7 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf,
 static ssize_t amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user *buf,
 					 size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -2656,7 +2656,7 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user
 static ssize_t amdgpu_debugfs_regs_didt_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -2683,7 +2683,7 @@ static ssize_t amdgpu_debugfs_regs_didt_read(struct file *f, char __user *buf,
 static ssize_t amdgpu_debugfs_regs_didt_write(struct file *f, const char __user *buf,
 					 size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -2711,7 +2711,7 @@ static ssize_t amdgpu_debugfs_regs_didt_write(struct file *f, const char __user
 static ssize_t amdgpu_debugfs_regs_smc_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -2738,7 +2738,7 @@ static ssize_t amdgpu_debugfs_regs_smc_read(struct file *f, char __user *buf,
 static ssize_t amdgpu_debugfs_regs_smc_write(struct file *f, const char __user *buf,
 					 size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -2766,7 +2766,7 @@ static ssize_t amdgpu_debugfs_regs_smc_write(struct file *f, const char __user *
 static ssize_t amdgpu_debugfs_gca_config_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 	uint32_t *config, no_regs = 0;
@@ -2836,7 +2836,7 @@ static ssize_t amdgpu_debugfs_gca_config_read(struct file *f, char __user *buf,
 static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	int idx, r;
 	int32_t value;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 3cb5e903cd62..2636f619569e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -283,7 +283,7 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring)
 static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf,
 					size_t size, loff_t *pos)
 {
-	struct amdgpu_ring *ring = (struct amdgpu_ring*)f->f_inode->i_private;
+	struct amdgpu_ring *ring = file_inode(f)->i_private;
 	int r, i;
 	uint32_t value, result, early[3];
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcaf691f56b5..a5d00ef2b421 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1412,7 +1412,7 @@ static const struct drm_info_list amdgpu_ttm_debugfs_list[] = {
 static ssize_t amdgpu_ttm_vram_read(struct file *f, char __user *buf,
 				    size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
@@ -1456,7 +1456,7 @@ static const struct file_operations amdgpu_ttm_vram_fops = {
 static ssize_t amdgpu_ttm_gtt_read(struct file *f, char __user *buf,
 				   size_t size, loff_t *pos)
 {
-	struct amdgpu_device *adev = f->f_inode->i_private;
+	struct amdgpu_device *adev = file_inode(f)->i_private;
 	ssize_t result = 0;
 	int r;
 
diff --git a/drivers/staging/greybus/camera.c b/drivers/staging/greybus/camera.c
index 491bdd720c0c..cebb76e7d55c 100644
--- a/drivers/staging/greybus/camera.c
+++ b/drivers/staging/greybus/camera.c
@@ -1091,7 +1091,7 @@ static ssize_t gb_camera_debugfs_read(struct file *file, char __user *buf,
 				      size_t len, loff_t *offset)
 {
 	const struct gb_camera_debugfs_entry *op = file->private_data;
-	struct gb_camera *gcam = file->f_inode->i_private;
+	struct gb_camera *gcam = file_inode(file)->i_private;
 	struct gb_camera_debugfs_buffer *buffer;
 	ssize_t ret;
 
@@ -1113,7 +1113,7 @@ static ssize_t gb_camera_debugfs_write(struct file *file,
 				       loff_t *offset)
 {
 	const struct gb_camera_debugfs_entry *op = file->private_data;
-	struct gb_camera *gcam = file->f_inode->i_private;
+	struct gb_camera *gcam = file_inode(file)->i_private;
 	ssize_t ret;
 	char *kbuf;
 
diff --git a/drivers/staging/greybus/es2.c b/drivers/staging/greybus/es2.c
index baab460eeaa3..d367cdffaa10 100644
--- a/drivers/staging/greybus/es2.c
+++ b/drivers/staging/greybus/es2.c
@@ -1250,7 +1250,7 @@ static int apb_log_poll(void *data)
 static ssize_t apb_log_read(struct file *f, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct es2_ap_dev *es2 = f->f_inode->i_private;
+	struct es2_ap_dev *es2 = file_inode(f)->i_private;
 	ssize_t ret;
 	size_t copied;
 	char *tmp_buf;
@@ -1304,7 +1304,7 @@ static void usb_log_disable(struct es2_ap_dev *es2)
 static ssize_t apb_log_enable_read(struct file *f, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct es2_ap_dev *es2 = f->f_inode->i_private;
+	struct es2_ap_dev *es2 = file_inode(f)->i_private;
 	int enable = !IS_ERR_OR_NULL(es2->apb_log_task);
 	char tmp_buf[3];
 
@@ -1317,7 +1317,7 @@ static ssize_t apb_log_enable_write(struct file *f, const char __user *buf,
 {
 	int enable;
 	ssize_t retval;
-	struct es2_ap_dev *es2 = f->f_inode->i_private;
+	struct es2_ap_dev *es2 = file_inode(f)->i_private;
 
 	retval = kstrtoint_from_user(buf, count, 10, &enable);
 	if (retval)
diff --git a/drivers/staging/greybus/svc.c b/drivers/staging/greybus/svc.c
index 550055ec27a5..8779270cadc1 100644
--- a/drivers/staging/greybus/svc.c
+++ b/drivers/staging/greybus/svc.c
@@ -757,7 +757,7 @@ static int gb_svc_version_request(struct gb_operation *op)
 static ssize_t pwr_debugfs_voltage_read(struct file *file, char __user *buf,
 					size_t len, loff_t *offset)
 {
-	struct svc_debugfs_pwrmon_rail *pwrmon_rails = file->f_inode->i_private;
+	struct svc_debugfs_pwrmon_rail *pwrmon_rails = file_inode(file)->i_private;
 	struct gb_svc *svc = pwrmon_rails->svc;
 	int ret, desc;
 	u32 value;
@@ -780,7 +780,7 @@ static ssize_t pwr_debugfs_voltage_read(struct file *file, char __user *buf,
 static ssize_t pwr_debugfs_current_read(struct file *file, char __user *buf,
 					size_t len, loff_t *offset)
 {
-	struct svc_debugfs_pwrmon_rail *pwrmon_rails = file->f_inode->i_private;
+	struct svc_debugfs_pwrmon_rail *pwrmon_rails = file_inode(file)->i_private;
 	struct gb_svc *svc = pwrmon_rails->svc;
 	int ret, desc;
 	u32 value;
@@ -803,7 +803,7 @@ static ssize_t pwr_debugfs_current_read(struct file *file, char __user *buf,
 static ssize_t pwr_debugfs_power_read(struct file *file, char __user *buf,
 				      size_t len, loff_t *offset)
 {
-	struct svc_debugfs_pwrmon_rail *pwrmon_rails = file->f_inode->i_private;
+	struct svc_debugfs_pwrmon_rail *pwrmon_rails = file_inode(file)->i_private;
 	struct gb_svc *svc = pwrmon_rails->svc;
 	int ret, desc;
 	u32 value;
diff --git a/drivers/staging/greybus/timesync.c b/drivers/staging/greybus/timesync.c
index 2e68af7dea6d..09ce00484b3f 100644
--- a/drivers/staging/greybus/timesync.c
+++ b/drivers/staging/greybus/timesync.c
@@ -921,7 +921,7 @@ EXPORT_SYMBOL_GPL(gb_timesync_schedule_asynchronous);
 static ssize_t gb_timesync_ping_read(struct file *file, char __user *ubuf,
 				     size_t len, loff_t *offset, bool ktime)
 {
-	struct gb_timesync_svc *timesync_svc = file->f_inode->i_private;
+	struct gb_timesync_svc *timesync_svc = file_inode(file)->i_private;
 	char *buf;
 	ssize_t ret = 0;
 
diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c
index 2001005bef45..a35a347ec357 100644
--- a/drivers/target/target_core_configfs.c
+++ b/drivers/target/target_core_configfs.c
@@ -143,7 +143,7 @@ static ssize_t target_core_item_dbroot_store(struct config_item *item,
 		pr_err("db_root: cannot open: %s\n", db_root_stage);
 		return -EINVAL;
 	}
-	if (!S_ISDIR(fp->f_inode->i_mode)) {
+	if (!S_ISDIR(file_inode(fp)->i_mode)) {
 		filp_close(fp, 0);
 		mutex_unlock(&g_tf_lock);
 		pr_err("db_root: not a directory: %s\n", db_root_stage);
diff --git a/fs/aio.c b/fs/aio.c
index 428484f2f841..8edf253484af 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -277,10 +277,10 @@ static void put_aio_ring_file(struct kioctx *ctx)
 	struct address_space *i_mapping;
 
 	if (aio_ring_file) {
-		truncate_setsize(aio_ring_file->f_inode, 0);
+		truncate_setsize(file_inode(aio_ring_file), 0);
 
 		/* Prevent further access to the kioctx from migratepages */
-		i_mapping = aio_ring_file->f_inode->i_mapping;
+		i_mapping = aio_ring_file->f_mapping;
 		spin_lock(&i_mapping->private_lock);
 		i_mapping->private_data = NULL;
 		ctx->aio_ring_file = NULL;
@@ -483,7 +483,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
-		page = find_or_create_page(file->f_inode->i_mapping,
+		page = find_or_create_page(file->f_mapping,
 					   i, GFP_HIGHUSER | __GFP_ZERO);
 		if (!page)
 			break;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 438b5bf675b6..09e7d68dff02 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -94,7 +94,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",indirect");
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	if (sbi->pipe)
-		seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+		seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
 	else
 		seq_printf(m, ",pipe_ino=-1");
 #endif
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 350a2c8cfd28..6e2771c210f6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -52,7 +52,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		   arg |= O_NONBLOCK;
 
 	/* Pipe packetized mode is controlled by O_DIRECT flag */
-	if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) {
+	if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
 		if (!filp->f_mapping || !filp->f_mapping->a_ops ||
 			!filp->f_mapping->a_ops->direct_IO)
 				return -EINVAL;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 02cc6139ec90..e6bbc8083d77 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -724,7 +724,7 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	int rc = -EINVAL;
 
-	if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
+	if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
 		if (cmd == F_GETLK) {
 			rc = 0;
 			posix_test_lock(filp, fl);
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 38887cc5577f..f1e824979aad 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -434,6 +434,7 @@ static ssize_t orangefs_debug_write(struct file *file,
 	char *debug_string;
 	struct orangefs_kernel_op_s *new_op = NULL;
 	struct client_debug_mask c_mask = { NULL, 0, 0 };
+	char *s;
 
 	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
 		"orangefs_debug_write: %pD\n",
@@ -521,8 +522,9 @@ static ssize_t orangefs_debug_write(struct file *file,
 	}
 
 	mutex_lock(&orangefs_debug_lock);
-	memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
-	sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
+	s = file_inode(file)->i_private;
+	memset(s, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+	sprintf(s, "%s\n", debug_string);
 	mutex_unlock(&orangefs_debug_lock);
 
 	*ppos += count;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 36795eed40b0..2838bddb1f91 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -33,7 +33,7 @@ static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
 {
 	const struct dentry *dentry = data;
 
-	if (f->f_inode == d_inode(dentry))
+	if (file_inode(f) == d_inode(dentry))
 		pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
 				    f, fd, current->pid, current->comm);
 	return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0d302a87f21b..99401efef7ac 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -548,8 +548,8 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	exe_file = get_task_exe_file(tsk);
 	if (!exe_file)
 		return 0;
-	ino = exe_file->f_inode->i_ino;
-	dev = exe_file->f_inode->i_sb->s_dev;
+	ino = file_inode(exe_file)->i_ino;
+	dev = file_inode(exe_file)->i_sb->s_dev;
 	fput(exe_file);
 	return audit_mark_compare(mark, ino, dev);
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6ee1febdf6ff..5134a1c17186 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6701,7 +6701,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
 				     struct file *file, unsigned long offset,
 				     unsigned long size)
 {
-	if (filter->inode != file->f_inode)
+	if (filter->inode != file_inode(file))
 		return false;
 
 	if (filter->offset > offset + size)
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index eb0a599fcf58..e852be4851fc 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -108,11 +108,7 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
 	/*
 	 * Get the counter ID stored in file->f_inode->i_private
 	 */
-	if (!file->f_inode) {
-		WARN_ON_ONCE(1);
-		return -EBADF;
-	}
-	counter = (long)(file->f_inode->i_private);
+	counter = (long)file_inode(file)->i_private;
 
 	if (counter >= qstat_num)
 		return -EBADF;
@@ -177,11 +173,7 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
 	/*
 	 * Get the counter ID stored in file->f_inode->i_private
 	 */
-	if (!file->f_inode) {
-		WARN_ON_ONCE(1);
-		return -EBADF;
-	}
-	if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+	if ((long)file_inode(file)->i_private != qstat_reset_cnts)
 		return count;
 
 	for_each_possible_cpu(cpu) {
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 1cb060293505..ad170310a856 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -225,7 +225,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file,
 {
 	struct task_smack *tsp = cred->security;
 	struct smack_known *sskp = tsp->smk_task;
-	struct inode *inode = file->f_inode;
+	struct inode *inode = file_inode(file);
 	struct inode_smack *isp = inode->i_security;
 	char acc[SMK_NUM_ACCESS_TYPE + 1];
 
-- 
cgit v1.2.3


From 3c839744b33782b930c5c61df35511ede5e5a574 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Sat, 3 Dec 2016 12:31:33 -0800
Subject: bpf: Preserve const register type on const OR alu ops

Occasionally, clang (e.g. version 3.8.1) translates a sum between two
constant operands using a BPF_OR instead of a BPF_ADD. The verifier is
currently not handling this scenario, and the destination register type
becomes UNKNOWN_VALUE even if it's still storing a constant. As a result,
the destination register cannot be used as argument to a helper function
expecting a ARG_CONST_STACK_*, limiting some use cases.

Modify the verifier to handle this case, and add a few tests to make sure
all combinations are supported, and stack boundaries are still verified
even with BPF_OR.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |  9 ++++-
 tools/testing/selftests/bpf/.gitignore      |  1 +
 tools/testing/selftests/bpf/test_verifier.c | 60 +++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0e742210750e..38d05da84a49 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1481,14 +1481,19 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
 	u8 opcode = BPF_OP(insn->code);
 
-	/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
-	 * Don't care about overflow or negative values, just add them
+	/* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or'
+	 * insn. Don't care about overflow or negative values, just add them
 	 */
 	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
 		dst_reg->imm += insn->imm;
 	else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
 		 src_reg->type == CONST_IMM)
 		dst_reg->imm += src_reg->imm;
+	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K)
+		dst_reg->imm |= insn->imm;
+	else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
+		 src_reg->type == CONST_IMM)
+		dst_reg->imm |= src_reg->imm;
 	else
 		mark_reg_unknown_value(regs, insn->dst_reg);
 	return 0;
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 3c59f96e3ed8..071431bedde8 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -1,2 +1,3 @@
 test_verifier
 test_maps
+test_lru_map
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 5da2e9d7689c..8d71e44b319d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2683,6 +2683,66 @@ static struct bpf_test tests[] = {
 		.errstr_unpriv = "R0 pointer arithmetic prohibited",
 		.result_unpriv = REJECT,
 	},
+	{
+		"constant register |= constant should keep constant type",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"constant register |= constant should not bypass stack boundary checks",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-48 access_size=58",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"constant register |= constant register should keep constant type",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_MOV64_IMM(BPF_REG_4, 13),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"constant register |= constant register should not bypass stack boundary checks",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+			BPF_MOV64_IMM(BPF_REG_2, 34),
+			BPF_MOV64_IMM(BPF_REG_4, 24),
+			BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid stack type R1 off=-48 access_size=58",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3


From cbbd26b8b1a6af9c02e2b6523e12bd50cc765059 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 1 Nov 2016 22:09:04 -0400
Subject: [iov_iter] new primitives - copy_from_iter_full() and friends

copy_from_iter_full(), copy_from_iter_full_nocache() and
csum_and_copy_from_iter_full() - counterparts of copy_from_iter()
et.al., advancing iterator only in case of successful full copy
and returning whether it had been successful or not.

Convert some obvious users.  *NOTE* - do not blindly assume that
something is a good candidate for those unless you are sure that
not advancing iov_iter in failure case is the right thing in
this case.  Anything that does short read/short write kind of
stuff (or is in a loop, etc.) is unlikely to be a good one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/bluetooth/hci_vhci.c       |  2 +-
 drivers/net/macvtap.c              |  4 +-
 drivers/net/tun.c                  |  7 +--
 drivers/usb/gadget/function/f_fs.c |  2 +-
 drivers/usb/gadget/legacy/inode.c  |  2 +-
 drivers/vhost/scsi.c               |  3 +-
 drivers/vhost/vhost.c              |  3 +-
 fs/ncpfs/file.c                    |  2 +-
 fs/orangefs/devorangefs-req.c      | 13 ++---
 include/linux/uio.h                |  3 ++
 kernel/printk/printk.c             |  2 +-
 lib/iov_iter.c                     | 98 +++++++++++++++++++++++++++++++++++++-
 net/atm/common.c                   |  2 +-
 net/bluetooth/l2cap_core.c         |  6 +--
 net/packet/af_packet.c             |  5 +-
 net/tipc/msg.c                     |  4 +-
 security/keys/keyctl.c             |  2 +-
 17 files changed, 121 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index c4a75a18dcae..233e850fdac7 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -181,7 +181,7 @@ static inline ssize_t vhci_get_user(struct vhci_data *data,
 	if (!skb)
 		return -ENOMEM;
 
-	if (copy_from_iter(skb_put(skb, len), len, from) != len) {
+	if (!copy_from_iter_full(skb_put(skb, len), len, from)) {
 		kfree_skb(skb);
 		return -EFAULT;
 	}
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 070e3290aa6e..19d81ca3fb49 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -673,7 +673,6 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 	int depth;
 	bool zerocopy = false;
 	size_t linear;
-	ssize_t n;
 
 	if (q->flags & IFF_VNET_HDR) {
 		vnet_hdr_len = q->vnet_hdr_sz;
@@ -684,8 +683,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 		len -= vnet_hdr_len;
 
 		err = -EFAULT;
-		n = copy_from_iter(&vnet_hdr, sizeof(vnet_hdr), from);
-		if (n != sizeof(vnet_hdr))
+		if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
 			goto err;
 		iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
 		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8093e39ae263..4fa2d756548a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1171,7 +1171,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	bool zerocopy = false;
 	int err;
 	u32 rxhash;
-	ssize_t n;
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1181,8 +1180,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			return -EINVAL;
 		len -= sizeof(pi);
 
-		n = copy_from_iter(&pi, sizeof(pi), from);
-		if (n != sizeof(pi))
+		if (!copy_from_iter_full(&pi, sizeof(pi), from))
 			return -EFAULT;
 	}
 
@@ -1191,8 +1189,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			return -EINVAL;
 		len -= tun->vnet_hdr_sz;
 
-		n = copy_from_iter(&gso, sizeof(gso), from);
-		if (n != sizeof(gso))
+		if (!copy_from_iter_full(&gso, sizeof(gso), from))
 			return -EFAULT;
 
 		if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 17989b72cdae..0bfd1e25b431 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -949,7 +949,7 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data)
 			goto error_mutex;
 		}
 		if (!io_data->read &&
-		    copy_from_iter(data, data_len, &io_data->data) != data_len) {
+		    !copy_from_iter_full(data, data_len, &io_data->data)) {
 			ret = -EFAULT;
 			goto error_mutex;
 		}
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index bd82dd12deff..10b2576f8b6a 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -667,7 +667,7 @@ ep_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return -ENOMEM;
 	}
 
-	if (unlikely(copy_from_iter(buf, len, from) != len)) {
+	if (unlikely(!copy_from_iter_full(buf, len, from))) {
 		value = -EFAULT;
 		goto out;
 	}
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 6e29d053843d..b296985fda0d 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -922,8 +922,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 		 */
 		iov_iter_init(&out_iter, WRITE, vq->iov, out, out_size);
 
-		ret = copy_from_iter(req, req_size, &out_iter);
-		if (unlikely(ret != req_size)) {
+		if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) {
 			vq_err(vq, "Faulted on copy_from_iter\n");
 			vhost_scsi_send_bad_target(vs, vq, head, out);
 			continue;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c6f2d89c0e97..06e8b81b6253 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1862,8 +1862,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			       i, count);
 			return -EINVAL;
 		}
-		if (unlikely(copy_from_iter(&desc, sizeof(desc), &from) !=
-			     sizeof(desc))) {
+		if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) {
 			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
 			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
 			return -EINVAL;
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index dd38ca1f2ecb..83ca77231707 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -203,7 +203,7 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 				      bufsize - (pos % bufsize),
 				      iov_iter_count(from));
 
-		if (copy_from_iter(bouncebuffer, to_write, from) != to_write) {
+		if (!copy_from_iter_full(bouncebuffer, to_write, from)) {
 			errno = -EFAULT;
 			break;
 		}
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index 516ffb4dc9a0..b0ced669427e 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -355,7 +355,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		__u64 tag;
 	} head;
 	int total = ret = iov_iter_count(iter);
-	int n;
 	int downcall_size = sizeof(struct orangefs_downcall_s);
 	int head_size = sizeof(head);
 
@@ -372,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		return -EFAULT;
 	}
      
-	n = copy_from_iter(&head, head_size, iter);
-	if (n < head_size) {
+	if (!copy_from_iter_full(&head, head_size, iter)) {
 		gossip_err("%s: failed to copy head.\n", __func__);
 		return -EFAULT;
 	}
@@ -407,8 +405,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		return ret;
 	}
 
-	n = copy_from_iter(&op->downcall, downcall_size, iter);
-	if (n != downcall_size) {
+	if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) {
 		gossip_err("%s: failed to copy downcall.\n", __func__);
 		goto Efault;
 	}
@@ -462,10 +459,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		goto Enomem;
 	}
 	memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
-	n = copy_from_iter(op->downcall.trailer_buf,
-			   op->downcall.trailer_size,
-			   iter);
-	if (n != op->downcall.trailer_size) {
+	if (!copy_from_iter_full(op->downcall.trailer_buf,
+			         op->downcall.trailer_size, iter)) {
 		gossip_err("%s: failed to copy trailer.\n", __func__);
 		vfree(op->downcall.trailer_buf);
 		goto Efault;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 6e22b544d039..e57c0ccd61c6 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -89,7 +89,9 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i);
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
 size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
+bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i);
 size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
+bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i);
 size_t iov_iter_zero(size_t bytes, struct iov_iter *);
 unsigned long iov_iter_alignment(const struct iov_iter *i);
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
@@ -155,6 +157,7 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 }
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
 int import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f7a55e9ff2f7..f6bda15396df 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -748,7 +748,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
 		return -ENOMEM;
 
 	buf[len] = '\0';
-	if (copy_from_iter(buf, len, from) != len) {
+	if (!copy_from_iter_full(buf, len, from)) {
 		kfree(buf);
 		return -EFAULT;
 	}
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f2bd21b93dfc..83c00b7f59b5 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -568,6 +568,31 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(copy_from_iter);
 
+bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
+{
+	char *to = addr;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return false;
+	}
+	if (unlikely(i->count < bytes))				\
+		return false;
+
+	iterate_all_kinds(i, bytes, v, ({
+		if (__copy_from_user((to += v.iov_len) - v.iov_len,
+				      v.iov_base, v.iov_len))
+			return false;
+		0;}),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len),
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+	)
+
+	iov_iter_advance(i, bytes);
+	return true;
+}
+EXPORT_SYMBOL(copy_from_iter_full);
+
 size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
@@ -587,6 +612,30 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(copy_from_iter_nocache);
 
+bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
+{
+	char *to = addr;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return false;
+	}
+	if (unlikely(i->count < bytes))				\
+		return false;
+	iterate_all_kinds(i, bytes, v, ({
+		if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+					     v.iov_base, v.iov_len))
+			return false;
+		0;}),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len),
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+	)
+
+	iov_iter_advance(i, bytes);
+	return true;
+}
+EXPORT_SYMBOL(copy_from_iter_full_nocache);
+
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
@@ -1008,7 +1057,7 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 	}
 	iterate_and_advance(i, bytes, v, ({
 		int err = 0;
-		next = csum_and_copy_from_user(v.iov_base, 
+		next = csum_and_copy_from_user(v.iov_base,
 					       (to += v.iov_len) - v.iov_len,
 					       v.iov_len, 0, &err);
 		if (!err) {
@@ -1037,6 +1086,51 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter);
 
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
+			       struct iov_iter *i)
+{
+	char *to = addr;
+	__wsum sum, next;
+	size_t off = 0;
+	sum = *csum;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return false;
+	}
+	if (unlikely(i->count < bytes))
+		return false;
+	iterate_all_kinds(i, bytes, v, ({
+		int err = 0;
+		next = csum_and_copy_from_user(v.iov_base,
+					       (to += v.iov_len) - v.iov_len,
+					       v.iov_len, 0, &err);
+		if (err)
+			return false;
+		sum = csum_block_add(sum, next, off);
+		off += v.iov_len;
+		0;
+	}), ({
+		char *p = kmap_atomic(v.bv_page);
+		next = csum_partial_copy_nocheck(p + v.bv_offset,
+						 (to += v.bv_len) - v.bv_len,
+						 v.bv_len, 0);
+		kunmap_atomic(p);
+		sum = csum_block_add(sum, next, off);
+		off += v.bv_len;
+	}),({
+		next = csum_partial_copy_nocheck(v.iov_base,
+						 (to += v.iov_len) - v.iov_len,
+						 v.iov_len, 0);
+		sum = csum_block_add(sum, next, off);
+		off += v.iov_len;
+	})
+	)
+	*csum = sum;
+	iov_iter_advance(i, bytes);
+	return true;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
 			     struct iov_iter *i)
 {
@@ -1051,7 +1145,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
 	iterate_and_advance(i, bytes, v, ({
 		int err = 0;
 		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
-					     v.iov_base, 
+					     v.iov_base,
 					     v.iov_len, 0, &err);
 		if (!err) {
 			sum = csum_block_add(sum, next, off);
diff --git a/net/atm/common.c b/net/atm/common.c
index 6dc12305799e..a3ca922d307b 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -630,7 +630,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
 		goto out;
 	skb->dev = NULL; /* for paths shared with net_device interfaces */
 	ATM_SKB(skb)->atm_options = vcc->atm_options;
-	if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) {
+	if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
 		kfree_skb(skb);
 		error = -EFAULT;
 		goto out;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 577f1c01454a..ce0b5dd01953 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -2127,7 +2127,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
 	struct sk_buff **frag;
 	int sent = 0;
 
-	if (copy_from_iter(skb_put(skb, count), count, &msg->msg_iter) != count)
+	if (!copy_from_iter_full(skb_put(skb, count), count, &msg->msg_iter))
 		return -EFAULT;
 
 	sent += count;
@@ -2147,8 +2147,8 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan,
 
 		*frag = tmp;
 
-		if (copy_from_iter(skb_put(*frag, count), count,
-				   &msg->msg_iter) != count)
+		if (!copy_from_iter_full(skb_put(*frag, count), count,
+				   &msg->msg_iter))
 			return -EFAULT;
 
 		sent += count;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d2238b204691..588ec202d5ba 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2432,14 +2432,11 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
 				 struct virtio_net_hdr *vnet_hdr)
 {
-	int n;
-
 	if (*len < sizeof(*vnet_hdr))
 		return -EINVAL;
 	*len -= sizeof(*vnet_hdr);
 
-	n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
-	if (n != sizeof(*vnet_hdr))
+	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
 		return -EFAULT;
 
 	return __packet_snd_vnet_parse(vnet_hdr, *len);
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 17201aa8423d..a22be502f1bd 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -268,7 +268,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		__skb_queue_tail(list, skb);
 		skb_copy_to_linear_data(skb, mhdr, mhsz);
 		pktpos = skb->data + mhsz;
-		if (copy_from_iter(pktpos, dsz, &m->msg_iter) == dsz)
+		if (copy_from_iter_full(pktpos, dsz, &m->msg_iter))
 			return dsz;
 		rc = -EFAULT;
 		goto error;
@@ -299,7 +299,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		if (drem < pktrem)
 			pktrem = drem;
 
-		if (copy_from_iter(pktpos, pktrem, &m->msg_iter) != pktrem) {
+		if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) {
 			rc = -EFAULT;
 			goto error;
 		}
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index d580ad06b792..f89f1900e58d 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1074,7 +1074,7 @@ long keyctl_instantiate_key_common(key_serial_t id,
 		}
 
 		ret = -EFAULT;
-		if (copy_from_iter(payload, plen, from) != plen)
+		if (!copy_from_iter_full(payload, plen, from))
 			goto error2;
 	}
 
-- 
cgit v1.2.3


From 7bd509e311f408f7a5132fcdde2069af65fa05ae Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 4 Dec 2016 23:19:41 +0100
Subject: bpf: add prog_digest and expose it via fdinfo/netlink

When loading a BPF program via bpf(2), calculate the digest over
the program's instruction stream and store it in struct bpf_prog's
digest member. This is done at a point in time before any instructions
are rewritten by the verifier. Any unstable map file descriptor
number part of the imm field will be zeroed for the hash.

fdinfo example output for progs:

  # cat /proc/1590/fdinfo/5
  pos:          0
  flags:        02000002
  mnt_id:       11
  prog_type:    1
  prog_jited:   1
  prog_digest:  b27e8b06da22707513aa97363dfb11c7c3675d28
  memlock:      4096

When programs are pinned and retrieved by an ELF loader, the loader
can check the program's digest through fdinfo and compare it against
one that was generated over the ELF file's program section to see
if the program needs to be reloaded. Furthermore, this can also be
exposed through other means such as netlink in case of a tc cls/act
dump (or xdp in future), but also through tracepoints or other
facilities to identify the program. Other than that, the digest can
also serve as a base name for the work in progress kallsyms support
of programs. The digest doesn't depend/select the crypto layer, since
we need to keep dependencies to a minimum. iproute2 will get support
for this facility.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                |  1 +
 include/linux/filter.h             |  7 +++-
 include/uapi/linux/pkt_cls.h       |  1 +
 include/uapi/linux/tc_act/tc_bpf.h |  1 +
 kernel/bpf/core.c                  | 65 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c               | 24 +++++++++++++-
 kernel/bpf/verifier.c              |  2 ++
 net/sched/act_bpf.c                |  9 ++++++
 net/sched/cls_bpf.c                |  8 +++++
 9 files changed, 116 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 69d0a7f12a3b..8796ff03f472 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,6 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+void bpf_prog_calc_digest(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 97338134398f..f078d2b1cff6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -14,6 +14,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
+#include <linux/cryptohash.h>
 
 #include <net/sch_generic.h>
 
@@ -56,6 +57,9 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
+/* Maximum BPF program size in bytes. */
+#define MAX_BPF_SIZE	(BPF_MAXINSNS * sizeof(struct bpf_insn))
+
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -404,8 +408,9 @@ struct bpf_prog {
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
-	u32			len;		/* Number of filter blocks */
 	enum bpf_prog_type	type;		/* Type of BPF program */
+	u32			len;		/* Number of filter blocks */
+	u32			digest[SHA_DIGEST_WORDS]; /* Program digest */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	unsigned int		(*bpf_func)(const void *ctx,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 86786d45ee66..1adc0b654996 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -397,6 +397,7 @@ enum {
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
 	TCA_BPF_FLAGS_GEN,
+	TCA_BPF_DIGEST,
 	__TCA_BPF_MAX,
 };
 
diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index 063d9d465119..a6b88a6f7f71 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -27,6 +27,7 @@ enum {
 	TCA_ACT_BPF_FD,
 	TCA_ACT_BPF_NAME,
 	TCA_ACT_BPF_PAD,
+	TCA_ACT_BPF_DIGEST,
 	__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 82a04143368e..bdcc9f4ba767 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
+#define SHA_BPF_RAW_SIZE						\
+	round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES)
+
+/* Called under verifier mutex. */
+void bpf_prog_calc_digest(struct bpf_prog *fp)
+{
+	const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
+	static u32 ws[SHA_WORKSPACE_WORDS];
+	static u8 raw[SHA_BPF_RAW_SIZE];
+	struct bpf_insn *dst = (void *)raw;
+	u32 i, bsize, psize, blocks;
+	bool was_ld_map;
+	u8 *todo = raw;
+	__be32 *result;
+	__be64 *bits;
+
+	sha_init(fp->digest);
+	memset(ws, 0, sizeof(ws));
+
+	/* We need to take out the map fd for the digest calculation
+	 * since they are unstable from user space side.
+	 */
+	for (i = 0, was_ld_map = false; i < fp->len; i++) {
+		dst[i] = fp->insnsi[i];
+		if (!was_ld_map &&
+		    dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+			was_ld_map = true;
+			dst[i].imm = 0;
+		} else if (was_ld_map &&
+			   dst[i].code == 0 &&
+			   dst[i].dst_reg == 0 &&
+			   dst[i].src_reg == 0 &&
+			   dst[i].off == 0) {
+			was_ld_map = false;
+			dst[i].imm = 0;
+		} else {
+			was_ld_map = false;
+		}
+	}
+
+	psize = fp->len * sizeof(struct bpf_insn);
+	memset(&raw[psize], 0, sizeof(raw) - psize);
+	raw[psize++] = 0x80;
+
+	bsize  = round_up(psize, SHA_MESSAGE_BYTES);
+	blocks = bsize / SHA_MESSAGE_BYTES;
+	if (bsize - psize >= sizeof(__be64)) {
+		bits = (__be64 *)(todo + bsize - sizeof(__be64));
+	} else {
+		bits = (__be64 *)(todo + bsize + bits_offset);
+		blocks++;
+	}
+	*bits = cpu_to_be64((psize - 1) << 3);
+
+	while (blocks--) {
+		sha_transform(fp->digest, todo, ws);
+		todo += SHA_MESSAGE_BYTES;
+	}
+
+	result = (__force __be32 *)fp->digest;
+	for (i = 0; i < SHA_DIGEST_WORDS; i++)
+		result[i] = cpu_to_be32(fp->digest[i]);
+}
+
 static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
 {
 	return BPF_CLASS(insn->code) == BPF_JMP  &&
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 85af86c496cd..c0d2b423ce93 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -662,8 +662,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_FS
+static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	const struct bpf_prog *prog = filp->private_data;
+	char prog_digest[sizeof(prog->digest) * 2 + 1] = { };
+
+	bin2hex(prog_digest, prog->digest, sizeof(prog->digest));
+	seq_printf(m,
+		   "prog_type:\t%u\n"
+		   "prog_jited:\t%u\n"
+		   "prog_digest:\t%s\n"
+		   "memlock:\t%llu\n",
+		   prog->type,
+		   prog->jited,
+		   prog_digest,
+		   prog->pages * 1ULL << PAGE_SHIFT);
+}
+#endif
+
 static const struct file_operations bpf_prog_fops = {
-        .release = bpf_prog_release,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= bpf_prog_show_fdinfo,
+#endif
+	.release	= bpf_prog_release,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38d05da84a49..cb37339ca0da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3176,6 +3176,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log_level = 0;
 	}
 
+	bpf_prog_calc_digest(env->prog);
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 84c1d2da4f8b..1c60317f0121 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -117,10 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog,
 static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
 				  struct sk_buff *skb)
 {
+	struct nlattr *nla;
+
 	if (prog->bpf_name &&
 	    nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST,
+			  sizeof(prog->filter->digest));
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
 	return 0;
 }
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index f70e03d2d2c8..adc776048d1a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -549,10 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog,
 static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
 				  struct sk_buff *skb)
 {
+	struct nlattr *nla;
+
 	if (prog->bpf_name &&
 	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
+	nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest));
+	if (nla == NULL)
+		return -EMSGSIZE;
+
+	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3cd5eca8d7a2fe43098df4c33a1272fe6945cac9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 20 Nov 2016 20:19:09 -0500
Subject: fsnotify: constify 'data' passed to ->handle_event()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/notify/dnotify/dnotify.c          |  2 +-
 fs/notify/fanotify/fanotify.c        |  8 ++++----
 fs/notify/fanotify/fanotify.h        |  2 +-
 fs/notify/inotify/inotify.h          |  2 +-
 fs/notify/inotify/inotify_fsnotify.c |  4 ++--
 include/linux/fsnotify_backend.h     |  2 +-
 kernel/audit_fsnotify.c              | 10 +++++-----
 kernel/audit_tree.c                  |  2 +-
 kernel/audit_watch.c                 |  8 ++++----
 9 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 6faaf710e563..5a4ec309e283 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -85,7 +85,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
-				u32 mask, void *data, int data_type,
+				u32 mask, const void *data, int data_type,
 				const unsigned char *file_name, u32 cookie)
 {
 	struct dnotify_mark *dn_mark;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e0e5f7c3c99f..bbc175d4213d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -90,10 +90,10 @@ static int fanotify_get_response(struct fsnotify_group *group,
 static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 				       struct fsnotify_mark *vfsmnt_mark,
 				       u32 event_mask,
-				       void *data, int data_type)
+				       const void *data, int data_type)
 {
 	__u32 marks_mask, marks_ignored_mask;
-	struct path *path = data;
+	const struct path *path = data;
 
 	pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
 		 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 }
 
 struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
-						 struct path *path)
+						 const struct path *path)
 {
 	struct fanotify_event_info *event;
 
@@ -177,7 +177,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct inode *inode,
 				 struct fsnotify_mark *inode_mark,
 				 struct fsnotify_mark *fanotify_mark,
-				 u32 mask, void *data, int data_type,
+				 u32 mask, const void *data, int data_type,
 				 const unsigned char *file_name, u32 cookie)
 {
 	int ret = 0;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 2a5fb14115df..4500a74f8d38 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -47,4 +47,4 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
 }
 
 struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
-						 struct path *path);
+						 const struct path *path);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ed855ef6f077..a6f5907a3fee 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -26,7 +26,7 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
-				u32 mask, void *data, int data_type,
+				u32 mask, const void *data, int data_type,
 				const unsigned char *file_name, u32 cookie);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 2cd900c2c737..19e7ec109a75 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group,
 			 struct inode *inode,
 			 struct fsnotify_mark *inode_mark,
 			 struct fsnotify_mark *vfsmount_mark,
-			 u32 mask, void *data, int data_type,
+			 u32 mask, const void *data, int data_type,
 			 const unsigned char *file_name, u32 cookie)
 {
 	struct inotify_inode_mark *i_mark;
@@ -80,7 +80,7 @@ int inotify_handle_event(struct fsnotify_group *group,
 
 	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
 	    (data_type == FSNOTIFY_EVENT_PATH)) {
-		struct path *path = data;
+		const struct path *path = data;
 
 		if (d_unlinked(path->dentry))
 			return 0;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 79467b239fcf..d357041bbec8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -96,7 +96,7 @@ struct fsnotify_ops {
 			    struct inode *inode,
 			    struct fsnotify_mark *inode_mark,
 			    struct fsnotify_mark *vfsmount_mark,
-			    u32 mask, void *data, int data_type,
+			    u32 mask, const void *data, int data_type,
 			    const unsigned char *file_name, u32 cookie);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index f84f8d06e1f6..1173f7cc7ba3 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -74,7 +74,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_
 }
 
 static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
-			     struct inode *inode)
+			     const struct inode *inode)
 {
 	audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
 	audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
@@ -168,11 +168,11 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
 				    struct inode *to_tell,
 				    struct fsnotify_mark *inode_mark,
 				    struct fsnotify_mark *vfsmount_mark,
-				    u32 mask, void *data, int data_type,
+				    u32 mask, const void *data, int data_type,
 				    const unsigned char *dname, u32 cookie)
 {
 	struct audit_fsnotify_mark *audit_mark;
-	struct inode *inode = NULL;
+	const struct inode *inode = NULL;
 
 	audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
 
@@ -180,10 +180,10 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
 
 	switch (data_type) {
 	case (FSNOTIFY_EVENT_PATH):
-		inode = ((struct path *)data)->dentry->d_inode;
+		inode = ((const struct path *)data)->dentry->d_inode;
 		break;
 	case (FSNOTIFY_EVENT_INODE):
-		inode = (struct inode *)data;
+		inode = (const struct inode *)data;
 		break;
 	default:
 		BUG();
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 25772476fa4a..3a2f5dfe8093 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -948,7 +948,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
 				   struct inode *to_tell,
 				   struct fsnotify_mark *inode_mark,
 				   struct fsnotify_mark *vfsmount_mark,
-				   u32 mask, void *data, int data_type,
+				   u32 mask, const void *data, int data_type,
 				   const unsigned char *file_name, u32 cookie)
 {
 	return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0d302a87f21b..f476c46b9c20 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -472,10 +472,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 				    struct inode *to_tell,
 				    struct fsnotify_mark *inode_mark,
 				    struct fsnotify_mark *vfsmount_mark,
-				    u32 mask, void *data, int data_type,
+				    u32 mask, const void *data, int data_type,
 				    const unsigned char *dname, u32 cookie)
 {
-	struct inode *inode;
+	const struct inode *inode;
 	struct audit_parent *parent;
 
 	parent = container_of(inode_mark, struct audit_parent, mark);
@@ -484,10 +484,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 
 	switch (data_type) {
 	case (FSNOTIFY_EVENT_PATH):
-		inode = d_backing_inode(((struct path *)data)->dentry);
+		inode = d_backing_inode(((const struct path *)data)->dentry);
 		break;
 	case (FSNOTIFY_EVENT_INODE):
-		inode = (struct inode *)data;
+		inode = (const struct inode *)data;
 		break;
 	default:
 		BUG();
-- 
cgit v1.2.3


From 8bd107633b64195a0748b05236c3d14db0a8bed4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 20 Nov 2016 20:36:51 -0500
Subject: audit_log_{name,link_denied}: constify struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h | 2 +-
 kernel/audit.c        | 4 ++--
 kernel/audit.h        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 9d4443f93db6..f51fca8d0b6f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -147,7 +147,7 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 extern void		    audit_log_key(struct audit_buffer *ab,
 					  char *key);
 extern void		    audit_log_link_denied(const char *operation,
-						  struct path *link);
+						  const struct path *link);
 extern void		    audit_log_lost(const char *message);
 #ifdef CONFIG_SECURITY
 extern void 		    audit_log_secctx(struct audit_buffer *ab, u32 secid);
diff --git a/kernel/audit.c b/kernel/audit.c
index f1ca11613379..06008c422bd5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1760,7 +1760,7 @@ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
  * @call_panic: optional pointer to int that will be updated if secid fails
  */
 void audit_log_name(struct audit_context *context, struct audit_names *n,
-		    struct path *path, int record_num, int *call_panic)
+		    const struct path *path, int record_num, int *call_panic)
 {
 	struct audit_buffer *ab;
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
@@ -1948,7 +1948,7 @@ EXPORT_SYMBOL(audit_log_task_info);
  * @operation: specific link operation
  * @link: the path that triggered the restriction
  */
-void audit_log_link_denied(const char *operation, struct path *link)
+void audit_log_link_denied(const char *operation, const struct path *link)
 {
 	struct audit_buffer *ab;
 	struct audit_names *name;
diff --git a/kernel/audit.h b/kernel/audit.h
index 431444c3708b..960d49c9db5e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -212,7 +212,7 @@ extern void audit_copy_inode(struct audit_names *name,
 extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
 			  kernel_cap_t *cap);
 extern void audit_log_name(struct audit_context *context,
-			   struct audit_names *n, struct path *path,
+			   struct audit_names *n, const struct path *path,
 			   int record_num, int *call_panic);
 
 extern int audit_pid;
-- 
cgit v1.2.3


From 8fc31ce8896fc3cea1d79688c8ff950ad4e73afe Mon Sep 17 00:00:00 2001
From: David Carrillo-Cisneros <davidcc@google.com>
Date: Sun, 4 Dec 2016 00:46:17 -0800
Subject: perf/core: Remove invalid warning from list_update_cgroup_even()t

The warning introduced in commit:

  864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups")

assumed that a cgroup switch always precedes list_del_event. This is
not the case. Remove warning.

Make sure that cpuctx->cgrp is NULL until a cgroup event is sched in
or ctx->nr_cgroups == 0.

Signed-off-by: David Carrillo-Cisneros <davidcc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Nilay Vaish <nilayvaish@gmail.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi V Shankar <ravi.v.shankar@intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1480841177-27299-1-git-send-email-davidcc@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6ee1febdf6ff..02c8421f8c01 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -903,17 +903,14 @@ list_update_cgroup_event(struct perf_event *event,
 	 */
 	cpuctx = __get_cpu_context(ctx);
 
-	/* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */
-	if (perf_cgroup_from_task(current, ctx) != event->cgrp) {
-		/*
-		 * We are removing the last cpu event in this context.
-		 * If that event is not active in this cpu, cpuctx->cgrp
-		 * should've been cleared by perf_cgroup_switch.
-		 */
-		WARN_ON_ONCE(!add && cpuctx->cgrp);
-		return;
-	}
-	cpuctx->cgrp = add ? event->cgrp : NULL;
+	/*
+	 * cpuctx->cgrp is NULL until a cgroup event is sched in or
+	 * ctx->nr_cgroup == 0 .
+	 */
+	if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
+		cpuctx->cgrp = event->cgrp;
+	else if (!add)
+		cpuctx->cgrp = NULL;
 }
 
 #else /* !CONFIG_CGROUP_PERF */
-- 
cgit v1.2.3


From f943fe0faf27991d256e10b5a85f175385c64cdc Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Mon, 28 Nov 2016 15:24:43 +0100
Subject: lockdep: Fix report formatting

Since commit:

  4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines")

printk() requires KERN_CONT to continue log messages. Lots of printk()
in lockdep.c and print_ip_sym() don't have it. As the result lockdep
reports are completely messed up.

Add missing KERN_CONT and inline print_ip_sym() where necessary.

Example of a messed up report:

  0-rc5+ #41 Not tainted
  -------------------------------------------------------
  syz-executor0/5036 is trying to acquire lock:
   (
  rtnl_mutex
  ){+.+.+.}
  , at:
  [<ffffffff86b3d6ac>] rtnl_lock+0x1c/0x20
  but task is already holding lock:
   (
  &net->packet.sklist_lock
  ){+.+...}
  , at:
  [<ffffffff873541a6>] packet_diag_dump+0x1a6/0x1920
  which lock already depends on the new lock.
  the existing dependency chain (in reverse order) is:
  -> #3
   (
  &net->packet.sklist_lock
  +.+...}
  ...

Without this patch all scripts that parse kernel bug reports are broken.

Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: andreyknvl@google.com
Cc: aryabinin@virtuozzo.com
Cc: joe@perches.com
Cc: syzkaller@googlegroups.com
Link: http://lkml.kernel.org/r/1480343083-48731-1-git-send-email-dvyukov@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 111 ++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763a49b3..4d7ffc0a0d00 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -506,13 +506,13 @@ static void __print_lock_name(struct lock_class *class)
 	name = class->name;
 	if (!name) {
 		name = __get_key_name(class->key, str);
-		printk("%s", name);
+		printk(KERN_CONT "%s", name);
 	} else {
-		printk("%s", name);
+		printk(KERN_CONT "%s", name);
 		if (class->name_version > 1)
-			printk("#%d", class->name_version);
+			printk(KERN_CONT "#%d", class->name_version);
 		if (class->subclass)
-			printk("/%d", class->subclass);
+			printk(KERN_CONT "/%d", class->subclass);
 	}
 }
 
@@ -522,9 +522,9 @@ static void print_lock_name(struct lock_class *class)
 
 	get_usage_chars(class, usage);
 
-	printk(" (");
+	printk(KERN_CONT " (");
 	__print_lock_name(class);
-	printk("){%s}", usage);
+	printk(KERN_CONT "){%s}", usage);
 }
 
 static void print_lockdep_cache(struct lockdep_map *lock)
@@ -536,7 +536,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
 	if (!name)
 		name = __get_key_name(lock->key->subkeys, str);
 
-	printk("%s", name);
+	printk(KERN_CONT "%s", name);
 }
 
 static void print_lock(struct held_lock *hlock)
@@ -551,13 +551,13 @@ static void print_lock(struct held_lock *hlock)
 	barrier();
 
 	if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
-		printk("<RELEASED>\n");
+		printk(KERN_CONT "<RELEASED>\n");
 		return;
 	}
 
 	print_lock_name(lock_classes + class_idx - 1);
-	printk(", at: ");
-	print_ip_sym(hlock->acquire_ip);
+	printk(KERN_CONT ", at: [<%p>] %pS\n",
+		(void *)hlock->acquire_ip, (void *)hlock->acquire_ip);
 }
 
 static void lockdep_print_held_locks(struct task_struct *curr)
@@ -792,8 +792,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
 		printk("\nnew class %p: %s", class->key, class->name);
 		if (class->name_version > 1)
-			printk("#%d", class->name_version);
-		printk("\n");
+			printk(KERN_CONT "#%d", class->name_version);
+		printk(KERN_CONT "\n");
 		dump_stack();
 
 		if (!graph_lock()) {
@@ -1071,7 +1071,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
 		return 0;
 	printk("\n-> #%u", depth);
 	print_lock_name(target->class);
-	printk(":\n");
+	printk(KERN_CONT ":\n");
 	print_stack_trace(&target->trace, 6);
 
 	return 0;
@@ -1102,11 +1102,11 @@ print_circular_lock_scenario(struct held_lock *src,
 	if (parent != source) {
 		printk("Chain exists of:\n  ");
 		__print_lock_name(source);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(parent);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(target);
-		printk("\n\n");
+		printk(KERN_CONT "\n\n");
 	}
 
 	printk(" Possible unsafe locking scenario:\n\n");
@@ -1114,16 +1114,16 @@ print_circular_lock_scenario(struct held_lock *src,
 	printk("       ----                    ----\n");
 	printk("  lock(");
 	__print_lock_name(target);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               lock(");
 	__print_lock_name(parent);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               lock(");
 	__print_lock_name(target);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  lock(");
 	__print_lock_name(source);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 }
 
@@ -1359,22 +1359,22 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 
 	printk("%*s->", depth, "");
 	print_lock_name(class);
-	printk(" ops: %lu", class->ops);
-	printk(" {\n");
+	printk(KERN_CONT " ops: %lu", class->ops);
+	printk(KERN_CONT " {\n");
 
 	for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
 		if (class->usage_mask & (1 << bit)) {
 			int len = depth;
 
 			len += printk("%*s   %s", depth, "", usage_str[bit]);
-			len += printk(" at:\n");
+			len += printk(KERN_CONT " at:\n");
 			print_stack_trace(class->usage_traces + bit, len);
 		}
 	}
 	printk("%*s }\n", depth, "");
 
-	printk("%*s ... key      at: ",depth,"");
-	print_ip_sym((unsigned long)class->key);
+	printk("%*s ... key      at: [<%p>] %pS\n",
+		depth, "", class->key, class->key);
 }
 
 /*
@@ -1437,11 +1437,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
 	if (middle_class != unsafe_class) {
 		printk("Chain exists of:\n  ");
 		__print_lock_name(safe_class);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(middle_class);
-		printk(" --> ");
+		printk(KERN_CONT " --> ");
 		__print_lock_name(unsafe_class);
-		printk("\n\n");
+		printk(KERN_CONT "\n\n");
 	}
 
 	printk(" Possible interrupt unsafe locking scenario:\n\n");
@@ -1449,18 +1449,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
 	printk("       ----                    ----\n");
 	printk("  lock(");
 	__print_lock_name(unsafe_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               local_irq_disable();\n");
 	printk("                               lock(");
 	__print_lock_name(safe_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("                               lock(");
 	__print_lock_name(middle_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  <Interrupt>\n");
 	printk("    lock(");
 	__print_lock_name(safe_class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 }
 
@@ -1497,9 +1497,9 @@ print_bad_irq_dependency(struct task_struct *curr,
 	print_lock(prev);
 	printk("which would create a new lock dependency:\n");
 	print_lock_name(hlock_class(prev));
-	printk(" ->");
+	printk(KERN_CONT " ->");
 	print_lock_name(hlock_class(next));
-	printk("\n");
+	printk(KERN_CONT "\n");
 
 	printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
 		irqclass);
@@ -1521,8 +1521,7 @@ print_bad_irq_dependency(struct task_struct *curr,
 
 	lockdep_print_held_locks(curr);
 
-	printk("\nthe dependencies between %s-irq-safe lock", irqclass);
-	printk(" and the holding lock:\n");
+	printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
 	if (!save_trace(&prev_root->trace))
 		return 0;
 	print_shortest_lock_dependencies(backwards_entry, prev_root);
@@ -1694,10 +1693,10 @@ print_deadlock_scenario(struct held_lock *nxt,
 	printk("       ----\n");
 	printk("  lock(");
 	__print_lock_name(prev);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  lock(");
 	__print_lock_name(next);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 	printk(" May be due to missing lock nesting notation\n\n");
 }
@@ -1891,9 +1890,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		graph_unlock();
 		printk("\n new dependency: ");
 		print_lock_name(hlock_class(prev));
-		printk(" => ");
+		printk(KERN_CONT " => ");
 		print_lock_name(hlock_class(next));
-		printk("\n");
+		printk(KERN_CONT "\n");
 		dump_stack();
 		return graph_lock();
 	}
@@ -2343,11 +2342,11 @@ print_usage_bug_scenario(struct held_lock *lock)
 	printk("       ----\n");
 	printk("  lock(");
 	__print_lock_name(class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("  <Interrupt>\n");
 	printk("    lock(");
 	__print_lock_name(class);
-	printk(");\n");
+	printk(KERN_CONT ");\n");
 	printk("\n *** DEADLOCK ***\n\n");
 }
 
@@ -2522,14 +2521,18 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 void print_irqtrace_events(struct task_struct *curr)
 {
 	printk("irq event stamp: %u\n", curr->irq_events);
-	printk("hardirqs last  enabled at (%u): ", curr->hardirq_enable_event);
-	print_ip_sym(curr->hardirq_enable_ip);
-	printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
-	print_ip_sym(curr->hardirq_disable_ip);
-	printk("softirqs last  enabled at (%u): ", curr->softirq_enable_event);
-	print_ip_sym(curr->softirq_enable_ip);
-	printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
-	print_ip_sym(curr->softirq_disable_ip);
+	printk("hardirqs last  enabled at (%u): [<%p>] %pS\n",
+		curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
+		(void *)curr->hardirq_enable_ip);
+	printk("hardirqs last disabled at (%u): [<%p>] %pS\n",
+		curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
+		(void *)curr->hardirq_disable_ip);
+	printk("softirqs last  enabled at (%u): [<%p>] %pS\n",
+		curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
+		(void *)curr->softirq_enable_ip);
+	printk("softirqs last disabled at (%u): [<%p>] %pS\n",
+		curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
+		(void *)curr->softirq_disable_ip);
 }
 
 static int HARDIRQ_verbose(struct lock_class *class)
@@ -3235,8 +3238,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (very_verbose(class)) {
 		printk("\nacquire class [%p] %s", class->key, class->name);
 		if (class->name_version > 1)
-			printk("#%d", class->name_version);
-		printk("\n");
+			printk(KERN_CONT "#%d", class->name_version);
+		printk(KERN_CONT "\n");
 		dump_stack();
 	}
 
@@ -3378,7 +3381,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	printk("%s/%d is trying to release lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
-	printk(") at:\n");
+	printk(KERN_CONT ") at:\n");
 	print_ip_sym(ip);
 	printk("but there are no more locks to release!\n");
 	printk("\nother info that might help us debug this:\n");
@@ -3871,7 +3874,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
 	printk("%s/%d is trying to contend lock (",
 		curr->comm, task_pid_nr(curr));
 	print_lockdep_cache(lock);
-	printk(") at:\n");
+	printk(KERN_CONT ") at:\n");
 	print_ip_sym(ip);
 	printk("but there are no locks held!\n");
 	printk("\nother info that might help us debug this:\n");
-- 
cgit v1.2.3


From b18cc3de00ec3442cf40ac60787dbe0703b99e24 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 7 Dec 2016 14:31:33 +0100
Subject: tracing/rb: Init the CPU mask on allocation

Before commit b32614c03413 ("tracing/rb: Convert to hotplug state machine")
the allocated cpumask was initialized to the mask of online or possible
CPUs. After the CPU hotplug changes the buffer initialization moved to
trace_rb_cpu_prepare() but the cpumask is allocated with alloc_cpumask()
and therefor has random content. As a consequence the cpu buffers are not
initialized and a later access dereferences a NULL pointer.

Use zalloc_cpumask() instead so trace_rb_cpu_prepare() initializes the
buffers properly.

Fixes: b32614c03413 ("tracing/rb: Convert to hotplug state machine")
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/20161207133133.hzkcqfllxcdi3joz@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a7a055f167c7..89a2611a1635 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1297,7 +1297,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	if (!buffer)
 		return NULL;
 
-	if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
 		goto fail_free_buffer;
 
 	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-- 
cgit v1.2.3


From 5304121adae9fc59f4b640f82200868112edd0bd Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Tue, 6 Dec 2016 18:00:43 -0600
Subject: clocksource: export the clocks_calc_mult_shift to use by timestamp
 code

The CPSW CPTS driver is capable of doing timestamping on tx/rx packets and
requires to know mult and shift factors for timestamp conversion from raw
value to nanoseconds (ptp clock). Now these mult and shift factors are
calculated manually and provided through DT, which makes very hard to
support of a lot number of platforms, especially if CPTS refclk is not the
same for some kind of boards and depends on efuse settings (Keystone 2
platforms). Hence, export clocks_calc_mult_shift() to allow drivers like
CPSW CPTS (and other ptp drivesr) to benefit from automaitc calculation of
mult and shift factors.

Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/time/clocksource.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7e4fad75acaa..150242ccfcd2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -89,6 +89,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
 	*mult = tmp;
 	*shift = sft;
 }
+EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
 
 /*[Clocksource internal variables]---------
  * curr_clocksource:
-- 
cgit v1.2.3


From ef0915cacd04c9e35be5f9d62a4e4b5b4b9bcfd1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 7 Dec 2016 01:15:44 +0100
Subject: bpf: fix loading of BPF_MAXINSNS sized programs

General assumption is that single program can hold up to BPF_MAXINSNS,
that is, 4096 number of instructions. It is the case with cBPF and
that limit was carried over to eBPF. When recently testing digest, I
noticed that it's actually not possible to feed 4096 instructions
via bpf(2).

The check for > BPF_MAXINSNS was added back then to bpf_check() in
cbd357008604 ("bpf: verifier (add ability to receive verification log)").
However, 09756af46893 ("bpf: expand BPF syscall with program load/unload")
added yet another check that comes before that into bpf_prog_load(),
but this time bails out already in case of >= BPF_MAXINSNS.

Fix it up and perform the check early in bpf_prog_load(), so we can drop
the second one in bpf_check(). It makes sense, because also a 0 insn
program is useless and we don't want to waste any resources doing work
up to bpf_check() point. The existing bpf(2) man page documents E2BIG
as the official error for such cases, so just stick with it as well.

Fixes: 09756af46893 ("bpf: expand BPF syscall with program load/unload")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c  | 4 ++--
 kernel/bpf/verifier.c | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0d2b423ce93..88f609f1c0c3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -786,8 +786,8 @@ static int bpf_prog_load(union bpf_attr *attr)
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	is_gpl = license_is_gpl_compatible(license);
 
-	if (attr->insn_cnt >= BPF_MAXINSNS)
-		return -EINVAL;
+	if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
+		return -E2BIG;
 
 	if (type == BPF_PROG_TYPE_KPROBE &&
 	    attr->kern_version != LINUX_VERSION_CODE)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cb37339ca0da..da9fb2a9b7eb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3133,9 +3133,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	struct bpf_verifier_env *env;
 	int ret = -EINVAL;
 
-	if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
-		return -E2BIG;
-
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
 	 * allocate/free it every time bpf_check() is called
 	 */
-- 
cgit v1.2.3


From 166ad0e1e2132ff0cda08b94af8301655fcabbcd Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 7 Dec 2016 14:44:36 -0800
Subject: kcov: add missing #include <linux/sched.h>

In __sanitizer_cov_trace_pc we use task_struct and fields within it, but
as we haven't included <linux/sched.h>, it is not guaranteed to be
defined.  While we usually happen to acquire the definition through a
transitive include, this is fragile (and hasn't been true in the past,
causing issues with backports).

Include <linux/sched.h> to avoid any fragility.

[mark.rutland@arm.com: rewrote changelog]
Link: http://lkml.kernel.org/r/1481007384-27529-1-git-send-email-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: James Morse <james.morse@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 30e6d05aa5a9..3cbb0c879705 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/printk.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
-- 
cgit v1.2.3


From 777c6e0daebb3fcefbbd6f620410a946b07ef6d0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 7 Dec 2016 14:54:38 +0100
Subject: hotplug: Make register and unregister notifier API symmetric

Yu Zhao has noticed that __unregister_cpu_notifier only unregisters its
notifiers when HOTPLUG_CPU=y while the registration might succeed even
when HOTPLUG_CPU=n if MODULE is enabled. This means that e.g. zswap
might keep a stale notifier on the list on the manual clean up during
the pool tear down and thus corrupt the list. Resulting in the following

[  144.964346] BUG: unable to handle kernel paging request at ffff880658a2be78
[  144.971337] IP: [<ffffffffa290b00b>] raw_notifier_chain_register+0x1b/0x40
<snipped>
[  145.122628] Call Trace:
[  145.125086]  [<ffffffffa28e5cf8>] __register_cpu_notifier+0x18/0x20
[  145.131350]  [<ffffffffa2a5dd73>] zswap_pool_create+0x273/0x400
[  145.137268]  [<ffffffffa2a5e0fc>] __zswap_param_set+0x1fc/0x300
[  145.143188]  [<ffffffffa2944c1d>] ? trace_hardirqs_on+0xd/0x10
[  145.149018]  [<ffffffffa2908798>] ? kernel_param_lock+0x28/0x30
[  145.154940]  [<ffffffffa2a3e8cf>] ? __might_fault+0x4f/0xa0
[  145.160511]  [<ffffffffa2a5e237>] zswap_compressor_param_set+0x17/0x20
[  145.167035]  [<ffffffffa2908d3c>] param_attr_store+0x5c/0xb0
[  145.172694]  [<ffffffffa290848d>] module_attr_store+0x1d/0x30
[  145.178443]  [<ffffffffa2b2b41f>] sysfs_kf_write+0x4f/0x70
[  145.183925]  [<ffffffffa2b2a5b9>] kernfs_fop_write+0x149/0x180
[  145.189761]  [<ffffffffa2a99248>] __vfs_write+0x18/0x40
[  145.194982]  [<ffffffffa2a9a412>] vfs_write+0xb2/0x1a0
[  145.200122]  [<ffffffffa2a9a732>] SyS_write+0x52/0xa0
[  145.205177]  [<ffffffffa2ff4d97>] entry_SYSCALL_64_fastpath+0x12/0x17

This can be even triggered manually by changing
/sys/module/zswap/parameters/compressor multiple times.

Fix this issue by making unregister APIs symmetric to the register so
there are no surprises.

Fixes: 47e627bc8c9a ("[PATCH] hotplug: Allow modules to use the cpu hotplug notifiers even if !CONFIG_HOTPLUG_CPU")
Reported-and-tested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Link: http://lkml.kernel.org/r/20161207135438.4310-1-mhocko@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h | 15 ++++-----------
 kernel/cpu.c        |  2 +-
 2 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b886dc17f2f3..e571128ad99a 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -93,22 +93,16 @@ extern bool cpuhp_tasks_frozen;
 		{ .notifier_call = fn, .priority = pri };	\
 	__register_cpu_notifier(&fn##_nb);			\
 }
-#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
-#define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-#define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
 
-#ifdef CONFIG_HOTPLUG_CPU
 extern int register_cpu_notifier(struct notifier_block *nb);
 extern int __register_cpu_notifier(struct notifier_block *nb);
 extern void unregister_cpu_notifier(struct notifier_block *nb);
 extern void __unregister_cpu_notifier(struct notifier_block *nb);
-#else
 
-#ifndef MODULE
-extern int register_cpu_notifier(struct notifier_block *nb);
-extern int __register_cpu_notifier(struct notifier_block *nb);
-#else
+#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
+#define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
+#define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
+
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
@@ -118,7 +112,6 @@ static inline int __register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
-#endif
 
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 29de1a9352c0..217fd2e7f435 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -659,7 +659,6 @@ void __init cpuhp_threads_init(void)
 	kthread_unpark(this_cpu_read(cpuhp_state.thread));
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
 void unregister_cpu_notifier(struct notifier_block *nb)
@@ -676,6 +675,7 @@ void __unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(__unregister_cpu_notifier);
 
+#ifdef CONFIG_HOTPLUG_CPU
 /**
  * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
  * @cpu: a CPU id
-- 
cgit v1.2.3


From 1da5c46fa965ff90f5ffc080b6ab3fae5e227bc3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 Nov 2016 18:50:57 +0100
Subject: kthread: Make struct kthread kmalloc'ed

commit 23196f2e5f5d "kthread: Pin the stack via try_get_task_stack() /
put_task_stack() in to_live_kthread() function" is a workaround for the
fragile design of struct kthread being allocated on the task stack.

struct kthread in its current form should be removed, but this needs
cleanups outside of kthread.c.

As a first step move struct kthread away from the task stack by making it
kmalloc'ed. This allows to access kthread.exited without the magic of
trying to pin task stack and the try logic in to_live_kthread().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chunming Zhou <David1.Zhou@amd.com>
Cc: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20161129175057.GA5330@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kthread.h |  1 +
 kernel/fork.c           |  2 ++
 kernel/kthread.c        | 58 ++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 48 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index a6e82a69c363..c1c3e63d52c1 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -48,6 +48,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 	__k;								   \
 })
 
+void free_kthread_struct(struct task_struct *k);
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
diff --git a/kernel/fork.c b/kernel/fork.c
index 600e93b5e539..7ffa16033ded 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -354,6 +354,8 @@ void free_task(struct task_struct *tsk)
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
 	arch_release_task_struct(tsk);
+	if (tsk->flags & PF_KTHREAD)
+		free_kthread_struct(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index be2cc1f9dd57..9d64b6526d0b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -53,14 +53,38 @@ enum KTHREAD_BITS {
 	KTHREAD_IS_PARKED,
 };
 
-#define __to_kthread(vfork)	\
-	container_of(vfork, struct kthread, exited)
+static inline void set_kthread_struct(void *kthread)
+{
+	/*
+	 * We abuse ->set_child_tid to avoid the new member and because it
+	 * can't be wrongly copied by copy_process(). We also rely on fact
+	 * that the caller can't exec, so PF_KTHREAD can't be cleared.
+	 */
+	current->set_child_tid = (__force void __user *)kthread;
+}
 
 static inline struct kthread *to_kthread(struct task_struct *k)
 {
-	return __to_kthread(k->vfork_done);
+	WARN_ON(!(k->flags & PF_KTHREAD));
+	return (__force void *)k->set_child_tid;
+}
+
+void free_kthread_struct(struct task_struct *k)
+{
+	/*
+	 * Can be NULL if this kthread was created by kernel_thread()
+	 * or if kmalloc() in kthread() failed.
+	 */
+	kfree(to_kthread(k));
 }
 
+#define __to_kthread(vfork)	\
+	container_of(vfork, struct kthread, exited)
+
+/*
+ * TODO: kill it and use to_kthread(). But we still need the users
+ * like kthread_stop() which has to sync with the exiting kthread.
+ */
 static struct kthread *to_live_kthread(struct task_struct *k)
 {
 	struct completion *vfork = ACCESS_ONCE(k->vfork_done);
@@ -181,14 +205,11 @@ static int kthread(void *_create)
 	int (*threadfn)(void *data) = create->threadfn;
 	void *data = create->data;
 	struct completion *done;
-	struct kthread self;
+	struct kthread *self;
 	int ret;
 
-	self.flags = 0;
-	self.data = data;
-	init_completion(&self.exited);
-	init_completion(&self.parked);
-	current->vfork_done = &self.exited;
+	self = kmalloc(sizeof(*self), GFP_KERNEL);
+	set_kthread_struct(self);
 
 	/* If user was SIGKILLed, I release the structure. */
 	done = xchg(&create->done, NULL);
@@ -196,6 +217,19 @@ static int kthread(void *_create)
 		kfree(create);
 		do_exit(-EINTR);
 	}
+
+	if (!self) {
+		create->result = ERR_PTR(-ENOMEM);
+		complete(done);
+		do_exit(-ENOMEM);
+	}
+
+	self->flags = 0;
+	self->data = data;
+	init_completion(&self->exited);
+	init_completion(&self->parked);
+	current->vfork_done = &self->exited;
+
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	create->result = current;
@@ -203,12 +237,10 @@ static int kthread(void *_create)
 	schedule();
 
 	ret = -EINTR;
-
-	if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
-		__kthread_parkme(&self);
+	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+		__kthread_parkme(self);
 		ret = threadfn(data);
 	}
-	/* we can't just return, we must preserve "self" on stack */
 	do_exit(ret);
 }
 
-- 
cgit v1.2.3


From eff9662547f358239b98dfc4a8e6905b494e14d6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 Nov 2016 18:51:00 +0100
Subject: Revert "kthread: Pin the stack via
 try_get_task_stack()/put_task_stack() in to_live_kthread() function"

This reverts commit 23196f2e5f5d810578a772785807dcdc2b9fdce9.

Now that struct kthread is kmalloc'ed and not longer on the task stack
there is no need anymore to pin the stack.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chunming Zhou <David1.Zhou@amd.com>
Cc: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20161129175100.GA5333@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/kthread.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9d64b6526d0b..7891a940007d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -88,7 +88,7 @@ void free_kthread_struct(struct task_struct *k)
 static struct kthread *to_live_kthread(struct task_struct *k)
 {
 	struct completion *vfork = ACCESS_ONCE(k->vfork_done);
-	if (likely(vfork) && try_get_task_stack(k))
+	if (likely(vfork))
 		return __to_kthread(vfork);
 	return NULL;
 }
@@ -473,10 +473,8 @@ void kthread_unpark(struct task_struct *k)
 {
 	struct kthread *kthread = to_live_kthread(k);
 
-	if (kthread) {
+	if (kthread)
 		__kthread_unpark(k, kthread);
-		put_task_stack(k);
-	}
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
 
@@ -505,7 +503,6 @@ int kthread_park(struct task_struct *k)
 				wait_for_completion(&kthread->parked);
 			}
 		}
-		put_task_stack(k);
 		ret = 0;
 	}
 	return ret;
@@ -541,7 +538,6 @@ int kthread_stop(struct task_struct *k)
 		__kthread_unpark(k, kthread);
 		wake_up_process(k);
 		wait_for_completion(&kthread->exited);
-		put_task_stack(k);
 	}
 	ret = k->exit_code;
 	put_task_struct(k);
-- 
cgit v1.2.3


From efb29fbfa50c490dac64a9418ebe553be82df781 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 Nov 2016 18:51:03 +0100
Subject: kthread: Don't use to_live_kthread() in kthread_stop()

kthread_stop() had to use to_live_kthread() simply because it was not
possible to access kthread->exited after the exiting task clears
task_struct->vfork_done. Now that to_kthread() is always valid,
wake_up_process() + wait_for_completion() can be done
ununconditionally. It's not an issue anymore if the task has already issued
complete_vfork_done() or died.

The exiting task can get the spurious wakeup after mm_release() but this is
possible without this change too and is fine; do_task_dead() ensures that
this can't make any harm.

As a further enhancement this could be converted to task_work_add() later,
so ->vfork_done can be avoided completely.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chunming Zhou <David1.Zhou@amd.com>
Cc: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20161129175103.GA5336@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/kthread.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7891a940007d..4dcbc8b5d6b6 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -532,13 +532,11 @@ int kthread_stop(struct task_struct *k)
 	trace_sched_kthread_stop(k);
 
 	get_task_struct(k);
-	kthread = to_live_kthread(k);
-	if (kthread) {
-		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-		__kthread_unpark(k, kthread);
-		wake_up_process(k);
-		wait_for_completion(&kthread->exited);
-	}
+	kthread = to_kthread(k);
+	set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+	__kthread_unpark(k, kthread);
+	wake_up_process(k);
+	wait_for_completion(&kthread->exited);
 	ret = k->exit_code;
 	put_task_struct(k);
 
-- 
cgit v1.2.3


From cf380a4a96e2260742051fa7fc831596bb26cc8b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 Nov 2016 18:51:07 +0100
Subject: kthread: Don't use to_live_kthread() in kthread_[un]park()

Now that to_kthread() is always validm change kthread_park() and
kthread_unpark() to use it and kill to_live_kthread().

The conversion of kthread_unpark() is trivial. If KTHREAD_IS_PARKED is set
then the task has called complete(&self->parked) and there the function
cannot race against a concurrent kthread_stop() and exit.

kthread_park() is more tricky, because its semantics are not well
defined. It returns -ENOSYS if the thread exited but this can never happen
and as Roman pointed out kthread_park() can obviously block forever if it
would race with the exiting kthread.

The usage of kthread_park() in cpuhp code (cpu.c, smpboot.c, stop_machine.c)
is fine. It can never see an exiting/exited kthread, smpboot_destroy_threads()
clears *ht->store, smpboot_park_thread() checks it is not NULL under the same
smpboot_threads_lock. cpuhp_threads and cpu_stop_threads never exit, so other
callers are fine too.

But it has two more users:

- watchdog_park_threads():

  The code is actually correct, get_online_cpus() ensures that
  kthread_park() can't race with itself (note that kthread_park() can't
  handle this race correctly), but it should not use kthread_park()
  directly.

- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c should not use
  kthread_park() either.

  kthread_park() must not be called after amd_sched_fini() which does
  kthread_stop(), otherwise even to_live_kthread() is not safe because
  task_struct can be already freed and sched->thread can point to nowhere.

The usage of kthread_park/unpark should either be restricted to core code
which is properly protected against the exit race or made more robust so it
is safe to use it in drivers.

To catch eventual exit issues, add a WARN_ON(PF_EXITING) for now.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chunming Zhou <David1.Zhou@amd.com>
Cc: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20161129175107.GA5339@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/kthread.c | 69 ++++++++++++++++++++------------------------------------
 1 file changed, 24 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4dcbc8b5d6b6..01d27164e5b7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -78,21 +78,6 @@ void free_kthread_struct(struct task_struct *k)
 	kfree(to_kthread(k));
 }
 
-#define __to_kthread(vfork)	\
-	container_of(vfork, struct kthread, exited)
-
-/*
- * TODO: kill it and use to_kthread(). But we still need the users
- * like kthread_stop() which has to sync with the exiting kthread.
- */
-static struct kthread *to_live_kthread(struct task_struct *k)
-{
-	struct completion *vfork = ACCESS_ONCE(k->vfork_done);
-	if (likely(vfork))
-		return __to_kthread(vfork);
-	return NULL;
-}
-
 /**
  * kthread_should_stop - should this kthread return now?
  *
@@ -441,8 +426,18 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 	return p;
 }
 
-static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k:		thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
 {
+	struct kthread *kthread = to_kthread(k);
+
 	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
 	/*
 	 * We clear the IS_PARKED bit here as we don't wait
@@ -460,22 +455,6 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
 		wake_up_state(k, TASK_PARKED);
 	}
 }
-
-/**
- * kthread_unpark - unpark a thread created by kthread_create().
- * @k:		thread created by kthread_create().
- *
- * Sets kthread_should_park() for @k to return false, wakes it, and
- * waits for it to return. If the thread is marked percpu then its
- * bound to the cpu again.
- */
-void kthread_unpark(struct task_struct *k)
-{
-	struct kthread *kthread = to_live_kthread(k);
-
-	if (kthread)
-		__kthread_unpark(k, kthread);
-}
 EXPORT_SYMBOL_GPL(kthread_unpark);
 
 /**
@@ -492,20 +471,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark);
  */
 int kthread_park(struct task_struct *k)
 {
-	struct kthread *kthread = to_live_kthread(k);
-	int ret = -ENOSYS;
-
-	if (kthread) {
-		if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-			set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
-			if (k != current) {
-				wake_up_process(k);
-				wait_for_completion(&kthread->parked);
-			}
+	struct kthread *kthread = to_kthread(k);
+
+	if (WARN_ON(k->flags & PF_EXITING))
+		return -ENOSYS;
+
+	if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+		set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+		if (k != current) {
+			wake_up_process(k);
+			wait_for_completion(&kthread->parked);
 		}
-		ret = 0;
 	}
-	return ret;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(kthread_park);
 
@@ -534,7 +513,7 @@ int kthread_stop(struct task_struct *k)
 	get_task_struct(k);
 	kthread = to_kthread(k);
 	set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-	__kthread_unpark(k, kthread);
+	kthread_unpark(k);
 	wake_up_process(k);
 	wait_for_completion(&kthread->exited);
 	ret = k->exit_code;
-- 
cgit v1.2.3


From 8fb9dcbdc3619741c10c573199d804161c34c89a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 29 Nov 2016 18:51:10 +0100
Subject: kthread: Don't abuse kthread_create_on_cpu() in
 __kthread_create_worker()

kthread_create_on_cpu() sets KTHREAD_IS_PER_CPU and kthread->cpu, this
only makes sense if this kthread can be parked/unparked by cpuhp code.
kthread workers never call kthread_parkme() so this has no effect.

Change __kthread_create_worker() to simply call kthread_bind(task, cpu).
The very fact that kthread_create_on_cpu() doesn't accept a generic fmt
shows that it should not be used outside of smpboot.c.

Now, the only reason we can not unexport this helper and move it into
smpboot.c is that it sets kthread->cpu and struct kthread is not exported.
And the only reason we can not kill kthread->cpu is that kthread_unpark()
is used by drivers/gpu/drm/amd/scheduler/gpu_scheduler.c and thus we can
not turn _unpark into kthread_unpark(struct smp_hotplug_thread *, cpu).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Cc: Chunming Zhou <David1.Zhou@amd.com>
Cc: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20161129175110.GA5342@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/kthread.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 01d27164e5b7..956495f0efaf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -641,6 +641,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
 {
 	struct kthread_worker *worker;
 	struct task_struct *task;
+	int node = -1;
 
 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
 	if (!worker)
@@ -648,25 +649,17 @@ __kthread_create_worker(int cpu, unsigned int flags,
 
 	kthread_init_worker(worker);
 
-	if (cpu >= 0) {
-		char name[TASK_COMM_LEN];
-
-		/*
-		 * kthread_create_worker_on_cpu() allows to pass a generic
-		 * namefmt in compare with kthread_create_on_cpu. We need
-		 * to format it here.
-		 */
-		vsnprintf(name, sizeof(name), namefmt, args);
-		task = kthread_create_on_cpu(kthread_worker_fn, worker,
-					     cpu, name);
-	} else {
-		task = __kthread_create_on_node(kthread_worker_fn, worker,
-						-1, namefmt, args);
-	}
+	if (cpu >= 0)
+		node = cpu_to_node(cpu);
 
+	task = __kthread_create_on_node(kthread_worker_fn, worker,
+						node, namefmt, args);
 	if (IS_ERR(task))
 		goto fail_task;
 
+	if (cpu >= 0)
+		kthread_bind(task, cpu);
+
 	worker->flags = flags;
 	worker->task = task;
 	wake_up_process(task);
-- 
cgit v1.2.3


From d2a4dd37f6b41fbcad76efbf63124eb3126c66fe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 7 Dec 2016 10:57:59 -0800
Subject: bpf: fix state equivalence

Commmits 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
and 484611357c19 ("bpf: allow access into map value arrays") by themselves
are correct, but in combination they make state equivalence ignore 'id' field
of the register state which can lead to accepting invalid program.

Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 14 +++++++-------
 kernel/bpf/verifier.c        |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7453c1281531..a13b031dc6b8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -18,13 +18,6 @@
 
 struct bpf_reg_state {
 	enum bpf_reg_type type;
-	/*
-	 * Used to determine if any memory access using this register will
-	 * result in a bad access.
-	 */
-	s64 min_value;
-	u64 max_value;
-	u32 id;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
@@ -40,6 +33,13 @@ struct bpf_reg_state {
 		 */
 		struct bpf_map *map_ptr;
 	};
+	u32 id;
+	/* Used to determine if any memory access using this register will
+	 * result in a bad access. These two fields must be last.
+	 * See states_equal()
+	 */
+	s64 min_value;
+	u64 max_value;
 };
 
 enum bpf_stack_slot_type {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index da9fb2a9b7eb..5b14f85f45c6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2528,7 +2528,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 		 * we didn't do a variable access into a map then we are a-ok.
 		 */
 		if (!varlen_map_access &&
-		    rold->type == rcur->type && rold->imm == rcur->imm)
+		    memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
 			continue;
 
 		/* If we didn't map access then again we don't care about the
-- 
cgit v1.2.3


From 17bedab2723145d17b14084430743549e6943d03 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 7 Dec 2016 15:53:11 -0800
Subject: bpf: xdp: Allow head adjustment in XDP prog

This patch allows XDP prog to extend/remove the packet
data at the head (like adding or removing header).  It is
done by adding a new XDP helper bpf_xdp_adjust_head().

It also renames bpf_helper_changes_skb_data() to
bpf_helper_changes_pkt_data() to better reflect
that XDP prog does not work on skb.

This patch adds one "xdp_adjust_head" bit to bpf_prog for the
XDP-capable driver to check if the XDP prog requires
bpf_xdp_adjust_head() support.  The driver can then decide
to error out during XDP_SETUP_PROG.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/powerpc/net/bpf_jit_comp64.c                  |  4 ++--
 arch/s390/net/bpf_jit_comp.c                       |  2 +-
 arch/x86/net/bpf_jit_comp.c                        |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  5 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  5 ++++
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  4 ++++
 drivers/net/ethernet/qlogic/qede/qede_main.c       |  5 ++++
 include/linux/filter.h                             |  6 +++--
 include/uapi/linux/bpf.h                           | 11 ++++++++-
 kernel/bpf/core.c                                  |  2 +-
 kernel/bpf/syscall.c                               |  2 ++
 kernel/bpf/verifier.c                              |  2 +-
 net/core/filter.c                                  | 28 ++++++++++++++++++++--
 13 files changed, 67 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 0fe98a567125..73a5cf18fd84 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -766,7 +766,7 @@ emit_clear:
 			func = (u8 *) __bpf_call_base + imm;
 
 			/* Save skb pointer if we need to re-cache skb data */
-			if (bpf_helper_changes_skb_data(func))
+			if (bpf_helper_changes_pkt_data(func))
 				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -775,7 +775,7 @@ emit_clear:
 			PPC_MR(b2p[BPF_REG_0], 3);
 
 			/* refresh skb cache */
-			if (bpf_helper_changes_skb_data(func)) {
+			if (bpf_helper_changes_pkt_data(func)) {
 				/* reload skb pointer to r3 */
 				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
 				bpf_jit_emit_skb_loads(image, ctx);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bee281f3163d..167b31b186c1 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		EMIT2(0x0d00, REG_14, REG_W1);
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
-		if (bpf_helper_changes_skb_data((void *)func)) {
+		if (bpf_helper_changes_pkt_data((void *)func)) {
 			jit->seen |= SEEN_SKB_CHANGE;
 			/* lg %b1,ST_OFF_SKBP(%r15) */
 			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe04a04dab8e..e76d1af60f7a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -853,7 +853,7 @@ xadd:			if (is_imm8(insn->off))
 			func = (u8 *) __bpf_call_base + imm32;
 			jmp_offset = func - (image + addrs[i]);
 			if (seen_ld_abs) {
-				reload_skb_data = bpf_helper_changes_skb_data(func);
+				reload_skb_data = bpf_helper_changes_pkt_data(func);
 				if (reload_skb_data) {
 					EMIT1(0x57); /* push %rdi */
 					jmp_offset += 22; /* pop, mov, sub, mov */
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 49a81f1fc1d6..f441eda63bec 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2686,6 +2686,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 	int err;
 	int i;
 
+	if (prog && prog->xdp_adjust_head) {
+		en_err(priv, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
+
 	xdp_ring_num = prog ? priv->rx_ring_num : 0;
 
 	/* No need to reconfigure buffers when simply swapping the
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 07020276fe73..cbfa38fc72c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3183,6 +3183,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
 	bool reset, was_opened;
 	int i;
 
+	if (prog && prog->xdp_adjust_head) {
+		netdev_err(netdev, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
+
 	mutex_lock(&priv->state_lock);
 
 	if ((netdev->features & NETIF_F_LRO) && prog) {
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 00d9a03be31d..e8d448109e03 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2946,6 +2946,10 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog)
 	};
 	int err;
 
+	if (prog && prog->xdp_adjust_head) {
+		nn_err(nn, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
 	if (!prog && !nn->xdp_prog)
 		return 0;
 	if (prog && nn->xdp_prog) {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index cf1dd1436d93..aecdd1c5c0ea 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2507,6 +2507,11 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog)
 {
 	struct qede_reload_args args;
 
+	if (prog && prog->xdp_adjust_head) {
+		DP_ERR(edev, "Does not support bpf_xdp_adjust_head()\n");
+		return -EOPNOTSUPP;
+	}
+
 	/* If we're called, there was already a bpf reference increment */
 	args.func = &qede_xdp_reload_func;
 	args.u.new_prog = prog;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f078d2b1cff6..6a1658308612 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -406,7 +406,8 @@ struct bpf_prog {
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				xdp_adjust_head:1; /* Adjusting pkt head? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
@@ -440,6 +441,7 @@ struct bpf_skb_data_end {
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_hard_start;
 };
 
 /* compute the linear packet data range [data, data_end) which
@@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
-bool bpf_helper_changes_skb_data(void *func);
+bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6123d9b8e828..0eb0e87dbe9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -424,6 +424,12 @@ union bpf_attr {
  *     @len: length of header to be pushed in front
  *     @flags: Flags (unused for now)
  *     Return: 0 on success or negative error
+ *
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ *     Adjust the xdp_md.data by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -469,7 +475,8 @@ union bpf_attr {
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
 	FN(get_numa_node_id),		\
-	FN(skb_change_head),
+	FN(skb_change_head),		\
+	FN(xdp_adjust_head),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -576,6 +583,8 @@ struct bpf_sock {
 	__u32 protocol;
 };
 
+#define XDP_PACKET_HEADROOM 256
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index bdcc9f4ba767..83e0d153b0b4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 	return prog;
 }
 
-bool __weak bpf_helper_changes_skb_data(void *func)
+bool __weak bpf_helper_changes_pkt_data(void *func)
 {
 	return false;
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 88f609f1c0c3..4819ec9d95f6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 				prog->dst_needed = 1;
 			if (insn->imm == BPF_FUNC_get_prandom_u32)
 				bpf_user_rnd_init_once();
+			if (insn->imm == BPF_FUNC_xdp_adjust_head)
+				prog->xdp_adjust_head = 1;
 			if (insn->imm == BPF_FUNC_tail_call) {
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5b14f85f45c6..d28f9a3380a9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
-	changes_data = bpf_helper_changes_skb_data(fn->func);
+	changes_data = bpf_helper_changes_pkt_data(fn->func);
 
 	memset(&meta, 0, sizeof(meta));
 	meta.pkt_access = fn->pkt_access;
diff --git a/net/core/filter.c b/net/core/filter.c
index b751202e12f8..b1461708a977 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-bool bpf_helper_changes_skb_data(void *func)
+BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
+{
+	void *data = xdp->data + offset;
+
+	if (unlikely(data < xdp->data_hard_start ||
+		     data > xdp->data_end - ETH_HLEN))
+		return -EINVAL;
+
+	xdp->data = data;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
+	.func		= bpf_xdp_adjust_head,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
+bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
 	    func == bpf_skb_vlan_pop ||
@@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func)
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace)
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_xdp_adjust_head)
 		return true;
 
 	return false;
@@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_xdp_event_output_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_xdp_adjust_head:
+		return &bpf_xdp_adjust_head_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 9c1645727b8fa90d07256fdfcc45bf831242a3ab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 8 Dec 2016 20:49:32 +0000
Subject: timekeeping_Force_unsigned_clocksource_to_nanoseconds_conversion

The clocksource delta to nanoseconds conversion is using signed math, but
the delta is unsigned. This makes the conversion space smaller than
necessary and in case of a multiplication overflow the conversion can
become negative. The conversion is done with scaled math:

    s64 nsec_delta = ((s64)clkdelta * clk->mult) >> clk->shift;

Shifting a signed integer right obvioulsy preserves the sign, which has
interesting consequences:

 - Time jumps backwards

 - __iter_div_u64_rem() which is used in one of the calling code pathes
   will take forever to piecewise calculate the seconds/nanoseconds part.

This has been reported by several people with different scenarios:

David observed that when stopping a VM with a debugger:

 "It was essentially the stopped by debugger case.  I forget exactly why,
  but the guest was being explicitly stopped from outside, it wasn't just
  scheduling lag.  I think it was something in the vicinity of 10 minutes
  stopped."

 When lifting the stop the machine went dead.

The stopped by debugger case is not really interesting, but nevertheless it
would be a good thing not to die completely.

But this was also observed on a live system by Liav:

 "When the OS is too overloaded, delta will get a high enough value for the
  msb of the sum delta * tkr->mult + tkr->xtime_nsec to be set, and so
  after the shift the nsec variable will gain a value similar to
  0xffffffffff000000."

Unfortunately this has been reintroduced recently with commit 6bd58f09e1d8
("time: Add cycles to nanoseconds translation"). It had been fixed a year
ago already in commit 35a4933a8959 ("time: Avoid signed overflow in
timekeeping_get_ns()").

Though it's not surprising that the issue has been reintroduced because the
function itself and the whole call chain uses s64 for the result and the
propagation of it. The change in this recent commit is subtle:

   s64 nsec;

-  nsec = (d * m + n) >> s:
+  nsec = d * m + n;
+  nsec >>= s;

d being type of cycle_t adds another level of obfuscation.

This wouldn't have happened if the previous change to unsigned computation
would have made the 'nsec' variable u64 right away and a follow up patch
had cleaned up the whole call chain.

There have been patches submitted which basically did a revert of the above
patch leaving everything else unchanged as signed. Back to square one. This
spawned a admittedly pointless discussion about potential users which rely
on the unsigned behaviour until someone pointed out that it had been fixed
before. The changelogs of said patches added further confusion as they made
finally false claims about the consequences for eventual users which expect
signed results.

Despite delta being cycle_t, aka. u64, it's very well possible to hand in
a signed negative value and the signed computation will happily return the
correct result. But nobody actually sat down and analyzed the code which
was added as user after the propably unintended signed conversion.

Though in sensitive code like this it's better to analyze it proper and
make sure that nothing relies on this than hunting the subtle wreckage half
a year later. After analyzing all call chains it stands that no caller can
hand in a negative value (which actually would work due to the s64 cast)
and rely on the signed math to do the right thing.

Change the conversion function to unsigned math. The conversion of all call
chains is done in a follow up patch.

This solves the starvation issue, which was caused by the negative result,
but it does not solve the underlying problem. It merily procrastinates
it. When the timekeeper update is deferred long enough that the unsigned
multiplication overflows, then time going backwards is observable again.

It does neither solve the issue of clocksources with a small counter width
which will wrap around possibly several times and cause random time stamps
to be generated. But those are usually not found on systems used for
virtualization, so this is likely a non issue.

I took the liberty to claim authorship for this simply because
analyzing all callsites and writing the changelog took substantially
more time than just making the simple s/s64/u64/ change and ignore the
rest.

Fixes: 6bd58f09e1d8 ("time: Add cycles to nanoseconds translation")
Reported-by: David Gibson <david@gibson.dropbear.id.au>
Reported-by: Liav Rehana <liavr@mellanox.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: "Christopher S. Hall" <christopher.s.hall@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20161208204228.688545601@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b2286e94c934..bfe589e929e8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -299,10 +299,10 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
 static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
-static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
 					  cycle_t delta)
 {
-	s64 nsec;
+	u64 nsec;
 
 	nsec = delta * tkr->mult + tkr->xtime_nsec;
 	nsec >>= tkr->shift;
-- 
cgit v1.2.3


From acc89612a70e370a5640fd77a83f15b7b94d85e4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 8 Dec 2016 20:49:34 +0000
Subject: timekeeping: Make the conversion call chain consistently unsigned

Propagating a unsigned value through signed variables and functions makes
absolutely no sense and is just prone to (re)introduce subtle signed
vs. unsigned issues as happened recently.

Clean it up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: "Christopher S. Hall" <christopher.s.hall@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Liav Rehana <liavr@mellanox.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20161208204228.765843099@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index bfe589e929e8..5244821643a4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -311,7 +311,7 @@ static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
 	return nsec + arch_gettimeoffset();
 }
 
-static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
 	cycle_t delta;
 
@@ -319,8 +319,8 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 	return timekeeping_delta_to_ns(tkr, delta);
 }
 
-static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
-					    cycle_t cycles)
+static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+					   cycle_t cycles)
 {
 	cycle_t delta;
 
@@ -652,7 +652,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 {
 	struct clocksource *clock = tk->tkr_mono.clock;
 	cycle_t cycle_now, delta;
-	s64 nsec;
+	u64 nsec;
 
 	cycle_now = tk->tkr_mono.read(clock);
 	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
@@ -681,7 +681,7 @@ int __getnstimeofday64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long seq;
-	s64 nsecs = 0;
+	u64 nsecs;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
@@ -721,7 +721,7 @@ ktime_t ktime_get(void)
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
 	ktime_t base;
-	s64 nsecs;
+	u64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
 
@@ -764,7 +764,7 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
 	ktime_t base, *offset = offsets[offs];
-	s64 nsecs;
+	u64 nsecs;
 
 	WARN_ON(timekeeping_suspended);
 
@@ -808,7 +808,7 @@ ktime_t ktime_get_raw(void)
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
 	ktime_t base;
-	s64 nsecs;
+	u64 nsecs;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
@@ -833,8 +833,8 @@ void ktime_get_ts64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct timespec64 tomono;
-	s64 nsec;
 	unsigned int seq;
+	u64 nsec;
 
 	WARN_ON(timekeeping_suspended);
 
@@ -922,8 +922,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 	unsigned long seq;
 	ktime_t base_raw;
 	ktime_t base_real;
-	s64 nsec_raw;
-	s64 nsec_real;
+	u64 nsec_raw;
+	u64 nsec_real;
 	cycle_t now;
 
 	WARN_ON_ONCE(timekeeping_suspended);
@@ -1081,7 +1081,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
 	cycle_t cycles, now, interval_start;
 	unsigned int clock_was_set_seq = 0;
 	ktime_t base_real, base_raw;
-	s64 nsec_real, nsec_raw;
+	u64 nsec_real, nsec_raw;
 	u8 cs_was_changed_seq;
 	unsigned long seq;
 	bool do_interp;
@@ -1394,7 +1394,7 @@ void getrawmonotonic64(struct timespec64 *ts)
 	struct timekeeper *tk = &tk_core.timekeeper;
 	struct timespec64 ts64;
 	unsigned long seq;
-	s64 nsecs;
+	u64 nsecs;
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-- 
cgit v1.2.3


From cbd99e3b289e43000c29aa4aa9b94b394cdc68bd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 8 Dec 2016 20:49:36 +0000
Subject: timekeeping: Get rid of pointless typecasts

cycle_t is defined as u64, so casting it to u64 is a pointless and
confusing exercise. cycle_t should simply go away and be replaced with a
plain u64 to avoid further confusion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: "Christopher S. Hall" <christopher.s.hall@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Liav Rehana <liavr@mellanox.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20161208204228.844699737@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5244821643a4..82e1b5cbebbb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -258,10 +258,9 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	tk->cycle_interval = interval;
 
 	/* Go back from cycles -> shifted ns */
-	tk->xtime_interval = (u64) interval * clock->mult;
+	tk->xtime_interval = interval * clock->mult;
 	tk->xtime_remainder = ntpinterval - tk->xtime_interval;
-	tk->raw_interval =
-		((u64) interval * clock->mult) >> clock->shift;
+	tk->raw_interval = (interval * clock->mult) >> clock->shift;
 
 	 /* if changing clocks, convert xtime_nsec shift units */
 	if (old_clock) {
-- 
cgit v1.2.3


From c029a2bec66e42e57538cb65e28618baf6a4b311 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 8 Dec 2016 20:49:38 +0000
Subject: timekeeping: Use mul_u64_u32_shr() instead of open coding it

The resume code must deal with a clocksource delta which is potentially big
enough to overflow the 64bit mult.

Replace the open coded handling with the proper function.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Parit Bhargava <prarit@redhat.com>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: "Christopher S. Hall" <christopher.s.hall@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Liav Rehana <liavr@mellanox.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20161208204228.921674404@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 82e1b5cbebbb..da233cdf89b0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1644,7 +1644,7 @@ void timekeeping_resume(void)
 	struct clocksource *clock = tk->tkr_mono.clock;
 	unsigned long flags;
 	struct timespec64 ts_new, ts_delta;
-	cycle_t cycle_now, cycle_delta;
+	cycle_t cycle_now;
 
 	sleeptime_injected = false;
 	read_persistent_clock64(&ts_new);
@@ -1670,27 +1670,11 @@ void timekeeping_resume(void)
 	cycle_now = tk->tkr_mono.read(clock);
 	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
 		cycle_now > tk->tkr_mono.cycle_last) {
-		u64 num, max = ULLONG_MAX;
-		u32 mult = clock->mult;
-		u32 shift = clock->shift;
-		s64 nsec = 0;
-
-		cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
-						tk->tkr_mono.mask);
-
-		/*
-		 * "cycle_delta * mutl" may cause 64 bits overflow, if the
-		 * suspended time is too long. In that case we need do the
-		 * 64 bits math carefully
-		 */
-		do_div(max, mult);
-		if (cycle_delta > max) {
-			num = div64_u64(cycle_delta, max);
-			nsec = (((u64) max * mult) >> shift) * num;
-			cycle_delta -= num * max;
-		}
-		nsec += ((u64) cycle_delta * mult) >> shift;
+		u64 nsec, cyc_delta;
 
+		cyc_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+					      tk->tkr_mono.mask);
+		nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift);
 		ts_delta = ns_to_timespec64(nsec);
 		sleeptime_injected = true;
 	} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
-- 
cgit v1.2.3


From 8cf868affdc459beee1a941df0cfaba1673740e3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 28 Nov 2016 13:03:21 -0500
Subject: tracing: Have the reg function allow to fail

Some tracepoints have a registration function that gets enabled when the
tracepoint is enabled. There may be cases that the registraction function
must fail (for example, can't allocate enough memory). In this case, the
tracepoint should also fail to register, otherwise the user would not know
why the tracepoint is not working.

Cc: David Howells <dhowells@redhat.com>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/powerpc/include/asm/trace.h                  |  4 ++--
 arch/powerpc/platforms/powernv/opal-tracepoints.c |  6 ++++--
 arch/powerpc/platforms/pseries/lpar.c             |  6 ++++--
 arch/x86/include/asm/trace/exceptions.h           |  2 +-
 arch/x86/include/asm/trace/irq_vectors.h          |  2 +-
 arch/x86/kernel/tracepoint.c                      |  3 ++-
 drivers/i2c/i2c-core.c                            |  3 ++-
 include/linux/tracepoint-defs.h                   |  2 +-
 include/linux/tracepoint.h                        |  2 +-
 include/trace/events/i2c.h                        |  2 +-
 kernel/trace/trace_benchmark.c                    |  3 ++-
 kernel/trace/trace_benchmark.h                    |  2 +-
 kernel/tracepoint.c                               | 12 +++++++++---
 samples/trace_events/trace-events-sample.c        |  3 ++-
 samples/trace_events/trace-events-sample.h        |  2 +-
 15 files changed, 34 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index 32e36b16773f..c05cef6ee06c 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -54,7 +54,7 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit,
 );
 
 #ifdef CONFIG_PPC_PSERIES
-extern void hcall_tracepoint_regfunc(void);
+extern int hcall_tracepoint_regfunc(void);
 extern void hcall_tracepoint_unregfunc(void);
 
 TRACE_EVENT_FN_COND(hcall_entry,
@@ -104,7 +104,7 @@ TRACE_EVENT_FN_COND(hcall_exit,
 #endif
 
 #ifdef CONFIG_PPC_POWERNV
-extern void opal_tracepoint_regfunc(void);
+extern int opal_tracepoint_regfunc(void);
 extern void opal_tracepoint_unregfunc(void);
 
 TRACE_EVENT_FN(opal_entry,
diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c
index 1e496b780efd..3c447002edff 100644
--- a/arch/powerpc/platforms/powernv/opal-tracepoints.c
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -6,9 +6,10 @@
 #ifdef HAVE_JUMP_LABEL
 struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
 
-void opal_tracepoint_regfunc(void)
+int opal_tracepoint_regfunc(void)
 {
 	static_key_slow_inc(&opal_tracepoint_key);
+	return 0;
 }
 
 void opal_tracepoint_unregfunc(void)
@@ -25,9 +26,10 @@ void opal_tracepoint_unregfunc(void)
 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
 extern long opal_tracepoint_refcount;
 
-void opal_tracepoint_regfunc(void)
+int opal_tracepoint_regfunc(void)
 {
 	opal_tracepoint_refcount++;
+	return 0;
 }
 
 void opal_tracepoint_unregfunc(void)
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index aa35245d8d6d..c0423ce3955c 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -661,9 +661,10 @@ EXPORT_SYMBOL(arch_free_page);
 #ifdef HAVE_JUMP_LABEL
 struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;
 
-void hcall_tracepoint_regfunc(void)
+int hcall_tracepoint_regfunc(void)
 {
 	static_key_slow_inc(&hcall_tracepoint_key);
+	return 0;
 }
 
 void hcall_tracepoint_unregfunc(void)
@@ -680,9 +681,10 @@ void hcall_tracepoint_unregfunc(void)
 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
 extern long hcall_tracepoint_refcount;
 
-void hcall_tracepoint_regfunc(void)
+int hcall_tracepoint_regfunc(void)
 {
 	hcall_tracepoint_refcount++;
+	return 0;
 }
 
 void hcall_tracepoint_unregfunc(void)
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2fbc66c7885b..2422b14c50a7 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -6,7 +6,7 @@
 
 #include <linux/tracepoint.h>
 
-extern void trace_irq_vector_regfunc(void);
+extern int trace_irq_vector_regfunc(void);
 extern void trace_irq_vector_unregfunc(void);
 
 DECLARE_EVENT_CLASS(x86_exceptions,
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 38a09a13a9bc..32dd6a9e343c 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -6,7 +6,7 @@
 
 #include <linux/tracepoint.h>
 
-extern void trace_irq_vector_regfunc(void);
+extern int trace_irq_vector_regfunc(void);
 extern void trace_irq_vector_unregfunc(void);
 
 DECLARE_EVENT_CLASS(x86_irq_vector,
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..15515132bf0d 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -34,7 +34,7 @@ static void switch_idt(void *arg)
 	local_irq_restore(flags);
 }
 
-void trace_irq_vector_regfunc(void)
+int trace_irq_vector_regfunc(void)
 {
 	mutex_lock(&irq_vector_mutex);
 	if (!trace_irq_vector_refcount) {
@@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void)
 	}
 	trace_irq_vector_refcount++;
 	mutex_unlock(&irq_vector_mutex);
+	return 0;
 }
 
 void trace_irq_vector_unregfunc(void)
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index b432b64e307a..6a2b995d7fc4 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -77,9 +77,10 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver);
 static struct static_key i2c_trace_msg = STATIC_KEY_INIT_FALSE;
 static bool is_registered;
 
-void i2c_transfer_trace_reg(void)
+int i2c_transfer_trace_reg(void)
 {
 	static_key_slow_inc(&i2c_trace_msg);
+	return 0;
 }
 
 void i2c_transfer_trace_unreg(void)
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 4ac89acb6136..a03192052066 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -29,7 +29,7 @@ struct tracepoint_func {
 struct tracepoint {
 	const char *name;		/* Tracepoint name */
 	struct static_key key;
-	void (*regfunc)(void);
+	int (*regfunc)(void);
 	void (*unregfunc)(void);
 	struct tracepoint_func __rcu *funcs;
 };
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index be586c632a0c..f72fcfe0e66a 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -81,7 +81,7 @@ static inline void tracepoint_synchronize_unregister(void)
 }
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
-extern void syscall_regfunc(void);
+extern int syscall_regfunc(void);
 extern void syscall_unregfunc(void);
 #endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
 
diff --git a/include/trace/events/i2c.h b/include/trace/events/i2c.h
index fe17187df65d..4abb8eab34d3 100644
--- a/include/trace/events/i2c.h
+++ b/include/trace/events/i2c.h
@@ -20,7 +20,7 @@
 /*
  * drivers/i2c/i2c-core.c
  */
-extern void i2c_transfer_trace_reg(void);
+extern int i2c_transfer_trace_reg(void);
 extern void i2c_transfer_trace_unreg(void);
 
 /*
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 0f109c4130d3..f76d0416dd83 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -164,11 +164,12 @@ static int benchmark_event_kthread(void *arg)
  * When the benchmark tracepoint is enabled, it calls this
  * function and the thread that calls the tracepoint is created.
  */
-void trace_benchmark_reg(void)
+int trace_benchmark_reg(void)
 {
 	bm_event_thread = kthread_run(benchmark_event_kthread,
 				      NULL, "event_benchmark");
 	WARN_ON(!bm_event_thread);
+	return 0;
 }
 
 /*
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index 3c1df1df4e29..ebdbfc2f2a64 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -6,7 +6,7 @@
 
 #include <linux/tracepoint.h>
 
-extern void trace_benchmark_reg(void);
+extern int trace_benchmark_reg(void);
 extern void trace_benchmark_unreg(void);
 
 #define BENCHMARK_EVENT_STRLEN		128
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d0639d917899..1f9a31f934a4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -194,9 +194,13 @@ static int tracepoint_add_func(struct tracepoint *tp,
 			       struct tracepoint_func *func, int prio)
 {
 	struct tracepoint_func *old, *tp_funcs;
+	int ret;
 
-	if (tp->regfunc && !static_key_enabled(&tp->key))
-		tp->regfunc();
+	if (tp->regfunc && !static_key_enabled(&tp->key)) {
+		ret = tp->regfunc();
+		if (ret < 0)
+			return ret;
+	}
 
 	tp_funcs = rcu_dereference_protected(tp->funcs,
 			lockdep_is_held(&tracepoints_mutex));
@@ -529,7 +533,7 @@ EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
 static int sys_tracepoint_refcount;
 
-void syscall_regfunc(void)
+int syscall_regfunc(void)
 {
 	struct task_struct *p, *t;
 
@@ -541,6 +545,8 @@ void syscall_regfunc(void)
 		read_unlock(&tasklist_lock);
 	}
 	sys_tracepoint_refcount++;
+
+	return 0;
 }
 
 void syscall_unregfunc(void)
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 880a7d1d27d2..30e282d33d4d 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -79,7 +79,7 @@ static int simple_thread_fn(void *arg)
 
 static DEFINE_MUTEX(thread_mutex);
 
-void foo_bar_reg(void)
+int foo_bar_reg(void)
 {
 	pr_info("Starting thread for foo_bar_fn\n");
 	/*
@@ -90,6 +90,7 @@ void foo_bar_reg(void)
 	mutex_lock(&thread_mutex);
 	simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn");
 	mutex_unlock(&thread_mutex);
+	return 0;
 }
 
 void foo_bar_unreg(void)
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index d6b75bb495b3..76a75ab7a608 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -354,7 +354,7 @@ TRACE_EVENT_CONDITION(foo_bar_with_cond,
 	TP_printk("foo %s %d", __get_str(foo), __entry->bar)
 );
 
-void foo_bar_reg(void);
+int foo_bar_reg(void);
 void foo_bar_unreg(void);
 
 /*
-- 
cgit v1.2.3


From 1dd349ab74d278cee50fc24dc26c023f3b149642 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 28 Nov 2016 13:17:25 -0500
Subject: tracing: Do not start benchmark on boot up

Trace events are enabled very early on boot up via the boot command line
parameter. The benchmark tool creates a new thread to perform the trace
event benchmarking. But at start up, it is called before scheduling is set
up and because it creates a new thread before the init thread is created,
this crashes the kernel.

Have the benchmark fail to register when started via the kernel command
line.

Also, since the registering of a tracepoint now can handle failure cases,
return -ENOMEM instead of warning if the thread cannot be created.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_benchmark.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index f76d0416dd83..2bc7dc3e8ff8 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -166,9 +166,18 @@ static int benchmark_event_kthread(void *arg)
  */
 int trace_benchmark_reg(void)
 {
+	if (system_state != SYSTEM_RUNNING) {
+		pr_warning("trace benchmark cannot be started via kernel command line\n");
+		return -EBUSY;
+	}
+
 	bm_event_thread = kthread_run(benchmark_event_kthread,
 				      NULL, "event_benchmark");
-	WARN_ON(!bm_event_thread);
+	if (!bm_event_thread) {
+		pr_warning("trace benchmark failed to create kernel thread\n");
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -183,6 +192,7 @@ void trace_benchmark_unreg(void)
 		return;
 
 	kthread_stop(bm_event_thread);
+	bm_event_thread = NULL;
 
 	strcpy(bm_str, "START");
 	bm_total = 0;
-- 
cgit v1.2.3


From 989a0a3d248192b6f5d16cc2aea95faed89bb7ce Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 28 Nov 2016 13:54:57 -0500
Subject: tracing: Have system enable return error if one of the events fail

If one of the events within a system fails to enable when "1" is written
to the system "enable" file, it should return an error. Note, some events
may still be enabled, but the user should know that something did go wrong.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d35fc2b0d304..93116549a284 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -702,6 +702,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
 	struct trace_event_call *call;
 	const char *name;
 	int ret = -EINVAL;
+	int eret = 0;
 
 	list_for_each_entry(file, &tr->events, list) {
 
@@ -725,9 +726,17 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
 		if (event && strcmp(event, name) != 0)
 			continue;
 
-		ftrace_event_enable_disable(file, set);
+		ret = ftrace_event_enable_disable(file, set);
 
-		ret = 0;
+		/*
+		 * Save the first error and return that. Some events
+		 * may still have been enabled, but let the user
+		 * know that something went wrong.
+		 */
+		if (ret && !eret)
+			eret = ret;
+
+		ret = eret;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 9c1f6bb8c88ab6c2779bdaf12d4f3407d336f085 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 28 Nov 2016 17:48:07 -0500
Subject: tracing: Allow benchmark to be enabled at early_initcall()

The trace event start up selftests fails when the trace benchmark is
enabled, because it is disabled during boot. It really only needs to be
disabled before scheduling is set up, as it creates a thread.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_benchmark.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 2bc7dc3e8ff8..e3b488825ae3 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -21,6 +21,8 @@ static u64 bm_stddev;
 static unsigned int bm_avg;
 static unsigned int bm_std;
 
+static bool ok_to_run;
+
 /*
  * This gets called in a loop recording the time it took to write
  * the tracepoint. What it writes is the time statistics of the last
@@ -166,7 +168,7 @@ static int benchmark_event_kthread(void *arg)
  */
 int trace_benchmark_reg(void)
 {
-	if (system_state != SYSTEM_RUNNING) {
+	if (!ok_to_run) {
 		pr_warning("trace benchmark cannot be started via kernel command line\n");
 		return -EBUSY;
 	}
@@ -207,3 +209,12 @@ void trace_benchmark_unreg(void)
 	bm_avg = 0;
 	bm_stddev = 0;
 }
+
+static __init int ok_to_run_trace_benchmark(void)
+{
+	ok_to_run = true;
+
+	return 0;
+}
+
+early_initcall(ok_to_run_trace_benchmark);
-- 
cgit v1.2.3


From 656c7f0d2d2b3237a31b105d5ed217a65350104f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 8 Dec 2016 12:40:18 -0500
Subject: tracing: Replace kmap with copy_from_user() in trace_marker writing

Instead of using get_user_pages_fast() and kmap_atomic() when writing
to the trace_marker file, just allocate enough space on the ring buffer
directly, and write into it via copy_from_user().

Writing into the trace_marker file use to allocate a temporary buffer
to perform the copy_from_user(), as we didn't want to write into the
ring buffer if the copy failed. But as a trace_marker write is suppose
to be extremely fast, and allocating memory causes other tracepoints to
trigger, Peter Zijlstra suggested using get_user_pages_fast() and
kmap_atomic() to keep the user space pages in memory and reading it
directly. But Henrik Austad had issues with this because it required taking
the mm->mmap_sem and causing long delays with the write.

Instead, just allocate the space in the ring buffer and use
copy_from_user() directly. If it faults, return -EFAULT and write
"<faulted>" into the ring buffer.

Link: http://lkml.kernel.org/r/20161208124018.72dd0f86@gandalf.local.home

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Henrik Austad <henrik@austad.us>
Cc: Peter Zijlstra <peterz@infradead.org>
Updates: d696b58ca2c3ca "tracing: Do not allocate buffer for trace_marker"
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 139 ++++++++++++++-------------------------------------
 1 file changed, 37 insertions(+), 102 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60416bf7c591..6f420d7b703b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5738,61 +5738,6 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static inline int lock_user_pages(const char __user *ubuf, size_t cnt,
-				  struct page **pages, void **map_page,
-				  int *offset)
-{
-	unsigned long addr = (unsigned long)ubuf;
-	int nr_pages = 1;
-	int ret;
-	int i;
-
-	/*
-	 * Userspace is injecting traces into the kernel trace buffer.
-	 * We want to be as non intrusive as possible.
-	 * To do so, we do not want to allocate any special buffers
-	 * or take any locks, but instead write the userspace data
-	 * straight into the ring buffer.
-	 *
-	 * First we need to pin the userspace buffer into memory,
-	 * which, most likely it is, because it just referenced it.
-	 * But there's no guarantee that it is. By using get_user_pages_fast()
-	 * and kmap_atomic/kunmap_atomic() we can get access to the
-	 * pages directly. We then write the data directly into the
-	 * ring buffer.
-	 */
-
-	/* check if we cross pages */
-	if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
-		nr_pages = 2;
-
-	*offset = addr & (PAGE_SIZE - 1);
-	addr &= PAGE_MASK;
-
-	ret = get_user_pages_fast(addr, nr_pages, 0, pages);
-	if (ret < nr_pages) {
-		while (--ret >= 0)
-			put_page(pages[ret]);
-		return -EFAULT;
-	}
-
-	for (i = 0; i < nr_pages; i++)
-		map_page[i] = kmap_atomic(pages[i]);
-
-	return nr_pages;
-}
-
-static inline void unlock_user_pages(struct page **pages,
-				     void **map_page, int nr_pages)
-{
-	int i;
-
-	for (i = nr_pages - 1; i >= 0; i--) {
-		kunmap_atomic(map_page[i]);
-		put_page(pages[i]);
-	}
-}
-
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
 					size_t cnt, loff_t *fpos)
@@ -5802,14 +5747,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	struct ring_buffer *buffer;
 	struct print_entry *entry;
 	unsigned long irq_flags;
-	struct page *pages[2];
-	void *map_page[2];
-	int nr_pages = 1;
+	const char faulted[] = "<faulted>";
 	ssize_t written;
-	int offset;
 	int size;
 	int len;
 
+/* Used in tracing_mark_raw_write() as well */
+#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */
+
 	if (tracing_disabled)
 		return -EINVAL;
 
@@ -5821,30 +5766,31 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset);
-	if (nr_pages < 0)
-		return nr_pages;
-
 	local_save_flags(irq_flags);
-	size = sizeof(*entry) + cnt + 2; /* possible \n added */
+	size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+
+	/* If less than "<faulted>", then make sure we can still add that */
+	if (cnt < FAULTED_SIZE)
+		size += FAULTED_SIZE - cnt;
+
 	buffer = tr->trace_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					    irq_flags, preempt_count());
-	if (!event) {
+	if (unlikely(!event))
 		/* Ring buffer disabled, return as if not open for write */
-		written = -EBADF;
-		goto out_unlock;
-	}
+		return -EBADF;
 
 	entry = ring_buffer_event_data(event);
 	entry->ip = _THIS_IP_;
 
-	if (nr_pages == 2) {
-		len = PAGE_SIZE - offset;
-		memcpy(&entry->buf, map_page[0] + offset, len);
-		memcpy(&entry->buf[len], map_page[1], cnt - len);
+	len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt);
+	if (len) {
+		memcpy(&entry->buf, faulted, FAULTED_SIZE);
+		cnt = FAULTED_SIZE;
+		written = -EFAULT;
 	} else
-		memcpy(&entry->buf, map_page[0] + offset, cnt);
+		written = cnt;
+	len = cnt;
 
 	if (entry->buf[cnt - 1] != '\n') {
 		entry->buf[cnt] = '\n';
@@ -5854,12 +5800,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	__buffer_unlock_commit(buffer, event);
 
-	written = cnt;
-
-	*fpos += written;
-
- out_unlock:
-	unlock_user_pages(pages, map_page, nr_pages);
+	if (written > 0)
+		*fpos += written;
 
 	return written;
 }
@@ -5875,15 +5817,14 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	struct raw_data_entry *entry;
+	const char faulted[] = "<faulted>";
 	unsigned long irq_flags;
-	struct page *pages[2];
-	void *map_page[2];
-	int nr_pages = 1;
 	ssize_t written;
-	int offset;
 	int size;
 	int len;
 
+#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int))
+
 	if (tracing_disabled)
 		return -EINVAL;
 
@@ -5899,38 +5840,32 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
 
 	BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
 
-	nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset);
-	if (nr_pages < 0)
-		return nr_pages;
-
 	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt;
+	if (cnt < FAULT_SIZE_ID)
+		size += FAULT_SIZE_ID - cnt;
+
 	buffer = tr->trace_buffer.buffer;
 	event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
 					    irq_flags, preempt_count());
-	if (!event) {
+	if (!event)
 		/* Ring buffer disabled, return as if not open for write */
-		written = -EBADF;
-		goto out_unlock;
-	}
+		return -EBADF;
 
 	entry = ring_buffer_event_data(event);
 
-	if (nr_pages == 2) {
-		len = PAGE_SIZE - offset;
-		memcpy(&entry->id, map_page[0] + offset, len);
-		memcpy(((char *)&entry->id) + len, map_page[1], cnt - len);
+	len = __copy_from_user_inatomic(&entry->id, ubuf, cnt);
+	if (len) {
+		entry->id = -1;
+		memcpy(&entry->buf, faulted, FAULTED_SIZE);
+		written = -EFAULT;
 	} else
-		memcpy(&entry->id, map_page[0] + offset, cnt);
+		written = cnt;
 
 	__buffer_unlock_commit(buffer, event);
 
-	written = cnt;
-
-	*fpos += written;
-
- out_unlock:
-	unlock_user_pages(pages, map_page, nr_pages);
+	if (written > 0)
+		*fpos += written;
 
 	return written;
 }
-- 
cgit v1.2.3


From 794de08a16cf1fc1bf785dc48f66d36218cf6d88 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 8 Dec 2016 20:54:49 -0500
Subject: fgraph: Handle a case where a tracer ignores set_graph_notrace

Both the wakeup and irqsoff tracers can use the function graph tracer when
the display-graph option is set. The problem is that they ignore the notrace
file, and record the entry of functions that would be ignored by the
function_graph tracer. This causes the trace->depth to be recorded into the
ring buffer. The set_graph_notrace uses a trick by adding a large negative
number to the trace->depth when a graph function is to be ignored.

On trace output, the graph function uses the depth to record a stack of
functions. But since the depth is negative, it accesses the array with a
negative number and causes an out of bounds access that can cause a kernel
oops or corrupt data.

Have the print functions handle cases where a tracer still records functions
even when they are in set_graph_notrace.

Also add warnings if the depth is below zero before accessing the array.

Note, the function graph logic will still prevent the return of these
functions from being recorded, which means that they will be left hanging
without a return. For example:

   # echo '*spin*' > set_graph_notrace
   # echo 1 > options/display-graph
   # echo wakeup > current_tracer
   # cat trace
   [...]
      _raw_spin_lock() {
        preempt_count_add() {
        do_raw_spin_lock() {
      update_rq_clock();

Where it should look like:

      _raw_spin_lock() {
        preempt_count_add();
        do_raw_spin_lock();
      }
      update_rq_clock();

Cc: stable@vger.kernel.org
Cc: Namhyung Kim <namhyung.kim@lge.com>
Fixes: 29ad23b00474 ("ftrace: Add set_graph_notrace filter")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8e1a115439fa..566f7327c3aa 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -842,6 +842,10 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 
 		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
 
+		/* If a graph tracer ignored set_graph_notrace */
+		if (call->depth < -1)
+			call->depth += FTRACE_NOTRACE_DEPTH;
+
 		/*
 		 * Comments display at + 1 to depth. Since
 		 * this is a leaf function, keep the comments
@@ -850,7 +854,8 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 		cpu_data->depth = call->depth - 1;
 
 		/* No need to keep this function around for this depth */
-		if (call->depth < FTRACE_RETFUNC_DEPTH)
+		if (call->depth < FTRACE_RETFUNC_DEPTH &&
+		    !WARN_ON_ONCE(call->depth < 0))
 			cpu_data->enter_funcs[call->depth] = 0;
 	}
 
@@ -880,11 +885,16 @@ print_graph_entry_nested(struct trace_iterator *iter,
 		struct fgraph_cpu_data *cpu_data;
 		int cpu = iter->cpu;
 
+		/* If a graph tracer ignored set_graph_notrace */
+		if (call->depth < -1)
+			call->depth += FTRACE_NOTRACE_DEPTH;
+
 		cpu_data = per_cpu_ptr(data->cpu_data, cpu);
 		cpu_data->depth = call->depth;
 
 		/* Save this function pointer to see if the exit matches */
-		if (call->depth < FTRACE_RETFUNC_DEPTH)
+		if (call->depth < FTRACE_RETFUNC_DEPTH &&
+		    !WARN_ON_ONCE(call->depth < 0))
 			cpu_data->enter_funcs[call->depth] = call->func;
 	}
 
@@ -1114,7 +1124,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		 */
 		cpu_data->depth = trace->depth - 1;
 
-		if (trace->depth < FTRACE_RETFUNC_DEPTH) {
+		if (trace->depth < FTRACE_RETFUNC_DEPTH &&
+		    !WARN_ON_ONCE(trace->depth < 0)) {
 			if (cpu_data->enter_funcs[trace->depth] != trace->func)
 				func_match = 0;
 			cpu_data->enter_funcs[trace->depth] = 0;
-- 
cgit v1.2.3


From 1a41442864e35bff859582fe9c5d051d0b1040ba Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 8 Dec 2016 19:28:28 -0500
Subject: tracing/fgraph: Have wakeup and irqsoff tracers ignore graph
 functions too

Currently both the wakeup and irqsoff traces do not handle set_graph_notrace
well. The ftrace infrastructure will ignore the return paths of all
functions leaving them hanging without an end:

  # echo '*spin*' > set_graph_notrace
  # cat trace
  [...]
          _raw_spin_lock() {
            preempt_count_add() {
            do_raw_spin_lock() {
          update_rq_clock();

Where the '*spin*' functions should have looked like this:

          _raw_spin_lock() {
            preempt_count_add();
            do_raw_spin_lock();
          }
          update_rq_clock();

Instead, have the wakeup and irqsoff tracers ignore the functions that are
set by the set_graph_notrace like the function_graph tracer does. Move
the logic in the function_graph tracer into a header to allow wakeup and
irqsoff tracers to use it as well.

Cc: Namhyung Kim <namhyung.kim@lge.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h                 | 11 +++++++++++
 kernel/trace/trace_functions_graph.c | 14 +++++++-------
 kernel/trace/trace_irqsoff.c         | 12 ++++++++++++
 kernel/trace/trace_sched_wakeup.c    | 12 ++++++++++++
 4 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 37602e722336..c2234494f40c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -846,6 +846,17 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)
 	return 0;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
+
+extern unsigned int fgraph_max_depth;
+
+static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+{
+	/* trace it when it is-nested-in or is a function enabled. */
+	return !(trace->depth || ftrace_graph_addr(trace->func)) ||
+		(trace->depth < 0) ||
+		(fgraph_max_depth && trace->depth >= fgraph_max_depth);
+}
+
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function_flags(struct trace_iterator *iter, u32 flags)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 566f7327c3aa..d56123cdcc89 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,7 +65,7 @@ struct fgraph_data {
 
 #define TRACE_GRAPH_INDENT	2
 
-static unsigned int max_depth;
+unsigned int fgraph_max_depth;
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns? (for self-debug purpose) */
@@ -384,10 +384,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	if (!ftrace_trace_task(tr))
 		return 0;
 
-	/* trace it when it is-nested-in or is a function enabled. */
-	if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
-	     ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
-	    (max_depth && trace->depth >= max_depth))
+	if (ftrace_graph_ignore_func(trace))
+		return 0;
+
+	if (ftrace_graph_ignore_irqs())
 		return 0;
 
 	/*
@@ -1500,7 +1500,7 @@ graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret)
 		return ret;
 
-	max_depth = val;
+	fgraph_max_depth = val;
 
 	*ppos += cnt;
 
@@ -1514,7 +1514,7 @@ graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
 	char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
 	int n;
 
-	n = sprintf(buf, "%d\n", max_depth);
+	n = sprintf(buf, "%d\n", fgraph_max_depth);
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
 }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 03cdff84d026..86654d7e1afe 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -175,6 +175,18 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
 	int ret;
 	int pc;
 
+	if (ftrace_graph_ignore_func(trace))
+		return 0;
+	/*
+	 * Do not trace a function if it's filtered by set_graph_notrace.
+	 * Make the index of ret stack negative to indicate that it should
+	 * ignore further functions.  But it needs its own ret stack entry
+	 * to recover the original index in order to continue tracing after
+	 * returning from the function.
+	 */
+	if (ftrace_graph_notrace_addr(trace->func))
+		return 1;
+
 	if (!func_prolog_dec(tr, &data, &flags))
 		return 0;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 1bf2324dc682..5d0bb025bb21 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -239,6 +239,18 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
 	unsigned long flags;
 	int pc, ret = 0;
 
+	if (ftrace_graph_ignore_func(trace))
+		return 0;
+	/*
+	 * Do not trace a function if it's filtered by set_graph_notrace.
+	 * Make the index of ret stack negative to indicate that it should
+	 * ignore further functions.  But it needs its own ret stack entry
+	 * to recover the original index in order to continue tracing after
+	 * returning from the function.
+	 */
+	if (ftrace_graph_notrace_addr(trace->func))
+		return 1;
+
 	if (!func_prolog_preempt_disable(tr, &data, &pc))
 		return 0;
 
-- 
cgit v1.2.3


From f519a3f1c6b7a990e5aed37a8f853c6ecfdee945 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 8 Dec 2016 17:56:53 +0100
Subject: sched/core: Fix find_idlest_group() for fork

During fork, the utilization of a task is init once the rq has been
selected because the current utilization level of the rq is used to
set the utilization of the fork task. As the task's utilization is
still 0 at this step of the fork sequence, it doesn't make sense to
look for some spare capacity that can fit the task's utilization.
Furthermore, I can see perf regressions for the test:

   hackbench -P -g 1

because the least loaded policy is always bypassed and tasks are not
spread during fork.

With this patch and the fix below, we are back to same performances as
for v4.8. The fix below is only a temporary one used for the test
until a smarter solution is found because we can't simply remove the
test which is useful for others benchmarks

| @@ -5708,13 +5708,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
|
|	avg_cost = this_sd->avg_scan_cost;
|
| -	/*
| -	 * Due to large variance we need a large fuzz factor; hackbench in
| -	 * particularly is sensitive here.
| -	 */
| -	if ((avg_idle / 512) < avg_cost)
| -		return -1;
| -
|	time = local_clock();
|
|	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {

Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Acked-by: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: kernellwp@gmail.com
Cc: umgwanakikbuti@gmail.com
Cc: yuyang.du@intel.comc
Link: http://lkml.kernel.org/r/1481216215-24651-2-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 18d9e75f1f6e..ebb815f6bda7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5473,13 +5473,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 	 * utilized systems if we require spare_capacity > task_util(p),
 	 * so we allow for some task stuffing by using
 	 * spare_capacity > task_util(p)/2.
+	 *
+	 * Spare capacity can't be used for fork because the utilization has
+	 * not been set yet, we must first select a rq to compute the initial
+	 * utilization.
 	 */
+	if (sd_flag & SD_BALANCE_FORK)
+		goto skip_spare;
+
 	if (this_spare > task_util(p) / 2 &&
 	    imbalance*this_spare > 100*most_spare)
 		return NULL;
 	else if (most_spare > task_util(p) / 2)
 		return most_spare_sg;
 
+skip_spare:
 	if (!idlest || 100*this_load < imbalance*min_load)
 		return NULL;
 	return idlest;
-- 
cgit v1.2.3


From 6b94780e45c17b83e3e75f8aaca5a328db583c74 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 8 Dec 2016 17:56:54 +0100
Subject: sched/core: Use load_avg for selecting idlest group

find_idlest_group() only compares the runnable_load_avg when looking
for the least loaded group. But on fork intensive use case like
hackbench where tasks blocked quickly after the fork, this can lead to
selecting the same CPU instead of other CPUs, which have similar
runnable load but a lower load_avg.

When the runnable_load_avg of 2 CPUs are close, we now take into
account the amount of blocked load as a 2nd selection factor. There is
now 3 zones for the runnable_load of the rq:

 - [0 .. (runnable_load - imbalance)]:
	Select the new rq which has significantly less runnable_load

 - [(runnable_load - imbalance) .. (runnable_load + imbalance)]:
	The runnable loads are close so we use load_avg to chose
	between the 2 rq

 - [(runnable_load + imbalance) .. ULONG_MAX]:
	Keep the current rq which has significantly less runnable_load

The scale factor that is currently used for comparing runnable_load,
doesn't work well with small value. As an example, the use of a
scaling factor fails as soon as this_runnable_load == 0 because we
always select local rq even if min_runnable_load is only 1, which
doesn't really make sense because they are just the same. So instead
of scaling factor, we use an absolute margin for runnable_load to
detect CPUs with similar runnable_load and we keep using scaling
factor for blocked load.

For use case like hackbench, this enable the scheduler to select
different CPUs during the fork sequence and to spread tasks across the
system.

Tests have been done on a Hikey board (ARM based octo cores) for
several kernel. The result below gives min, max, avg and stdev values
of 18 runs with each configuration.

The patches depend on the "no missing update_rq_clock()" work.

hackbench -P -g 1

         ea86cb4b7621  7dc603c9028e  v4.8        v4.8+patches
  min    0.049         0.050         0.051       0,048
  avg    0.057         0.057(0%)     0.057(0%)   0,055(+5%)
  max    0.066         0.068         0.070       0,063
  stdev  +/-9%         +/-9%         +/-8%       +/-9%

More performance numbers here:

  https://lkml.kernel.org/r/20161203214707.GI20785@codeblueprint.co.uk

Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten.Rasmussen@arm.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: kernellwp@gmail.com
Cc: umgwanakikbuti@gmail.com
Cc: yuyang.du@intel.comc
Link: http://lkml.kernel.org/r/1481216215-24651-3-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ebb815f6bda7..6559d197e08a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5405,16 +5405,20 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	struct sched_group *most_spare_sg = NULL;
-	unsigned long min_load = ULONG_MAX, this_load = 0;
+	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
+	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
 	unsigned long most_spare = 0, this_spare = 0;
 	int load_idx = sd->forkexec_idx;
-	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+				(sd->imbalance_pct-100) / 100;
 
 	if (sd_flag & SD_BALANCE_WAKE)
 		load_idx = sd->wake_idx;
 
 	do {
-		unsigned long load, avg_load, spare_cap, max_spare_cap;
+		unsigned long load, avg_load, runnable_load;
+		unsigned long spare_cap, max_spare_cap;
 		int local_group;
 		int i;
 
@@ -5431,6 +5435,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		 * the group containing the CPU with most spare capacity.
 		 */
 		avg_load = 0;
+		runnable_load = 0;
 		max_spare_cap = 0;
 
 		for_each_cpu(i, sched_group_cpus(group)) {
@@ -5440,7 +5445,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 			else
 				load = target_load(i, load_idx);
 
-			avg_load += load;
+			runnable_load += load;
+
+			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
 
 			spare_cap = capacity_spare_wake(i, p);
 
@@ -5449,14 +5456,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		}
 
 		/* Adjust by relative CPU capacity of the group */
-		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
+		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
+					group->sgc->capacity;
+		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
+					group->sgc->capacity;
 
 		if (local_group) {
-			this_load = avg_load;
+			this_runnable_load = runnable_load;
+			this_avg_load = avg_load;
 			this_spare = max_spare_cap;
 		} else {
-			if (avg_load < min_load) {
-				min_load = avg_load;
+			if (min_runnable_load > (runnable_load + imbalance)) {
+				/*
+				 * The runnable load is significantly smaller
+				 * so we can pick this new cpu
+				 */
+				min_runnable_load = runnable_load;
+				min_avg_load = avg_load;
+				idlest = group;
+			} else if ((runnable_load < (min_runnable_load + imbalance)) &&
+				   (100*min_avg_load > imbalance_scale*avg_load)) {
+				/*
+				 * The runnable loads are close so take the
+				 * blocked load into account through avg_load.
+				 */
+				min_avg_load = avg_load;
 				idlest = group;
 			}
 
@@ -5482,14 +5506,23 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 		goto skip_spare;
 
 	if (this_spare > task_util(p) / 2 &&
-	    imbalance*this_spare > 100*most_spare)
+	    imbalance_scale*this_spare > 100*most_spare)
 		return NULL;
-	else if (most_spare > task_util(p) / 2)
+
+	if (most_spare > task_util(p) / 2)
 		return most_spare_sg;
 
 skip_spare:
-	if (!idlest || 100*this_load < imbalance*min_load)
+	if (!idlest)
+		return NULL;
+
+	if (min_runnable_load > (this_runnable_load + imbalance))
 		return NULL;
+
+	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
+	     (100*this_avg_load < imbalance_scale*min_avg_load))
+		return NULL;
+
 	return idlest;
 }
 
-- 
cgit v1.2.3


From c59f29cb144a6a0dfac16ede9dc8eafc02dc56ca Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Fri, 9 Dec 2016 21:50:17 +0530
Subject: tracing: Use SOFTIRQ_OFFSET for softirq dectection for more accurate
 results

The 's' flag is supposed to indicate that a softirq is running. This
can be detected by testing the preempt_count with SOFTIRQ_OFFSET.

The current code tests the preempt_count with SOFTIRQ_MASK, which
would be true even when softirqs are disabled but not serving a
softirq.

Link: http://lkml.kernel.org/r/1481300417-3564-1-git-send-email-pkondeti@codeaurora.org

Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6f420d7b703b..970aafe80b49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1949,7 +1949,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 #endif
 		((pc & NMI_MASK    ) ? TRACE_FLAG_NMI     : 0) |
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
-		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
 		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
 		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
 }
-- 
cgit v1.2.3


From 99e6f6e8134b0c9d5d82eb2af1068a57199125e4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 7 Dec 2016 14:31:33 +0100
Subject: tracing/rb: Init the CPU mask on allocation

Before commit b32614c03413 ("tracing/rb: Convert to hotplug state
machine") the allocated cpumask was initialized to the mask of ONLINE or
POSSIBLE CPUs. After the CPU hotplug changes the buffer initialisation
moved to trace_rb_cpu_prepare() but I forgot to initially set the
cpumask to zero. This is done now.

Link: http://lkml.kernel.org/r/20161207133133.hzkcqfllxcdi3joz@linutronix.de

Fixes: b32614c03413 ("tracing/rb: Convert to hotplug state machine")
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f17476f9d7f4..7edfd41d506c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1303,7 +1303,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	if (!buffer)
 		return NULL;
 
-	if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
+	if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
 		goto fail_free_buffer;
 
 	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-- 
cgit v1.2.3


From f18f97ac43d72ae84fe012dd4f19fe3f8f901469 Mon Sep 17 00:00:00 2001
From: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Date: Fri, 9 Dec 2016 15:19:37 +0100
Subject: tracing/kprobes: Add a helper method to return number of probe hits

The number of probe hits is stored in a percpu variable and therefore
can't be read directly. Add a helper method trace_kprobe_nhit() that
performs the required calculation.

It will be used in a follow-up commit that changes kprobe selftests to
verify the number of probe hits.

Link: http://lkml.kernel.org/r/1481293178-3128-1-git-send-email-marcin.nowakowski@imgtec.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index eb6c9f1d3a93..a2af1bceca29 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -73,6 +73,17 @@ static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
 	return !!strchr(trace_kprobe_symbol(tk), ':');
 }
 
+static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
+{
+	unsigned long nhit = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		nhit += *per_cpu_ptr(tk->nhit, cpu);
+
+	return nhit;
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -882,14 +893,10 @@ static const struct file_operations kprobe_events_ops = {
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_kprobe *tk = v;
-	unsigned long nhit = 0;
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		nhit += *per_cpu_ptr(tk->nhit, cpu);
 
 	seq_printf(m, "  %-44s %15lu %15lu\n",
-		   trace_event_name(&tk->tp.call), nhit,
+		   trace_event_name(&tk->tp.call),
+		   trace_kprobe_nhit(tk),
 		   tk->rp.kp.nmissed);
 
 	return 0;
-- 
cgit v1.2.3


From d4d7ccc834fe2401084e01fb043ad70ac410b19d Mon Sep 17 00:00:00 2001
From: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Date: Fri, 9 Dec 2016 15:19:38 +0100
Subject: kprobes/trace: Fix kprobe selftest for newer gcc

Commit 265a5b7ee3eb ("kprobes/trace: Fix kprobe selftest for gcc 4.6")
has added __used attribute to kprobe_trace_selftest_target to ensure
that the method is listed in kallsyms table.

However, even though the method remains in the kernel image, the actual
call is optimized away as there are no side effects and the return value
is never checked.

Add a return value check and a 'noinline' attribute to ensure that an
inlined copy of the method is not used by the caller. Also add checks
that verify that the kprobe was really hit, as at the moment the tests
show positive results despite the test method being optimized away.

Finally, add __init annotations to find_trace_probe_file() and
kprobe_trace_selftest_target() as they are only called from within an
__init method.

Link: http://lkml.kernel.org/r/1481293178-3128-2-git-send-email-marcin.nowakowski@imgtec.com

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a2af1bceca29..a133ecd741e4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1361,18 +1361,18 @@ fs_initcall(init_kprobe_trace);
 
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-
 /*
  * The "__used" keeps gcc from removing the function symbol
- * from the kallsyms table.
+ * from the kallsyms table. 'noinline' makes sure that there
+ * isn't an inlined version used by the test method below
  */
-static __used int kprobe_trace_selftest_target(int a1, int a2, int a3,
-					       int a4, int a5, int a6)
+static __used __init noinline int
+kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6)
 {
 	return a1 + a2 + a3 + a4 + a5 + a6;
 }
 
-static struct trace_event_file *
+static struct __init trace_event_file *
 find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr)
 {
 	struct trace_event_file *file;
@@ -1450,12 +1450,25 @@ static __init int kprobe_trace_self_tests_init(void)
 
 	ret = target(1, 2, 3, 4, 5, 6);
 
+	/*
+	 * Not expecting an error here, the check is only to prevent the
+	 * optimizer from removing the call to target() as otherwise there
+	 * are no side-effects and the call is never performed.
+	 */
+	if (ret != 21)
+		warn++;
+
 	/* Disable trace points before removing it */
 	tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
 	if (WARN_ON_ONCE(tk == NULL)) {
 		pr_warn("error on getting test probe.\n");
 		warn++;
 	} else {
+		if (trace_kprobe_nhit(tk) != 1) {
+			pr_warn("incorrect number of testprobe hits\n");
+			warn++;
+		}
+
 		file = find_trace_probe_file(tk, top_trace_array());
 		if (WARN_ON_ONCE(file == NULL)) {
 			pr_warn("error on getting probe file.\n");
@@ -1469,6 +1482,11 @@ static __init int kprobe_trace_self_tests_init(void)
 		pr_warn("error on getting 2nd test probe.\n");
 		warn++;
 	} else {
+		if (trace_kprobe_nhit(tk) != 1) {
+			pr_warn("incorrect number of testprobe2 hits\n");
+			warn++;
+		}
+
 		file = find_trace_probe_file(tk, top_trace_array());
 		if (WARN_ON_ONCE(file == NULL)) {
 			pr_warn("error on getting probe file.\n");
-- 
cgit v1.2.3


From c0b942a76361e08fc9fb17989e0f266e64ff0688 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Mon, 12 Dec 2016 16:40:39 -0800
Subject: kthread: add __printf attributes

When commit fbae2d44aa1d ("kthread: add kthread_create_worker*()")
introduced some kthread_create_...() functions which were taking
printf-like parametter, it introduced __printf attributes to some
functions (e.g.  kthread_create_worker()).  Nevertheless some new
functions were forgotten (they have been detected thanks to
-Wmissing-format-attribute warning flag).

Add the missing __printf attributes to the newly-introduced functions in
order to detect formatting issues at build-time with -Wformat flag.

Link: http://lkml.kernel.org/r/20161126193543.22672-1-nicolas.iooss_linux@m4x.org
Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h | 2 +-
 kernel/kthread.c        | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c1c3e63d52c1..4fec8b775895 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -175,7 +175,7 @@ __printf(2, 3)
 struct kthread_worker *
 kthread_create_worker(unsigned int flags, const char namefmt[], ...);
 
-struct kthread_worker *
+__printf(3, 4) struct kthread_worker *
 kthread_create_worker_on_cpu(int cpu, unsigned int flags,
 			     const char namefmt[], ...);
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 956495f0efaf..2318fba86277 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create)
 	}
 }
 
-static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
+static __printf(4, 0)
+struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 						    void *data, int node,
 						    const char namefmt[],
 						    va_list args)
@@ -635,7 +636,7 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
-static struct kthread_worker *
+static __printf(3, 0) struct kthread_worker *
 __kthread_create_worker(int cpu, unsigned int flags,
 			const char namefmt[], va_list args)
 {
-- 
cgit v1.2.3


From 3fb4afd9a504c2386b8435028d43283216bf588e Mon Sep 17 00:00:00 2001
From: Stanislav Kinsburskiy <skinsbursky@virtuozzo.com>
Date: Mon, 12 Dec 2016 16:40:42 -0800
Subject: prctl: remove one-shot limitation for changing exe link

This limitation came with the reason to remove "another way for
malicious code to obscure a compromised program and masquerade as a
benign process" by allowing "security-concious program can use this
prctl once during its early initialization to ensure the prctl cannot
later be abused for this purpose":

    http://marc.info/?l=linux-kernel&m=133160684517468&w=2

This explanation doesn't look sufficient.  The only thing "exe" link is
indicating is the file, used to execve, which is basically nothing and
not reliable immediately after process has returned from execve system
call.

Moreover, to use this feture, all the mappings to previous exe file have
to be unmapped and all the new exe file permissions must be satisfied.

Which means, that changing exe link is very similar to calling execve on
the binary.

The need to remove this limitations comes from migration of NFS mount
point, which is not accessible during restore and replaced by other file
system.  Because of this exe link has to be changed twice.

[akpm@linux-foundation.org: fix up comment]
Link: http://lkml.kernel.org/r/20160927153755.9337.69650.stgit@localhost.localdomain
Signed-off-by: Stanislav Kinsburskiy <skinsbursky@virtuozzo.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  6 +++++-
 kernel/sys.c          | 10 ----------
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7551d3e2ab70..0e90f2973719 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -540,7 +540,11 @@ static inline int get_dumpable(struct mm_struct *mm)
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
-#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
+/*
+ * This one-shot flag is dropped due to necessity of changing exe once again
+ * on NFS restore
+ */
+//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
diff --git a/kernel/sys.c b/kernel/sys.c
index 89d5be418157..fd6f50809b6e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1696,16 +1696,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 		fput(exe_file);
 	}
 
-	/*
-	 * The symlink can be changed only once, just to disallow arbitrary
-	 * transitions malicious software might bring in. This means one
-	 * could make a snapshot over all processes running and monitor
-	 * /proc/pid/exe changes to notice unusual activity if needed.
-	 */
-	err = -EPERM;
-	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-		goto exit;
-
 	err = 0;
 	/* set the new file, lockless */
 	get_file(exe.file);
-- 
cgit v1.2.3


From 0f110a9b956c1678b53986b003d59794604807ba Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 12 Dec 2016 16:44:14 -0800
Subject: kernel/fork: use vfree_atomic() to free thread stack

vfree() is going to use sleeping lock.  Thread stack freed in atomic
context, therefore we must use vfree_atomic() here.

Link: http://lkml.kernel.org/r/1479474236-4139-6-git-send-email-hch@lst.de
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Jisheng Zhang <jszhang@marvell.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: John Dias <joaodias@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7ffa16033ded..00492b22adfe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
 		}
 		local_irq_restore(flags);
 
-		vfree(tsk->stack);
+		vfree_atomic(tsk->stack);
 		return;
 	}
 #endif
-- 
cgit v1.2.3


From 4ca5ede07c9871c13ae422c96d6d08dbd0df5eda Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Dec 2016 16:45:35 -0800
Subject: hung_task: decrement sysctl_hung_task_warnings only if it is positive

Since sysctl_hung_task_warnings == -1 is allowed (infinite warnings),
commit 48a6d64edadb ("hung_task: allow hung_task_panic when
hung_task_warnings is 0") should decrement it only when it is not -1.

This prevents the kernel from ceasing warnings after the first
4294967295 ;)

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: John Siddle <jsiddle@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hung_task.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 2b59c82cc3e1..40c07e4fa116 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	 * complain:
 	 */
 	if (sysctl_hung_task_warnings) {
-		sysctl_hung_task_warnings--;
+		if (sysctl_hung_task_warnings > 0)
+			sysctl_hung_task_warnings--;
 		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
 			t->comm, t->pid, timeout);
 		pr_err("      %s %s %.*s\n",
-- 
cgit v1.2.3


From 4a998e322abc935e95efc1a8108e6102be636a43 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 12 Dec 2016 16:45:40 -0800
Subject: printk/NMI: fix up handling of the full nmi log buffer

vsnprintf() adds the trailing '\0' but it does not count it into the
number of printed characters.  The result is that there is one byte less
space for the real characters in the buffer.

The broken check for the free space might cause that we will repeatedly
try to print 1 character into the buffer, never reach the full buffer,
and do not count the messages as missed.

Also vsnprintf() returns the number of characters that would be printed
if the buffer was big enough.  As a result, s->len might be bigger than
the size of the buffer[*].  And the printk() function might return
bigger len than it really printed.  Both problems are fixed by using
vscnprintf() instead.

Note that I though about increasing the number of missed messages even
when the message was shrunken.  But it made the code even more
complicated.  I think that it is not worth it.  Shrunken messages are
usually easy to recognize.  And it should be a corner case.

[*] The overflown s->len value is crazy and unexpected.  I "made a
mistake" and reported this situation as an internal error when fixed
handling of PR_CONT headers in some other patch.

Link: http://lkml.kernel.org/r/20161208174912.GA17042@linux.suse
Signed-off-by: Petr Mladek <pmladek@suse.com>
CcL Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Joe Perches <joe@perches.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Takashi Iwai <tiwai@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/nmi.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 16bab471c7e2..152533edc56f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args)
 again:
 	len = atomic_read(&s->len);
 
-	if (len >= sizeof(s->buffer)) {
+	/* The trailing '\0' is not counted into len. */
+	if (len >= sizeof(s->buffer) - 1) {
 		atomic_inc(&nmi_message_lost);
 		return 0;
 	}
@@ -79,7 +80,7 @@ again:
 	if (!len)
 		smp_rmb();
 
-	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+	add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
 
 	/*
 	 * Do it once again if the buffer has been flushed in the meantime.
-- 
cgit v1.2.3


From 22c2c7b2ef7864389b1b75f9fd604da14b21e2c2 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 12 Dec 2016 16:45:44 -0800
Subject: printk/NMI: handle continuous lines and missing newline

Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing
continuation lines") added back KERN_CONT message header.  As a result
it might appear in the middle of the line when the parts are squashed
via the temporary NMI buffer.

A reasonable solution seems to be to split the text in the NNI temporary
not only by newlines but also by the message headers.

Another solution would be to filter out KERN_CONT when writing to the
temporary buffer.  But this would complicate the lockless handling.
Also it would not solve problems with a missing newline that was there
even before the KERN_CONT stuff.

This patch moves the temporary buffer handling into separate function.
I played with it and it seems that using the char pointers make the code
easier to read.

Also it prints the final newline as a continuous line.

Finally, it moves handling of the s->len overflow into the paranoid
check.  And allows to recover from the disaster.

Link: http://lkml.kernel.org/r/1478695291-12169-2-git-send-email-pmladek@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: David Sterba <dsterba@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/nmi.c | 78 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 152533edc56f..f011aaef583c 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -114,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len)
 
 }
 
-/*
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
-					int start, int end)
+/* printk part of the temporary buffer line by line */
+static int printk_nmi_flush_buffer(const char *start, size_t len)
 {
-	const char *buf = s->buffer + start;
+	const char *c, *end;
+	bool header;
+
+	c = start;
+	end = start + len;
+	header = true;
+
+	/* Print line by line. */
+	while (c < end) {
+		if (*c == '\n') {
+			printk_nmi_flush_line(start, c - start + 1);
+			start = ++c;
+			header = true;
+			continue;
+		}
+
+		/* Handle continuous lines or missing new line. */
+		if ((c + 1 < end) && printk_get_level(c)) {
+			if (header) {
+				c = printk_skip_level(c);
+				continue;
+			}
+
+			printk_nmi_flush_line(start, c - start);
+			start = c++;
+			header = true;
+			continue;
+		}
+
+		header = false;
+		c++;
+	}
 
-	printk_nmi_flush_line(buf, (end - start) + 1);
+	/* Check if there was a partial line. Ignore pure header. */
+	if (start < end && !header) {
+		static const char newline[] = KERN_CONT "\n";
+
+		printk_nmi_flush_line(start, end - start);
+		printk_nmi_flush_line(newline, strlen(newline));
+	}
+
+	return len;
 }
 
 /*
@@ -136,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work)
 		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
 	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
 	unsigned long flags;
-	size_t len, size;
-	int i, last_i;
+	size_t len;
+	int i;
 
 	/*
 	 * The lock has two functions. First, one reader has to flush all
@@ -155,12 +190,14 @@ more:
 	/*
 	 * This is just a paranoid check that nobody has manipulated
 	 * the buffer an unexpected way. If we printed something then
-	 * @len must only increase.
+	 * @len must only increase. Also it should never overflow the
+	 * buffer size.
 	 */
-	if (i && i >= len) {
+	if ((i && i >= len) || len > sizeof(s->buffer)) {
 		const char *msg = "printk_nmi_flush: internal error\n";
 
 		printk_nmi_flush_line(msg, strlen(msg));
+		len = 0;
 	}
 
 	if (!len)
@@ -168,22 +205,7 @@ more:
 
 	/* Make sure that data has been written up to the @len */
 	smp_rmb();
-
-	size = min(len, sizeof(s->buffer));
-	last_i = i;
-
-	/* Print line by line. */
-	for (; i < size; i++) {
-		if (s->buffer[i] == '\n') {
-			printk_nmi_flush_seq_line(s, last_i, i);
-			last_i = i + 1;
-		}
-	}
-	/* Check if there was a partial line. */
-	if (last_i < size) {
-		printk_nmi_flush_seq_line(s, last_i, size - 1);
-		printk_nmi_flush_line("\n", strlen("\n"));
-	}
+	i += printk_nmi_flush_buffer(s->buffer + i, len - i);
 
 	/*
 	 * Check that nothing has got added in the meantime and truncate
-- 
cgit v1.2.3


From 497957576cf8a2150d723aedd74ea60b5d498bfe Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 12 Dec 2016 16:45:47 -0800
Subject: printk/kdb: handle more message headers

Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing
continuation lines") allows to define more message headers for a single
message.  The motivation is that continuous lines might get mixed.
Therefore it make sense to define the right log level for every piece of
a cont line.

This patch introduces printk_skip_headers() that will skip all headers
and uses it in the kdb code instead of printk_skip_level().

This approach helps to fix other printk_skip_level() users
independently.

Link: http://lkml.kernel.org/r/1478695291-12169-3-git-send-email-pmladek@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Joe Perches <joe@perches.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Jaroslav Kysela <perex@perex.cz>
Cc: Takashi Iwai <tiwai@suse.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: David Sterba <dsterba@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h    | 8 ++++++++
 kernel/debug/kdb/kdb_io.c | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index eac1af8502bb..a0859e169bc3 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -31,6 +31,14 @@ static inline const char *printk_skip_level(const char *buffer)
 	return buffer;
 }
 
+static inline const char *printk_skip_headers(const char *buffer)
+{
+	while (printk_get_level(buffer))
+		buffer = printk_skip_level(buffer);
+
+	return buffer;
+}
+
 #define CONSOLE_EXT_LOG_MAX	8192
 
 /* printk's without a loglevel use this.. */
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index fc1ef736253c..98c9011eac78 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -697,7 +697,7 @@ kdb_printit:
 	 * Write to all consoles.
 	 */
 	retlen = strlen(kdb_buffer);
-	cp = (char *) printk_skip_level(kdb_buffer);
+	cp = (char *) printk_skip_headers(kdb_buffer);
 	if (!dbg_kdb_mode && kgdb_connected) {
 		gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
 	} else {
-- 
cgit v1.2.3


From d06505b2a95efba25dc635b64a481e3197e9369a Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Tue, 1 Nov 2016 23:31:50 +0100
Subject: Remove last traces of ikconfig.h

The build system stopped generating ikconfig.h in v2.6.8. Remove an entry
for it in dontdiff. There's also a reference to it in a small comment.
Remove that comment too, as it is of little help in any case.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/dontdiff | 1 -
 kernel/Makefile        | 2 --
 2 files changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 5385cba941d2..a23edccd2059 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -139,7 +139,6 @@ hpet_example
 hugepage-mmap
 hugepage-shm
 ihex2fw
-ikconfig.h*
 inat-tables.c
 initramfs_list
 int16.c
diff --git a/kernel/Makefile b/kernel/Makefile
index eb26e12c6c2a..eaee9de224bd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -115,8 +115,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
-# config_data.h contains the same information as ikconfig.h but gzipped.
-# Info from config_data can be extracted from /proc/config*
 targets += config_data.gz
 $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 	$(call if_changed,gzip)
-- 
cgit v1.2.3


From 55a6f170a413cd8dc7a3a52e5a326e1a87579b4f Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rbriggs@redhat.com>
Date: Tue, 29 Nov 2016 16:53:23 -0500
Subject: audit: move kaudit thread start from auditd registration to kaudit
 init (#2)

Richard made this change some time ago but Eric backed it out because
the rest of the supporting code wasn't ready.  In order to move the
netlink multicast send to kauditd_thread we need to ensure the
kauditd_thread is always running, so restore commit 6ff5e459 ("audit:
move kaudit thread start from auditd registration to kaudit init").

Signed-off-by: Richard Guy Briggs <rbriggs@redhat.com>
[PM: brought forward and merged based on Richard's old patch]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index a8a91bd2b2a9..d4c78ba5c4f9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -832,16 +832,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (err)
 		return err;
 
-	/* As soon as there's any sign of userspace auditd,
-	 * start kauditd to talk to it */
-	if (!kauditd_task) {
-		kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
-		if (IS_ERR(kauditd_task)) {
-			err = PTR_ERR(kauditd_task);
-			kauditd_task = NULL;
-			return err;
-		}
-	}
 	seq  = nlh->nlmsg_seq;
 	data = nlmsg_data(nlh);
 
@@ -1190,6 +1180,10 @@ static int __init audit_init(void)
 		audit_default ? "enabled" : "disabled");
 	register_pernet_subsys(&audit_net_ops);
 
+	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+	if (IS_ERR(kauditd_task))
+		return PTR_ERR(kauditd_task);
+
 	skb_queue_head_init(&audit_skb_queue);
 	skb_queue_head_init(&audit_skb_hold_queue);
 	audit_initialized = AUDIT_INITIALIZED;
-- 
cgit v1.2.3


From 6c9255645350ce2aecb7c3cd032d0e93d4a7a71a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:23 -0500
Subject: audit: fixup audit_init()

Make sure everything is initialized before we start the kauditd_thread
and don't emit the "initialized" record until everything is finished.
We also panic with a descriptive message if we can't start the
kauditd_thread.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index d4c78ba5c4f9..b61642b1934f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1180,21 +1180,23 @@ static int __init audit_init(void)
 		audit_default ? "enabled" : "disabled");
 	register_pernet_subsys(&audit_net_ops);
 
-	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
-	if (IS_ERR(kauditd_task))
-		return PTR_ERR(kauditd_task);
-
 	skb_queue_head_init(&audit_skb_queue);
 	skb_queue_head_init(&audit_skb_hold_queue);
 	audit_initialized = AUDIT_INITIALIZED;
 	audit_enabled = audit_default;
 	audit_ever_enabled |= !!audit_default;
 
-	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
-
 	for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
 		INIT_LIST_HEAD(&audit_inode_hash[i]);
 
+	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+	if (IS_ERR(kauditd_task)) {
+		int err = PTR_ERR(kauditd_task);
+		panic("audit: failed to start the kauditd thread (%d)\n", err);
+	}
+
+	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
+
 	return 0;
 }
 __initcall(audit_init);
-- 
cgit v1.2.3


From 4aa83872d346806d9b54768aa0d1329050542bad Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:24 -0500
Subject: audit: queue netlink multicast sends just like we do for unicast
 sends

Sending audit netlink multicast messages is bad for all the same
reasons that sending audit netlink unicast messages is bad, so this
patch reworks things so that we don't do the multicast send in
audit_log_end(), we do it from the dedicated kauditd_thread thread just
as we do for unicast messages.

See the GitHub issues below for more information/history:

 * https://github.com/linux-audit/audit-kernel/issues/23
 * https://github.com/linux-audit/audit-kernel/issues/22

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 70 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index b61642b1934f..801247a6c9e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -511,26 +511,46 @@ static void flush_hold_queue(void)
 
 static int kauditd_thread(void *dummy)
 {
+	struct sk_buff *skb;
+	struct nlmsghdr *nlh;
+
 	set_freezable();
 	while (!kthread_should_stop()) {
-		struct sk_buff *skb;
-
 		flush_hold_queue();
 
 		skb = skb_dequeue(&audit_skb_queue);
-
 		if (skb) {
-			if (!audit_backlog_limit ||
-			    (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit))
-				wake_up(&audit_backlog_wait);
+			nlh = nlmsg_hdr(skb);
+
+			/* if nlh->nlmsg_len is zero then we haven't attempted
+			 * to send the message to userspace yet, if nlmsg_len
+			 * is non-zero then we have attempted to send it to
+			 * the multicast listeners as well as auditd; keep
+			 * trying to send to auditd but don't repeat the
+			 * multicast send */
+			if (nlh->nlmsg_len == 0) {
+				nlh->nlmsg_len = skb->len;
+				kauditd_send_multicast_skb(skb, GFP_KERNEL);
+
+				/* see the note in kauditd_send_multicast_skb
+				 * regarding the nlh->nlmsg_len value and why
+				 * it differs between the multicast and unicast
+				 * clients */
+				nlh->nlmsg_len -= NLMSG_HDRLEN;
+			}
+
 			if (audit_pid)
 				kauditd_send_skb(skb);
 			else
 				audit_printk_skb(skb);
-			continue;
+		} else {
+			/* we have flushed the backlog so wake everyone up who
+			 * is blocked and go to sleep until we have something
+			 * in the backlog again */
+			wake_up(&audit_backlog_wait);
+			wait_event_freezable(kauditd_wait,
+					     skb_queue_len(&audit_skb_queue));
 		}
-
-		wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
 	}
 	return 0;
 }
@@ -1969,10 +1989,10 @@ out:
  * audit_log_end - end one audit record
  * @ab: the audit_buffer
  *
- * netlink_unicast() cannot be called inside an irq context because it blocks
- * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
- * on a queue and a tasklet is scheduled to remove them from the queue outside
- * the irq context.  May be called in any context.
+ * We can not do a netlink send inside an irq context because it blocks (last
+ * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
+ * queue and a tasklet is scheduled to remove them from the queue outside the
+ * irq context.  May be called in any context.
  */
 void audit_log_end(struct audit_buffer *ab)
 {
@@ -1981,28 +2001,8 @@ void audit_log_end(struct audit_buffer *ab)
 	if (!audit_rate_check()) {
 		audit_log_lost("rate limit exceeded");
 	} else {
-		struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-
-		nlh->nlmsg_len = ab->skb->len;
-		kauditd_send_multicast_skb(ab->skb, ab->gfp_mask);
-
-		/*
-		 * The original kaudit unicast socket sends up messages with
-		 * nlmsg_len set to the payload length rather than the entire
-		 * message length.  This breaks the standard set by netlink.
-		 * The existing auditd daemon assumes this breakage.  Fixing
-		 * this would require co-ordinating a change in the established
-		 * protocol between the kaudit kernel subsystem and the auditd
-		 * userspace code.
-		 */
-		nlh->nlmsg_len -= NLMSG_HDRLEN;
-
-		if (audit_pid) {
-			skb_queue_tail(&audit_skb_queue, ab->skb);
-			wake_up_interruptible(&kauditd_wait);
-		} else {
-			audit_printk_skb(ab->skb);
-		}
+		skb_queue_tail(&audit_skb_queue, ab->skb);
+		wake_up_interruptible(&kauditd_wait);
 		ab->skb = NULL;
 	}
 	audit_buffer_free(ab);
-- 
cgit v1.2.3


From af8b824f283de5acc9b9ae8dbb60e4adacff721b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:24 -0500
Subject: audit: rename the queues and kauditd related functions

The audit queue names can be shortened and the record sending
helpers associated with the kauditd task could be named better, do
these small cleanups now to make life easier once we start reworking
the queues and kauditd code.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 801247a6c9e5..6ac1df116c0b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -138,9 +138,9 @@ static DEFINE_SPINLOCK(audit_freelist_lock);
 static int	   audit_freelist_count;
 static LIST_HEAD(audit_freelist);
 
-static struct sk_buff_head audit_skb_queue;
+static struct sk_buff_head audit_queue;
 /* queue of skbs to send to auditd when/if it comes back */
-static struct sk_buff_head audit_skb_hold_queue;
+static struct sk_buff_head audit_hold_queue;
 static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
@@ -377,8 +377,8 @@ static void audit_hold_skb(struct sk_buff *skb)
 {
 	if (audit_default &&
 	    (!audit_backlog_limit ||
-	     skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit))
-		skb_queue_tail(&audit_skb_hold_queue, skb);
+	     skb_queue_len(&audit_hold_queue) < audit_backlog_limit))
+		skb_queue_tail(&audit_hold_queue, skb);
 	else
 		kfree_skb(skb);
 }
@@ -387,7 +387,7 @@ static void audit_hold_skb(struct sk_buff *skb)
  * For one reason or another this nlh isn't getting delivered to the userspace
  * audit daemon, just send it to printk.
  */
-static void audit_printk_skb(struct sk_buff *skb)
+static void kauditd_printk_skb(struct sk_buff *skb)
 {
 	struct nlmsghdr *nlh = nlmsg_hdr(skb);
 	char *data = nlmsg_data(nlh);
@@ -402,7 +402,7 @@ static void audit_printk_skb(struct sk_buff *skb)
 	audit_hold_skb(skb);
 }
 
-static void kauditd_send_skb(struct sk_buff *skb)
+static void kauditd_send_unicast_skb(struct sk_buff *skb)
 {
 	int err;
 	int attempts = 0;
@@ -493,13 +493,13 @@ static void flush_hold_queue(void)
 	if (!audit_default || !audit_pid)
 		return;
 
-	skb = skb_dequeue(&audit_skb_hold_queue);
+	skb = skb_dequeue(&audit_hold_queue);
 	if (likely(!skb))
 		return;
 
 	while (skb && audit_pid) {
-		kauditd_send_skb(skb);
-		skb = skb_dequeue(&audit_skb_hold_queue);
+		kauditd_send_unicast_skb(skb);
+		skb = skb_dequeue(&audit_hold_queue);
 	}
 
 	/*
@@ -518,7 +518,7 @@ static int kauditd_thread(void *dummy)
 	while (!kthread_should_stop()) {
 		flush_hold_queue();
 
-		skb = skb_dequeue(&audit_skb_queue);
+		skb = skb_dequeue(&audit_queue);
 		if (skb) {
 			nlh = nlmsg_hdr(skb);
 
@@ -540,16 +540,16 @@ static int kauditd_thread(void *dummy)
 			}
 
 			if (audit_pid)
-				kauditd_send_skb(skb);
+				kauditd_send_unicast_skb(skb);
 			else
-				audit_printk_skb(skb);
+				kauditd_printk_skb(skb);
 		} else {
 			/* we have flushed the backlog so wake everyone up who
 			 * is blocked and go to sleep until we have something
 			 * in the backlog again */
 			wake_up(&audit_backlog_wait);
 			wait_event_freezable(kauditd_wait,
-					     skb_queue_len(&audit_skb_queue));
+					     skb_queue_len(&audit_queue));
 		}
 	}
 	return 0;
@@ -865,7 +865,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.rate_limit		= audit_rate_limit;
 		s.backlog_limit		= audit_backlog_limit;
 		s.lost			= atomic_read(&audit_lost);
-		s.backlog		= skb_queue_len(&audit_skb_queue);
+		s.backlog		= skb_queue_len(&audit_queue);
 		s.feature_bitmap	= AUDIT_FEATURE_BITMAP_ALL;
 		s.backlog_wait_time	= audit_backlog_wait_time_master;
 		audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
@@ -1200,8 +1200,8 @@ static int __init audit_init(void)
 		audit_default ? "enabled" : "disabled");
 	register_pernet_subsys(&audit_net_ops);
 
-	skb_queue_head_init(&audit_skb_queue);
-	skb_queue_head_init(&audit_skb_hold_queue);
+	skb_queue_head_init(&audit_queue);
+	skb_queue_head_init(&audit_hold_queue);
 	audit_initialized = AUDIT_INITIALIZED;
 	audit_enabled = audit_default;
 	audit_ever_enabled |= !!audit_default;
@@ -1357,7 +1357,7 @@ static long wait_for_auditd(long sleep_time)
 	DECLARE_WAITQUEUE(wait, current);
 
 	if (audit_backlog_limit &&
-	    skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+	    skb_queue_len(&audit_queue) > audit_backlog_limit) {
 		add_wait_queue_exclusive(&audit_backlog_wait, &wait);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		sleep_time = schedule_timeout(sleep_time);
@@ -1406,7 +1406,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	}
 
 	while (audit_backlog_limit
-	       && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+	       && skb_queue_len(&audit_queue) > audit_backlog_limit + reserve) {
 		if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
 			long sleep_time;
 
@@ -1419,7 +1419,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 		}
 		if (audit_rate_check() && printk_ratelimit())
 			pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
-				skb_queue_len(&audit_skb_queue),
+				skb_queue_len(&audit_queue),
 				audit_backlog_limit);
 		audit_log_lost("backlog limit exceeded");
 		audit_backlog_wait_time = 0;
@@ -2001,7 +2001,7 @@ void audit_log_end(struct audit_buffer *ab)
 	if (!audit_rate_check()) {
 		audit_log_lost("rate limit exceeded");
 	} else {
-		skb_queue_tail(&audit_skb_queue, ab->skb);
+		skb_queue_tail(&audit_queue, ab->skb);
 		wake_up_interruptible(&kauditd_wait);
 		ab->skb = NULL;
 	}
-- 
cgit v1.2.3


From c6480207fdf7b61de216ee23e93eac0a6878fa74 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:25 -0500
Subject: audit: rework the audit queue handling

The audit record backlog queue has always been a bit of a mess, and
the moving the multicast send into kauditd_thread() from
audit_log_end() only makes things worse.  This patch attempts to fix
the backlog queue with a better design that should hold up better
under load and have less of a performance impact at syscall
invocation time.

While it looks like there is a log going on in this patch, the main
change is the move from a single backlog queue to three queues:

* A queue for holding records generated from audit_log_end() that
haven't been consumed by kauditd_thread() (audit_queue).

* A queue for holding records that have been sent via multicast but
had a temporary failure when sending via unicast and need a resend
(audit_retry_queue).

* A queue for holding records that haven't been sent via unicast
because no one is listening (audit_hold_queue).

Special care is taken in this patch to ensure that the proper
record ordering is preserved, e.g. we send everything in the hold
queue first, then the retry queue, and finally the main queue.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 347 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 226 insertions(+), 121 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 6ac1df116c0b..f4056bc331fc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -138,11 +138,18 @@ static DEFINE_SPINLOCK(audit_freelist_lock);
 static int	   audit_freelist_count;
 static LIST_HEAD(audit_freelist);
 
+/* queue msgs to send via kauditd_task */
 static struct sk_buff_head audit_queue;
-/* queue of skbs to send to auditd when/if it comes back */
+/* queue msgs due to temporary unicast send problems */
+static struct sk_buff_head audit_retry_queue;
+/* queue msgs waiting for new auditd connection */
 static struct sk_buff_head audit_hold_queue;
+
+/* queue servicing thread */
 static struct task_struct *kauditd_task;
 static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+
+/* waitqueue for callers who are blocked on the audit backlog */
 static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
 
 static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
@@ -364,25 +371,6 @@ static int audit_set_failure(u32 state)
 	return audit_do_config_change("audit_failure", &audit_failure, state);
 }
 
-/*
- * Queue skbs to be sent to auditd when/if it comes back.  These skbs should
- * already have been sent via prink/syslog and so if these messages are dropped
- * it is not a huge concern since we already passed the audit_log_lost()
- * notification and stuff.  This is just nice to get audit messages during
- * boot before auditd is running or messages generated while auditd is stopped.
- * This only holds messages is audit_default is set, aka booting with audit=1
- * or building your kernel that way.
- */
-static void audit_hold_skb(struct sk_buff *skb)
-{
-	if (audit_default &&
-	    (!audit_backlog_limit ||
-	     skb_queue_len(&audit_hold_queue) < audit_backlog_limit))
-		skb_queue_tail(&audit_hold_queue, skb);
-	else
-		kfree_skb(skb);
-}
-
 /*
  * For one reason or another this nlh isn't getting delivered to the userspace
  * audit daemon, just send it to printk.
@@ -398,58 +386,114 @@ static void kauditd_printk_skb(struct sk_buff *skb)
 		else
 			audit_log_lost("printk limit exceeded");
 	}
+}
+
+/**
+ * kauditd_hold_skb - Queue an audit record, waiting for auditd
+ * @skb: audit record
+ *
+ * Description:
+ * Queue the audit record, waiting for an instance of auditd.  When this
+ * function is called we haven't given up yet on sending the record, but things
+ * are not looking good.  The first thing we want to do is try to write the
+ * record via printk and then see if we want to try and hold on to the record
+ * and queue it, if we have room.  If we want to hold on to the record, but we
+ * don't have room, record a record lost message.
+ */
+static void kauditd_hold_skb(struct sk_buff *skb)
+{
+	/* at this point it is uncertain if we will ever send this to auditd so
+	 * try to send the message via printk before we go any further */
+	kauditd_printk_skb(skb);
+
+	/* can we just silently drop the message? */
+	if (!audit_default) {
+		kfree_skb(skb);
+		return;
+	}
+
+	/* if we have room, queue the message */
+	if (!audit_backlog_limit ||
+	    skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
+		skb_queue_tail(&audit_hold_queue, skb);
+		return;
+	}
 
-	audit_hold_skb(skb);
+	/* we have no other options - drop the message */
+	audit_log_lost("kauditd hold queue overflow");
+	kfree_skb(skb);
 }
 
-static void kauditd_send_unicast_skb(struct sk_buff *skb)
+/**
+ * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
+ * @skb: audit record
+ *
+ * Description:
+ * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
+ * but for some reason we are having problems sending it audit records so
+ * queue the given record and attempt to resend.
+ */
+static void kauditd_retry_skb(struct sk_buff *skb)
 {
-	int err;
-	int attempts = 0;
-#define AUDITD_RETRIES 5
+	/* NOTE: because records should only live in the retry queue for a
+	 * short period of time, before either being sent or moved to the hold
+	 * queue, we don't currently enforce a limit on this queue */
+	skb_queue_tail(&audit_retry_queue, skb);
+}
+
+/**
+ * auditd_reset - Disconnect the auditd connection
+ *
+ * Description:
+ * Break the auditd/kauditd connection and move all the records in the retry
+ * queue into the hold queue in case auditd reconnects.
+ */
+static void auditd_reset(void)
+{
+	struct sk_buff *skb;
+
+	/* break the connection */
+	audit_pid = 0;
+	audit_sock = NULL;
+
+	/* flush all of the retry queue to the hold queue */
+	while ((skb = skb_dequeue(&audit_retry_queue)))
+		kauditd_hold_skb(skb);
+}
+
+/**
+ * kauditd_send_unicast_skb - Send a record via unicast to auditd
+ * @skb: audit record
+ */
+static int kauditd_send_unicast_skb(struct sk_buff *skb)
+{
+	int rc;
 
-restart:
-	/* take a reference in case we can't send it and we want to hold it */
+	/* get an extra skb reference in case we fail to send */
 	skb_get(skb);
-	err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
-	if (err < 0) {
-		pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
-		       audit_pid, err);
-		if (audit_pid) {
-			if (err == -ECONNREFUSED || err == -EPERM
-			    || ++attempts >= AUDITD_RETRIES) {
-				char s[32];
-
-				snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
-				audit_log_lost(s);
-				audit_pid = 0;
-				audit_sock = NULL;
-			} else {
-				pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
-					attempts, audit_pid);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule();
-				goto restart;
-			}
-		}
-		/* we might get lucky and get this in the next auditd */
-		audit_hold_skb(skb);
-	} else
-		/* drop the extra reference if sent ok */
+	rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+	if (rc >= 0) {
 		consume_skb(skb);
+		rc = 0;
+	}
+
+	return rc;
 }
 
 /*
- * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ * kauditd_send_multicast_skb - Send a record to any multicast listeners
+ * @skb: audit record
  *
+ * Description:
  * This function doesn't consume an skb as might be expected since it has to
  * copy it anyways.
  */
-static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
 {
-	struct sk_buff		*copy;
-	struct audit_net	*aunet = net_generic(&init_net, audit_net_id);
-	struct sock		*sock = aunet->nlsk;
+	struct sk_buff *copy;
+	struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+	struct sock *sock = aunet->nlsk;
+	struct nlmsghdr *nlh;
 
 	if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
 		return;
@@ -464,94 +508,155 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask)
 	 * no reason for new multicast clients to continue with this
 	 * non-compliance.
 	 */
-	copy = skb_copy(skb, gfp_mask);
+	copy = skb_copy(skb, GFP_KERNEL);
 	if (!copy)
 		return;
+	nlh = nlmsg_hdr(copy);
+	nlh->nlmsg_len = skb->len;
 
-	nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask);
+	nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
 }
 
-/*
- * flush_hold_queue - empty the hold queue if auditd appears
- *
- * If auditd just started, drain the queue of messages already
- * sent to syslog/printk.  Remember loss here is ok.  We already
- * called audit_log_lost() if it didn't go out normally.  so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
+/**
+ * kauditd_wake_condition - Return true when it is time to wake kauditd_thread
  *
- * If you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue.  I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
+ * Description:
+ * This function is for use by the wait_event_freezable() call in
+ * kauditd_thread().
  */
-static void flush_hold_queue(void)
+static int kauditd_wake_condition(void)
 {
-	struct sk_buff *skb;
-
-	if (!audit_default || !audit_pid)
-		return;
-
-	skb = skb_dequeue(&audit_hold_queue);
-	if (likely(!skb))
-		return;
+	static int pid_last = 0;
+	int rc;
+	int pid = audit_pid;
 
-	while (skb && audit_pid) {
-		kauditd_send_unicast_skb(skb);
-		skb = skb_dequeue(&audit_hold_queue);
-	}
+	/* wake on new messages or a change in the connected auditd */
+	rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last);
+	if (rc)
+		pid_last = pid;
 
-	/*
-	 * if auditd just disappeared but we
-	 * dequeued an skb we need to drop ref
-	 */
-	consume_skb(skb);
+	return rc;
 }
 
 static int kauditd_thread(void *dummy)
 {
+	int rc;
+	int auditd = 0;
+	int reschedule = 0;
 	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
 
+#define UNICAST_RETRIES 5
+#define AUDITD_BAD(x,y) \
+	((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES)
+
+	/* NOTE: we do invalidate the auditd connection flag on any sending
+	 * errors, but we only "restore" the connection flag at specific places
+	 * in the loop in order to help ensure proper ordering of audit
+	 * records */
+
 	set_freezable();
 	while (!kthread_should_stop()) {
-		flush_hold_queue();
+		/* NOTE: possible area for future improvement is to look at
+		 *       the hold and retry queues, since only this thread
+		 *       has access to these queues we might be able to do
+		 *       our own queuing and skip some/all of the locking */
+
+		/* NOTE: it might be a fun experiment to split the hold and
+		 *       retry queue handling to another thread, but the
+		 *       synchronization issues and other overhead might kill
+		 *       any performance gains */
+
+		/* attempt to flush the hold queue */
+		while (auditd && (skb = skb_dequeue(&audit_hold_queue))) {
+			rc = kauditd_send_unicast_skb(skb);
+			if (rc) {
+				/* requeue to the same spot */
+				skb_queue_head(&audit_hold_queue, skb);
+
+				auditd = 0;
+				if (AUDITD_BAD(rc, reschedule)) {
+					auditd_reset();
+					reschedule = 0;
+				}
+			} else
+				/* we were able to send successfully */
+				reschedule = 0;
+		}
+
+		/* attempt to flush the retry queue */
+		while (auditd && (skb = skb_dequeue(&audit_retry_queue))) {
+			rc = kauditd_send_unicast_skb(skb);
+			if (rc) {
+				auditd = 0;
+				if (AUDITD_BAD(rc, reschedule)) {
+					kauditd_hold_skb(skb);
+					auditd_reset();
+					reschedule = 0;
+				} else
+					/* temporary problem (we hope), queue
+					 * to the same spot and retry */
+					skb_queue_head(&audit_retry_queue, skb);
+			} else
+				/* we were able to send successfully */
+				reschedule = 0;
+		}
 
+		/* standard queue processing, try to be as quick as possible */
+quick_loop:
 		skb = skb_dequeue(&audit_queue);
 		if (skb) {
+			/* setup the netlink header, see the comments in
+			 * kauditd_send_multicast_skb() for length quirks */
 			nlh = nlmsg_hdr(skb);
-
-			/* if nlh->nlmsg_len is zero then we haven't attempted
-			 * to send the message to userspace yet, if nlmsg_len
-			 * is non-zero then we have attempted to send it to
-			 * the multicast listeners as well as auditd; keep
-			 * trying to send to auditd but don't repeat the
-			 * multicast send */
-			if (nlh->nlmsg_len == 0) {
-				nlh->nlmsg_len = skb->len;
-				kauditd_send_multicast_skb(skb, GFP_KERNEL);
-
-				/* see the note in kauditd_send_multicast_skb
-				 * regarding the nlh->nlmsg_len value and why
-				 * it differs between the multicast and unicast
-				 * clients */
-				nlh->nlmsg_len -= NLMSG_HDRLEN;
-			}
-
-			if (audit_pid)
-				kauditd_send_unicast_skb(skb);
+			nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
+
+			/* attempt to send to any multicast listeners */
+			kauditd_send_multicast_skb(skb);
+
+			/* attempt to send to auditd, queue on failure */
+			if (auditd) {
+				rc = kauditd_send_unicast_skb(skb);
+				if (rc) {
+					auditd = 0;
+					if (AUDITD_BAD(rc, reschedule)) {
+						auditd_reset();
+						reschedule = 0;
+					}
+
+					/* move to the retry queue */
+					kauditd_retry_skb(skb);
+				} else
+					/* everything is working so go fast! */
+					goto quick_loop;
+			} else if (reschedule)
+				/* we are currently having problems, move to
+				 * the retry queue */
+				kauditd_retry_skb(skb);
 			else
-				kauditd_printk_skb(skb);
+				/* dump the message via printk and hold it */
+				kauditd_hold_skb(skb);
 		} else {
-			/* we have flushed the backlog so wake everyone up who
-			 * is blocked and go to sleep until we have something
-			 * in the backlog again */
+			/* we have flushed the backlog so wake everyone */
 			wake_up(&audit_backlog_wait);
-			wait_event_freezable(kauditd_wait,
-					     skb_queue_len(&audit_queue));
+
+			/* if everything is okay with auditd (if present), go
+			 * to sleep until there is something new in the queue
+			 * or we have a change in the connected auditd;
+			 * otherwise simply reschedule to give things a chance
+			 * to recover */
+			if (reschedule) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule();
+			} else
+				wait_event_freezable(kauditd_wait,
+						     kauditd_wake_condition());
+
+			/* update the auditd connection status */
+			auditd = (audit_pid ? 1 : 0);
 		}
 	}
+
 	return 0;
 }
 
@@ -616,6 +721,7 @@ static int audit_send_reply_thread(void *arg)
 	kfree(reply);
 	return 0;
 }
+
 /**
  * audit_send_reply - send an audit reply message via netlink
  * @request_skb: skb of request we are replying to (used to target the reply)
@@ -1171,10 +1277,8 @@ static void __net_exit audit_net_exit(struct net *net)
 {
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 	struct sock *sock = aunet->nlsk;
-	if (sock == audit_sock) {
-		audit_pid = 0;
-		audit_sock = NULL;
-	}
+	if (sock == audit_sock)
+		auditd_reset();
 
 	RCU_INIT_POINTER(aunet->nlsk, NULL);
 	synchronize_net();
@@ -1201,6 +1305,7 @@ static int __init audit_init(void)
 	register_pernet_subsys(&audit_net_ops);
 
 	skb_queue_head_init(&audit_queue);
+	skb_queue_head_init(&audit_retry_queue);
 	skb_queue_head_init(&audit_hold_queue);
 	audit_initialized = AUDIT_INITIALIZED;
 	audit_enabled = audit_default;
-- 
cgit v1.2.3


From 3197542482df22c2a131d4a813280bd7c54cedf5 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:25 -0500
Subject: audit: rework audit_log_start()

The backlog queue handling in audit_log_start() is a little odd with
some questionable design decisions, this patch attempts to rectify
this with the following changes:

* Never make auditd wait, ignore any backlog limits as we need auditd
awake so it can drain the backlog queue.

* When we hit a backlog limit and start dropping records, don't wake
all the tasks sleeping on the backlog, that's silly.  Instead, let
kauditd_thread() take care of waking everyone once it has had a chance
to drain the backlog queue.

* Don't keep a global backlog timeout countdown, make it per-task.  A
per-task timer means we won't have all the sleeping tasks waking at
the same time and hammering on an already stressed backlog queue.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 92 +++++++++++++++++++++++-----------------------------------
 1 file changed, 36 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index f4056bc331fc..e23ce6ce101f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -107,7 +107,6 @@ static u32	audit_rate_limit;
  * When set to zero, this means unlimited. */
 static u32	audit_backlog_limit = 64;
 #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
-static u32	audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME;
 static u32	audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
 
 /* The identity of the user shutting down the audit system. */
@@ -345,7 +344,7 @@ static int audit_set_backlog_limit(u32 limit)
 static int audit_set_backlog_wait_time(u32 timeout)
 {
 	return audit_do_config_change("audit_backlog_wait_time",
-				      &audit_backlog_wait_time_master, timeout);
+				      &audit_backlog_wait_time, timeout);
 }
 
 static int audit_set_enabled(u32 state)
@@ -973,7 +972,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.lost			= atomic_read(&audit_lost);
 		s.backlog		= skb_queue_len(&audit_queue);
 		s.feature_bitmap	= AUDIT_FEATURE_BITMAP_ALL;
-		s.backlog_wait_time	= audit_backlog_wait_time_master;
+		s.backlog_wait_time	= audit_backlog_wait_time;
 		audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
@@ -1454,24 +1453,6 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 	}
 }
 
-/*
- * Wait for auditd to drain the queue a little
- */
-static long wait_for_auditd(long sleep_time)
-{
-	DECLARE_WAITQUEUE(wait, current);
-
-	if (audit_backlog_limit &&
-	    skb_queue_len(&audit_queue) > audit_backlog_limit) {
-		add_wait_queue_exclusive(&audit_backlog_wait, &wait);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		sleep_time = schedule_timeout(sleep_time);
-		remove_wait_queue(&audit_backlog_wait, &wait);
-	}
-
-	return sleep_time;
-}
-
 /**
  * audit_log_start - obtain an audit buffer
  * @ctx: audit_context (may be NULL)
@@ -1490,12 +1471,9 @@ static long wait_for_auditd(long sleep_time)
 struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				     int type)
 {
-	struct audit_buffer	*ab	= NULL;
-	struct timespec		t;
-	unsigned int		uninitialized_var(serial);
-	int reserve = 5; /* Allow atomic callers to go up to five
-			    entries over the normal backlog limit */
-	unsigned long timeout_start = jiffies;
+	struct audit_buffer *ab;
+	struct timespec t;
+	unsigned int uninitialized_var(serial);
 
 	if (audit_initialized != AUDIT_INITIALIZED)
 		return NULL;
@@ -1503,38 +1481,40 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
 		return NULL;
 
-	if (gfp_mask & __GFP_DIRECT_RECLAIM) {
-		if (audit_pid && audit_pid == current->tgid)
-			gfp_mask &= ~__GFP_DIRECT_RECLAIM;
-		else
-			reserve = 0;
-	}
-
-	while (audit_backlog_limit
-	       && skb_queue_len(&audit_queue) > audit_backlog_limit + reserve) {
-		if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
-			long sleep_time;
-
-			sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
-			if (sleep_time > 0) {
-				sleep_time = wait_for_auditd(sleep_time);
-				if (sleep_time > 0)
-					continue;
+	/* don't ever fail/sleep on auditd since we need auditd to drain the
+	 * queue; also, when we are checking for auditd, compare PIDs using
+	 * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() using
+	 * a PID anchored in the caller's namespace */
+	if (!(audit_pid && audit_pid == task_tgid_vnr(current))) {
+		long sleep_time = audit_backlog_wait_time;
+
+		while (audit_backlog_limit &&
+		       (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+			/* wake kauditd to try and flush the queue */
+			wake_up_interruptible(&kauditd_wait);
+
+			/* sleep if we are allowed and we haven't exhausted our
+			 * backlog wait limit */
+			if ((gfp_mask & __GFP_DIRECT_RECLAIM) &&
+			    (sleep_time > 0)) {
+				DECLARE_WAITQUEUE(wait, current);
+
+				add_wait_queue_exclusive(&audit_backlog_wait,
+							 &wait);
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				sleep_time = schedule_timeout(sleep_time);
+				remove_wait_queue(&audit_backlog_wait, &wait);
+			} else {
+				if (audit_rate_check() && printk_ratelimit())
+					pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
+						skb_queue_len(&audit_queue),
+						audit_backlog_limit);
+				audit_log_lost("backlog limit exceeded");
+				return NULL;
 			}
 		}
-		if (audit_rate_check() && printk_ratelimit())
-			pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
-				skb_queue_len(&audit_queue),
-				audit_backlog_limit);
-		audit_log_lost("backlog limit exceeded");
-		audit_backlog_wait_time = 0;
-		wake_up(&audit_backlog_wait);
-		return NULL;
 	}
 
-	if (!reserve && !audit_backlog_wait_time)
-		audit_backlog_wait_time = audit_backlog_wait_time_master;
-
 	ab = audit_buffer_alloc(ctx, gfp_mask, type);
 	if (!ab) {
 		audit_log_lost("out of memory in audit_log_start");
@@ -1542,9 +1522,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	}
 
 	audit_get_stamp(ab->ctx, &t, &serial);
-
 	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
 			 t.tv_sec, t.tv_nsec/1000000, serial);
+
 	return ab;
 }
 
-- 
cgit v1.2.3


From e1d166212894d9d959a601c4802882b877bb420a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:25 -0500
Subject: audit: wake up kauditd_thread after auditd registers

This patch was suggested by Richard Briggs back in 2015, see the link
to the mail archive below.  Unfortunately, that patch is no longer
even remotely valid due to other changes to the code.

* https://www.redhat.com/archives/linux-audit/2015-October/msg00075.html

Suggested-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e23ce6ce101f..0572e5dcfda7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1009,6 +1009,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_pid = new_pid;
 			audit_nlk_portid = NETLINK_CB(skb).portid;
 			audit_sock = skb->sk;
+			wake_up_interruptible(&kauditd_wait);
 		}
 		if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
 			err = audit_set_rate_limit(s.rate_limit);
-- 
cgit v1.2.3


From 6c54e7899693dee3db67ea996e9be0e10f67920f Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:26 -0500
Subject: audit: handle a clean auditd shutdown with grace

When auditd stops cleanly it sets 'auditd_pid' to 0 with an
AUDIT_SET message, in this case we should reset our backlog
queues via the auditd_reset() function.  This patch also adds
a 'auditd_pid' check to the top of kauditd_send_unicast_skb()
so we can fail quicker.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 0572e5dcfda7..b447a6b1fdc8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -468,6 +468,10 @@ static int kauditd_send_unicast_skb(struct sk_buff *skb)
 {
 	int rc;
 
+	/* if we know nothing is connected, don't even try the netlink call */
+	if (!audit_pid)
+		return -ECONNREFUSED;
+
 	/* get an extra skb reference in case we fail to send */
 	skb_get(skb);
 	rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
@@ -1009,6 +1013,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_pid = new_pid;
 			audit_nlk_portid = NETLINK_CB(skb).portid;
 			audit_sock = skb->sk;
+			if (!new_pid)
+				auditd_reset();
 			wake_up_interruptible(&kauditd_wait);
 		}
 		if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
-- 
cgit v1.2.3


From a09cfa470817ac086cf68418da13a2b91c2744ef Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 29 Nov 2016 16:53:26 -0500
Subject: audit: don't ever sleep on a command record/message

Sleeping on a command record/message in audit_log_start() could slow
something, e.g. auditd, from doing something important, e.g. clean
shutdown, which could present problems on a heavily loaded system.
This patch allows tasks to bypass any queue restrictions if they are
logging a command record/message.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index b447a6b1fdc8..f20eee0db7e6 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1488,11 +1488,19 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
 		return NULL;
 
-	/* don't ever fail/sleep on auditd since we need auditd to drain the
-	 * queue; also, when we are checking for auditd, compare PIDs using
-	 * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() using
-	 * a PID anchored in the caller's namespace */
-	if (!(audit_pid && audit_pid == task_tgid_vnr(current))) {
+	/* don't ever fail/sleep on these two conditions:
+	 * 1. auditd generated record - since we need auditd to drain the
+	 *    queue; also, when we are checking for auditd, compare PIDs using
+	 *    task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
+	 *    using a PID anchored in the caller's namespace
+	 * 2. audit command message - record types 1000 through 1099 inclusive
+	 *    are command messages/records used to manage the kernel subsystem
+	 *    and the audit userspace, blocking on these messages could cause
+	 *    problems under load so don't do it (note: not all of these
+	 *    command types are valid as record types, but it is quicker to
+	 *    just check two ints than a series of ints in a if/switch stmt) */
+	if (!((audit_pid && audit_pid == task_tgid_vnr(current)) ||
+	      (type >= 1000 && type <= 1099))) {
 		long sleep_time = audit_backlog_wait_time;
 
 		while (audit_backlog_limit &&
-- 
cgit v1.2.3


From 533c7b69c764ad5febb3e716899f43a75564fcab Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 13 Dec 2016 10:03:01 -0500
Subject: audit: use proper refcount locking on audit_sock

Resetting audit_sock appears to be racy.

audit_sock was being copied and dereferenced without using a refcount on
the source sock.

Bump the refcount on the underlying sock when we store a refrence in
audit_sock and release it when we reset audit_sock.  audit_sock
modification needs the audit_cmd_mutex.

See: https://lkml.org/lkml/2016/11/26/232

Thanks to Eric Dumazet <edumazet@google.com> and Cong Wang
<xiyou.wangcong@gmail.com> on ideas how to fix it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Reviewed-by: Cong Wang <xiyou.wangcong@gmail.com>
[PM: fixed the comment block text formatting for auditd_reset()]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index f20eee0db7e6..41017685f9f2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -445,15 +445,20 @@ static void kauditd_retry_skb(struct sk_buff *skb)
  *
  * Description:
  * Break the auditd/kauditd connection and move all the records in the retry
- * queue into the hold queue in case auditd reconnects.
+ * queue into the hold queue in case auditd reconnects.  The audit_cmd_mutex
+ * must be held when calling this function.
  */
 static void auditd_reset(void)
 {
 	struct sk_buff *skb;
 
 	/* break the connection */
+	if (audit_sock) {
+		sock_put(audit_sock);
+		audit_sock = NULL;
+	}
 	audit_pid = 0;
-	audit_sock = NULL;
+	audit_nlk_portid = 0;
 
 	/* flush all of the retry queue to the hold queue */
 	while ((skb = skb_dequeue(&audit_retry_queue)))
@@ -579,7 +584,9 @@ static int kauditd_thread(void *dummy)
 
 				auditd = 0;
 				if (AUDITD_BAD(rc, reschedule)) {
+					mutex_lock(&audit_cmd_mutex);
 					auditd_reset();
+					mutex_unlock(&audit_cmd_mutex);
 					reschedule = 0;
 				}
 			} else
@@ -594,7 +601,9 @@ static int kauditd_thread(void *dummy)
 				auditd = 0;
 				if (AUDITD_BAD(rc, reschedule)) {
 					kauditd_hold_skb(skb);
+					mutex_lock(&audit_cmd_mutex);
 					auditd_reset();
+					mutex_unlock(&audit_cmd_mutex);
 					reschedule = 0;
 				} else
 					/* temporary problem (we hope), queue
@@ -623,7 +632,9 @@ quick_loop:
 				if (rc) {
 					auditd = 0;
 					if (AUDITD_BAD(rc, reschedule)) {
+						mutex_lock(&audit_cmd_mutex);
 						auditd_reset();
+						mutex_unlock(&audit_cmd_mutex);
 						reschedule = 0;
 					}
 
@@ -1010,11 +1021,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			}
 			if (audit_enabled != AUDIT_OFF)
 				audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
-			audit_pid = new_pid;
-			audit_nlk_portid = NETLINK_CB(skb).portid;
-			audit_sock = skb->sk;
-			if (!new_pid)
+			if (new_pid) {
+				if (audit_sock)
+					sock_put(audit_sock);
+				audit_pid = new_pid;
+				audit_nlk_portid = NETLINK_CB(skb).portid;
+				sock_hold(skb->sk);
+				audit_sock = skb->sk;
+			} else {
 				auditd_reset();
+			}
 			wake_up_interruptible(&kauditd_wait);
 		}
 		if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
@@ -1283,8 +1299,10 @@ static void __net_exit audit_net_exit(struct net *net)
 {
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 	struct sock *sock = aunet->nlsk;
+	mutex_lock(&audit_cmd_mutex);
 	if (sock == audit_sock)
 		auditd_reset();
+	mutex_unlock(&audit_cmd_mutex);
 
 	RCU_INIT_POINTER(aunet->nlsk, NULL);
 	synchronize_net();
-- 
cgit v1.2.3


From 4d1f0fb096aedea7bb5489af93498a82e467c480 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Wed, 14 Dec 2016 15:04:04 -0800
Subject: kernel/watchdog: use nmi registers snapshot in hardlockup handler

NMI handler doesn't call set_irq_regs(), it's set only by normal IRQ.
Thus get_irq_regs() returns NULL or stale registers snapshot with IP/SP
pointing to the code interrupted by IRQ which was interrupted by NMI.
NULL isn't a problem: in this case watchdog calls dump_stack() and
prints full stack trace including NMI.  But if we're stuck in IRQ
handler then NMI watchlog will print stack trace without IRQ part at
all.

This patch uses registers snapshot passed into NMI handler as arguments:
these registers point exactly to the instruction interrupted by NMI.

Fixes: 55537871ef66 ("kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup")
Link: http://lkml.kernel.org/r/146771764784.86724.6006627197118544150.stgit@buzz
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: <stable@vger.kernel.org>	[4.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f280ec..6d1020c03d41 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -344,7 +344,6 @@ static void watchdog_overflow_callback(struct perf_event *event,
 	 */
 	if (is_hardlockup()) {
 		int this_cpu = smp_processor_id();
-		struct pt_regs *regs = get_irq_regs();
 
 		/* only print hardlockups once */
 		if (__this_cpu_read(hard_watchdog_warn) == true)
-- 
cgit v1.2.3


From c7be96af89d4b53211862d8599b2430e8900ed92 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Wed, 14 Dec 2016 15:04:10 -0800
Subject: signals: avoid unnecessary taking of sighand->siglock

When running certain database workload on a high-end system with many
CPUs, it was found that spinlock contention in the sigprocmask syscalls
became a significant portion of the overall CPU cycles as shown below.

  9.30%  9.30%  905387  dataserver  /proc/kcore 0x7fff8163f4d2
  [k] _raw_spin_lock_irq
            |
            ---_raw_spin_lock_irq
               |
               |--99.34%-- __set_current_blocked
               |          sigprocmask
               |          sys_rt_sigprocmask
               |          system_call_fastpath
               |          |
               |          |--50.63%-- __swapcontext
               |          |          |
               |          |          |--99.91%-- upsleepgeneric
               |          |
               |          |--49.36%-- __setcontext
               |          |          ktskRun

Looking further into the swapcontext function in glibc, it was found that
the function always call sigprocmask() without checking if there are
changes in the signal mask.

A check was added to the __set_current_blocked() function to avoid taking
the sighand->siglock spinlock if there is no change in the signal mask.
This will prevent unneeded spinlock contention when many threads are
trying to call sigprocmask().

With this patch applied, the spinlock contention in sigprocmask() was
gone.

Link: http://lkml.kernel.org/r/1474979209-11867-1-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/signal.h | 17 +++++++++++++++++
 kernel/signal.c        |  7 +++++++
 2 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index b63f63eaa39c..5308304993be 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set)
 	}
 }
 
+static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
+{
+	switch (_NSIG_WORDS) {
+	case 4:
+		return	(set1->sig[3] == set2->sig[3]) &&
+			(set1->sig[2] == set2->sig[2]) &&
+			(set1->sig[1] == set2->sig[1]) &&
+			(set1->sig[0] == set2->sig[0]);
+	case 2:
+		return	(set1->sig[1] == set2->sig[1]) &&
+			(set1->sig[0] == set2->sig[0]);
+	case 1:
+		return	set1->sig[0] == set2->sig[0];
+	}
+	return 0;
+}
+
 #define sigmask(sig)	(1UL << ((sig) - 1))
 
 #ifndef __HAVE_ARCH_SIG_SETOPS
diff --git a/kernel/signal.c b/kernel/signal.c
index 29a410780aa9..ae60996fedff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset)
 {
 	struct task_struct *tsk = current;
 
+	/*
+	 * In case the signal mask hasn't changed, there is nothing we need
+	 * to do. The current->blocked shouldn't be modified by other task.
+	 */
+	if (sigequalsets(&tsk->blocked, newset))
+		return;
+
 	spin_lock_irq(&tsk->sighand->siglock);
 	__set_task_blocked(tsk, newset);
 	spin_unlock_irq(&tsk->sighand->siglock);
-- 
cgit v1.2.3


From 760c6a9139c37e16502362b22656d0cc4e840e8f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 14 Dec 2016 15:04:14 -0800
Subject: coredump: clarify "unsafe core_pattern" warning

I was amused to find "unsafe core_pattern" warning having these lines in
/etc/sysctl.conf:

	fs.suid_dumpable=2
	kernel.core_pattern=/core/core-%e-%p-%E
	kernel.core_uses_pid=0

Turns out kernel is formally right.  Default core_pattern is just "core",
which doesn't qualify for secure path while setting suid.dumpable.

Hint admins about solution, clarify sysctl names, delete unnecessary '\'
characters (string literals are concatenated regardless) and reformat for
easier grepping.

Link: http://lkml.kernel.org/r/20161029152124.GA1258@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 39b3368f6de6..1475d2545b7e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void)
 #ifdef CONFIG_COREDUMP
 	if (suid_dumpable == SUID_DUMP_ROOT &&
 	    core_pattern[0] != '/' && core_pattern[0] != '|') {
-		printk(KERN_WARNING "Unsafe core_pattern used with "\
-			"suid_dumpable=2. Pipe handler or fully qualified "\
-			"core dump path required.\n");
+		printk(KERN_WARNING
+"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
+"Pipe handler or fully qualified core dump path required.\n"
+"Set kernel.core_pattern before fs.suid_dumpable.\n"
+		);
 	}
 #endif
 }
-- 
cgit v1.2.3


From 401721ecd1dcb0a428aa5d6832ee05ffbdbffbbe Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 14 Dec 2016 15:04:20 -0800
Subject: kexec: export the value of phys_base instead of symbol address

Currently in x86_64, the symbol address of phys_base is exported to
vmcoreinfo.  Dave Anderson complained this is really useless for his
Crash implementation.  Because in user-space utility Crash and
Makedumpfile which exported vmcore information is mainly used for, value
of phys_base is needed to covert virtual address of exported kernel
symbol to physical address.  Especially init_level4_pgt, if we want to
access and go over the page table to look up a PA corresponding to VA,
firstly we need calculate

  page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base;

Now in Crash and Makedumpfile, we have to analyze the vmcore elf program
header to get value of phys_base.  As Dave said, it would be preferable
if it were readily availabl in vmcoreinfo rather than depending upon the
PT_LOAD semantics.

Hence in this patch change to export the value of phys_base instead of
its virtual address.

And people also complained that KERNEL_IMAGE_SIZE exporting is x86_64
only, should be moved into arch dependent function
arch_crash_save_vmcoreinfo.  Do the moving in this patch.

Link: http://lkml.kernel.org/r/1478568596-30060-2-git-send-email-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Eugene Surovegin <surovegin@google.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/machine_kexec_64.c | 3 ++-
 kernel/kexec_core.c                | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 5a294e48b185..307b1f4543de 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
-	VMCOREINFO_SYMBOL(phys_base);
+	VMCOREINFO_NUMBER(phys_base);
 	VMCOREINFO_SYMBOL(init_level4_pgt);
 
 #ifdef CONFIG_NUMA
@@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
 			      kaslr_offset());
+	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 561675589511..8ad3a29eb728 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1467,9 +1467,6 @@ static int __init crash_save_vmcoreinfo_init(void)
 #endif
 	VMCOREINFO_NUMBER(PG_head_mask);
 	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_X86
-	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
-#endif
 #ifdef CONFIG_HUGETLB_PAGE
 	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
 #endif
-- 
cgit v1.2.3


From 8e53c073a41795365eb0e47ae23441dc01c70511 Mon Sep 17 00:00:00 2001
From: zhong jiang <zhongjiang@huawei.com>
Date: Wed, 14 Dec 2016 15:04:23 -0800
Subject: kexec: add cond_resched into kimage_alloc_crash_control_pages

A soft lookup will occur when I run trinity in syscall kexec_load.  the
corresponding stack information is as follows.

  BUG: soft lockup - CPU#6 stuck for 22s! [trinity-c6:13859]
  Kernel panic - not syncing: softlockup: hung tasks
  CPU: 6 PID: 13859 Comm: trinity-c6 Tainted: G           O L ----V-------   3.10.0-327.28.3.35.zhongjiang.x86_64 #1
  Hardware name: Huawei Technologies Co., Ltd. Tecal BH622 V2/BC01SRSA0, BIOS RMIBV386 06/30/2014
  Call Trace:
   <IRQ>  dump_stack+0x19/0x1b
   panic+0xd8/0x214
   watchdog_timer_fn+0x1cc/0x1e0
   __hrtimer_run_queues+0xd2/0x260
   hrtimer_interrupt+0xb0/0x1e0
   ? call_softirq+0x1c/0x30
   local_apic_timer_interrupt+0x37/0x60
   smp_apic_timer_interrupt+0x3f/0x60
   apic_timer_interrupt+0x6d/0x80
   <EOI>  ? kimage_alloc_control_pages+0x80/0x270
   ? kmem_cache_alloc_trace+0x1ce/0x1f0
   ? do_kimage_alloc_init+0x1f/0x90
   kimage_alloc_init+0x12a/0x180
   SyS_kexec_load+0x20a/0x260
   system_call_fastpath+0x16/0x1b

the first time allocation of control pages may take too much time
because crash_res.end can be set to a higher value.  we need to add
cond_resched to avoid the issue.

The patch have been tested and above issue is not appear.

Link: http://lkml.kernel.org/r/1481164674-42775-1-git-send-email-zhongjiang@huawei.com
Signed-off-by: zhong jiang <zhongjiang@huawei.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Xunlei Pang <xpang@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8ad3a29eb728..5617cc412444 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 	while (hole_end <= crashk_res.end) {
 		unsigned long i;
 
+		cond_resched();
+
 		if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
 			break;
 		/* See if I overlap any of the segments */
-- 
cgit v1.2.3


From 7560ef39dc0bfebba8f43766d8bb4c1ec5eb66fc Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 14 Dec 2016 15:04:26 -0800
Subject: sysctl: add KERN_CONT to deprecated_sysctl_warning()

Do not break lines while printk()ing values.

  kernel: warning: process `tomoyo_file_tes' used the deprecated sysctl system call with
  kernel: 3.
  kernel: 5.
  kernel: 56.
  kernel:

Link: http://lkml.kernel.org/r/1480814833-4976-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_binary.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6eb99c17dbd8..ece4b177052b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
 			"warning: process `%s' used the deprecated sysctl "
 			"system call with ", current->comm);
 		for (i = 0; i < nlen; i++)
-			printk("%d.", name[i]);
-		printk("\n");
+			printk(KERN_CONT "%d.", name[i]);
+		printk(KERN_CONT "\n");
 	}
 	return;
 }
-- 
cgit v1.2.3


From 9a29d0fbc2d9ad99fb8a981ab72548cc360e9d4c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 14 Dec 2016 15:05:38 -0800
Subject: relay: check array offset before using it

Smatch complains that we started using the array offset before we
checked that it was valid.

Fixes: 017c59c042d0 ('relay: Use per CPU constructs for the relay channel buffer pointers')
Link: http://lkml.kernel.org/r/20161013084947.GC16198@mwanda
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index da79a109dbeb..8f18d314a96a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan,
 {
 	struct rchan_buf *buf;
 
-	if (!chan)
+	if (!chan || cpu >= NR_CPUS)
 		return;
 
 	buf = *per_cpu_ptr(chan->buf, cpu);
-	if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs)
+	if (!buf || subbufs_consumed > chan->n_subbufs)
 		return;
 
 	if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
-- 
cgit v1.2.3


From db862358a4a96f52d3b0c713c703828f90d97de9 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 14 Dec 2016 15:05:46 -0800
Subject: kcov: add more missing includes

It is fragile that some definitions acquired via transitive
dependencies, as shown in below:

atomic_*        (<linux/atomic.h>)
ENOMEM/EN*      (<linux/errno.h>)
EXPORT_SYMBOL   (<linux/export.h>)
device_initcall (<linux/init.h>)
preempt_*       (<linux/preempt.h>)

Include them to prevent possible issues.

Link: http://lkml.kernel.org/r/1481163221-40170-1-git-send-email-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Suggested-by: Mark Rutland <mark.rutland@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3cbb0c879705..cc2fa35ca480 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,11 +1,16 @@
 #define pr_fmt(fmt) "kcov: " fmt
 
 #define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/preempt.h>
 #include <linux/printk.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-- 
cgit v1.2.3


From 2d13bb6494c807bcf3f78af0e96c0b8615a94385 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 14 Dec 2016 15:05:49 -0800
Subject: kernel/debug/debug_core.c: more properly delay for secondary CPUs

We've got a delay loop waiting for secondary CPUs.  That loop uses
loops_per_jiffy.  However, loops_per_jiffy doesn't actually mean how
many tight loops make up a jiffy on all architectures.  It is quite
common to see things like this in the boot log:

  Calibrating delay loop (skipped), value calculated using timer
  frequency.. 48.00 BogoMIPS (lpj=24000)

In my case I was seeing lots of cases where other CPUs timed out
entering the debugger only to print their stack crawls shortly after the
kdb> prompt was written.

Elsewhere in kgdb we already use udelay(), so that should be safe enough
to use to implement our timeout.  We'll delay 1 ms for 1000 times, which
should give us a full second of delay (just like the old code wanted)
but allow us to notice that we're done every 1 ms.

[akpm@linux-foundation.org: simplifications, per Daniel]
Link: http://lkml.kernel.org/r/1477091361-2039-1-git-send-email-dianders@chromium.org
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Brian Norris <briannorris@chromium.org>
Cc: <stable@vger.kernel.org>	[4.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/debug/debug_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..79517e5549f1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
 	/*
 	 * Wait for the other CPUs to be notified and be waiting for us:
 	 */
-	time_left = loops_per_jiffy * HZ;
+	time_left = MSEC_PER_SEC;
 	while (kgdb_do_roundup && --time_left &&
 	       (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
 		   online_cpus)
-		cpu_relax();
+		udelay(1000);
 	if (!time_left)
 		pr_crit("Timed out waiting for secondary CPUs.\n");
 
-- 
cgit v1.2.3


From d1bd8ead126668a2d6c42d97cc3664e95b3fa1dc Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 14 Dec 2016 15:05:52 -0800
Subject: kdb: remove unused kdb_event handling

kdb_event state variable is only set but never checked in the kernel
code.

http://www.spinics.net/lists/kdb/msg01733.html suggests that this
variable affected WARN_CONSOLE_UNLOCKED() in the original
implementation.  But this check never went upstream.

The semantic is unclear and racy.  The value is updated after the
kdb_printf_lock is acquired and after it is released.  It should be
symmetric at minimum.  The value should be manipulated either inside or
outside the locked area.

Fortunately, it seems that the original function is gone and we could
simply remove the state variable.

Link: http://lkml.kernel.org/r/1480412276-16690-2-git-send-email-pmladek@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kdb.h         | 1 -
 kernel/debug/kdb/kdb_io.c   | 2 --
 kernel/debug/kdb/kdb_main.c | 1 -
 3 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 410decacff8f..eb706188dc23 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -77,7 +77,6 @@ extern int kdb_poll_idx;
  * number whenever the kernel debugger is entered.
  */
 extern int kdb_initial_cpu;
-extern atomic_t kdb_event;
 
 /* Types and messages used for dynamically added kdb shell commands */
 
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 98c9011eac78..46f477bebe0c 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -576,7 +576,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
 		KDB_STATE_SET(PRINTF_LOCK);
 		spin_lock_irqsave(&kdb_printf_lock, flags);
 		got_printf_lock = 1;
-		atomic_inc(&kdb_event);
 	} else {
 		__acquire(kdb_printf_lock);
 	}
@@ -851,7 +850,6 @@ kdb_print_out:
 		got_printf_lock = 0;
 		spin_unlock_irqrestore(&kdb_printf_lock, flags);
 		KDB_STATE_CLEAR(PRINTF_LOCK);
-		atomic_dec(&kdb_event);
 	} else {
 		__release(kdb_printf_lock);
 	}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2a20c0dfdafc..ca183919d302 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -60,7 +60,6 @@ int kdb_grep_trailing;
  * Kernel debugger state flags
  */
 int kdb_flags;
-atomic_t kdb_event;
 
 /*
  * kdb_lock protects updates to kdb_initial_cpu.  Used to
-- 
cgit v1.2.3


From d5d8d3d0d4adcc3aec6e2e0fb656165014a712b7 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 14 Dec 2016 15:05:55 -0800
Subject: kdb: properly synchronize vkdb_printf() calls with other CPUs

kdb_printf_lock does not prevent other CPUs from entering the critical
section because it is ignored when KDB_STATE_PRINTF_LOCK is set.

The problematic situation might look like:

CPU0					CPU1

vkdb_printf()
  if (!KDB_STATE(PRINTF_LOCK))
    KDB_STATE_SET(PRINTF_LOCK);
    spin_lock_irqsave(&kdb_printf_lock, flags);

					vkdb_printf()
					  if (!KDB_STATE(PRINTF_LOCK))

BANG: The PRINTF_LOCK state is set and CPU1 is entering the critical
section without spinning on the lock.

The problem is that the code tries to implement locking using two state
variables that are not handled atomically.  Well, we need a custom
locking because we want to allow reentering the critical section on the
very same CPU.

Let's use solution from Petr Zijlstra that was proposed for a similar
scenario, see
https://lkml.kernel.org/r/20161018171513.734367391@infradead.org

This patch uses the same trick with cmpxchg().  The only difference is
that we want to handle only recursion from the same context and
therefore we disable interrupts.

In addition, KDB_STATE_PRINTF_LOCK is removed.  In fact, we are not able
to set it a non-racy way.

Link: http://lkml.kernel.org/r/1480412276-16690-3-git-send-email-pmladek@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/debug/kdb/kdb_io.c      | 30 +++++++++++++-----------------
 kernel/debug/kdb/kdb_private.h |  1 -
 2 files changed, 13 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 46f477bebe0c..daa76154fb1b 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -555,16 +555,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
 	int colcount;
 	int logging, saved_loglevel = 0;
 	int saved_trap_printk;
-	int got_printf_lock = 0;
 	int retlen = 0;
 	int fnd, len;
+	int this_cpu, old_cpu;
+	static int kdb_printf_cpu = -1;
 	char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
 	char *moreprompt = "more> ";
 	struct console *c = console_drivers;
-	static DEFINE_SPINLOCK(kdb_printf_lock);
 	unsigned long uninitialized_var(flags);
 
-	preempt_disable();
+	local_irq_save(flags);
 	saved_trap_printk = kdb_trap_printk;
 	kdb_trap_printk = 0;
 
@@ -572,12 +572,13 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
 	 * But if any cpu goes recursive in kdb, just print the output,
 	 * even if it is interleaved with any other text.
 	 */
-	if (!KDB_STATE(PRINTF_LOCK)) {
-		KDB_STATE_SET(PRINTF_LOCK);
-		spin_lock_irqsave(&kdb_printf_lock, flags);
-		got_printf_lock = 1;
-	} else {
-		__acquire(kdb_printf_lock);
+	this_cpu = smp_processor_id();
+	for (;;) {
+		old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
+		if (old_cpu == -1 || old_cpu == this_cpu)
+			break;
+
+		cpu_relax();
 	}
 
 	diag = kdbgetintenv("LINES", &linecount);
@@ -846,15 +847,10 @@ kdb_print_out:
 	suspend_grep = 0; /* end of what may have been a recursive call */
 	if (logging)
 		console_loglevel = saved_loglevel;
-	if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
-		got_printf_lock = 0;
-		spin_unlock_irqrestore(&kdb_printf_lock, flags);
-		KDB_STATE_CLEAR(PRINTF_LOCK);
-	} else {
-		__release(kdb_printf_lock);
-	}
+	/* kdb_printf_cpu locked the code above. */
+	smp_store_release(&kdb_printf_cpu, old_cpu);
 	kdb_trap_printk = saved_trap_printk;
-	preempt_enable();
+	local_irq_restore(flags);
 	return retlen;
 }
 
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 75014d7f4568..fc224fbcf954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -132,7 +132,6 @@ extern int kdb_state;
 #define KDB_STATE_PAGER		0x00000400	/* pager is available */
 #define KDB_STATE_GO_SWITCH	0x00000800	/* go is switching
 						 * back to initial cpu */
-#define KDB_STATE_PRINTF_LOCK	0x00001000	/* Holds kdb_printf lock */
 #define KDB_STATE_WAIT_IPI	0x00002000	/* Waiting for kdb_ipi() NMI */
 #define KDB_STATE_RECURSE	0x00004000	/* Recursive entry to kdb */
 #define KDB_STATE_IP_ADJUSTED	0x00008000	/* Restart IP has been
-- 
cgit v1.2.3


From 34aaff40b42148b23dcde40152480e25c7d2d759 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Wed, 14 Dec 2016 15:05:58 -0800
Subject: kdb: call vkdb_printf() from vprintk_default() only when wanted

kdb_trap_printk allows to pass normal printk() messages to kdb via
vkdb_printk().  For example, it is used to get backtrace using the
classic show_stack(), see kdb_show_stack().

vkdb_printf() tries to avoid a potential infinite loop by disabling the
trap.  But this approach is racy, for example:

CPU1					CPU2

vkdb_printf()
  // assume that kdb_trap_printk == 0
  saved_trap_printk = kdb_trap_printk;
  kdb_trap_printk = 0;

					kdb_show_stack()
					  kdb_trap_printk++;

Problem1: Now, a nested printk() on CPU0 calls vkdb_printf()
	  even when it should have been disabled. It will not
	  cause a deadlock but...

   // using the outdated saved value: 0
   kdb_trap_printk = saved_trap_printk;

					  kdb_trap_printk--;

Problem2: Now, kdb_trap_printk == -1 and will stay like this.
   It means that all messages will get passed to kdb from
   now on.

This patch removes the racy saved_trap_printk handling.  Instead, the
recursion is prevented by a check for the locked CPU.

The solution is still kind of racy.  A non-related printk(), from
another process, might get trapped by vkdb_printf().  And the wanted
printk() might not get trapped because kdb_printf_cpu is assigned.  But
this problem existed even with the original code.

A proper solution would be to get_cpu() before setting kdb_trap_printk
and trap messages only from this CPU.  I am not sure if it is worth the
effort, though.

In fact, the race is very theoretical.  When kdb is running any of the
commands that use kdb_trap_printk there is a single active CPU and the
other CPUs should be in a holding pen inside kgdb_cpu_enter().

The only time this is violated is when there is a timeout waiting for
the other CPUs to report to the holding pen.

Finally, note that the situation is a bit schizophrenic.  vkdb_printf()
explicitly allows recursion but only from KDB code that calls
kdb_printf() directly.  On the other hand, the generic printk()
recursion is not allowed because it might cause an infinite loop.  This
is why we could not hide the decision inside vkdb_printf() easily.

Link: http://lkml.kernel.org/r/1480412276-16690-4-git-send-email-pmladek@suse.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kdb.h       | 1 +
 kernel/debug/kdb/kdb_io.c | 9 ++-------
 kernel/printk/printk.c    | 3 ++-
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index eb706188dc23..68bd88223417 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -161,6 +161,7 @@ enum kdb_msgsrc {
 };
 
 extern int kdb_trap_printk;
+extern int kdb_printf_cpu;
 extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
 				      va_list args);
 extern __printf(1, 2) int kdb_printf(const char *, ...);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index daa76154fb1b..e74be38245ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -30,6 +30,7 @@
 char kdb_prompt_str[CMD_BUFLEN];
 
 int kdb_trap_printk;
+int kdb_printf_cpu = -1;
 
 static int kgdb_transition_check(char *buffer)
 {
@@ -554,24 +555,19 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
 	int linecount;
 	int colcount;
 	int logging, saved_loglevel = 0;
-	int saved_trap_printk;
 	int retlen = 0;
 	int fnd, len;
 	int this_cpu, old_cpu;
-	static int kdb_printf_cpu = -1;
 	char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
 	char *moreprompt = "more> ";
 	struct console *c = console_drivers;
 	unsigned long uninitialized_var(flags);
 
-	local_irq_save(flags);
-	saved_trap_printk = kdb_trap_printk;
-	kdb_trap_printk = 0;
-
 	/* Serialize kdb_printf if multiple cpus try to write at once.
 	 * But if any cpu goes recursive in kdb, just print the output,
 	 * even if it is interleaved with any other text.
 	 */
+	local_irq_save(flags);
 	this_cpu = smp_processor_id();
 	for (;;) {
 		old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
@@ -849,7 +845,6 @@ kdb_print_out:
 		console_loglevel = saved_loglevel;
 	/* kdb_printf_cpu locked the code above. */
 	smp_store_release(&kdb_printf_cpu, old_cpu);
-	kdb_trap_printk = saved_trap_printk;
 	local_irq_restore(flags);
 	return retlen;
 }
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 577f2288d19f..a3ce35e0fa1e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args)
 	int r;
 
 #ifdef CONFIG_KGDB_KDB
-	if (unlikely(kdb_trap_printk)) {
+	/* Allow to pass printk() to kdb but avoid a recursion. */
+	if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
 		r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
 		return r;
 	}
-- 
cgit v1.2.3


From b6f8a92c9ca835b4a079ecee8433d0d110398448 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 14 Dec 2016 15:06:13 -0800
Subject: posix-timers: give lazy compilers some help optimizing code away

The OpenRISC compiler (so far) fails to optimize away a large portion of
code containing a reference to posix_timer_event in alarmtimer.c when
CONFIG_POSIX_TIMERS is unset.  Let's give it a direct clue to let the
build succeed.

This fixes
[linux-next:master 6682/7183] alarmtimer.c:undefined reference to `posix_timer_event'
reported by kbuild test robot.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Josh Triplett <josh@joshtriplett.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/alarmtimer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9b08ca391aed..3921cf7fea8e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 
 	spin_lock_irqsave(&ptr->it_lock, flags);
 	if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
-		if (posix_timer_event(ptr, 0) != 0)
+		if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
+		    posix_timer_event(ptr, 0) != 0)
 			ptr->it_overrun++;
 	}
 
-- 
cgit v1.2.3


From 249e52e35580fcfe5dad53a7dcd7c1252788749c Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@oracle.com>
Date: Wed, 14 Dec 2016 15:06:21 -0800
Subject: kernel/watchdog.c: move shared definitions to nmi.h

Patch series "Clean up watchdog handlers", v2.

This is an attempt to cleanup watchdog handlers.  Right now,
kernel/watchdog.c implements both softlockup and hardlockup detectors.
Softlockup code is generic.  Hardlockup code is arch specific.  Some
architectures don't use hardlockup detectors.  They use their own
watchdog detectors.  To make both these combination work, we have
numerous #ifdefs in kernel/watchdog.c.

We are trying here to make these handlers independent of each other.
Also provide an interface for architectures to implement their own
handlers.  watchdog_nmi_enable and watchdog_nmi_disable will be defined
as weak such that architectures can override its definitions.

Thanks to Don Zickus for his suggestions.
Here are our previous discussions
http://www.spinics.net/lists/sparclinux/msg16543.html
http://www.spinics.net/lists/sparclinux/msg16441.html

This patch (of 3):

Move shared macros and definitions to nmi.h so that watchdog.c, new file
watchdog_hld.c or any other architecture specific handler can use those
definitions.

Link: http://lkml.kernel.org/r/1478034826-43888-2-git-send-email-babu.moger@oracle.com
Signed-off-by: Babu Moger <babu.moger@oracle.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Josh Hunt <johunt@akamai.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h | 24 ++++++++++++++++++++++++
 kernel/watchdog.c   | 28 ++++------------------------
 2 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a78c35cff1ae..aacca824a6ae 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -7,6 +7,23 @@
 #include <linux/sched.h>
 #include <asm/irq.h>
 
+/*
+ * The run state of the lockup detectors is controlled by the content of the
+ * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
+ * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
+ *
+ * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
+ * are variables that are only used as an 'interface' between the parameters
+ * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
+ * 'watchdog_thresh' variable is handled differently because its value is not
+ * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
+ * is equal zero.
+ */
+#define NMI_WATCHDOG_ENABLED_BIT   0
+#define SOFT_WATCHDOG_ENABLED_BIT  1
+#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
+#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
+
 /**
  * touch_nmi_watchdog - restart NMI watchdog timeout.
  * 
@@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled;
 extern int soft_watchdog_enabled;
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern unsigned long watchdog_enabled;
 extern unsigned long *watchdog_cpumask_bits;
+#ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
+#endif
+extern bool is_hardlockup(void);
 struct ctl_table;
 extern int proc_watchdog(struct ctl_table *, int ,
 			 void __user *, size_t *, loff_t *);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6d1020c03d41..d21049496e26 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,29 +27,12 @@
 #include <linux/perf_event.h>
 #include <linux/kthread.h>
 
-/*
- * The run state of the lockup detectors is controlled by the content of the
- * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
- * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
- *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
- */
-#define NMI_WATCHDOG_ENABLED_BIT   0
-#define SOFT_WATCHDOG_ENABLED_BIT  1
-#define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
-#define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
-
 static DEFINE_MUTEX(watchdog_proc_mutex);
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
 #else
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
 #endif
 int __read_mostly nmi_watchdog_enabled;
 int __read_mostly soft_watchdog_enabled;
@@ -59,9 +42,6 @@ int __read_mostly watchdog_thresh = 10;
 #ifdef CONFIG_SMP
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
 #endif
 static struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -289,7 +269,7 @@ void touch_softlockup_watchdog_sync(void)
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
-static bool is_hardlockup(void)
+bool is_hardlockup(void)
 {
 	unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
 
-- 
cgit v1.2.3


From 73ce0511c43686095efd2f65ef564aab952e07bc Mon Sep 17 00:00:00 2001
From: Babu Moger <babu.moger@oracle.com>
Date: Wed, 14 Dec 2016 15:06:24 -0800
Subject: kernel/watchdog.c: move hardlockup detector to separate file

Separate hardlockup code from watchdog.c and move it to watchdog_hld.c.
It is mostly straight forward.  Remove everything inside
CONFIG_HARDLOCKUP_DETECTORS.  This code will go to file watchdog_hld.c.
Also update the makefile accordigly.

Link: http://lkml.kernel.org/r/1478034826-43888-3-git-send-email-babu.moger@oracle.com
Signed-off-by: Babu Moger <babu.moger@oracle.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Cc: Aaron Tomlin <atomlin@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Josh Hunt <johunt@akamai.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile       |   1 +
 kernel/watchdog.c     | 241 +++-----------------------------------------------
 kernel/watchdog_hld.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 239 insertions(+), 230 deletions(-)
 create mode 100644 kernel/watchdog_hld.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index eb26e12c6c2a..314e7d62f5f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d21049496e26..d4b0fa01cae3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,7 +24,6 @@
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-#include <linux/perf_event.h>
 #include <linux/kthread.h>
 
 static DEFINE_MUTEX(watchdog_proc_mutex);
@@ -80,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
 static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
 static unsigned long soft_lockup_nmi_warn;
 
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-unsigned int __read_mostly hardlockup_panic =
-			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
-	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-
-static int __init hardlockup_panic_setup(char *str)
-{
-	if (!strncmp(str, "panic", 5))
-		hardlockup_panic = 1;
-	else if (!strncmp(str, "nopanic", 7))
-		hardlockup_panic = 0;
-	else if (!strncmp(str, "0", 1))
-		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-	else if (!strncmp(str, "1", 1))
-		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
-	return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
 
@@ -244,30 +202,12 @@ void touch_all_softlockup_watchdogs(void)
 	wq_watchdog_touch(-1);
 }
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void touch_nmi_watchdog(void)
-{
-	/*
-	 * Using __raw here because some code paths have
-	 * preemption enabled.  If preemption is enabled
-	 * then interrupts should be enabled too, in which
-	 * case we shouldn't have to worry about the watchdog
-	 * going off.
-	 */
-	raw_cpu_write(watchdog_nmi_touch, true);
-	touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-#endif
-
 void touch_softlockup_watchdog_sync(void)
 {
 	__this_cpu_write(softlockup_touch_sync, true);
 	__this_cpu_write(watchdog_touch_ts, 0);
 }
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
 /* watchdog detector functions */
 bool is_hardlockup(void)
 {
@@ -279,7 +219,6 @@ bool is_hardlockup(void)
 	__this_cpu_write(hrtimer_interrupts_saved, hrint);
 	return false;
 }
-#endif
 
 static int is_softlockup(unsigned long touch_ts)
 {
@@ -293,77 +232,22 @@ static int is_softlockup(unsigned long touch_ts)
 	return 0;
 }
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-
-static struct perf_event_attr wd_hw_attr = {
-	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES,
-	.size		= sizeof(struct perf_event_attr),
-	.pinned		= 1,
-	.disabled	= 1,
-};
-
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
-		 struct perf_sample_data *data,
-		 struct pt_regs *regs)
-{
-	/* Ensure the watchdog never gets throttled */
-	event->hw.interrupts = 0;
-
-	if (__this_cpu_read(watchdog_nmi_touch) == true) {
-		__this_cpu_write(watchdog_nmi_touch, false);
-		return;
-	}
-
-	/* check for a hardlockup
-	 * This is done by making sure our timer interrupt
-	 * is incrementing.  The timer interrupt should have
-	 * fired multiple times before we overflow'd.  If it hasn't
-	 * then this is a good indication the cpu is stuck
-	 */
-	if (is_hardlockup()) {
-		int this_cpu = smp_processor_id();
-
-		/* only print hardlockups once */
-		if (__this_cpu_read(hard_watchdog_warn) == true)
-			return;
-
-		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
-		print_modules();
-		print_irqtrace_events(current);
-		if (regs)
-			show_regs(regs);
-		else
-			dump_stack();
-
-		/*
-		 * Perform all-CPU dump only once to avoid multiple hardlockups
-		 * generating interleaving traces
-		 */
-		if (sysctl_hardlockup_all_cpu_backtrace &&
-				!test_and_set_bit(0, &hardlockup_allcpu_dumped))
-			trigger_allbutself_cpu_backtrace();
-
-		if (hardlockup_panic)
-			nmi_panic(regs, "Hard LOCKUP");
-
-		__this_cpu_write(hard_watchdog_warn, true);
-		return;
-	}
-
-	__this_cpu_write(hard_watchdog_warn, false);
-	return;
-}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
 static void watchdog_interrupt_count(void)
 {
 	__this_cpu_inc(hrtimer_interrupts);
 }
 
-static int watchdog_nmi_enable(unsigned int cpu);
-static void watchdog_nmi_disable(unsigned int cpu);
+/*
+ * These two functions are mostly architecture specific
+ * defining them as weak here.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+	return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
 
 static int watchdog_enable_all_cpus(void);
 static void watchdog_disable_all_cpus(void);
@@ -556,109 +440,6 @@ static void watchdog(unsigned int cpu)
 		watchdog_nmi_disable(cpu);
 }
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-
-static int watchdog_nmi_enable(unsigned int cpu)
-{
-	struct perf_event_attr *wd_attr;
-	struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
-	/* nothing to do if the hard lockup detector is disabled */
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		goto out;
-
-	/* is it already setup and enabled? */
-	if (event && event->state > PERF_EVENT_STATE_OFF)
-		goto out;
-
-	/* it is setup but not enabled */
-	if (event != NULL)
-		goto out_enable;
-
-	wd_attr = &wd_hw_attr;
-	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-
-	/* Try to register using hardware perf events */
-	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-
-	/* save cpu0 error for future comparision */
-	if (cpu == 0 && IS_ERR(event))
-		cpu0_err = PTR_ERR(event);
-
-	if (!IS_ERR(event)) {
-		/* only print for cpu0 or different than cpu0 */
-		if (cpu == 0 || cpu0_err)
-			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
-		goto out_save;
-	}
-
-	/*
-	 * Disable the hard lockup detector if _any_ CPU fails to set up
-	 * set up the hardware perf event. The watchdog() function checks
-	 * the NMI_WATCHDOG_ENABLED bit periodically.
-	 *
-	 * The barriers are for syncing up watchdog_enabled across all the
-	 * cpus, as clear_bit() does not use barriers.
-	 */
-	smp_mb__before_atomic();
-	clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
-	smp_mb__after_atomic();
-
-	/* skip displaying the same error again */
-	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
-		return PTR_ERR(event);
-
-	/* vary the KERN level based on the returned errno */
-	if (PTR_ERR(event) == -EOPNOTSUPP)
-		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
-	else if (PTR_ERR(event) == -ENOENT)
-		pr_warn("disabled (cpu%i): hardware events not enabled\n",
-			 cpu);
-	else
-		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
-			cpu, PTR_ERR(event));
-
-	pr_info("Shutting down hard lockup detector on all cpus\n");
-
-	return PTR_ERR(event);
-
-	/* success path */
-out_save:
-	per_cpu(watchdog_ev, cpu) = event;
-out_enable:
-	perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
-	return 0;
-}
-
-static void watchdog_nmi_disable(unsigned int cpu)
-{
-	struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
-	if (event) {
-		perf_event_disable(event);
-		per_cpu(watchdog_ev, cpu) = NULL;
-
-		/* should be in cleanup, but blocks oprofile */
-		perf_event_release_kernel(event);
-	}
-	if (cpu == 0) {
-		/* watchdog_nmi_enable() expects this to be zero initially. */
-		cpu0_err = 0;
-	}
-}
-
-#else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
 static struct smp_hotplug_thread watchdog_threads = {
 	.store			= &softlockup_watchdog,
 	.thread_should_run	= watchdog_should_run,
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..84016c8aee6b
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,227 @@
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+	if (!strncmp(str, "panic", 5))
+		hardlockup_panic = 1;
+	else if (!strncmp(str, "nopanic", 7))
+		hardlockup_panic = 0;
+	else if (!strncmp(str, "0", 1))
+		watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+	else if (!strncmp(str, "1", 1))
+		watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+	return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+void touch_nmi_watchdog(void)
+{
+	/*
+	 * Using __raw here because some code paths have
+	 * preemption enabled.  If preemption is enabled
+	 * then interrupts should be enabled too, in which
+	 * case we shouldn't have to worry about the watchdog
+	 * going off.
+	 */
+	raw_cpu_write(watchdog_nmi_touch, true);
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+static struct perf_event_attr wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+		 struct perf_sample_data *data,
+		 struct pt_regs *regs)
+{
+	/* Ensure the watchdog never gets throttled */
+	event->hw.interrupts = 0;
+
+	if (__this_cpu_read(watchdog_nmi_touch) == true) {
+		__this_cpu_write(watchdog_nmi_touch, false);
+		return;
+	}
+
+	/* check for a hardlockup
+	 * This is done by making sure our timer interrupt
+	 * is incrementing.  The timer interrupt should have
+	 * fired multiple times before we overflow'd.  If it hasn't
+	 * then this is a good indication the cpu is stuck
+	 */
+	if (is_hardlockup()) {
+		int this_cpu = smp_processor_id();
+
+		/* only print hardlockups once */
+		if (__this_cpu_read(hard_watchdog_warn) == true)
+			return;
+
+		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+		print_modules();
+		print_irqtrace_events(current);
+		if (regs)
+			show_regs(regs);
+		else
+			dump_stack();
+
+		/*
+		 * Perform all-CPU dump only once to avoid multiple hardlockups
+		 * generating interleaving traces
+		 */
+		if (sysctl_hardlockup_all_cpu_backtrace &&
+				!test_and_set_bit(0, &hardlockup_allcpu_dumped))
+			trigger_allbutself_cpu_backtrace();
+
+		if (hardlockup_panic)
+			nmi_panic(regs, "Hard LOCKUP");
+
+		__this_cpu_write(hard_watchdog_warn, true);
+		return;
+	}
+
+	__this_cpu_write(hard_watchdog_warn, false);
+	return;
+}
+
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+	struct perf_event_attr *wd_attr;
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	/* nothing to do if the hard lockup detector is disabled */
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		goto out;
+
+	/* is it already setup and enabled? */
+	if (event && event->state > PERF_EVENT_STATE_OFF)
+		goto out;
+
+	/* it is setup but not enabled */
+	if (event != NULL)
+		goto out_enable;
+
+	wd_attr = &wd_hw_attr;
+	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+	/* Try to register using hardware perf events */
+	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+	/* save cpu0 error for future comparision */
+	if (cpu == 0 && IS_ERR(event))
+		cpu0_err = PTR_ERR(event);
+
+	if (!IS_ERR(event)) {
+		/* only print for cpu0 or different than cpu0 */
+		if (cpu == 0 || cpu0_err)
+			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+		goto out_save;
+	}
+
+	/*
+	 * Disable the hard lockup detector if _any_ CPU fails to set up
+	 * set up the hardware perf event. The watchdog() function checks
+	 * the NMI_WATCHDOG_ENABLED bit periodically.
+	 *
+	 * The barriers are for syncing up watchdog_enabled across all the
+	 * cpus, as clear_bit() does not use barriers.
+	 */
+	smp_mb__before_atomic();
+	clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+	smp_mb__after_atomic();
+
+	/* skip displaying the same error again */
+	if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+		return PTR_ERR(event);
+
+	/* vary the KERN level based on the returned errno */
+	if (PTR_ERR(event) == -EOPNOTSUPP)
+		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+	else if (PTR_ERR(event) == -ENOENT)
+		pr_warn("disabled (cpu%i): hardware events not enabled\n",
+			 cpu);
+	else
+		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+			cpu, PTR_ERR(event));
+
+	pr_info("Shutting down hard lockup detector on all cpus\n");
+
+	return PTR_ERR(event);
+
+	/* success path */
+out_save:
+	per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+	perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+	return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+	struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+	if (event) {
+		perf_event_disable(event);
+		per_cpu(watchdog_ev, cpu) = NULL;
+
+		/* should be in cleanup, but blocks oprofile */
+		perf_event_release_kernel(event);
+	}
+	if (cpu == 0) {
+		/* watchdog_nmi_enable() expects this to be zero initially. */
+		cpu0_err = 0;
+	}
+}
-- 
cgit v1.2.3


From 5b56d49fc31dbb0487e14ead790fc81ca9fb2c99 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Wed, 14 Dec 2016 15:06:52 -0800
Subject: mm: add locked parameter to get_user_pages_remote()

Patch series "mm: unexport __get_user_pages_unlocked()".

This patch series continues the cleanup of get_user_pages*() functions
taking advantage of the fact we can now pass gup_flags as we please.

It firstly adds an additional 'locked' parameter to
get_user_pages_remote() to allow for its callers to utilise
VM_FAULT_RETRY functionality.  This is necessary as the invocation of
__get_user_pages_unlocked() in process_vm_rw_single_vec() makes use of
this and no other existing higher level function would allow it to do
so.

Secondly existing callers of __get_user_pages_unlocked() are replaced
with the appropriate higher-level replacement -
get_user_pages_unlocked() if the current task and memory descriptor are
referenced, or get_user_pages_remote() if other task/memory descriptors
are referenced (having acquiring mmap_sem.)

This patch (of 2):

Add a int *locked parameter to get_user_pages_remote() to allow
VM_FAULT_RETRY faulting behaviour similar to get_user_pages_[un]locked().

Taking into account the previous adjustments to get_user_pages*()
functions allowing for the passing of gup_flags, we are now in a
position where __get_user_pages_unlocked() need only be exported for his
ability to allow VM_FAULT_RETRY behaviour, this adjustment allows us to
subsequently unexport __get_user_pages_unlocked() as well as allowing
for future flexibility in the use of get_user_pages_remote().

[sfr@canb.auug.org.au: merge fix for get_user_pages_remote API change]
  Link: http://lkml.kernel.org/r/20161122210511.024ec341@canb.auug.org.au
Link: http://lkml.kernel.org/r/20161027095141.2569-2-lstoakes@gmail.com
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/etnaviv/etnaviv_gem.c   |  2 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c |  2 +-
 drivers/infiniband/core/umem_odp.c      |  2 +-
 drivers/vfio/vfio_iommu_type1.c         |  2 +-
 fs/exec.c                               |  2 +-
 include/linux/mm.h                      |  2 +-
 kernel/events/uprobes.c                 |  4 ++--
 mm/gup.c                                | 12 ++++++++----
 mm/memory.c                             |  2 +-
 security/tomoyo/domain.c                |  2 +-
 10 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 7d066a91d778..6f5dfabb3096 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -759,7 +759,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
 	down_read(&mm->mmap_sem);
 	while (pinned < npages) {
 		ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
-					    flags, pvec + pinned, NULL);
+					    flags, pvec + pinned, NULL, NULL);
 		if (ret < 0)
 			break;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 107ddf51065e..d068af2ec3a3 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 					 obj->userptr.ptr + pinned * PAGE_SIZE,
 					 npages - pinned,
 					 flags,
-					 pvec + pinned, NULL);
+					 pvec + pinned, NULL, NULL);
 				if (ret < 0)
 					break;
 
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 1f0fe3217f23..6b079a31dced 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
 		 */
 		npages = get_user_pages_remote(owning_process, owning_mm,
 				user_virt, gup_num_pages,
-				flags, local_page_list, NULL);
+				flags, local_page_list, NULL, NULL);
 		up_read(&owning_mm->mmap_sem);
 
 		if (npages < 0)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9815e45c23c4..f3726ba12aa6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 
 		down_read(&mm->mmap_sem);
 		ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
-					    NULL);
+					    NULL, NULL);
 		up_read(&mm->mmap_sem);
 	}
 
diff --git a/fs/exec.c b/fs/exec.c
index 923c57d96899..eac60886b0ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 	 * doing the exec and bprm->mm is the new process's mm.
 	 */
 	ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
-			&page, NULL);
+			&page, NULL, NULL);
 	if (ret <= 0)
 		return NULL;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a92c8d73aeaf..cc154454675a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1274,7 +1274,7 @@ extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 			    unsigned long start, unsigned long nr_pages,
 			    unsigned int gup_flags, struct page **pages,
-			    struct vm_area_struct **vmas);
+			    struct vm_area_struct **vmas, int *locked);
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 			    unsigned int gup_flags, struct page **pages,
 			    struct vm_area_struct **vmas);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..215871bda3a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
 retry:
 	/* Read the page with vaddr into memory */
 	ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
-			&vma);
+			&vma, NULL);
 	if (ret <= 0)
 		return ret;
 
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	 * essentially a kernel access to the memory.
 	 */
 	result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
-			NULL);
+			NULL, NULL);
 	if (result < 0)
 		return result;
 
diff --git a/mm/gup.c b/mm/gup.c
index e50178c58b97..b64c907aa4f0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  *		only intends to ensure the pages are faulted in.
  * @vmas:	array of pointers to vmas corresponding to each page.
  *		Or NULL if the caller does not require them.
+ * @locked:	pointer to lock flag indicating whether lock is held and
+ *		subsequently whether VM_FAULT_RETRY functionality can be
+ *		utilised. Lock must initially be held.
  *
  * Returns number of pages pinned. This may be fewer than the number
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
-		struct vm_area_struct **vmas)
+		struct vm_area_struct **vmas, int *locked)
 {
 	return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
-				       NULL, false,
+				       locked, true,
 				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
 /*
  * This is the same as get_user_pages_remote(), just with a
  * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's.  We also
- * obviously don't pass FOLL_REMOTE in here.
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter.  We also obviously don't pass
+ * FOLL_REMOTE in here.
  */
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
diff --git a/mm/memory.c b/mm/memory.c
index c264f7cd3e47..3a6a1239c42b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3919,7 +3919,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		struct page *page = NULL;
 
 		ret = get_user_pages_remote(tsk, mm, addr, 1,
-				gup_flags, &page, &vma);
+				gup_flags, &page, &vma, NULL);
 		if (ret <= 0) {
 #ifndef CONFIG_HAVE_IOREMAP_PROT
 			break;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 682b73af7766..838ffa78cfda 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
 	 * the execve().
 	 */
 	if (get_user_pages_remote(current, bprm->mm, pos, 1,
-				FOLL_FORCE, &page, NULL) <= 0)
+				FOLL_FORCE, &page, NULL, NULL) <= 0)
 		return false;
 #else
 	page = bprm->page[pos / PAGE_SIZE];
-- 
cgit v1.2.3


From 5aa068ea4082b39eafc356c27c9ecd155b0895f6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 25 Oct 2016 11:27:31 -0700
Subject: printk: remove games with previous record flags

The record logging code looks at the previous record flags in various
ways, and they are all wrong.

You can't use the previous record flags to determine anything about the
next record, because they may simply not be related.  In particular, the
reason the previous record was a continuation record may well be exactly
_because_ the new record was printed by a different process, which is
why the previous record was flushed.

So all those games are simply wrong, and make the code hard to
understand (because the code fundamentally cdoes not make sense).

So remove it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 109 +++++++++++--------------------------------------
 1 file changed, 24 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a3ce35e0fa1e..50b2703d6d6a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -356,7 +356,6 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
-static enum log_flags syslog_prev;
 static size_t syslog_partial;
 
 /* index and sequence number of the first record stored in the buffer */
@@ -370,7 +369,6 @@ static u32 log_next_idx;
 /* the next printk record to write to the console */
 static u64 console_seq;
 static u32 console_idx;
-static enum log_flags console_prev;
 
 /* the next printk record to read after the last 'clear' command */
 static u64 clear_seq;
@@ -639,27 +637,15 @@ static void append_char(char **pp, char *e, char c)
 }
 
 static ssize_t msg_print_ext_header(char *buf, size_t size,
-				    struct printk_log *msg, u64 seq,
-				    enum log_flags prev_flags)
+				    struct printk_log *msg, u64 seq)
 {
 	u64 ts_usec = msg->ts_nsec;
-	char cont = '-';
 
 	do_div(ts_usec, 1000);
 
-	/*
-	 * If we couldn't merge continuation line fragments during the print,
-	 * export the stored flags to allow an optional external merge of the
-	 * records. Merging the records isn't always neccessarily correct, like
-	 * when we hit a race during printing. In most cases though, it produces
-	 * better readable output. 'c' in the record flags mark the first
-	 * fragment of a line, '+' the following.
-	 */
-	if (msg->flags & LOG_CONT)
-		cont = (prev_flags & LOG_CONT) ? '+' : 'c';
-
 	return scnprintf(buf, size, "%u,%llu,%llu,%c;",
-		       (msg->facility << 3) | msg->level, seq, ts_usec, cont);
+		       (msg->facility << 3) | msg->level, seq, ts_usec,
+		       msg->flags & LOG_CONT ? 'c' : '-');
 }
 
 static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -714,7 +700,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
 struct devkmsg_user {
 	u64 seq;
 	u32 idx;
-	enum log_flags prev;
 	struct ratelimit_state rs;
 	struct mutex lock;
 	char buf[CONSOLE_EXT_LOG_MAX];
@@ -824,12 +809,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 
 	msg = log_from_idx(user->idx);
 	len = msg_print_ext_header(user->buf, sizeof(user->buf),
-				   msg, user->seq, user->prev);
+				   msg, user->seq);
 	len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
 				  log_dict(msg), msg->dict_len,
 				  log_text(msg), msg->text_len);
 
-	user->prev = msg->flags;
 	user->idx = log_next(user->idx);
 	user->seq++;
 	raw_spin_unlock_irq(&logbuf_lock);
@@ -1210,26 +1194,12 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)
 	return len;
 }
 
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
-			     bool syslog, char *buf, size_t size)
+static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size)
 {
 	const char *text = log_text(msg);
 	size_t text_size = msg->text_len;
-	bool prefix = true;
-	bool newline = true;
 	size_t len = 0;
 
-	if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))
-		prefix = false;
-
-	if (msg->flags & LOG_CONT) {
-		if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE))
-			prefix = false;
-
-		if (!(msg->flags & LOG_NEWLINE))
-			newline = false;
-	}
-
 	do {
 		const char *next = memchr(text, '\n', text_size);
 		size_t text_len;
@@ -1247,22 +1217,17 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
 			    text_len + 1 >= size - len)
 				break;
 
-			if (prefix)
-				len += print_prefix(msg, syslog, buf + len);
+			len += print_prefix(msg, syslog, buf + len);
 			memcpy(buf + len, text, text_len);
 			len += text_len;
-			if (next || newline)
-				buf[len++] = '\n';
+			buf[len++] = '\n';
 		} else {
 			/* SYSLOG_ACTION_* buffer size only calculation */
-			if (prefix)
-				len += print_prefix(msg, syslog, NULL);
+			len += print_prefix(msg, syslog, NULL);
 			len += text_len;
-			if (next || newline)
-				len++;
+			len++;
 		}
 
-		prefix = true;
 		text = next;
 	} while (text);
 
@@ -1288,7 +1253,6 @@ static int syslog_print(char __user *buf, int size)
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
-			syslog_prev = 0;
 			syslog_partial = 0;
 		}
 		if (syslog_seq == log_next_seq) {
@@ -1298,13 +1262,11 @@ static int syslog_print(char __user *buf, int size)
 
 		skip = syslog_partial;
 		msg = log_from_idx(syslog_idx);
-		n = msg_print_text(msg, syslog_prev, true, text,
-				   LOG_LINE_MAX + PREFIX_MAX);
+		n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX);
 		if (n - syslog_partial <= size) {
 			/* message fits into buffer, move forward */
 			syslog_idx = log_next(syslog_idx);
 			syslog_seq++;
-			syslog_prev = msg->flags;
 			n -= syslog_partial;
 			syslog_partial = 0;
 		} else if (!len){
@@ -1347,7 +1309,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		u64 next_seq;
 		u64 seq;
 		u32 idx;
-		enum log_flags prev;
 
 		/*
 		 * Find first record that fits, including all following records,
@@ -1355,12 +1316,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		 */
 		seq = clear_seq;
 		idx = clear_idx;
-		prev = 0;
 		while (seq < log_next_seq) {
 			struct printk_log *msg = log_from_idx(idx);
 
-			len += msg_print_text(msg, prev, true, NULL, 0);
-			prev = msg->flags;
+			len += msg_print_text(msg, true, NULL, 0);
 			idx = log_next(idx);
 			seq++;
 		}
@@ -1368,12 +1327,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		/* move first record forward until length fits into the buffer */
 		seq = clear_seq;
 		idx = clear_idx;
-		prev = 0;
 		while (len > size && seq < log_next_seq) {
 			struct printk_log *msg = log_from_idx(idx);
 
-			len -= msg_print_text(msg, prev, true, NULL, 0);
-			prev = msg->flags;
+			len -= msg_print_text(msg, true, NULL, 0);
 			idx = log_next(idx);
 			seq++;
 		}
@@ -1386,7 +1343,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			struct printk_log *msg = log_from_idx(idx);
 			int textlen;
 
-			textlen = msg_print_text(msg, prev, true, text,
+			textlen = msg_print_text(msg, true, text,
 						 LOG_LINE_MAX + PREFIX_MAX);
 			if (textlen < 0) {
 				len = textlen;
@@ -1394,7 +1351,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 			}
 			idx = log_next(idx);
 			seq++;
-			prev = msg->flags;
 
 			raw_spin_unlock_irq(&logbuf_lock);
 			if (copy_to_user(buf + len, text, textlen))
@@ -1407,7 +1363,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 				/* messages are gone, move to next one */
 				seq = log_first_seq;
 				idx = log_first_idx;
-				prev = 0;
 			}
 		}
 	}
@@ -1508,7 +1463,6 @@ int do_syslog(int type, char __user *buf, int len, int source)
 			/* messages are gone, move to first one */
 			syslog_seq = log_first_seq;
 			syslog_idx = log_first_idx;
-			syslog_prev = 0;
 			syslog_partial = 0;
 		}
 		if (source == SYSLOG_FROM_PROC) {
@@ -1521,16 +1475,14 @@ int do_syslog(int type, char __user *buf, int len, int source)
 		} else {
 			u64 seq = syslog_seq;
 			u32 idx = syslog_idx;
-			enum log_flags prev = syslog_prev;
 
 			error = 0;
 			while (seq < log_next_seq) {
 				struct printk_log *msg = log_from_idx(idx);
 
-				error += msg_print_text(msg, prev, true, NULL, 0);
+				error += msg_print_text(msg, true, NULL, 0);
 				idx = log_next(idx);
 				seq++;
-				prev = msg->flags;
 			}
 			error -= syslog_partial;
 		}
@@ -1712,7 +1664,7 @@ static size_t cont_print_text(char *text, size_t size)
 	size_t textlen = 0;
 	size_t len;
 
-	if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
+	if (cont.cons == 0) {
 		textlen += print_time(cont.ts_nsec, text);
 		size -= textlen;
 	}
@@ -1981,11 +1933,9 @@ static u64 syslog_seq;
 static u32 syslog_idx;
 static u64 console_seq;
 static u32 console_idx;
-static enum log_flags syslog_prev;
 static u64 log_first_seq;
 static u32 log_first_idx;
 static u64 log_next_seq;
-static enum log_flags console_prev;
 static struct cont {
 	size_t len;
 	size_t cons;
@@ -1997,15 +1947,15 @@ static char *log_dict(const struct printk_log *msg) { return NULL; }
 static struct printk_log *log_from_idx(u32 idx) { return NULL; }
 static u32 log_next(u32 idx) { return 0; }
 static ssize_t msg_print_ext_header(char *buf, size_t size,
-				    struct printk_log *msg, u64 seq,
-				    enum log_flags prev_flags) { return 0; }
+				    struct printk_log *msg,
+				    u64 seq) { return 0; }
 static ssize_t msg_print_ext_body(char *buf, size_t size,
 				  char *dict, size_t dict_len,
 				  char *text, size_t text_len) { return 0; }
 static void call_console_drivers(int level,
 				 const char *ext_text, size_t ext_len,
 				 const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+static size_t msg_print_text(const struct printk_log *msg,
 			     bool syslog, char *buf, size_t size) { return 0; }
 static size_t cont_print_text(char *text, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
@@ -2382,7 +2332,6 @@ again:
 			/* messages are gone, move to first one */
 			console_seq = log_first_seq;
 			console_idx = log_first_idx;
-			console_prev = 0;
 		} else {
 			len = 0;
 		}
@@ -2407,16 +2356,14 @@ skip:
 			 * will properly dump everything later.
 			 */
 			msg->flags &= ~LOG_NOCONS;
-			console_prev = msg->flags;
 			goto skip;
 		}
 
-		len += msg_print_text(msg, console_prev, false,
-				      text + len, sizeof(text) - len);
+		len += msg_print_text(msg, false, text + len, sizeof(text) - len);
 		if (nr_ext_console_drivers) {
 			ext_len = msg_print_ext_header(ext_text,
 						sizeof(ext_text),
-						msg, console_seq, console_prev);
+						msg, console_seq);
 			ext_len += msg_print_ext_body(ext_text + ext_len,
 						sizeof(ext_text) - ext_len,
 						log_dict(msg), msg->dict_len,
@@ -2424,7 +2371,6 @@ skip:
 		}
 		console_idx = log_next(console_idx);
 		console_seq++;
-		console_prev = msg->flags;
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
@@ -2719,7 +2665,6 @@ void register_console(struct console *newcon)
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
 		console_seq = syslog_seq;
 		console_idx = syslog_idx;
-		console_prev = syslog_prev;
 		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 		/*
 		 * We're about to replay the log buffer.  Only do this to the
@@ -3075,7 +3020,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 
 	msg = log_from_idx(dumper->cur_idx);
-	l = msg_print_text(msg, 0, syslog, line, size);
+	l = msg_print_text(msg, syslog, line, size);
 
 	dumper->cur_idx = log_next(dumper->cur_idx);
 	dumper->cur_seq++;
@@ -3144,7 +3089,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	u32 idx;
 	u64 next_seq;
 	u32 next_idx;
-	enum log_flags prev;
 	size_t l = 0;
 	bool ret = false;
 
@@ -3167,27 +3111,23 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	/* calculate length of entire buffer */
 	seq = dumper->cur_seq;
 	idx = dumper->cur_idx;
-	prev = 0;
 	while (seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, prev, true, NULL, 0);
+		l += msg_print_text(msg, true, NULL, 0);
 		idx = log_next(idx);
 		seq++;
-		prev = msg->flags;
 	}
 
 	/* move first record forward until length fits into the buffer */
 	seq = dumper->cur_seq;
 	idx = dumper->cur_idx;
-	prev = 0;
 	while (l > size && seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l -= msg_print_text(msg, prev, true, NULL, 0);
+		l -= msg_print_text(msg, true, NULL, 0);
 		idx = log_next(idx);
 		seq++;
-		prev = msg->flags;
 	}
 
 	/* last message in next interation */
@@ -3198,10 +3138,9 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	while (seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-		l += msg_print_text(msg, prev, syslog, buf + l, size - l);
+		l += msg_print_text(msg, syslog, buf + l, size - l);
 		idx = log_next(idx);
 		seq++;
-		prev = msg->flags;
 	}
 
 	dumper->next_seq = next_seq;
-- 
cgit v1.2.3


From 5c2992ee7fd8a29d04125dc0aa3522784c5fa5eb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 25 Oct 2016 11:27:31 -0700
Subject: printk: remove console flushing special cases for partial buffered
 lines

It actively hurts proper merging, and makes for a lot of special cases.
There was a good(ish) reason for doing it originally, but it's getting
too painful to maintain.  And most of the original reasons for it are
long gone.

So instead of having special code to flush partial lines to the console
(as opposed to the record buffers), do _all_ the console writing from
the record buffer, and be done with it.

If an oops happens (or some other synchronous event), we will flush the
partial lines due to the oops printing activity, so this does not affect
that.  It does mean that if you have a completely hung machine, a
partial preceding line may not have been printed out.

That was some of the original reason for this complexity, in fact, back
when we used to test for the historical i386 "halt" instruction problem
by doing

	pr_info("Checking 'hlt' instruction... ");

	if (!boot_cpu_data.hlt_works_ok) {
		pr_cont("disabled\n");
		return;
	}
	halt();
	halt();
	halt();
	halt();
	pr_cont("OK\n");

and that model no longer works (it the 'hlt' instruction kills the
machine, the partial line won't have been flushed, so you won't even see
it).

Of course, that was also back in the days when people actually had
textual console output rather than a graphical splash-screen at bootup.
How times change..

Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Cc: Joe Perches <joe@perches.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Petr Mladek <pmladek@suse.com>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 108 +++----------------------------------------------
 1 file changed, 5 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 50b2703d6d6a..b3c454b733da 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1583,46 +1583,25 @@ static inline void printk_delay(void)
 static struct cont {
 	char buf[LOG_LINE_MAX];
 	size_t len;			/* length == 0 means unused buffer */
-	size_t cons;			/* bytes written to console */
 	struct task_struct *owner;	/* task of first print*/
 	u64 ts_nsec;			/* time of first print */
 	u8 level;			/* log level of first message */
 	u8 facility;			/* log facility of first message */
 	enum log_flags flags;		/* prefix, newline flags */
-	bool flushed:1;			/* buffer sealed and committed */
 } cont;
 
 static void cont_flush(void)
 {
-	if (cont.flushed)
-		return;
 	if (cont.len == 0)
 		return;
-	if (cont.cons) {
-		/*
-		 * If a fragment of this line was directly flushed to the
-		 * console; wait for the console to pick up the rest of the
-		 * line. LOG_NOCONS suppresses a duplicated output.
-		 */
-		log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS,
-			  cont.ts_nsec, NULL, 0, cont.buf, cont.len);
-		cont.flushed = true;
-	} else {
-		/*
-		 * If no fragment of this line ever reached the console,
-		 * just submit it to the store and free the buffer.
-		 */
-		log_store(cont.facility, cont.level, cont.flags, 0,
-			  NULL, 0, cont.buf, cont.len);
-		cont.len = 0;
-	}
+
+	log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec,
+		  NULL, 0, cont.buf, cont.len);
+	cont.len = 0;
 }
 
 static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len)
 {
-	if (cont.len && cont.flushed)
-		return false;
-
 	/*
 	 * If ext consoles are present, flush and skip in-kernel
 	 * continuation.  See nr_ext_console_drivers definition.  Also, if
@@ -1639,8 +1618,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
 		cont.owner = current;
 		cont.ts_nsec = local_clock();
 		cont.flags = flags;
-		cont.cons = 0;
-		cont.flushed = false;
 	}
 
 	memcpy(cont.buf + cont.len, text, len);
@@ -1659,34 +1636,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char *
 	return true;
 }
 
-static size_t cont_print_text(char *text, size_t size)
-{
-	size_t textlen = 0;
-	size_t len;
-
-	if (cont.cons == 0) {
-		textlen += print_time(cont.ts_nsec, text);
-		size -= textlen;
-	}
-
-	len = cont.len - cont.cons;
-	if (len > 0) {
-		if (len+1 > size)
-			len = size-1;
-		memcpy(text + textlen, cont.buf + cont.cons, len);
-		textlen += len;
-		cont.cons = cont.len;
-	}
-
-	if (cont.flushed) {
-		if (cont.flags & LOG_NEWLINE)
-			text[textlen++] = '\n';
-		/* got everything, release buffer */
-		cont.len = 0;
-	}
-	return textlen;
-}
-
 static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
 {
 	/*
@@ -1957,7 +1906,6 @@ static void call_console_drivers(int level,
 				 const char *text, size_t len) {}
 static size_t msg_print_text(const struct printk_log *msg,
 			     bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
 static bool suppress_message_printing(int level) { return false; }
 
 /* Still needs to be defined for users */
@@ -2221,42 +2169,6 @@ static inline int can_use_console(void)
 	return cpu_online(raw_smp_processor_id()) || have_callable_console();
 }
 
-static void console_cont_flush(char *text, size_t size)
-{
-	unsigned long flags;
-	size_t len;
-
-	raw_spin_lock_irqsave(&logbuf_lock, flags);
-
-	if (!cont.len)
-		goto out;
-
-	if (suppress_message_printing(cont.level)) {
-		cont.cons = cont.len;
-		if (cont.flushed)
-			cont.len = 0;
-		goto out;
-	}
-
-	/*
-	 * We still queue earlier records, likely because the console was
-	 * busy. The earlier ones need to be printed before this one, we
-	 * did not flush any fragment so far, so just let it queue up.
-	 */
-	if (console_seq < log_next_seq && !cont.cons)
-		goto out;
-
-	len = cont_print_text(text, size);
-	raw_spin_unlock(&logbuf_lock);
-	stop_critical_timings();
-	call_console_drivers(cont.level, NULL, 0, text, len);
-	start_critical_timings();
-	local_irq_restore(flags);
-	return;
-out:
-	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
-}
-
 /**
  * console_unlock - unlock the console system
  *
@@ -2310,9 +2222,6 @@ again:
 		return;
 	}
 
-	/* flush buffered message fragment immediately to console */
-	console_cont_flush(text, sizeof(text));
-
 	for (;;) {
 		struct printk_log *msg;
 		size_t ext_len = 0;
@@ -2341,8 +2250,7 @@ skip:
 
 		msg = log_from_idx(console_idx);
 		level = msg->level;
-		if ((msg->flags & LOG_NOCONS) ||
-				suppress_message_printing(level)) {
+		if (suppress_message_printing(level)) {
 			/*
 			 * Skip record we have buffered and already printed
 			 * directly to the console when we received it, and
@@ -2350,12 +2258,6 @@ skip:
 			 */
 			console_idx = log_next(console_idx);
 			console_seq++;
-			/*
-			 * We will get here again when we register a new
-			 * CON_PRINTBUFFER console. Clear the flag so we
-			 * will properly dump everything later.
-			 */
-			msg->flags &= ~LOG_NOCONS;
 			goto skip;
 		}
 
-- 
cgit v1.2.3


From c1a9eeb938b5433947e5ea22f89baff3182e7075 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Dec 2016 12:10:37 +0100
Subject: tick/broadcast: Prevent NULL pointer dereference

When a disfunctional timer, e.g. dummy timer, is installed, the tick core
tries to setup the broadcast timer.

If no broadcast device is installed, the kernel crashes with a NULL pointer
dereference in tick_broadcast_setup_oneshot() because the function has no
sanity check.

Reported-by: Mason <slash.tmp@free.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Richard Cochran <rcochran@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
Cc: Sebastian Frias <sf84@laposte.net>
Cc: Thibaud Cornic <thibaud_cornic@sigmadesigns.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Link: http://lkml.kernel.org/r/1147ef90-7877-e4d2-bb2b-5c4fa8d3144b@free.fr
---
 kernel/time/tick-broadcast.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f6aae7977824..d2a20e83ebae 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -871,6 +871,9 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
 	int cpu = smp_processor_id();
 
+	if (!bc)
+		return;
+
 	/* Set it up only once ! */
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
 		int was_periodic = clockevent_state_periodic(bc);
-- 
cgit v1.2.3


From c0af52437254fda8b0cdbaae5a9b6d9327f1fcd5 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com>
Date: Wed, 14 Dec 2016 16:01:12 -0200
Subject: genirq/affinity: Fix node generation from cpumask

Commit 34c3d9819fda ("genirq/affinity: Provide smarter irq spreading
infrastructure") introduced a better IRQ spreading mechanism, taking
account of the available NUMA nodes in the machine.

Problem is that the algorithm of retrieving the nodemask iterates
"linearly" based on the number of online nodes - some architectures
present non-linear node distribution among the nodemask, like PowerPC.
If this is the case, the algorithm lead to a wrong node count number
and therefore to a bad/incomplete IRQ affinity distribution.

For example, this problem were found in a machine with 128 CPUs and two
nodes, namely nodes 0 and 8 (instead of 0 and 1, if it was linearly
distributed). This led to a wrong affinity distribution which then led to
a bad mq allocation for nvme driver.

Finally, we take the opportunity to fix a comment regarding the affinity
distribution when we have _more_ nodes than vectors.

Fixes: 34c3d9819fda ("genirq/affinity: Provide smarter irq spreading infrastructure")
Reported-by: Gabriel Krisman Bertazi <gabriel@krisman.be>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Gabriel Krisman Bertazi <gabriel@krisman.be>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Cc: linux-pci@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: hch@lst.de
Link: http://lkml.kernel.org/r/1481738472-2671-1-git-send-email-gpiccoli@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 9be9bda7c1f9..4544b115f5eb 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -37,10 +37,10 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
 
 static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
 {
-	int n, nodes;
+	int n, nodes = 0;
 
 	/* Calculate the number of nodes in the supplied affinity mask */
-	for (n = 0, nodes = 0; n < num_online_nodes(); n++) {
+	for_each_online_node(n) {
 		if (cpumask_intersects(mask, cpumask_of_node(n))) {
 			node_set(n, *nodemsk);
 			nodes++;
@@ -82,7 +82,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk);
 
 	/*
-	 * If the number of nodes in the mask is less than or equal the
+	 * If the number of nodes in the mask is greater than or equal the
 	 * number of vectors we just spread the vectors across the nodes.
 	 */
 	if (affv <= nodes) {
-- 
cgit v1.2.3


From 512f09801b356c54baef62543e51169f03b2e642 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Thu, 15 Dec 2016 10:00:57 -0500
Subject: cpu/hotplug: Clarify description of __cpuhp_setup_state() return
 value

When invoked with CPUHP_AP_ONLINE_DYN state __cpuhp_setup_state()
is expected to return positive value which is the hotplug state that
the routine assigns.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: linux-pm@vger.kernel.org
Cc: viresh.kumar@linaro.org
Cc: bigeasy@linutronix.de
Cc: rjw@rjwysocki.net
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1481814058-4799-2-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 217fd2e7f435..5339aca811d2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1586,7 +1586,11 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
  * @startup:	startup callback function
  * @teardown:	teardown callback function
  *
- * Returns 0 if successful, otherwise a proper error code
+ * Returns:
+ *   On success:
+ *      Positive state number if @state is CPUHP_AP_ONLINE_DYN
+ *      0 for all other states
+ *   On failure: proper (negative) error code
  */
 int __cpuhp_setup_state(enum cpuhp_state state,
 			const char *name, bool invoke,
-- 
cgit v1.2.3


From 8fa9a697ab083af0f4a2807869752685ed8a7a6a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Thu, 15 Dec 2016 13:53:58 +0100
Subject: printk: Remove no longer used second struct cont
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If CONFIG_PRINTK=n:

    kernel/printk/printk.c:1893: warning: ‘cont’ defined but not used

Note that there are actually two different struct cont definitions and
objects: the first one is used if CONFIG_PRINTK=y, the second one became
unused by removing console_cont_flush().

Fixes: 5c2992ee7fd8 ("printk: remove console flushing special cases for partial buffered lines")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Petr Mladek <pmladek@suse.com>
[ I do the occasional "allnoconfig" builds, but apparently not often
  enough  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b3c454b733da..bc2e220ed2b0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1885,12 +1885,6 @@ static u32 console_idx;
 static u64 log_first_seq;
 static u32 log_first_idx;
 static u64 log_next_seq;
-static struct cont {
-	size_t len;
-	size_t cons;
-	u8 level;
-	bool flushed:1;
-} cont;
 static char *log_text(const struct printk_log *msg) { return NULL; }
 static char *log_dict(const struct printk_log *msg) { return NULL; }
 static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-- 
cgit v1.2.3


From a08dd0da5307ba01295c8383923e51e7997c3576 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 15 Dec 2016 01:30:06 +0100
Subject: bpf: fix regression on verifier pruning wrt map lookups

Commit 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL
registers") introduced a regression where existing programs stopped
loading due to reaching the verifier's maximum complexity limit,
whereas prior to this commit they were loading just fine; the affected
program has roughly 2k instructions.

What was found is that state pruning couldn't be performed effectively
anymore due to mismatches of the verifier's register state, in particular
in the id tracking. It doesn't mean that 57a09bf0a416 is incorrect per
se, but rather that verifier needs to perform a lot more work for the
same program with regards to involved map lookups.

Since commit 57a09bf0a416 is only about tracking registers with type
PTR_TO_MAP_VALUE_OR_NULL, the id is only needed to follow registers
until they are promoted through pattern matching with a NULL check to
either PTR_TO_MAP_VALUE or UNKNOWN_VALUE type. After that point, the
id becomes irrelevant for the transitioned types.

For UNKNOWN_VALUE, id is already reset to 0 via mark_reg_unknown_value(),
but not so for PTR_TO_MAP_VALUE where id is becoming stale. It's even
transferred further into other types that don't make use of it. Among
others, one example is where UNKNOWN_VALUE is set on function call
return with RET_INTEGER return type.

states_equal() will then fall through the memcmp() on register state;
note that the second memcmp() uses offsetofend(), so the id is part of
that since d2a4dd37f6b4 ("bpf: fix state equivalence"). But the bisect
pointed already to 57a09bf0a416, where we really reach beyond complexity
limit. What I found was that states_equal() often failed in this
case due to id mismatches in spilled regs with registers in type
PTR_TO_MAP_VALUE. Unlike non-spilled regs, spilled regs just perform
a memcmp() on their reg state and don't have any other optimizations
in place, therefore also id was relevant in this case for making a
pruning decision.

We can safely reset id to 0 as well when converting to PTR_TO_MAP_VALUE.
For the affected program, it resulted in a ~17 fold reduction of
complexity and let the program load fine again. Selftest suite also
runs fine. The only other place where env->id_gen is used currently is
through direct packet access, but for these cases id is long living, thus
a different scenario.

Also, the current logic in mark_map_regs() is not fully correct when
marking NULL branch with UNKNOWN_VALUE. We need to cache the destination
reg's id in any case. Otherwise, once we marked that reg as UNKNOWN_VALUE,
it's id is reset and any subsequent registers that hold the original id
and are of type PTR_TO_MAP_VALUE_OR_NULL won't be marked UNKNOWN_VALUE
anymore, since mark_map_reg() reuses the uncached regs[regno].id that
was just overridden. Note, we don't need to cache it outside of
mark_map_regs(), since it's called once on this_branch and the other
time on other_branch, which are both two independent verifier states.
A test case for this is added here, too.

Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       | 11 ++++++++---
 tools/testing/selftests/bpf/test_verifier.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d28f9a3380a9..81e267bc4640 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1970,6 +1970,11 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 
 	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
 		reg->type = type;
+		/* We don't need id from this point onwards anymore, thus we
+		 * should better reset it, so that state pruning has chances
+		 * to take effect.
+		 */
+		reg->id = 0;
 		if (type == UNKNOWN_VALUE)
 			mark_reg_unknown_value(regs, regno);
 	}
@@ -1982,16 +1987,16 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 			  enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs;
+	u32 id = regs[regno].id;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_map_reg(regs, i, regs[regno].id, type);
+		mark_map_reg(regs, i, id, type);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
-		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE,
-			     regs[regno].id, type);
+		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
 	}
 }
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 0103bf2e0c0d..072dc63dc957 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -2660,6 +2660,34 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
+	{
+		"multiple registers share map_lookup_elem bad reg type",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_MOV64_REG(BPF_REG_5, BPF_REG_0),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_MOV64_IMM(BPF_REG_1, 2),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 0),
+			BPF_MOV64_IMM(BPF_REG_1, 3),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.result = REJECT,
+		.errstr = "R3 invalid mem access 'inv'",
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
 	{
 		"invalid map access from else condition",
 		.insns = {
-- 
cgit v1.2.3


From aafe6ae9cee32df85eb5e8bb6dd1d918e6807b09 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 18 Dec 2016 01:52:57 +0100
Subject: bpf: dynamically allocate digest scratch buffer

Geert rightfully complained that 7bd509e311f4 ("bpf: add prog_digest
and expose it via fdinfo/netlink") added a too large allocation of
variable 'raw' from bss section, and should instead be done dynamically:

  # ./scripts/bloat-o-meter kernel/bpf/core.o.1 kernel/bpf/core.o.2
  add/remove: 3/0 grow/shrink: 0/0 up/down: 33291/0 (33291)
  function                                     old     new   delta
  raw                                            -   32832  +32832
  [...]

Since this is only relevant during program creation path, which can be
considered slow-path anyway, lets allocate that dynamically and be not
implicitly dependent on verifier mutex. Move bpf_prog_calc_digest() at
the beginning of replace_map_fd_with_map_ptr() and also error handling
stays straight forward.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h    |  2 +-
 include/linux/filter.h | 14 +++++++++++---
 kernel/bpf/core.c      | 27 ++++++++++++++++-----------
 kernel/bpf/syscall.c   |  2 +-
 kernel/bpf/verifier.c  |  6 ++++--
 5 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8796ff03f472..201eb483c46f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,7 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
-void bpf_prog_calc_digest(struct bpf_prog *fp);
+int bpf_prog_calc_digest(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index af8a1804cac6..702314253797 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -57,9 +57,6 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
-/* Maximum BPF program size in bytes. */
-#define MAX_BPF_SIZE	(BPF_MAXINSNS * sizeof(struct bpf_insn))
-
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -517,6 +514,17 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	return BPF_PROG_RUN(prog, xdp);
 }
 
+static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
+{
+	return prog->len * sizeof(struct bpf_insn);
+}
+
+static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog)
+{
+	return round_up(bpf_prog_insn_size(prog) +
+			sizeof(__be64) + 1, SHA_MESSAGE_BYTES);
+}
+
 static inline unsigned int bpf_prog_size(unsigned int proglen)
 {
 	return max(sizeof(struct bpf_prog),
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 83e0d153b0b4..75c08b8068d6 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,28 +136,29 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
-#define SHA_BPF_RAW_SIZE						\
-	round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES)
-
-/* Called under verifier mutex. */
-void bpf_prog_calc_digest(struct bpf_prog *fp)
+int bpf_prog_calc_digest(struct bpf_prog *fp)
 {
 	const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
-	static u32 ws[SHA_WORKSPACE_WORDS];
-	static u8 raw[SHA_BPF_RAW_SIZE];
-	struct bpf_insn *dst = (void *)raw;
+	u32 raw_size = bpf_prog_digest_scratch_size(fp);
+	u32 ws[SHA_WORKSPACE_WORDS];
 	u32 i, bsize, psize, blocks;
+	struct bpf_insn *dst;
 	bool was_ld_map;
-	u8 *todo = raw;
+	u8 *raw, *todo;
 	__be32 *result;
 	__be64 *bits;
 
+	raw = vmalloc(raw_size);
+	if (!raw)
+		return -ENOMEM;
+
 	sha_init(fp->digest);
 	memset(ws, 0, sizeof(ws));
 
 	/* We need to take out the map fd for the digest calculation
 	 * since they are unstable from user space side.
 	 */
+	dst = (void *)raw;
 	for (i = 0, was_ld_map = false; i < fp->len; i++) {
 		dst[i] = fp->insnsi[i];
 		if (!was_ld_map &&
@@ -177,12 +178,13 @@ void bpf_prog_calc_digest(struct bpf_prog *fp)
 		}
 	}
 
-	psize = fp->len * sizeof(struct bpf_insn);
-	memset(&raw[psize], 0, sizeof(raw) - psize);
+	psize = bpf_prog_insn_size(fp);
+	memset(&raw[psize], 0, raw_size - psize);
 	raw[psize++] = 0x80;
 
 	bsize  = round_up(psize, SHA_MESSAGE_BYTES);
 	blocks = bsize / SHA_MESSAGE_BYTES;
+	todo   = raw;
 	if (bsize - psize >= sizeof(__be64)) {
 		bits = (__be64 *)(todo + bsize - sizeof(__be64));
 	} else {
@@ -199,6 +201,9 @@ void bpf_prog_calc_digest(struct bpf_prog *fp)
 	result = (__force __be32 *)fp->digest;
 	for (i = 0; i < SHA_DIGEST_WORDS; i++)
 		result[i] = cpu_to_be32(fp->digest[i]);
+
+	vfree(raw);
+	return 0;
 }
 
 static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4819ec9d95f6..35d674c1f12e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -811,7 +811,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
 	err = -EFAULT;
 	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
-			   prog->len * sizeof(struct bpf_insn)) != 0)
+			   bpf_prog_insn_size(prog)) != 0)
 		goto free_prog;
 
 	prog->orig_prog = NULL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 81e267bc4640..64b7b1abe087 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2931,6 +2931,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	int insn_cnt = env->prog->len;
 	int i, j, err;
 
+	err = bpf_prog_calc_digest(env->prog);
+	if (err)
+		return err;
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
 		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
@@ -3178,8 +3182,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log_level = 0;
 	}
 
-	bpf_prog_calc_digest(env->prog);
-
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.2.3


From 5ccb071e97fbd9ffe623a0d3977cc6d013bee93c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 18 Dec 2016 01:52:58 +0100
Subject: bpf: fix overflow in prog accounting

Commit aaac3ba95e4c ("bpf: charge user for creation of BPF maps and
programs") made a wrong assumption of charging against prog->pages.
Unlike map->pages, prog->pages are still subject to change when we
need to expand the program through bpf_prog_realloc().

This can for example happen during verification stage when we need to
expand and rewrite parts of the program. Should the required space
cross a page boundary, then prog->pages is not the same anymore as
its original value that we used to bpf_prog_charge_memlock() on. Thus,
we'll hit a wrap-around during bpf_prog_uncharge_memlock() when prog
is freed eventually. I noticed this that despite having unlimited
memlock, programs suddenly refused to load with EPERM error due to
insufficient memlock.

There are two ways to fix this issue. One would be to add a cached
variable to struct bpf_prog that takes a snapshot of prog->pages at the
time of charging. The other approach is to also account for resizes. I
chose to go with the latter for a couple of reasons: i) We want accounting
rather to be more accurate instead of further fooling limits, ii) adding
yet another page counter on struct bpf_prog would also be a waste just
for this purpose. We also do want to charge as early as possible to
avoid going into the verifier just to find out later on that we crossed
limits. The only place that needs to be fixed is bpf_prog_realloc(),
since only here we expand the program, so we try to account for the
needed delta and should we fail, call-sites check for outcome anyway.
On cBPF to eBPF migrations, we don't grab a reference to the user as
they are charged differently. With that in place, my test case worked
fine.

Fixes: aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 11 +++++++++++
 kernel/bpf/core.c    | 16 +++++++++++++---
 kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++--------
 3 files changed, 52 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 201eb483c46f..f74ae68086dc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -238,6 +238,8 @@ struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
+int __bpf_prog_charge(struct user_struct *user, u32 pages);
+void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
@@ -318,6 +320,15 @@ static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline int __bpf_prog_charge(struct user_struct *user, u32 pages)
+{
+	return 0;
+}
+
+static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 /* verifier prototypes for helper functions called from eBPF programs */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 75c08b8068d6..1eb4f1303756 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -105,19 +105,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
 			  gfp_extra_flags;
 	struct bpf_prog *fp;
+	u32 pages, delta;
+	int ret;
 
 	BUG_ON(fp_old == NULL);
 
 	size = round_up(size, PAGE_SIZE);
-	if (size <= fp_old->pages * PAGE_SIZE)
+	pages = size / PAGE_SIZE;
+	if (pages <= fp_old->pages)
 		return fp_old;
 
+	delta = pages - fp_old->pages;
+	ret = __bpf_prog_charge(fp_old->aux->user, delta);
+	if (ret)
+		return NULL;
+
 	fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-	if (fp != NULL) {
+	if (fp == NULL) {
+		__bpf_prog_uncharge(fp_old->aux->user, delta);
+	} else {
 		kmemcheck_annotate_bitfield(fp, meta);
 
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
-		fp->pages = size / PAGE_SIZE;
+		fp->pages = pages;
 		fp->aux->prog = fp;
 
 		/* We keep fp->aux from fp_old around in the new
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35d674c1f12e..e89acea22ecf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -615,19 +615,39 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
+int __bpf_prog_charge(struct user_struct *user, u32 pages)
+{
+	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	unsigned long user_bufs;
+
+	if (user) {
+		user_bufs = atomic_long_add_return(pages, &user->locked_vm);
+		if (user_bufs > memlock_limit) {
+			atomic_long_sub(pages, &user->locked_vm);
+			return -EPERM;
+		}
+	}
+
+	return 0;
+}
+
+void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
+{
+	if (user)
+		atomic_long_sub(pages, &user->locked_vm);
+}
+
 static int bpf_prog_charge_memlock(struct bpf_prog *prog)
 {
 	struct user_struct *user = get_current_user();
-	unsigned long memlock_limit;
-
-	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	int ret;
 
-	atomic_long_add(prog->pages, &user->locked_vm);
-	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
-		atomic_long_sub(prog->pages, &user->locked_vm);
+	ret = __bpf_prog_charge(user, prog->pages);
+	if (ret) {
 		free_uid(user);
-		return -EPERM;
+		return ret;
 	}
+
 	prog->aux->user = user;
 	return 0;
 }
@@ -636,7 +656,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
 {
 	struct user_struct *user = prog->aux->user;
 
-	atomic_long_sub(prog->pages, &user->locked_vm);
+	__bpf_prog_uncharge(user, prog->pages);
 	free_uid(user);
 }
 
-- 
cgit v1.2.3


From 6760bf2ddde8ad64f8205a651223a93de3a35494 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 18 Dec 2016 01:52:59 +0100
Subject: bpf: fix mark_reg_unknown_value for spilled regs on map value marking

Martin reported a verifier issue that hit the BUG_ON() for his
test case in the mark_reg_unknown_value() function:

  [  202.861380] kernel BUG at kernel/bpf/verifier.c:467!
  [...]
  [  203.291109] Call Trace:
  [  203.296501]  [<ffffffff811364d5>] mark_map_reg+0x45/0x50
  [  203.308225]  [<ffffffff81136558>] mark_map_regs+0x78/0x90
  [  203.320140]  [<ffffffff8113938d>] do_check+0x226d/0x2c90
  [  203.331865]  [<ffffffff8113a6ab>] bpf_check+0x48b/0x780
  [  203.343403]  [<ffffffff81134c8e>] bpf_prog_load+0x27e/0x440
  [  203.355705]  [<ffffffff8118a38f>] ? handle_mm_fault+0x11af/0x1230
  [  203.369158]  [<ffffffff812d8188>] ? security_capable+0x48/0x60
  [  203.382035]  [<ffffffff811351a4>] SyS_bpf+0x124/0x960
  [  203.393185]  [<ffffffff810515f6>] ? __do_page_fault+0x276/0x490
  [  203.406258]  [<ffffffff816db320>] entry_SYSCALL_64_fastpath+0x13/0x94

This issue got uncovered after the fix in a08dd0da5307 ("bpf: fix
regression on verifier pruning wrt map lookups"). The reason why it
wasn't noticed before was, because as mentioned in a08dd0da5307,
mark_map_regs() was doing the id matching incorrectly based on the
uncached regs[regno].id. So, in the first loop, we walked all regs
and as soon as we found regno == i, then this reg's id was cleared
when calling mark_reg_unknown_value() thus that every subsequent
register was probed against id of 0 (which, in combination with the
PTR_TO_MAP_VALUE_OR_NULL type is an invalid condition that no other
register state can hold), and therefore wasn't type transitioned such
as in the spilled register case for the second loop.

Now since that got fixed, it turned out that 57a09bf0a416 ("bpf:
Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") used
mark_reg_unknown_value() incorrectly for the spilled regs, and thus
hitting the BUG_ON() in some cases due to regno >= MAX_BPF_REG.

Although spilled regs have the same type as the non-spilled regs
for the verifier state, that is, struct bpf_reg_state, they are
semantically different from the non-spilled regs. In other words,
there can be up to 64 (MAX_BPF_STACK / BPF_REG_SIZE) spilled regs
in the stack, for example, register R<x> could have been spilled by
the program to stack location X, Y, Z, and in mark_map_regs() we
need to scan these stack slots of type STACK_SPILL for potential
registers that we have to transition from PTR_TO_MAP_VALUE_OR_NULL.
Therefore, depending on the location, the spilled_regs regno can
be a lot higher than just MAX_BPF_REG's value since we operate on
stack instead. The reset in mark_reg_unknown_value() itself is
just fine, only that the BUG_ON() was inappropriate for this. Fix
it by making a __mark_reg_unknown_value() version that can be
called from mark_map_reg() generically; we know for the non-spilled
case that the regno is always < MAX_BPF_REG anyway.

Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
Reported-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 64b7b1abe087..83ed2f8f6f22 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -462,14 +462,19 @@ static void init_reg_state(struct bpf_reg_state *regs)
 	regs[BPF_REG_1].type = PTR_TO_CTX;
 }
 
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
 {
-	BUG_ON(regno >= MAX_BPF_REG);
 	regs[regno].type = UNKNOWN_VALUE;
 	regs[regno].id = 0;
 	regs[regno].imm = 0;
 }
 
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+{
+	BUG_ON(regno >= MAX_BPF_REG);
+	__mark_reg_unknown_value(regs, regno);
+}
+
 static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 {
 	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
@@ -1976,7 +1981,7 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 		 */
 		reg->id = 0;
 		if (type == UNKNOWN_VALUE)
-			mark_reg_unknown_value(regs, regno);
+			__mark_reg_unknown_value(regs, regno);
 	}
 }
 
-- 
cgit v1.2.3


From 297e765e390a2ac996000b5f7228cbd84d995174 Mon Sep 17 00:00:00 2001
From: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Date: Tue, 13 Dec 2016 11:40:57 +0100
Subject: uprobes: Fix uprobes on MIPS, allow for a cache flush after ixol
 breakpoint creation

Commit:

  72e6ae285a1d ('ARM: 8043/1: uprobes need icache flush after xol write'

... has introduced an arch-specific method to ensure all caches are
flushed appropriately after an instruction is written to an XOL page.

However, when the XOL area is created and the out-of-line breakpoint
instruction is copied, caches are not flushed at all and stale data may
be found in icache.

Replace a simple copy_to_page() with arch_uprobe_copy_ixol() to allow
the arch to ensure all caches are updated accordingly.

This change fixes uprobes on MIPS InterAptiv (tested on Creator Ci40).

Signed-off-by: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Victor Kamensky <victor.kamensky@linaro.org>
Cc: linux-mips@linux-mips.org
Link: http://lkml.kernel.org/r/1481625657-22850-1-git-send-email-marcin.nowakowski@imgtec.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..b5916b4969a5 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1194,7 +1194,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
 	/* Reserve the 1st slot for get_trampoline_vaddr() */
 	set_bit(0, area->bitmap);
 	atomic_set(&area->slot_count, 1);
-	copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
+	arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
 
 	if (!xol_add_vma(mm, area))
 		return area;
-- 
cgit v1.2.3


From 7b8589cc29e7c35dcfd2d5138979f17b48f90110 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Mon, 19 Dec 2016 16:22:48 -0800
Subject: ima: on soft reboot, save the measurement list

The TPM PCRs are only reset on a hard reboot.  In order to validate a
TPM's quote after a soft reboot (eg.  kexec -e), the IMA measurement
list of the running kernel must be saved and restored on boot.

This patch uses the kexec buffer passing mechanism to pass the
serialized IMA binary_runtime_measurements to the next kernel.

Link: http://lkml.kernel.org/r/1480554346-29071-7-git-send-email-zohar@linux.vnet.ibm.com
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Andreas Steffen <andreas.steffen@strongswan.org>
Cc: Josh Sklar <sklar@linux.vnet.ibm.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ima.h                |  12 ++++
 kernel/kexec_file.c                |   4 ++
 security/integrity/ima/ima.h       |   1 +
 security/integrity/ima/ima_fs.c    |   2 +-
 security/integrity/ima/ima_kexec.c | 117 +++++++++++++++++++++++++++++++++++++
 5 files changed, 135 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0eb7c2e7f0d6..7f6952f8d6aa 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -11,6 +11,7 @@
 #define _LINUX_IMA_H
 
 #include <linux/fs.h>
+#include <linux/kexec.h>
 struct linux_binprm;
 
 #ifdef CONFIG_IMA
@@ -23,6 +24,10 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size,
 			      enum kernel_read_file_id id);
 extern void ima_post_path_mknod(struct dentry *dentry);
 
+#ifdef CONFIG_IMA_KEXEC
+extern void ima_add_kexec_buffer(struct kimage *image);
+#endif
+
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
 {
@@ -62,6 +67,13 @@ static inline void ima_post_path_mknod(struct dentry *dentry)
 
 #endif /* CONFIG_IMA */
 
+#ifndef CONFIG_IMA_KEXEC
+struct kimage;
+
+static inline void ima_add_kexec_buffer(struct kimage *image)
+{}
+#endif
+
 #ifdef CONFIG_IMA_APPRAISE
 extern void ima_inode_post_setattr(struct dentry *dentry);
 extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 0c2df7f73792..b56a558e406d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/fs.h>
+#include <linux/ima.h>
 #include <crypto/hash.h>
 #include <crypto/sha.h>
 #include <linux/syscalls.h>
@@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
 		return ret;
 	image->kernel_buf_len = size;
 
+	/* IMA needs to pass the measurement list to the next kernel. */
+	ima_add_kexec_buffer(image);
+
 	/* Call arch image probe handlers */
 	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
 					    image->kernel_buf_len);
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index ea1dcc452911..139dec67dcbf 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -143,6 +143,7 @@ void ima_print_digest(struct seq_file *m, u8 *digest, u32 size);
 struct ima_template_desc *ima_template_desc_current(void);
 int ima_restore_measurement_entry(struct ima_template_entry *entry);
 int ima_restore_measurement_list(loff_t bufsize, void *buf);
+int ima_measurements_show(struct seq_file *m, void *v);
 unsigned long ima_get_binary_runtime_size(void);
 int ima_init_template(void);
 
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 3df46906492d..10bea0125fa1 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -116,7 +116,7 @@ void ima_putc(struct seq_file *m, void *data, int datalen)
  *       [eventdata length]
  *       eventdata[n]=template specific data
  */
-static int ima_measurements_show(struct seq_file *m, void *v)
+int ima_measurements_show(struct seq_file *m, void *v)
 {
 	/* the list never shrinks, so we don't need a lock here */
 	struct ima_queue_entry *qe = v;
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index 36afd0fe9747..2c4824ac1ce1 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -10,8 +10,125 @@
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  */
+#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
+#include <linux/kexec.h>
 #include "ima.h"
 
+#ifdef CONFIG_IMA_KEXEC
+static int ima_dump_measurement_list(unsigned long *buffer_size, void **buffer,
+				     unsigned long segment_size)
+{
+	struct ima_queue_entry *qe;
+	struct seq_file file;
+	struct ima_kexec_hdr khdr = {
+		.version = 1, .buffer_size = 0, .count = 0};
+	int ret = 0;
+
+	/* segment size can't change between kexec load and execute */
+	file.buf = vmalloc(segment_size);
+	if (!file.buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	file.size = segment_size;
+	file.read_pos = 0;
+	file.count = sizeof(khdr);	/* reserved space */
+
+	list_for_each_entry_rcu(qe, &ima_measurements, later) {
+		if (file.count < file.size) {
+			khdr.count++;
+			ima_measurements_show(&file, qe);
+		} else {
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * fill in reserved space with some buffer details
+	 * (eg. version, buffer size, number of measurements)
+	 */
+	khdr.buffer_size = file.count;
+	memcpy(file.buf, &khdr, sizeof(khdr));
+	print_hex_dump(KERN_DEBUG, "ima dump: ", DUMP_PREFIX_NONE,
+			16, 1, file.buf,
+			file.count < 100 ? file.count : 100, true);
+
+	*buffer_size = file.count;
+	*buffer = file.buf;
+out:
+	if (ret == -EINVAL)
+		vfree(file.buf);
+	return ret;
+}
+
+/*
+ * Called during kexec_file_load so that IMA can add a segment to the kexec
+ * image for the measurement list for the next kernel.
+ *
+ * This function assumes that kexec_mutex is held.
+ */
+void ima_add_kexec_buffer(struct kimage *image)
+{
+	struct kexec_buf kbuf = { .image = image, .buf_align = PAGE_SIZE,
+				  .buf_min = 0, .buf_max = ULONG_MAX,
+				  .top_down = true };
+	unsigned long binary_runtime_size;
+
+	/* use more understandable variable names than defined in kbuf */
+	void *kexec_buffer = NULL;
+	size_t kexec_buffer_size;
+	size_t kexec_segment_size;
+	int ret;
+
+	/*
+	 * Reserve an extra half page of memory for additional measurements
+	 * added during the kexec load.
+	 */
+	binary_runtime_size = ima_get_binary_runtime_size();
+	if (binary_runtime_size >= ULONG_MAX - PAGE_SIZE)
+		kexec_segment_size = ULONG_MAX;
+	else
+		kexec_segment_size = ALIGN(ima_get_binary_runtime_size() +
+					   PAGE_SIZE / 2, PAGE_SIZE);
+	if ((kexec_segment_size == ULONG_MAX) ||
+	    ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) {
+		pr_err("Binary measurement list too large.\n");
+		return;
+	}
+
+	ima_dump_measurement_list(&kexec_buffer_size, &kexec_buffer,
+				  kexec_segment_size);
+	if (!kexec_buffer) {
+		pr_err("Not enough memory for the kexec measurement buffer.\n");
+		return;
+	}
+
+	kbuf.buffer = kexec_buffer;
+	kbuf.bufsz = kexec_buffer_size;
+	kbuf.memsz = kexec_segment_size;
+	ret = kexec_add_buffer(&kbuf);
+	if (ret) {
+		pr_err("Error passing over kexec measurement buffer.\n");
+		return;
+	}
+
+	ret = arch_ima_add_kexec_buffer(image, kbuf.mem, kexec_segment_size);
+	if (ret) {
+		pr_err("Error passing over kexec measurement buffer.\n");
+		return;
+	}
+
+	pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n",
+		 kbuf.mem);
+}
+#endif /* IMA_KEXEC */
+
 /*
  * Restore the measurement list from the previous kernel.
  */
-- 
cgit v1.2.3


From 4983f0ab7ffaad1e534b21975367429736475205 Mon Sep 17 00:00:00 2001
From: Alexander Popov <alex.popov@linux.com>
Date: Mon, 19 Dec 2016 16:23:09 -0800
Subject: kcov: make kcov work properly with KASLR enabled

Subtract KASLR offset from the kernel addresses reported by kcov.
Tested on x86_64 and AArch64 (Hikey LeMaker).

Link: http://lkml.kernel.org/r/1481417456-28826-3-git-send-email-alex.popov@linux.com
Signed-off-by: Alexander Popov <alex.popov@linux.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: Jon Masters <jcm@redhat.com>
Cc: David Daney <david.daney@cavium.com>
Cc: Ganapatrao Kulkarni <gkulkarni@caviumnetworks.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Cc: James Morse <james.morse@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: syzkaller <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index cc2fa35ca480..85e5546cd791 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -19,6 +19,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/kcov.h>
+#include <asm/setup.h>
 
 /*
  * kcov descriptor (one per opened debugfs file).
@@ -73,6 +74,11 @@ void notrace __sanitizer_cov_trace_pc(void)
 	if (mode == KCOV_MODE_TRACE) {
 		unsigned long *area;
 		unsigned long pos;
+		unsigned long ip = _RET_IP_;
+
+#ifdef CONFIG_RANDOMIZE_BASE
+		ip -= kaslr_offset();
+#endif
 
 		/*
 		 * There is some code that runs in interrupts but for which
@@ -86,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 		/* The first word is number of subsequent PCs. */
 		pos = READ_ONCE(area[0]) + 1;
 		if (likely(pos < t->kcov_size)) {
-			area[pos] = _RET_IP_;
+			area[pos] = ip;
 			WRITE_ONCE(area[0], pos);
 		}
 	}
-- 
cgit v1.2.3


From c00d2c7e89880036f288a764599b2b8b87c0a364 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 20 Dec 2016 07:04:57 -0500
Subject: move aio compat to fs/aio.c

... and fix the minor buglet in compat io_submit() - native one
kills ioctx as cleanup when put_user() fails.  Get rid of
bogus compat_... in !CONFIG_AIO case, while we are at it - they
should simply fail with ENOSYS, same as for native counterparts.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c            | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/compat.c         | 75 -----------------------------------------
 include/linux/aio.h |  5 ---
 kernel/sys_ni.c     |  3 ++
 4 files changed, 98 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/fs/aio.c b/fs/aio.c
index 8edf253484af..8c79e1a53af9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1367,6 +1367,39 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
+{
+	struct kioctx *ioctx = NULL;
+	unsigned long ctx;
+	long ret;
+
+	ret = get_user(ctx, ctx32p);
+	if (unlikely(ret))
+		goto out;
+
+	ret = -EINVAL;
+	if (unlikely(ctx || nr_events == 0)) {
+		pr_debug("EINVAL: ctx %lu nr_events %u\n",
+		         ctx, nr_events);
+		goto out;
+	}
+
+	ioctx = ioctx_alloc(nr_events);
+	ret = PTR_ERR(ioctx);
+	if (!IS_ERR(ioctx)) {
+		/* truncating is ok because it's a user address */
+		ret = put_user((u32)ioctx->user_id, ctx32p);
+		if (ret)
+			kill_ioctx(current->mm, ioctx, NULL);
+		percpu_ref_put(&ioctx->users);
+	}
+
+out:
+	return ret;
+}
+#endif
+
 /* sys_io_destroy:
  *	Destroy the aio_context specified.  May cancel any outstanding 
  *	AIOs and block on completion.  Will fail with -ENOSYS if not
@@ -1591,8 +1624,8 @@ out_put_req:
 	return ret;
 }
 
-long do_io_submit(aio_context_t ctx_id, long nr,
-		  struct iocb __user *__user *iocbpp, bool compat)
+static long do_io_submit(aio_context_t ctx_id, long nr,
+			  struct iocb __user *__user *iocbpp, bool compat)
 {
 	struct kioctx *ctx;
 	long ret = 0;
@@ -1662,6 +1695,44 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 	return do_io_submit(ctx_id, nr, iocbpp, 0);
 }
 
+#ifdef CONFIG_COMPAT
+static inline long
+copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+{
+	compat_uptr_t uptr;
+	int i;
+
+	for (i = 0; i < nr; ++i) {
+		if (get_user(uptr, ptr32 + i))
+			return -EFAULT;
+		if (put_user(compat_ptr(uptr), ptr64 + i))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
+
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+		       int, nr, u32 __user *, iocb)
+{
+	struct iocb __user * __user *iocb64;
+	long ret;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (nr > MAX_AIO_SUBMITS)
+		nr = MAX_AIO_SUBMITS;
+
+	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
+	ret = copy_iocb(nr, iocb, iocb64);
+	if (!ret)
+		ret = do_io_submit(ctx_id, nr, iocb64, 1);
+	return ret;
+}
+#endif
+
 /* lookup_kiocb
  *	Finds a given iocb for cancellation.
  */
@@ -1761,3 +1832,25 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	}
 	return ret;
 }
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
+		       compat_long_t, min_nr,
+		       compat_long_t, nr,
+		       struct io_event __user *, events,
+		       struct compat_timespec __user *, timeout)
+{
+	struct timespec t;
+	struct timespec __user *ut = NULL;
+
+	if (timeout) {
+		if (compat_get_timespec(&t, timeout))
+			return -EFAULT;
+
+		ut = compat_alloc_user_space(sizeof(*ut));
+		if (copy_to_user(ut, &t, sizeof(t)))
+			return -EFAULT;
+	}
+	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+}
+#endif
diff --git a/fs/compat.c b/fs/compat.c
index 543b48c29ac3..3f4908c28698 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -487,45 +487,6 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
 	return compat_sys_fcntl64(fd, cmd, arg);
 }
 
-COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
-{
-	long ret;
-	aio_context_t ctx64;
-
-	mm_segment_t oldfs = get_fs();
-	if (unlikely(get_user(ctx64, ctx32p)))
-		return -EFAULT;
-
-	set_fs(KERNEL_DS);
-	/* The __user pointer cast is valid because of the set_fs() */
-	ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64);
-	set_fs(oldfs);
-	/* truncating is ok because it's a user address */
-	if (!ret)
-		ret = put_user((u32) ctx64, ctx32p);
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
-		       compat_long_t, min_nr,
-		       compat_long_t, nr,
-		       struct io_event __user *, events,
-		       struct compat_timespec __user *, timeout)
-{
-	struct timespec t;
-	struct timespec __user *ut = NULL;
-
-	if (timeout) {
-		if (compat_get_timespec(&t, timeout))
-			return -EFAULT;
-
-		ut = compat_alloc_user_space(sizeof(*ut));
-		if (copy_to_user(ut, &t, sizeof(t)) )
-			return -EFAULT;
-	} 
-	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
-}
-
 /* A write operation does a read from user space and vice versa */
 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 
@@ -602,42 +563,6 @@ out:
 	return ret;
 }
 
-static inline long
-copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
-{
-	compat_uptr_t uptr;
-	int i;
-
-	for (i = 0; i < nr; ++i) {
-		if (get_user(uptr, ptr32 + i))
-			return -EFAULT;
-		if (put_user(compat_ptr(uptr), ptr64 + i))
-			return -EFAULT;
-	}
-	return 0;
-}
-
-#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
-
-COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
-		       int, nr, u32 __user *, iocb)
-{
-	struct iocb __user * __user *iocb64; 
-	long ret;
-
-	if (unlikely(nr < 0))
-		return -EINVAL;
-
-	if (nr > MAX_AIO_SUBMITS)
-		nr = MAX_AIO_SUBMITS;
-	
-	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
-	ret = copy_iocb(nr, iocb, iocb64);
-	if (!ret)
-		ret = do_io_submit(ctx_id, nr, iocb64, 1);
-	return ret;
-}
-
 struct compat_ncp_mount_data {
 	compat_int_t version;
 	compat_uint_t ncp_fd;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9eb42dbc5582..fdd0a343f455 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -14,14 +14,9 @@ typedef int (kiocb_cancel_fn)(struct kiocb *);
 /* prototypes */
 #ifdef CONFIG_AIO
 extern void exit_aio(struct mm_struct *mm);
-extern long do_io_submit(aio_context_t ctx_id, long nr,
-			 struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
 #else
 static inline void exit_aio(struct mm_struct *mm) { }
-static inline long do_io_submit(aio_context_t ctx_id, long nr,
-				struct iocb __user * __user *iocbpp,
-				bool compat) { return 0; }
 static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 635482e60ca3..8acef8576ce9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -150,6 +150,9 @@ cond_syscall(sys_io_destroy);
 cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
+cond_syscall(compat_sys_io_setup);
+cond_syscall(compat_sys_io_submit);
+cond_syscall(compat_sys_io_getevents);
 cond_syscall(sys_sysfs);
 cond_syscall(sys_syslog);
 cond_syscall(sys_process_vm_readv);
-- 
cgit v1.2.3


From e3ba730702af370563f66cb610b71aa0ca67955e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 22 Dec 2016 10:15:20 +0100
Subject: fsnotify: Remove fsnotify_duplicate_mark()

There are only two calls sites of fsnotify_duplicate_mark(). Those are
in kernel/audit_tree.c and both are bogus. Vfsmount pointer is unused
for audit tree, inode pointer and group gets set in
fsnotify_add_mark_locked() later anyway, mask and free_mark are already
set in alloc_chunk(). In fact, calling fsnotify_duplicate_mark() is
actively harmful because following fsnotify_add_mark_locked() will leak
group reference by overwriting the group pointer. So just remove the two
calls to fsnotify_duplicate_mark() and the function.

Signed-off-by: Jan Kara <jack@suse.cz>
[PM: line wrapping to fit in 80 chars]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/notify/mark.c                 | 12 ------------
 include/linux/fsnotify_backend.h |  2 --
 kernel/audit_tree.c              |  8 ++++----
 3 files changed, 4 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d3fea0bd89e2..6043306e8e21 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -510,18 +510,6 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
 	}
 }
 
-void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
-{
-	assert_spin_locked(&old->lock);
-	new->inode = old->inode;
-	new->mnt = old->mnt;
-	if (old->group)
-		fsnotify_get_group(old->group);
-	new->group = old->group;
-	new->mask = old->mask;
-	new->free_mark = old->free_mark;
-}
-
 /*
  * Nothing fancy, just initialize lists and locks and counters.
  */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7268ed076be8..ce77caa2bb10 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -324,8 +324,6 @@ extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(str
 extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
 /* find (and take a reference) to a mark associated with group and vfsmount */
 extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
-/* copy the values from old into new */
-extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
 /* set the ignored_mask of a mark */
 extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask);
 /* set the mask of a mark (might pin the object into memory */
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 055f11b0a50f..b4b58400531f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -258,8 +258,8 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 
-	fsnotify_duplicate_mark(&new->mark, entry);
-	if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) {
+	if (fsnotify_add_mark(&new->mark,
+			      entry->group, entry->inode, NULL, 1)) {
 		fsnotify_put_mark(&new->mark);
 		goto Fallback;
 	}
@@ -395,8 +395,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOENT;
 	}
 
-	fsnotify_duplicate_mark(chunk_entry, old_entry);
-	if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) {
+	if (fsnotify_add_mark(chunk_entry,
+			      old_entry->group, old_entry->inode, NULL, 1)) {
 		spin_unlock(&old_entry->lock);
 		fsnotify_put_mark(chunk_entry);
 		fsnotify_put_mark(old_entry);
-- 
cgit v1.2.3


From 7c0f6ba682b9c7632072ffbedf8d328c8f3c42ba Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 24 Dec 2016 11:46:01 -0800
Subject: Replace <asm/uaccess.h> with <linux/uaccess.h> globally

This was entirely automated, using the script by Al:

  PATT='^[[:blank:]]*#[[:blank:]]*include[[:blank:]]*<asm/uaccess.h>'
  sed -i -e "s!$PATT!#include <linux/uaccess.h>!" \
        $(git grep -l "$PATT"|grep -v ^include/linux/uaccess.h)

to do the replacement at the end of the merge window.

Requested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/boot/misc.c                                | 2 +-
 arch/alpha/kernel/irq.c                               | 2 +-
 arch/alpha/kernel/osf_sys.c                           | 2 +-
 arch/alpha/kernel/process.c                           | 2 +-
 arch/alpha/kernel/ptrace.c                            | 2 +-
 arch/alpha/kernel/setup.c                             | 2 +-
 arch/alpha/kernel/signal.c                            | 2 +-
 arch/alpha/kernel/srm_env.c                           | 2 +-
 arch/alpha/kernel/srmcons.c                           | 2 +-
 arch/alpha/kernel/time.c                              | 2 +-
 arch/alpha/kernel/traps.c                             | 2 +-
 arch/alpha/lib/csum_partial_copy.c                    | 2 +-
 arch/alpha/math-emu/math.c                            | 2 +-
 arch/alpha/mm/init.c                                  | 2 +-
 arch/arm/common/bL_switcher_dummy_if.c                | 2 +-
 arch/arm/kernel/swp_emulate.c                         | 2 +-
 arch/arm/kvm/arm.c                                    | 2 +-
 arch/arm/kvm/guest.c                                  | 2 +-
 arch/arm/mach-iop13xx/irq.c                           | 2 +-
 arch/arm/mach-ixp4xx/common.c                         | 2 +-
 arch/arm/mach-rpc/dma.c                               | 2 +-
 arch/arm/plat-iop/time.c                              | 2 +-
 arch/arm64/include/asm/word-at-a-time.h               | 2 +-
 arch/arm64/kernel/armv8_deprecated.c                  | 2 +-
 arch/arm64/kernel/entry.S                             | 2 +-
 arch/arm64/kernel/probes/kprobes.c                    | 2 +-
 arch/arm64/kernel/signal32.c                          | 2 +-
 arch/arm64/kvm/guest.c                                | 2 +-
 arch/arm64/lib/clear_user.S                           | 2 +-
 arch/arm64/lib/copy_from_user.S                       | 2 +-
 arch/arm64/lib/copy_in_user.S                         | 2 +-
 arch/arm64/lib/copy_to_user.S                         | 2 +-
 arch/arm64/mm/cache.S                                 | 2 +-
 arch/arm64/xen/hypercall.S                            | 2 +-
 arch/avr32/kernel/avr32_ksyms.c                       | 2 +-
 arch/avr32/kernel/ptrace.c                            | 2 +-
 arch/avr32/kernel/signal.c                            | 2 +-
 arch/avr32/mm/cache.c                                 | 2 +-
 arch/blackfin/kernel/bfin_dma.c                       | 2 +-
 arch/blackfin/kernel/kgdb_test.c                      | 2 +-
 arch/blackfin/kernel/module.c                         | 2 +-
 arch/c6x/mm/init.c                                    | 2 +-
 arch/cris/arch-v10/drivers/eeprom.c                   | 2 +-
 arch/cris/arch-v10/drivers/sync_serial.c              | 2 +-
 arch/cris/arch-v10/kernel/ptrace.c                    | 2 +-
 arch/cris/arch-v10/kernel/signal.c                    | 2 +-
 arch/cris/arch-v10/kernel/traps.c                     | 2 +-
 arch/cris/arch-v10/lib/usercopy.c                     | 2 +-
 arch/cris/arch-v10/mm/fault.c                         | 2 +-
 arch/cris/arch-v32/drivers/cryptocop.c                | 2 +-
 arch/cris/arch-v32/kernel/ptrace.c                    | 2 +-
 arch/cris/arch-v32/kernel/signal.c                    | 2 +-
 arch/cris/arch-v32/kernel/traps.c                     | 2 +-
 arch/cris/arch-v32/lib/usercopy.c                     | 2 +-
 arch/cris/kernel/crisksyms.c                          | 2 +-
 arch/cris/kernel/process.c                            | 2 +-
 arch/cris/kernel/profile.c                            | 2 +-
 arch/cris/kernel/ptrace.c                             | 2 +-
 arch/cris/kernel/sys_cris.c                           | 2 +-
 arch/cris/kernel/traps.c                              | 2 +-
 arch/frv/include/asm/futex.h                          | 2 +-
 arch/frv/kernel/irq.c                                 | 2 +-
 arch/frv/kernel/pm-mb93093.c                          | 2 +-
 arch/frv/kernel/pm.c                                  | 2 +-
 arch/frv/kernel/process.c                             | 2 +-
 arch/frv/kernel/ptrace.c                              | 2 +-
 arch/frv/kernel/signal.c                              | 2 +-
 arch/frv/kernel/sys_frv.c                             | 2 +-
 arch/frv/kernel/sysctl.c                              | 2 +-
 arch/frv/kernel/traps.c                               | 2 +-
 arch/frv/kernel/uaccess.c                             | 2 +-
 arch/frv/mm/dma-alloc.c                               | 2 +-
 arch/frv/mm/extable.c                                 | 2 +-
 arch/h8300/boot/compressed/misc.c                     | 2 +-
 arch/h8300/kernel/process.c                           | 2 +-
 arch/h8300/kernel/signal.c                            | 2 +-
 arch/hexagon/kernel/hexagon_ksyms.c                   | 2 +-
 arch/hexagon/kernel/signal.c                          | 2 +-
 arch/hexagon/mm/uaccess.c                             | 2 +-
 arch/hexagon/mm/vm_fault.c                            | 2 +-
 arch/ia64/kernel/brl_emu.c                            | 2 +-
 arch/ia64/kernel/crash_dump.c                         | 2 +-
 arch/ia64/kernel/init_task.c                          | 2 +-
 arch/ia64/kernel/irq.c                                | 2 +-
 arch/ia64/kernel/kprobes.c                            | 2 +-
 arch/ia64/kernel/perfmon.c                            | 2 +-
 arch/ia64/kernel/process.c                            | 2 +-
 arch/ia64/kernel/ptrace.c                             | 2 +-
 arch/ia64/kernel/salinfo.c                            | 2 +-
 arch/ia64/kernel/signal.c                             | 2 +-
 arch/ia64/kernel/sys_ia64.c                           | 2 +-
 arch/ia64/kernel/traps.c                              | 2 +-
 arch/ia64/kernel/unaligned.c                          | 2 +-
 arch/ia64/kernel/unwind.c                             | 2 +-
 arch/ia64/lib/csum_partial_copy.c                     | 2 +-
 arch/ia64/mm/extable.c                                | 2 +-
 arch/ia64/mm/init.c                                   | 2 +-
 arch/ia64/sn/kernel/sn2/sn_hwperf.c                   | 2 +-
 arch/ia64/sn/kernel/sn2/sn_proc_fs.c                  | 2 +-
 arch/ia64/sn/kernel/tiocx.c                           | 2 +-
 arch/m32r/kernel/align.c                              | 2 +-
 arch/m32r/kernel/irq.c                                | 2 +-
 arch/m32r/kernel/m32r_ksyms.c                         | 2 +-
 arch/m32r/kernel/process.c                            | 2 +-
 arch/m32r/kernel/ptrace.c                             | 2 +-
 arch/m32r/kernel/signal.c                             | 2 +-
 arch/m32r/kernel/sys_m32r.c                           | 2 +-
 arch/m32r/kernel/traps.c                              | 2 +-
 arch/m32r/lib/csum_partial_copy.c                     | 2 +-
 arch/m32r/lib/usercopy.c                              | 2 +-
 arch/m32r/mm/extable.c                                | 2 +-
 arch/m32r/mm/fault-nommu.c                            | 2 +-
 arch/m68k/bvme6000/rtc.c                              | 2 +-
 arch/m68k/kernel/process.c                            | 2 +-
 arch/m68k/kernel/ptrace.c                             | 2 +-
 arch/m68k/kernel/signal.c                             | 2 +-
 arch/m68k/kernel/sys_m68k.c                           | 2 +-
 arch/m68k/kernel/traps.c                              | 2 +-
 arch/m68k/lib/uaccess.c                               | 2 +-
 arch/m68k/mac/misc.c                                  | 2 +-
 arch/m68k/mm/init.c                                   | 2 +-
 arch/m68k/mm/motorola.c                               | 2 +-
 arch/m68k/mm/sun3mmu.c                                | 2 +-
 arch/m68k/mvme16x/rtc.c                               | 2 +-
 arch/m68k/sun3/mmu_emu.c                              | 2 +-
 arch/metag/kernel/irq.c                               | 2 +-
 arch/mips/alchemy/common/power.c                      | 2 +-
 arch/mips/dec/kn01-berr.c                             | 2 +-
 arch/mips/include/asm/checksum.h                      | 2 +-
 arch/mips/include/asm/compat-signal.h                 | 2 +-
 arch/mips/include/asm/r4kcache.h                      | 2 +-
 arch/mips/include/asm/termios.h                       | 2 +-
 arch/mips/jazz/jazzdma.c                              | 2 +-
 arch/mips/kernel/branch.c                             | 2 +-
 arch/mips/kernel/cpu-probe.c                          | 2 +-
 arch/mips/kernel/crash_dump.c                         | 2 +-
 arch/mips/kernel/irq.c                                | 2 +-
 arch/mips/kernel/kgdb.c                               | 2 +-
 arch/mips/kernel/linux32.c                            | 2 +-
 arch/mips/kernel/mips-mt-fpaff.c                      | 2 +-
 arch/mips/kernel/mips-r2-to-r6-emul.c                 | 2 +-
 arch/mips/kernel/mips_ksyms.c                         | 2 +-
 arch/mips/kernel/process.c                            | 2 +-
 arch/mips/kernel/ptrace.c                             | 2 +-
 arch/mips/kernel/ptrace32.c                           | 2 +-
 arch/mips/kernel/signal32.c                           | 2 +-
 arch/mips/kernel/signal_n32.c                         | 2 +-
 arch/mips/kernel/syscall.c                            | 2 +-
 arch/mips/kernel/traps.c                              | 2 +-
 arch/mips/kernel/unaligned.c                          | 2 +-
 arch/mips/math-emu/cp1emu.c                           | 2 +-
 arch/mips/math-emu/dsemul.c                           | 2 +-
 arch/mips/mm/extable.c                                | 2 +-
 arch/mips/mm/sc-debugfs.c                             | 2 +-
 arch/mips/oprofile/op_model_loongson3.c               | 2 +-
 arch/mips/sgi-ip22/ip28-berr.c                        | 2 +-
 arch/mips/sgi-ip27/ip27-berr.c                        | 2 +-
 arch/mips/sgi-ip32/ip32-berr.c                        | 2 +-
 arch/mips/sibyte/common/sb_tbprof.c                   | 2 +-
 arch/mn10300/kernel/fpu.c                             | 2 +-
 arch/mn10300/kernel/mn10300_ksyms.c                   | 2 +-
 arch/mn10300/kernel/process.c                         | 2 +-
 arch/mn10300/kernel/ptrace.c                          | 2 +-
 arch/mn10300/kernel/setup.c                           | 2 +-
 arch/mn10300/kernel/signal.c                          | 2 +-
 arch/mn10300/kernel/sys_mn10300.c                     | 2 +-
 arch/mn10300/lib/checksum.c                           | 2 +-
 arch/mn10300/mm/cache-smp.c                           | 2 +-
 arch/mn10300/mm/cache.c                               | 2 +-
 arch/mn10300/mm/extable.c                             | 2 +-
 arch/mn10300/mm/init.c                                | 2 +-
 arch/mn10300/mm/misalignment.c                        | 2 +-
 arch/mn10300/proc-mn2ws0050/proc-init.c               | 2 +-
 arch/nios2/kernel/traps.c                             | 2 +-
 arch/openrisc/kernel/or32_ksyms.c                     | 2 +-
 arch/openrisc/kernel/process.c                        | 2 +-
 arch/openrisc/kernel/signal.c                         | 2 +-
 arch/openrisc/kernel/traps.c                          | 2 +-
 arch/openrisc/mm/fault.c                              | 2 +-
 arch/parisc/kernel/asm-offsets.c                      | 2 +-
 arch/parisc/kernel/parisc_ksyms.c                     | 2 +-
 arch/parisc/kernel/pci-dma.c                          | 2 +-
 arch/parisc/kernel/perf.c                             | 2 +-
 arch/parisc/kernel/ptrace.c                           | 2 +-
 arch/parisc/kernel/signal.c                           | 2 +-
 arch/parisc/kernel/signal32.c                         | 2 +-
 arch/parisc/kernel/sys_parisc.c                       | 2 +-
 arch/parisc/kernel/time.c                             | 2 +-
 arch/parisc/kernel/unaligned.c                        | 2 +-
 arch/parisc/kernel/unwind.c                           | 2 +-
 arch/parisc/lib/checksum.c                            | 2 +-
 arch/powerpc/include/asm/asm-prototypes.h             | 2 +-
 arch/powerpc/kernel/align.c                           | 2 +-
 arch/powerpc/kernel/crash_dump.c                      | 2 +-
 arch/powerpc/kernel/hw_breakpoint.c                   | 2 +-
 arch/powerpc/kernel/irq.c                             | 2 +-
 arch/powerpc/kernel/kprobes.c                         | 2 +-
 arch/powerpc/kernel/module.c                          | 2 +-
 arch/powerpc/kernel/nvram_64.c                        | 2 +-
 arch/powerpc/kernel/pci_32.c                          | 2 +-
 arch/powerpc/kernel/proc_powerpc.c                    | 2 +-
 arch/powerpc/kernel/ptrace.c                          | 2 +-
 arch/powerpc/kernel/ptrace32.c                        | 2 +-
 arch/powerpc/kernel/rtas-proc.c                       | 2 +-
 arch/powerpc/kernel/rtas.c                            | 2 +-
 arch/powerpc/kernel/rtas_flash.c                      | 2 +-
 arch/powerpc/kernel/rtasd.c                           | 2 +-
 arch/powerpc/kernel/setup_32.c                        | 2 +-
 arch/powerpc/kernel/signal.c                          | 2 +-
 arch/powerpc/kernel/signal_32.c                       | 2 +-
 arch/powerpc/kernel/signal_64.c                       | 2 +-
 arch/powerpc/kernel/sys_ppc32.c                       | 2 +-
 arch/powerpc/kernel/syscalls.c                        | 2 +-
 arch/powerpc/kernel/time.c                            | 2 +-
 arch/powerpc/kernel/traps.c                           | 2 +-
 arch/powerpc/kernel/vecemu.c                          | 2 +-
 arch/powerpc/kvm/book3s.c                             | 2 +-
 arch/powerpc/kvm/book3s_hv.c                          | 2 +-
 arch/powerpc/kvm/book3s_pr.c                          | 2 +-
 arch/powerpc/kvm/book3s_pr_papr.c                     | 2 +-
 arch/powerpc/kvm/book3s_rtas.c                        | 2 +-
 arch/powerpc/kvm/book3s_xics.c                        | 2 +-
 arch/powerpc/kvm/booke.c                              | 2 +-
 arch/powerpc/kvm/mpic.c                               | 2 +-
 arch/powerpc/kvm/powerpc.c                            | 2 +-
 arch/powerpc/lib/checksum_wrappers.c                  | 2 +-
 arch/powerpc/lib/code-patching.c                      | 2 +-
 arch/powerpc/lib/sstep.c                              | 2 +-
 arch/powerpc/lib/usercopy_64.c                        | 2 +-
 arch/powerpc/math-emu/fabs.c                          | 2 +-
 arch/powerpc/math-emu/fadd.c                          | 2 +-
 arch/powerpc/math-emu/fadds.c                         | 2 +-
 arch/powerpc/math-emu/fcmpo.c                         | 2 +-
 arch/powerpc/math-emu/fcmpu.c                         | 2 +-
 arch/powerpc/math-emu/fctiw.c                         | 2 +-
 arch/powerpc/math-emu/fctiwz.c                        | 2 +-
 arch/powerpc/math-emu/fdiv.c                          | 2 +-
 arch/powerpc/math-emu/fdivs.c                         | 2 +-
 arch/powerpc/math-emu/fmadd.c                         | 2 +-
 arch/powerpc/math-emu/fmadds.c                        | 2 +-
 arch/powerpc/math-emu/fmr.c                           | 2 +-
 arch/powerpc/math-emu/fmsub.c                         | 2 +-
 arch/powerpc/math-emu/fmsubs.c                        | 2 +-
 arch/powerpc/math-emu/fmul.c                          | 2 +-
 arch/powerpc/math-emu/fmuls.c                         | 2 +-
 arch/powerpc/math-emu/fnabs.c                         | 2 +-
 arch/powerpc/math-emu/fneg.c                          | 2 +-
 arch/powerpc/math-emu/fnmadd.c                        | 2 +-
 arch/powerpc/math-emu/fnmadds.c                       | 2 +-
 arch/powerpc/math-emu/fnmsub.c                        | 2 +-
 arch/powerpc/math-emu/fnmsubs.c                       | 2 +-
 arch/powerpc/math-emu/fre.c                           | 2 +-
 arch/powerpc/math-emu/fres.c                          | 2 +-
 arch/powerpc/math-emu/frsp.c                          | 2 +-
 arch/powerpc/math-emu/frsqrte.c                       | 2 +-
 arch/powerpc/math-emu/frsqrtes.c                      | 2 +-
 arch/powerpc/math-emu/fsel.c                          | 2 +-
 arch/powerpc/math-emu/fsqrt.c                         | 2 +-
 arch/powerpc/math-emu/fsqrts.c                        | 2 +-
 arch/powerpc/math-emu/fsub.c                          | 2 +-
 arch/powerpc/math-emu/fsubs.c                         | 2 +-
 arch/powerpc/math-emu/lfd.c                           | 2 +-
 arch/powerpc/math-emu/lfs.c                           | 2 +-
 arch/powerpc/math-emu/math.c                          | 2 +-
 arch/powerpc/math-emu/math_efp.c                      | 2 +-
 arch/powerpc/math-emu/mcrfs.c                         | 2 +-
 arch/powerpc/math-emu/mffs.c                          | 2 +-
 arch/powerpc/math-emu/mtfsb0.c                        | 2 +-
 arch/powerpc/math-emu/mtfsb1.c                        | 2 +-
 arch/powerpc/math-emu/mtfsf.c                         | 2 +-
 arch/powerpc/math-emu/mtfsfi.c                        | 2 +-
 arch/powerpc/math-emu/stfd.c                          | 2 +-
 arch/powerpc/math-emu/stfiwx.c                        | 2 +-
 arch/powerpc/math-emu/stfs.c                          | 2 +-
 arch/powerpc/mm/40x_mmu.c                             | 2 +-
 arch/powerpc/mm/fsl_booke_mmu.c                       | 2 +-
 arch/powerpc/mm/hash_utils_64.c                       | 2 +-
 arch/powerpc/mm/init_64.c                             | 2 +-
 arch/powerpc/mm/subpage-prot.c                        | 2 +-
 arch/powerpc/platforms/cell/spufs/coredump.c          | 2 +-
 arch/powerpc/platforms/cell/spufs/file.c              | 2 +-
 arch/powerpc/platforms/cell/spufs/inode.c             | 2 +-
 arch/powerpc/platforms/cell/spufs/syscalls.c          | 2 +-
 arch/powerpc/platforms/chrp/nvram.c                   | 2 +-
 arch/powerpc/platforms/powernv/opal-elog.c            | 2 +-
 arch/powerpc/platforms/powernv/opal-lpc.c             | 2 +-
 arch/powerpc/platforms/powernv/opal-prd.c             | 2 +-
 arch/powerpc/platforms/pseries/cmm.c                  | 2 +-
 arch/powerpc/platforms/pseries/dlpar.c                | 2 +-
 arch/powerpc/platforms/pseries/dtl.c                  | 2 +-
 arch/powerpc/platforms/pseries/lparcfg.c              | 2 +-
 arch/powerpc/platforms/pseries/nvram.c                | 2 +-
 arch/powerpc/platforms/pseries/reconfig.c             | 2 +-
 arch/powerpc/platforms/pseries/scanlog.c              | 2 +-
 arch/powerpc/sysdev/scom.c                            | 2 +-
 arch/powerpc/sysdev/tsi108_pci.c                      | 2 +-
 arch/s390/appldata/appldata_base.c                    | 2 +-
 arch/s390/boot/compressed/misc.c                      | 2 +-
 arch/s390/crypto/prng.c                               | 2 +-
 arch/s390/include/asm/checksum.h                      | 2 +-
 arch/s390/include/asm/idals.h                         | 2 +-
 arch/s390/include/asm/mmu_context.h                   | 2 +-
 arch/s390/kernel/compat_linux.c                       | 2 +-
 arch/s390/kernel/compat_signal.c                      | 2 +-
 arch/s390/kernel/debug.c                              | 2 +-
 arch/s390/kernel/dis.c                                | 2 +-
 arch/s390/kernel/kprobes.c                            | 2 +-
 arch/s390/kernel/ptrace.c                             | 2 +-
 arch/s390/kernel/signal.c                             | 2 +-
 arch/s390/kernel/sys_s390.c                           | 2 +-
 arch/s390/kernel/time.c                               | 2 +-
 arch/s390/kernel/traps.c                              | 2 +-
 arch/s390/kvm/interrupt.c                             | 2 +-
 arch/s390/mm/init.c                                   | 2 +-
 arch/score/include/asm/checksum.h                     | 2 +-
 arch/score/kernel/ptrace.c                            | 2 +-
 arch/score/kernel/traps.c                             | 2 +-
 arch/score/lib/checksum_copy.c                        | 2 +-
 arch/sh/boards/mach-landisk/gio.c                     | 2 +-
 arch/sh/boot/compressed/misc.c                        | 2 +-
 arch/sh/include/asm/mmu_context.h                     | 2 +-
 arch/sh/kernel/cpu/init.c                             | 2 +-
 arch/sh/kernel/cpu/shmobile/cpuidle.c                 | 2 +-
 arch/sh/kernel/cpu/shmobile/pm.c                      | 2 +-
 arch/sh/kernel/crash_dump.c                           | 2 +-
 arch/sh/kernel/io_trapped.c                           | 2 +-
 arch/sh/kernel/irq.c                                  | 2 +-
 arch/sh/kernel/kprobes.c                              | 2 +-
 arch/sh/kernel/process_32.c                           | 2 +-
 arch/sh/kernel/process_64.c                           | 2 +-
 arch/sh/kernel/ptrace_32.c                            | 2 +-
 arch/sh/kernel/ptrace_64.c                            | 2 +-
 arch/sh/kernel/setup.c                                | 2 +-
 arch/sh/kernel/sh_ksyms_64.c                          | 2 +-
 arch/sh/kernel/signal_32.c                            | 2 +-
 arch/sh/kernel/signal_64.c                            | 2 +-
 arch/sh/kernel/sys_sh.c                               | 2 +-
 arch/sh/kernel/sys_sh32.c                             | 2 +-
 arch/sh/kernel/traps_64.c                             | 2 +-
 arch/sh/math-emu/math.c                               | 2 +-
 arch/sh/mm/cache-debugfs.c                            | 2 +-
 arch/sh/mm/cache-sh3.c                                | 2 +-
 arch/sh/mm/cache-sh5.c                                | 2 +-
 arch/sh/mm/cache-sh7705.c                             | 2 +-
 arch/sh/mm/extable_32.c                               | 2 +-
 arch/sh/mm/extable_64.c                               | 2 +-
 arch/sh/mm/nommu.c                                    | 2 +-
 arch/sh/mm/pmb.c                                      | 2 +-
 arch/sh/mm/tlb-sh3.c                                  | 2 +-
 arch/sh/mm/tlbex_64.c                                 | 2 +-
 arch/sh/mm/tlbflush_64.c                              | 2 +-
 arch/sh/oprofile/backtrace.c                          | 2 +-
 arch/sparc/include/asm/checksum_32.h                  | 2 +-
 arch/sparc/include/asm/checksum_64.h                  | 2 +-
 arch/sparc/kernel/apc.c                               | 2 +-
 arch/sparc/kernel/irq_64.c                            | 2 +-
 arch/sparc/kernel/kprobes.c                           | 2 +-
 arch/sparc/kernel/mdesc.c                             | 2 +-
 arch/sparc/kernel/pci.c                               | 2 +-
 arch/sparc/kernel/pcic.c                              | 2 +-
 arch/sparc/kernel/pmc.c                               | 2 +-
 arch/sparc/kernel/process_32.c                        | 2 +-
 arch/sparc/kernel/process_64.c                        | 2 +-
 arch/sparc/kernel/ptrace_32.c                         | 2 +-
 arch/sparc/kernel/ptrace_64.c                         | 2 +-
 arch/sparc/kernel/signal32.c                          | 2 +-
 arch/sparc/kernel/signal_32.c                         | 2 +-
 arch/sparc/kernel/signal_64.c                         | 2 +-
 arch/sparc/kernel/smp_64.c                            | 2 +-
 arch/sparc/kernel/sys_sparc32.c                       | 2 +-
 arch/sparc/kernel/sys_sparc_32.c                      | 2 +-
 arch/sparc/kernel/sys_sparc_64.c                      | 2 +-
 arch/sparc/kernel/time_64.c                           | 2 +-
 arch/sparc/kernel/traps_64.c                          | 2 +-
 arch/sparc/kernel/unaligned_32.c                      | 2 +-
 arch/sparc/kernel/unaligned_64.c                      | 2 +-
 arch/sparc/kernel/uprobes.c                           | 2 +-
 arch/sparc/kernel/visemul.c                           | 2 +-
 arch/sparc/kernel/windows.c                           | 2 +-
 arch/sparc/math-emu/math_32.c                         | 2 +-
 arch/sparc/math-emu/math_64.c                         | 2 +-
 arch/sparc/mm/extable.c                               | 2 +-
 arch/sparc/mm/init_64.c                               | 2 +-
 arch/tile/kernel/process.c                            | 2 +-
 arch/tile/kernel/single_step.c                        | 2 +-
 arch/tile/kernel/unaligned.c                          | 2 +-
 arch/um/drivers/harddog_kern.c                        | 2 +-
 arch/um/drivers/hostaudio_kern.c                      | 2 +-
 arch/um/drivers/mconsole_kern.c                       | 2 +-
 arch/um/drivers/mmapper_kern.c                        | 2 +-
 arch/um/drivers/random.c                              | 2 +-
 arch/um/kernel/exec.c                                 | 2 +-
 arch/um/kernel/exitcode.c                             | 2 +-
 arch/um/kernel/process.c                              | 2 +-
 arch/um/kernel/ptrace.c                               | 2 +-
 arch/um/kernel/syscall.c                              | 2 +-
 arch/x86/entry/common.c                               | 2 +-
 arch/x86/ia32/ia32_aout.c                             | 2 +-
 arch/x86/ia32/ia32_signal.c                           | 2 +-
 arch/x86/ia32/sys_ia32.c                              | 2 +-
 arch/x86/include/asm/asm-prototypes.h                 | 2 +-
 arch/x86/include/asm/checksum_64.h                    | 2 +-
 arch/x86/include/asm/xen/page.h                       | 2 +-
 arch/x86/kernel/apm_32.c                              | 2 +-
 arch/x86/kernel/cpu/mcheck/mce-severity.c             | 2 +-
 arch/x86/kernel/crash_dump_32.c                       | 2 +-
 arch/x86/kernel/doublefault.c                         | 2 +-
 arch/x86/kernel/kprobes/core.c                        | 2 +-
 arch/x86/kernel/kprobes/opt.c                         | 2 +-
 arch/x86/kernel/process.c                             | 2 +-
 arch/x86/kernel/ptrace.c                              | 2 +-
 arch/x86/kernel/test_nx.c                             | 2 +-
 arch/x86/kernel/tls.c                                 | 2 +-
 arch/x86/kernel/vm86_32.c                             | 2 +-
 arch/x86/lib/usercopy_32.c                            | 2 +-
 arch/x86/math-emu/errors.c                            | 2 +-
 arch/x86/math-emu/fpu_entry.c                         | 2 +-
 arch/x86/math-emu/get_address.c                       | 2 +-
 arch/x86/math-emu/load_store.c                        | 2 +-
 arch/x86/math-emu/reg_ld_str.c                        | 2 +-
 arch/x86/mm/extable.c                                 | 2 +-
 arch/x86/mm/init_32.c                                 | 2 +-
 arch/x86/mm/init_64.c                                 | 2 +-
 arch/x86/mm/pageattr.c                                | 2 +-
 arch/x86/um/ptrace_32.c                               | 2 +-
 arch/x86/um/ptrace_64.c                               | 2 +-
 arch/x86/um/signal.c                                  | 2 +-
 arch/x86/um/tls_32.c                                  | 2 +-
 arch/x86/xen/p2m.c                                    | 2 +-
 arch/xtensa/include/asm/checksum.h                    | 2 +-
 arch/xtensa/include/asm/segment.h                     | 2 +-
 arch/xtensa/kernel/asm-offsets.c                      | 2 +-
 arch/xtensa/kernel/irq.c                              | 2 +-
 arch/xtensa/kernel/process.c                          | 2 +-
 arch/xtensa/kernel/ptrace.c                           | 2 +-
 arch/xtensa/kernel/signal.c                           | 2 +-
 arch/xtensa/kernel/stacktrace.c                       | 2 +-
 arch/xtensa/kernel/syscall.c                          | 2 +-
 arch/xtensa/kernel/traps.c                            | 2 +-
 arch/xtensa/kernel/xtensa_ksyms.c                     | 2 +-
 arch/xtensa/platforms/iss/console.c                   | 2 +-
 arch/xtensa/platforms/iss/simdisk.c                   | 2 +-
 block/ioctl.c                                         | 2 +-
 block/partitions/ibm.c                                | 2 +-
 block/scsi_ioctl.c                                    | 2 +-
 drivers/acpi/acpi_video.c                             | 2 +-
 drivers/acpi/battery.c                                | 2 +-
 drivers/acpi/osl.c                                    | 2 +-
 drivers/acpi/proc.c                                   | 2 +-
 drivers/acpi/processor_thermal.c                      | 2 +-
 drivers/acpi/processor_throttling.c                   | 2 +-
 drivers/acpi/thermal.c                                | 2 +-
 drivers/atm/adummy.c                                  | 2 +-
 drivers/atm/atmtcp.c                                  | 2 +-
 drivers/atm/eni.c                                     | 2 +-
 drivers/atm/firestream.c                              | 2 +-
 drivers/atm/fore200e.c                                | 2 +-
 drivers/atm/he.c                                      | 2 +-
 drivers/atm/horizon.c                                 | 2 +-
 drivers/atm/idt77105.c                                | 2 +-
 drivers/atm/idt77252.c                                | 2 +-
 drivers/atm/iphase.c                                  | 2 +-
 drivers/atm/nicstar.c                                 | 2 +-
 drivers/atm/suni.c                                    | 2 +-
 drivers/atm/uPD98402.c                                | 2 +-
 drivers/atm/zatm.c                                    | 2 +-
 drivers/base/memory.c                                 | 2 +-
 drivers/block/DAC960.c                                | 2 +-
 drivers/block/amiflop.c                               | 2 +-
 drivers/block/brd.c                                   | 2 +-
 drivers/block/cciss.c                                 | 2 +-
 drivers/block/cryptoloop.c                            | 2 +-
 drivers/block/hd.c                                    | 2 +-
 drivers/block/loop.c                                  | 2 +-
 drivers/block/nbd.c                                   | 2 +-
 drivers/block/paride/pcd.c                            | 2 +-
 drivers/block/paride/pd.c                             | 2 +-
 drivers/block/paride/pf.c                             | 2 +-
 drivers/block/paride/pg.c                             | 2 +-
 drivers/block/paride/pt.c                             | 2 +-
 drivers/block/pktcdvd.c                               | 2 +-
 drivers/block/swim3.c                                 | 2 +-
 drivers/block/sx8.c                                   | 2 +-
 drivers/block/umem.c                                  | 2 +-
 drivers/cdrom/cdrom.c                                 | 2 +-
 drivers/char/agp/compat_ioctl.c                       | 2 +-
 drivers/char/agp/frontend.c                           | 2 +-
 drivers/char/applicom.c                               | 2 +-
 drivers/char/bfin-otp.c                               | 2 +-
 drivers/char/ds1620.c                                 | 2 +-
 drivers/char/dtlk.c                                   | 2 +-
 drivers/char/generic_nvram.c                          | 2 +-
 drivers/char/hangcheck-timer.c                        | 2 +-
 drivers/char/hw_random/core.c                         | 2 +-
 drivers/char/ipmi/ipmi_watchdog.c                     | 2 +-
 drivers/char/lp.c                                     | 2 +-
 drivers/char/mbcs.c                                   | 2 +-
 drivers/char/mmtimer.c                                | 2 +-
 drivers/char/mwave/3780i.c                            | 2 +-
 drivers/char/mwave/mwavedd.h                          | 2 +-
 drivers/char/nsc_gpio.c                               | 2 +-
 drivers/char/nwbutton.c                               | 2 +-
 drivers/char/nwflash.c                                | 2 +-
 drivers/char/pc8736x_gpio.c                           | 2 +-
 drivers/char/pcmcia/cm4040_cs.c                       | 2 +-
 drivers/char/pcmcia/synclink_cs.c                     | 2 +-
 drivers/char/random.c                                 | 2 +-
 drivers/char/raw.c                                    | 2 +-
 drivers/char/scx200_gpio.c                            | 2 +-
 drivers/char/sonypi.c                                 | 2 +-
 drivers/char/tlclk.c                                  | 2 +-
 drivers/char/toshiba.c                                | 2 +-
 drivers/char/xilinx_hwicap/xilinx_hwicap.c            | 2 +-
 drivers/cpufreq/ia64-acpi-cpufreq.c                   | 2 +-
 drivers/cpuidle/governors/ladder.c                    | 2 +-
 drivers/dio/dio.c                                     | 2 +-
 drivers/edac/edac_device.c                            | 2 +-
 drivers/edac/edac_mc.c                                | 2 +-
 drivers/edac/edac_pci.c                               | 2 +-
 drivers/i2c/i2c-core.c                                | 2 +-
 drivers/ide/hpt366.c                                  | 2 +-
 drivers/ide/ide-disk.c                                | 2 +-
 drivers/ide/ide-io.c                                  | 2 +-
 drivers/ide/ide-iops.c                                | 2 +-
 drivers/ide/ide-probe.c                               | 2 +-
 drivers/ide/ide-proc.c                                | 2 +-
 drivers/infiniband/core/ucm.c                         | 2 +-
 drivers/infiniband/core/user_mad.c                    | 2 +-
 drivers/infiniband/core/uverbs_cmd.c                  | 2 +-
 drivers/infiniband/core/uverbs_main.c                 | 2 +-
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c             | 2 +-
 drivers/infiniband/ulp/iser/iscsi_iser.c              | 2 +-
 drivers/input/input-compat.c                          | 2 +-
 drivers/input/misc/atlas_btns.c                       | 2 +-
 drivers/input/mouse/amimouse.c                        | 2 +-
 drivers/input/mouse/atarimouse.c                      | 2 +-
 drivers/input/mouse/trackpoint.c                      | 2 +-
 drivers/input/serio/hp_sdc.c                          | 2 +-
 drivers/input/serio/q40kbd.c                          | 2 +-
 drivers/input/serio/serport.c                         | 2 +-
 drivers/input/tablet/aiptek.c                         | 2 +-
 drivers/input/tablet/gtco.c                           | 2 +-
 drivers/isdn/capi/kcapi.c                             | 2 +-
 drivers/isdn/hardware/avm/b1.c                        | 2 +-
 drivers/isdn/hardware/avm/b1dma.c                     | 2 +-
 drivers/isdn/hardware/avm/c4.c                        | 2 +-
 drivers/isdn/hardware/eicon/capimain.c                | 2 +-
 drivers/isdn/hardware/eicon/divamnt.c                 | 2 +-
 drivers/isdn/hardware/eicon/divasi.c                  | 2 +-
 drivers/isdn/hardware/eicon/divasmain.c               | 2 +-
 drivers/isdn/hardware/eicon/divasproc.c               | 2 +-
 drivers/isdn/hysdn/hysdn_boot.c                       | 2 +-
 drivers/lguest/core.c                                 | 2 +-
 drivers/lguest/page_tables.c                          | 2 +-
 drivers/lguest/x86/core.c                             | 2 +-
 drivers/macintosh/ans-lcd.c                           | 2 +-
 drivers/macintosh/smu.c                               | 2 +-
 drivers/macintosh/via-pmu.c                           | 2 +-
 drivers/macintosh/via-pmu68k.c                        | 2 +-
 drivers/md/dm-ioctl.c                                 | 2 +-
 drivers/media/dvb-core/dmxdev.c                       | 2 +-
 drivers/media/dvb-core/dvb_demux.c                    | 2 +-
 drivers/media/dvb-core/dvb_net.c                      | 2 +-
 drivers/media/dvb-core/dvb_ringbuffer.c               | 2 +-
 drivers/media/i2c/adv7170.c                           | 2 +-
 drivers/media/i2c/adv7175.c                           | 2 +-
 drivers/media/i2c/bt856.c                             | 2 +-
 drivers/media/i2c/bt866.c                             | 2 +-
 drivers/media/i2c/cs53l32a.c                          | 2 +-
 drivers/media/i2c/m52790.c                            | 2 +-
 drivers/media/i2c/saa6588.c                           | 2 +-
 drivers/media/i2c/saa7110.c                           | 2 +-
 drivers/media/i2c/saa7185.c                           | 2 +-
 drivers/media/i2c/tlv320aic23b.c                      | 2 +-
 drivers/media/i2c/vp27smpx.c                          | 2 +-
 drivers/media/i2c/vpx3220.c                           | 2 +-
 drivers/media/i2c/wm8739.c                            | 2 +-
 drivers/media/i2c/wm8775.c                            | 2 +-
 drivers/media/pci/ivtv/ivtv-driver.h                  | 2 +-
 drivers/media/pci/meye/meye.c                         | 2 +-
 drivers/media/pci/zoran/videocodec.c                  | 2 +-
 drivers/media/pci/zoran/zoran_driver.c                | 2 +-
 drivers/media/platform/arv.c                          | 2 +-
 drivers/media/usb/pvrusb2/pvrusb2-ioread.c            | 2 +-
 drivers/media/usb/pwc/pwc-ctrl.c                      | 2 +-
 drivers/media/v4l2-core/v4l2-common.c                 | 2 +-
 drivers/media/v4l2-core/v4l2-dev.c                    | 2 +-
 drivers/message/fusion/mptctl.c                       | 2 +-
 drivers/message/fusion/mptlan.h                       | 2 +-
 drivers/misc/ibmasm/ibmasmfs.c                        | 2 +-
 drivers/mmc/core/block.c                              | 2 +-
 drivers/mmc/host/android-goldfish.c                   | 2 +-
 drivers/mtd/devices/pmc551.c                          | 2 +-
 drivers/mtd/devices/slram.c                           | 2 +-
 drivers/mtd/ftl.c                                     | 2 +-
 drivers/mtd/inftlcore.c                               | 2 +-
 drivers/mtd/inftlmount.c                              | 2 +-
 drivers/mtd/maps/sun_uflash.c                         | 2 +-
 drivers/mtd/mtd_blkdevs.c                             | 2 +-
 drivers/mtd/mtdchar.c                                 | 2 +-
 drivers/mtd/nftlcore.c                                | 2 +-
 drivers/net/appletalk/ipddp.c                         | 2 +-
 drivers/net/eql.c                                     | 2 +-
 drivers/net/ethernet/3com/3c509.c                     | 2 +-
 drivers/net/ethernet/3com/3c515.c                     | 2 +-
 drivers/net/ethernet/3com/3c574_cs.c                  | 2 +-
 drivers/net/ethernet/3com/3c59x.c                     | 2 +-
 drivers/net/ethernet/3com/typhoon.c                   | 2 +-
 drivers/net/ethernet/8390/axnet_cs.c                  | 2 +-
 drivers/net/ethernet/8390/ne2k-pci.c                  | 2 +-
 drivers/net/ethernet/8390/pcnet_cs.c                  | 2 +-
 drivers/net/ethernet/adaptec/starfire.c               | 2 +-
 drivers/net/ethernet/alteon/acenic.c                  | 2 +-
 drivers/net/ethernet/amd/amd8111e.c                   | 2 +-
 drivers/net/ethernet/amd/nmclan_cs.c                  | 2 +-
 drivers/net/ethernet/broadcom/b44.c                   | 2 +-
 drivers/net/ethernet/chelsio/cxgb/cxgb2.c             | 2 +-
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c       | 2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c       | 2 +-
 drivers/net/ethernet/dec/tulip/de2104x.c              | 2 +-
 drivers/net/ethernet/dec/tulip/de4x5.c                | 2 +-
 drivers/net/ethernet/dec/tulip/dmfe.c                 | 2 +-
 drivers/net/ethernet/dec/tulip/tulip_core.c           | 2 +-
 drivers/net/ethernet/dec/tulip/uli526x.c              | 2 +-
 drivers/net/ethernet/dec/tulip/winbond-840.c          | 2 +-
 drivers/net/ethernet/dec/tulip/xircom_cb.c            | 2 +-
 drivers/net/ethernet/dlink/dl2k.h                     | 2 +-
 drivers/net/ethernet/dlink/sundance.c                 | 2 +-
 drivers/net/ethernet/fealnx.c                         | 2 +-
 drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c | 2 +-
 drivers/net/ethernet/freescale/fs_enet/mac-fcc.c      | 2 +-
 drivers/net/ethernet/freescale/fs_enet/mac-fec.c      | 2 +-
 drivers/net/ethernet/freescale/fs_enet/mac-scc.c      | 2 +-
 drivers/net/ethernet/freescale/fs_enet/mii-fec.c      | 2 +-
 drivers/net/ethernet/freescale/gianfar.c              | 2 +-
 drivers/net/ethernet/freescale/gianfar.h              | 2 +-
 drivers/net/ethernet/freescale/gianfar_ethtool.c      | 2 +-
 drivers/net/ethernet/freescale/ucc_geth.c             | 2 +-
 drivers/net/ethernet/freescale/ucc_geth_ethtool.c     | 2 +-
 drivers/net/ethernet/fujitsu/fmvj18x_cs.c             | 2 +-
 drivers/net/ethernet/ibm/emac/core.c                  | 2 +-
 drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c        | 2 +-
 drivers/net/ethernet/natsemi/natsemi.c                | 2 +-
 drivers/net/ethernet/natsemi/ns83820.c                | 2 +-
 drivers/net/ethernet/packetengines/hamachi.c          | 2 +-
 drivers/net/ethernet/packetengines/yellowfin.c        | 2 +-
 drivers/net/ethernet/realtek/8139cp.c                 | 2 +-
 drivers/net/ethernet/sgi/ioc3-eth.c                   | 2 +-
 drivers/net/ethernet/sis/sis900.c                     | 2 +-
 drivers/net/ethernet/smsc/epic100.c                   | 2 +-
 drivers/net/ethernet/smsc/smc91c92_cs.c               | 2 +-
 drivers/net/ethernet/sun/cassini.c                    | 2 +-
 drivers/net/ethernet/sun/sungem.c                     | 2 +-
 drivers/net/ethernet/sun/sunhme.c                     | 2 +-
 drivers/net/ethernet/via/via-rhine.c                  | 2 +-
 drivers/net/ethernet/xircom/xirc2ps_cs.c              | 2 +-
 drivers/net/fddi/skfp/skfddi.c                        | 2 +-
 drivers/net/hamradio/6pack.c                          | 2 +-
 drivers/net/hamradio/baycom_epp.c                     | 2 +-
 drivers/net/hamradio/baycom_par.c                     | 2 +-
 drivers/net/hamradio/baycom_ser_fdx.c                 | 2 +-
 drivers/net/hamradio/baycom_ser_hdx.c                 | 2 +-
 drivers/net/hamradio/bpqether.c                       | 2 +-
 drivers/net/hamradio/dmascc.c                         | 2 +-
 drivers/net/hamradio/hdlcdrv.c                        | 2 +-
 drivers/net/hamradio/mkiss.c                          | 2 +-
 drivers/net/hamradio/scc.c                            | 2 +-
 drivers/net/hamradio/yam.c                            | 2 +-
 drivers/net/hippi/rrunner.c                           | 2 +-
 drivers/net/irda/irtty-sir.c                          | 2 +-
 drivers/net/irda/kingsun-sir.c                        | 2 +-
 drivers/net/irda/ks959-sir.c                          | 2 +-
 drivers/net/irda/ksdazzle-sir.c                       | 2 +-
 drivers/net/irda/mcs7780.c                            | 2 +-
 drivers/net/irda/vlsi_ir.c                            | 2 +-
 drivers/net/loopback.c                                | 2 +-
 drivers/net/phy/davicom.c                             | 2 +-
 drivers/net/phy/icplus.c                              | 2 +-
 drivers/net/phy/lxt.c                                 | 2 +-
 drivers/net/phy/qsemi.c                               | 2 +-
 drivers/net/ppp/ppp_async.c                           | 2 +-
 drivers/net/ppp/ppp_synctty.c                         | 2 +-
 drivers/net/ppp/pppoe.c                               | 2 +-
 drivers/net/ppp/pppox.c                               | 2 +-
 drivers/net/sb1000.c                                  | 2 +-
 drivers/net/slip/slhc.c                               | 2 +-
 drivers/net/slip/slip.c                               | 2 +-
 drivers/net/tun.c                                     | 2 +-
 drivers/net/usb/catc.c                                | 2 +-
 drivers/net/usb/kaweth.c                              | 2 +-
 drivers/net/usb/pegasus.c                             | 2 +-
 drivers/net/usb/rtl8150.c                             | 2 +-
 drivers/net/wan/dlci.c                                | 2 +-
 drivers/net/wan/dscc4.c                               | 2 +-
 drivers/net/wan/farsync.c                             | 2 +-
 drivers/net/wan/hd64570.c                             | 2 +-
 drivers/net/wan/hd64572.c                             | 2 +-
 drivers/net/wan/lapbether.c                           | 2 +-
 drivers/net/wan/lmc/lmc_main.c                        | 2 +-
 drivers/net/wan/lmc/lmc_media.c                       | 2 +-
 drivers/net/wan/sbni.c                                | 2 +-
 drivers/net/wan/sdla.c                                | 2 +-
 drivers/net/wireless/atmel/atmel.c                    | 2 +-
 drivers/net/wireless/intel/ipw2x00/ipw2100.c          | 2 +-
 drivers/net/wireless/intel/ipw2x00/libipw_geo.c       | 2 +-
 drivers/net/wireless/intel/ipw2x00/libipw_module.c    | 2 +-
 drivers/net/wireless/intel/ipw2x00/libipw_rx.c        | 2 +-
 drivers/net/wireless/intel/ipw2x00/libipw_tx.c        | 2 +-
 drivers/net/wireless/intersil/hostap/hostap_hw.c      | 2 +-
 drivers/net/wireless/intersil/hostap/hostap_main.c    | 2 +-
 drivers/net/wireless/intersil/prism54/isl_38xx.c      | 2 +-
 drivers/net/wireless/intersil/prism54/isl_ioctl.c     | 2 +-
 drivers/net/wireless/ray_cs.c                         | 2 +-
 drivers/net/wireless/wl3501_cs.c                      | 2 +-
 drivers/nubus/proc.c                                  | 2 +-
 drivers/oprofile/event_buffer.c                       | 2 +-
 drivers/oprofile/oprofilefs.c                         | 2 +-
 drivers/parisc/ccio-dma.c                             | 2 +-
 drivers/parisc/ccio-rm-dma.c                          | 2 +-
 drivers/parisc/eisa_eeprom.c                          | 2 +-
 drivers/parisc/eisa_enumerator.c                      | 2 +-
 drivers/parisc/led.c                                  | 2 +-
 drivers/parisc/pdc_stable.c                           | 2 +-
 drivers/parport/daisy.c                               | 2 +-
 drivers/parport/ieee1284_ops.c                        | 2 +-
 drivers/parport/parport_gsc.c                         | 2 +-
 drivers/parport/probe.c                               | 2 +-
 drivers/parport/procfs.c                              | 2 +-
 drivers/pci/hotplug/acpiphp_ibm.c                     | 2 +-
 drivers/pci/hotplug/cpqphp_core.c                     | 2 +-
 drivers/pci/hotplug/cpqphp_nvram.c                    | 2 +-
 drivers/pci/hotplug/pci_hotplug_core.c                | 2 +-
 drivers/pci/proc.c                                    | 2 +-
 drivers/pci/syscall.c                                 | 2 +-
 drivers/platform/x86/sony-laptop.c                    | 2 +-
 drivers/platform/x86/thinkpad_acpi.c                  | 2 +-
 drivers/pnp/interface.c                               | 2 +-
 drivers/pnp/pnpbios/proc.c                            | 2 +-
 drivers/s390/block/dasd_devmap.c                      | 2 +-
 drivers/s390/block/dasd_eckd.c                        | 2 +-
 drivers/s390/block/dasd_eer.c                         | 2 +-
 drivers/s390/block/dasd_erp.c                         | 2 +-
 drivers/s390/block/dasd_genhd.c                       | 2 +-
 drivers/s390/block/dasd_ioctl.c                       | 2 +-
 drivers/s390/block/dasd_proc.c                        | 2 +-
 drivers/s390/block/xpram.c                            | 2 +-
 drivers/s390/char/con3215.c                           | 2 +-
 drivers/s390/char/keyboard.c                          | 2 +-
 drivers/s390/char/monreader.c                         | 2 +-
 drivers/s390/char/monwriter.c                         | 2 +-
 drivers/s390/char/sclp_rw.c                           | 2 +-
 drivers/s390/char/sclp_tty.c                          | 2 +-
 drivers/s390/char/sclp_vt220.c                        | 2 +-
 drivers/s390/char/tape_char.c                         | 2 +-
 drivers/s390/char/tty3270.c                           | 2 +-
 drivers/s390/char/vmcp.c                              | 2 +-
 drivers/s390/char/vmlogrdr.c                          | 2 +-
 drivers/s390/char/vmur.c                              | 2 +-
 drivers/s390/char/zcore.c                             | 2 +-
 drivers/s390/cio/blacklist.c                          | 2 +-
 drivers/s390/crypto/zcrypt_api.c                      | 2 +-
 drivers/s390/crypto/zcrypt_cex2a.c                    | 2 +-
 drivers/s390/crypto/zcrypt_pcixcc.c                   | 2 +-
 drivers/s390/net/netiucv.c                            | 2 +-
 drivers/sbus/char/display7seg.c                       | 2 +-
 drivers/sbus/char/envctrl.c                           | 2 +-
 drivers/sbus/char/flash.c                             | 2 +-
 drivers/sbus/char/jsflash.c                           | 2 +-
 drivers/sbus/char/openprom.c                          | 2 +-
 drivers/scsi/3w-9xxx.c                                | 2 +-
 drivers/scsi/3w-sas.c                                 | 2 +-
 drivers/scsi/3w-xxxx.c                                | 2 +-
 drivers/scsi/aacraid/aachba.c                         | 2 +-
 drivers/scsi/aacraid/commctrl.c                       | 2 +-
 drivers/scsi/arcmsr/arcmsr_hba.c                      | 2 +-
 drivers/scsi/bfa/bfad.c                               | 2 +-
 drivers/scsi/dpt_i2o.c                                | 2 +-
 drivers/scsi/gdth.c                                   | 2 +-
 drivers/scsi/hptiop.c                                 | 2 +-
 drivers/scsi/ips.h                                    | 2 +-
 drivers/scsi/megaraid.c                               | 2 +-
 drivers/scsi/megaraid/megaraid_mm.h                   | 2 +-
 drivers/scsi/megaraid/megaraid_sas_base.c             | 2 +-
 drivers/scsi/osst.c                                   | 2 +-
 drivers/scsi/qla2xxx/qla_sup.c                        | 2 +-
 drivers/scsi/scsi_ioctl.c                             | 2 +-
 drivers/scsi/scsi_proc.c                              | 2 +-
 drivers/scsi/sd.c                                     | 2 +-
 drivers/scsi/sr.c                                     | 2 +-
 drivers/scsi/sr_ioctl.c                               | 2 +-
 drivers/scsi/st.c                                     | 2 +-
 drivers/tty/amiserial.c                               | 2 +-
 drivers/tty/hvc/hvc_console.c                         | 2 +-
 drivers/tty/hvc/hvcs.c                                | 2 +-
 drivers/tty/hvc/hvsi.c                                | 2 +-
 drivers/tty/moxa.c                                    | 2 +-
 drivers/tty/mxser.c                                   | 2 +-
 drivers/tty/n_hdlc.c                                  | 2 +-
 drivers/tty/n_r3964.c                                 | 2 +-
 drivers/tty/serial/icom.c                             | 2 +-
 drivers/tty/serial/serial_core.c                      | 2 +-
 drivers/tty/synclink.c                                | 2 +-
 drivers/tty/synclink_gt.c                             | 2 +-
 drivers/tty/synclinkmp.c                              | 2 +-
 drivers/tty/tty_ioctl.c                               | 2 +-
 drivers/tty/vt/consolemap.c                           | 2 +-
 drivers/tty/vt/selection.c                            | 2 +-
 drivers/tty/vt/vc_screen.c                            | 2 +-
 drivers/tty/vt/vt_ioctl.c                             | 2 +-
 drivers/usb/atm/usbatm.c                              | 2 +-
 drivers/usb/core/hub.c                                | 2 +-
 drivers/usb/gadget/legacy/inode.c                     | 2 +-
 drivers/usb/host/uhci-hcd.c                           | 2 +-
 drivers/usb/misc/ftdi-elan.c                          | 2 +-
 drivers/usb/misc/idmouse.c                            | 2 +-
 drivers/usb/misc/ldusb.c                              | 2 +-
 drivers/usb/misc/legousbtower.c                       | 2 +-
 drivers/usb/mon/mon_bin.c                             | 2 +-
 drivers/usb/mon/mon_stat.c                            | 2 +-
 drivers/usb/mon/mon_text.c                            | 2 +-
 drivers/usb/musb/musb_debugfs.c                       | 2 +-
 drivers/video/console/newport_con.c                   | 2 +-
 drivers/video/fbdev/68328fb.c                         | 2 +-
 drivers/video/fbdev/hitfb.c                           | 2 +-
 drivers/video/fbdev/hpfb.c                            | 2 +-
 drivers/video/fbdev/mx3fb.c                           | 2 +-
 drivers/video/fbdev/q40fb.c                           | 2 +-
 drivers/video/fbdev/sm501fb.c                         | 2 +-
 drivers/video/fbdev/stifb.c                           | 2 +-
 drivers/video/fbdev/w100fb.c                          | 2 +-
 drivers/zorro/proc.c                                  | 2 +-
 fs/9p/vfs_file.c                                      | 2 +-
 fs/afs/proc.c                                         | 2 +-
 fs/aio.c                                              | 2 +-
 fs/anon_inodes.c                                      | 2 +-
 fs/bfs/inode.c                                        | 2 +-
 fs/binfmt_aout.c                                      | 2 +-
 fs/binfmt_elf.c                                       | 2 +-
 fs/binfmt_elf_fdpic.c                                 | 2 +-
 fs/block_dev.c                                        | 2 +-
 fs/cifs/cifs_debug.c                                  | 2 +-
 fs/cifs/cifssmb.c                                     | 2 +-
 fs/cifs/connect.c                                     | 2 +-
 fs/cifs/transport.c                                   | 2 +-
 fs/compat.c                                           | 2 +-
 fs/compat_ioctl.c                                     | 2 +-
 fs/configfs/file.c                                    | 2 +-
 fs/coredump.c                                         | 2 +-
 fs/dcache.c                                           | 2 +-
 fs/dcookies.c                                         | 2 +-
 fs/dlm/dlm_internal.h                                 | 2 +-
 fs/efs/efs.h                                          | 2 +-
 fs/eventpoll.c                                        | 2 +-
 fs/exec.c                                             | 2 +-
 fs/ext2/ioctl.c                                       | 2 +-
 fs/ext2/super.c                                       | 2 +-
 fs/ext4/extents.c                                     | 2 +-
 fs/ext4/ioctl.c                                       | 2 +-
 fs/ext4/super.c                                       | 2 +-
 fs/fcntl.c                                            | 2 +-
 fs/fhandle.c                                          | 2 +-
 fs/filesystems.c                                      | 2 +-
 fs/gfs2/file.c                                        | 2 +-
 fs/gfs2/glock.c                                       | 2 +-
 fs/gfs2/inode.c                                       | 2 +-
 fs/gfs2/sys.c                                         | 2 +-
 fs/gfs2/util.c                                        | 2 +-
 fs/gfs2/xattr.c                                       | 2 +-
 fs/hfs/hfs_fs.h                                       | 2 +-
 fs/hfsplus/ioctl.c                                    | 2 +-
 fs/hugetlbfs/inode.c                                  | 2 +-
 fs/jbd2/journal.c                                     | 2 +-
 fs/jfs/ioctl.c                                        | 2 +-
 fs/jfs/jfs_debug.c                                    | 2 +-
 fs/jfs/super.c                                        | 2 +-
 fs/libfs.c                                            | 2 +-
 fs/locks.c                                            | 2 +-
 fs/namei.c                                            | 2 +-
 fs/ncpfs/dir.c                                        | 2 +-
 fs/ncpfs/file.c                                       | 2 +-
 fs/ncpfs/inode.c                                      | 2 +-
 fs/ncpfs/ioctl.c                                      | 2 +-
 fs/ncpfs/mmap.c                                       | 2 +-
 fs/ncpfs/ncplib_kernel.h                              | 2 +-
 fs/ncpfs/sock.c                                       | 2 +-
 fs/ncpfs/symlink.c                                    | 2 +-
 fs/nfs/direct.c                                       | 2 +-
 fs/nfs/file.c                                         | 2 +-
 fs/nfs/getroot.c                                      | 2 +-
 fs/nfs/inode.c                                        | 2 +-
 fs/nfs/super.c                                        | 2 +-
 fs/nfs/write.c                                        | 2 +-
 fs/nfsd/fault_inject.c                                | 2 +-
 fs/nfsd/vfs.c                                         | 2 +-
 fs/ntfs/file.c                                        | 2 +-
 fs/ocfs2/cluster/masklog.c                            | 2 +-
 fs/ocfs2/cluster/tcp.c                                | 2 +-
 fs/ocfs2/dlmfs/dlmfs.c                                | 2 +-
 fs/ocfs2/stack_user.c                                 | 2 +-
 fs/open.c                                             | 2 +-
 fs/openpromfs/inode.c                                 | 2 +-
 fs/pipe.c                                             | 2 +-
 fs/proc/base.c                                        | 2 +-
 fs/proc/generic.c                                     | 2 +-
 fs/proc/inode.c                                       | 2 +-
 fs/proc/kcore.c                                       | 2 +-
 fs/proc/kmsg.c                                        | 2 +-
 fs/proc/nommu.c                                       | 2 +-
 fs/proc/page.c                                        | 2 +-
 fs/proc/proc_net.c                                    | 2 +-
 fs/proc/proc_tty.c                                    | 2 +-
 fs/proc/root.c                                        | 2 +-
 fs/proc/task_mmu.c                                    | 2 +-
 fs/proc/vmcore.c                                      | 2 +-
 fs/ramfs/file-nommu.c                                 | 2 +-
 fs/ramfs/inode.c                                      | 2 +-
 fs/read_write.c                                       | 2 +-
 fs/readdir.c                                          | 2 +-
 fs/select.c                                           | 2 +-
 fs/seq_file.c                                         | 2 +-
 fs/stat.c                                             | 2 +-
 fs/ufs/inode.c                                        | 2 +-
 fs/ufs/super.c                                        | 2 +-
 fs/utimes.c                                           | 2 +-
 fs/xattr.c                                            | 2 +-
 fs/xfs/xfs_ioctl32.c                                  | 2 +-
 fs/xfs/xfs_linux.h                                    | 2 +-
 include/asm-generic/termios-base.h                    | 2 +-
 include/asm-generic/termios.h                         | 2 +-
 include/drm/drmP.h                                    | 2 +-
 include/linux/isdnif.h                                | 2 +-
 include/linux/pagemap.h                               | 2 +-
 include/linux/poll.h                                  | 2 +-
 include/net/checksum.h                                | 2 +-
 include/net/sctp/sctp.h                               | 2 +-
 include/rdma/ib_verbs.h                               | 2 +-
 init/init_task.c                                      | 2 +-
 kernel/capability.c                                   | 2 +-
 kernel/compat.c                                       | 2 +-
 kernel/configs.c                                      | 2 +-
 kernel/cpuset.c                                       | 2 +-
 kernel/exit.c                                         | 2 +-
 kernel/extable.c                                      | 2 +-
 kernel/fork.c                                         | 2 +-
 kernel/futex_compat.c                                 | 2 +-
 kernel/groups.c                                       | 2 +-
 kernel/kmod.c                                         | 2 +-
 kernel/kprobes.c                                      | 2 +-
 kernel/locking/lockdep_proc.c                         | 2 +-
 kernel/module.c                                       | 2 +-
 kernel/power/snapshot.c                               | 2 +-
 kernel/power/user.c                                   | 2 +-
 kernel/printk/printk.c                                | 2 +-
 kernel/profile.c                                      | 2 +-
 kernel/signal.c                                       | 2 +-
 kernel/sys.c                                          | 2 +-
 kernel/sysctl.c                                       | 2 +-
 kernel/time/hrtimer.c                                 | 2 +-
 kernel/time/itimer.c                                  | 2 +-
 kernel/time/posix-cpu-timers.c                        | 2 +-
 kernel/time/posix-timers.c                            | 2 +-
 kernel/time/time.c                                    | 2 +-
 kernel/time/timer.c                                   | 2 +-
 kernel/time/timer_list.c                              | 2 +-
 kernel/time/timer_stats.c                             | 2 +-
 kernel/uid16.c                                        | 2 +-
 lib/extable.c                                         | 2 +-
 lib/kstrtox.c                                         | 2 +-
 mm/memcontrol.c                                       | 2 +-
 mm/memory.c                                           | 2 +-
 mm/mempolicy.c                                        | 2 +-
 mm/mincore.c                                          | 2 +-
 mm/mmap.c                                             | 2 +-
 mm/mprotect.c                                         | 2 +-
 mm/nommu.c                                            | 2 +-
 mm/shmem.c                                            | 2 +-
 mm/util.c                                             | 2 +-
 mm/vmalloc.c                                          | 2 +-
 net/802/fc.c                                          | 2 +-
 net/802/hippi.c                                       | 2 +-
 net/8021q/vlan.c                                      | 2 +-
 net/ax25/af_ax25.c                                    | 2 +-
 net/ax25/ax25_addr.c                                  | 2 +-
 net/ax25/ax25_dev.c                                   | 2 +-
 net/ax25/ax25_ds_in.c                                 | 2 +-
 net/ax25/ax25_ds_subr.c                               | 2 +-
 net/ax25/ax25_ds_timer.c                              | 2 +-
 net/ax25/ax25_iface.c                                 | 2 +-
 net/ax25/ax25_in.c                                    | 2 +-
 net/ax25/ax25_ip.c                                    | 2 +-
 net/ax25/ax25_out.c                                   | 2 +-
 net/ax25/ax25_route.c                                 | 2 +-
 net/ax25/ax25_std_in.c                                | 2 +-
 net/ax25/ax25_std_subr.c                              | 2 +-
 net/ax25/ax25_std_timer.c                             | 2 +-
 net/ax25/ax25_subr.c                                  | 2 +-
 net/ax25/ax25_timer.c                                 | 2 +-
 net/ax25/ax25_uid.c                                   | 2 +-
 net/bridge/br_device.c                                | 2 +-
 net/bridge/br_ioctl.c                                 | 2 +-
 net/bridge/br_netfilter_hooks.c                       | 2 +-
 net/bridge/br_netfilter_ipv6.c                        | 2 +-
 net/bridge/netfilter/ebtables.c                       | 2 +-
 net/compat.c                                          | 2 +-
 net/core/datagram.c                                   | 2 +-
 net/core/dev.c                                        | 2 +-
 net/core/filter.c                                     | 2 +-
 net/core/gen_estimator.c                              | 2 +-
 net/core/rtnetlink.c                                  | 2 +-
 net/core/scm.c                                        | 2 +-
 net/core/skbuff.c                                     | 2 +-
 net/core/sock.c                                       | 2 +-
 net/core/utils.c                                      | 2 +-
 net/decnet/dn_dev.c                                   | 2 +-
 net/decnet/dn_fib.c                                   | 2 +-
 net/decnet/dn_table.c                                 | 2 +-
 net/decnet/sysctl_net_decnet.c                        | 2 +-
 net/ipv4/af_inet.c                                    | 2 +-
 net/ipv4/devinet.c                                    | 2 +-
 net/ipv4/fib_frontend.c                               | 2 +-
 net/ipv4/fib_semantics.c                              | 2 +-
 net/ipv4/fib_trie.c                                   | 2 +-
 net/ipv4/icmp.c                                       | 2 +-
 net/ipv4/igmp.c                                       | 2 +-
 net/ipv4/ip_gre.c                                     | 2 +-
 net/ipv4/ip_options.c                                 | 2 +-
 net/ipv4/ip_output.c                                  | 2 +-
 net/ipv4/ip_sockglue.c                                | 2 +-
 net/ipv4/ipconfig.c                                   | 2 +-
 net/ipv4/ipip.c                                       | 2 +-
 net/ipv4/ipmr.c                                       | 2 +-
 net/ipv4/netfilter/arp_tables.c                       | 2 +-
 net/ipv4/netfilter/ip_tables.c                        | 2 +-
 net/ipv4/raw.c                                        | 2 +-
 net/ipv4/route.c                                      | 2 +-
 net/ipv4/tcp.c                                        | 2 +-
 net/ipv4/udp.c                                        | 2 +-
 net/ipv6/af_inet6.c                                   | 2 +-
 net/ipv6/datagram.c                                   | 2 +-
 net/ipv6/icmp.c                                       | 2 +-
 net/ipv6/ip6_flowlabel.c                              | 2 +-
 net/ipv6/ip6_tunnel.c                                 | 2 +-
 net/ipv6/ip6mr.c                                      | 2 +-
 net/ipv6/ipv6_sockglue.c                              | 2 +-
 net/ipv6/netfilter/ip6_tables.c                       | 2 +-
 net/ipv6/route.c                                      | 2 +-
 net/ipv6/sit.c                                        | 2 +-
 net/ipv6/udp.c                                        | 2 +-
 net/ipx/af_ipx.c                                      | 2 +-
 net/irda/af_irda.c                                    | 2 +-
 net/irda/ircomm/ircomm_tty.c                          | 2 +-
 net/irda/ircomm/ircomm_tty_ioctl.c                    | 2 +-
 net/irda/irda_device.c                                | 2 +-
 net/irda/irnet/irnet.h                                | 2 +-
 net/lapb/lapb_iface.c                                 | 2 +-
 net/lapb/lapb_in.c                                    | 2 +-
 net/lapb/lapb_out.c                                   | 2 +-
 net/lapb/lapb_subr.c                                  | 2 +-
 net/lapb/lapb_timer.c                                 | 2 +-
 net/netfilter/ipvs/ip_vs_ctl.c                        | 2 +-
 net/netfilter/nfnetlink.c                             | 2 +-
 net/netlink/af_netlink.c                              | 2 +-
 net/packet/af_packet.c                                | 2 +-
 net/rose/af_rose.c                                    | 2 +-
 net/rose/rose_route.c                                 | 2 +-
 net/sctp/ipv6.c                                       | 2 +-
 net/socket.c                                          | 2 +-
 net/sunrpc/auth_gss/auth_gss.c                        | 2 +-
 net/sunrpc/cache.c                                    | 2 +-
 net/sunrpc/svcsock.c                                  | 2 +-
 net/sunrpc/sysctl.c                                   | 2 +-
 net/unix/af_unix.c                                    | 2 +-
 net/x25/af_x25.c                                      | 2 +-
 net/x25/x25_link.c                                    | 2 +-
 net/xfrm/xfrm_state.c                                 | 2 +-
 net/xfrm/xfrm_user.c                                  | 2 +-
 security/keys/keyctl.c                                | 2 +-
 security/keys/process_keys.c                          | 2 +-
 security/keys/request_key_auth.c                      | 2 +-
 security/keys/user_defined.c                          | 2 +-
 sound/oss/dmasound/dmasound_atari.c                   | 2 +-
 sound/oss/dmasound/dmasound_core.c                    | 2 +-
 sound/oss/dmasound/dmasound_paula.c                   | 2 +-
 sound/oss/dmasound/dmasound_q40.c                     | 2 +-
 sound/oss/msnd.c                                      | 2 +-
 sound/oss/os.h                                        | 2 +-
 sound/oss/swarm_cs4297a.c                             | 2 +-
 virt/kvm/kvm_main.c                                   | 2 +-
 1088 files changed, 1088 insertions(+), 1088 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/boot/misc.c b/arch/alpha/boot/misc.c
index 3ff9a957a25c..1b568ed74f95 100644
--- a/arch/alpha/boot/misc.c
+++ b/arch/alpha/boot/misc.c
@@ -21,7 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define memzero(s,n)	memset ((s),0,(n))
 #define puts		srm_printk
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index 2d6efcff3bf3..2f26ae74b61a 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -26,7 +26,7 @@
 #include <linux/bitops.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 volatile unsigned long irq_err_count;
 DEFINE_PER_CPU(unsigned long, irq_pmi_count);
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 56e427c7aa3c..54d8616644e2 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -39,7 +39,7 @@
 
 #include <asm/fpu.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sysinfo.h>
 #include <asm/thread_info.h>
 #include <asm/hwrpb.h>
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index b483156698d5..bca963a4aa48 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -31,7 +31,7 @@
 #include <linux/rcupdate.h>
 
 #include <asm/reg.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/hwrpb.h>
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 04abdec7f496..bc4d2cdcf21d 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -16,7 +16,7 @@
 #include <linux/tracehook.h>
 #include <linux/audit.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/fpu.h>
 
diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c
index 4811e54069fc..491e6a604e82 100644
--- a/arch/alpha/kernel/setup.c
+++ b/arch/alpha/kernel/setup.c
@@ -53,7 +53,7 @@ static struct notifier_block alpha_panic_block = {
         INT_MAX /* try to do it first */
 };
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/hwrpb.h>
 #include <asm/dma.h>
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 8dbfb15f1745..17308f925306 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -22,7 +22,7 @@
 #include <linux/syscalls.h>
 #include <linux/tracehook.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sigcontext.h>
 #include <asm/ucontext.h>
 
diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c
index ffe996a54fad..705ae12acd15 100644
--- a/arch/alpha/kernel/srm_env.c
+++ b/arch/alpha/kernel/srm_env.c
@@ -35,7 +35,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <asm/console.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/machvec.h>
 
 #define BASE_DIR	"srm_environment"	/* Subdir in /proc/		*/
diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c
index 72b59511e59a..e9c45b65a905 100644
--- a/arch/alpha/kernel/srmcons.c
+++ b/arch/alpha/kernel/srmcons.c
@@ -18,7 +18,7 @@
 #include <linux/tty_flip.h>
 
 #include <asm/console.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 static DEFINE_SPINLOCK(srmcons_callback_lock);
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 5b6202a825ff..992000e3d9e4 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -34,7 +34,7 @@
 #include <linux/profile.h>
 #include <linux/irq_work.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/hwrpb.h>
 
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 74aceead06e9..3328af7c2776 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -18,7 +18,7 @@
 #include <linux/ratelimit.h>
 
 #include <asm/gentrap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 #include <asm/sysinfo.h>
 #include <asm/hwrpb.h>
diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c
index b4ff3b683bcd..5dfb7975895f 100644
--- a/arch/alpha/lib/csum_partial_copy.c
+++ b/arch/alpha/lib/csum_partial_copy.c
@@ -11,7 +11,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 #define ldq_u(x,y) \
diff --git a/arch/alpha/math-emu/math.c b/arch/alpha/math-emu/math.c
index 58c2669a1dd4..fa5ae0ad8983 100644
--- a/arch/alpha/math-emu/math.c
+++ b/arch/alpha/math-emu/math.c
@@ -3,7 +3,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "sfp-util.h"
 #include <math-emu/soft-fp.h>
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index a1bea91df56a..0542e973c73d 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -22,7 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/gfp.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/hwrpb.h>
diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c
index 6053f64c3752..4c10c6452678 100644
--- a/arch/arm/common/bL_switcher_dummy_if.c
+++ b/arch/arm/common/bL_switcher_dummy_if.c
@@ -15,7 +15,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/bL_switcher.h>
 
 static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c
index c3fe769d7558..853221f81104 100644
--- a/arch/arm/kernel/swp_emulate.c
+++ b/arch/arm/kernel/swp_emulate.c
@@ -29,7 +29,7 @@
 #include <asm/opcodes.h>
 #include <asm/system_info.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Error-checking SWP macros implemented using ldrex{b}/strex{b}
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 8f92efa8460e..11676787ad49 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -33,7 +33,7 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/mman.h>
 #include <asm/tlbflush.h>
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 9aca92074f85..fa6182a40941 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -23,7 +23,7 @@
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <asm/cputype.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/kvm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
diff --git a/arch/arm/mach-iop13xx/irq.c b/arch/arm/mach-iop13xx/irq.c
index c702cc4092de..bd9b43c8004e 100644
--- a/arch/arm/mach-iop13xx/irq.c
+++ b/arch/arm/mach-iop13xx/irq.c
@@ -20,7 +20,7 @@
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/sysctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mach/irq.h>
 #include <asm/irq.h>
 #include <mach/hardware.h>
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 26874f608ca9..0f08f611c1a6 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -34,7 +34,7 @@
 #include <mach/udc.h>
 #include <mach/hardware.h>
 #include <mach/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-rpc/dma.c b/arch/arm/mach-rpc/dma.c
index 6d3517dc4772..fb48f3141fb4 100644
--- a/arch/arm/mach-rpc/dma.c
+++ b/arch/arm/mach-rpc/dma.c
@@ -20,7 +20,7 @@
 #include <asm/fiq.h>
 #include <asm/irq.h>
 #include <mach/hardware.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/mach/dma.h>
 #include <asm/hardware/iomd.h>
diff --git a/arch/arm/plat-iop/time.c b/arch/arm/plat-iop/time.c
index 101e8f2c7abe..ed8d129d4bea 100644
--- a/arch/arm/plat-iop/time.c
+++ b/arch/arm/plat-iop/time.c
@@ -25,7 +25,7 @@
 #include <linux/sched_clock.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mach/irq.h>
 #include <asm/mach/time.h>
 #include <mach/time.h>
diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h
index 2b79b8a89457..b0d708ff7f4e 100644
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -16,7 +16,7 @@
 #ifndef __ASM_WORD_AT_A_TIME_H
 #define __ASM_WORD_AT_A_TIME_H
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifndef __AARCH64EB__
 
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 04de188a36c9..fde04f029ec3 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -19,7 +19,7 @@
 #include <asm/sysreg.h>
 #include <asm/system_misc.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cpufeature.h>
 
 #define CREATE_TRACE_POINTS
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 4f0d76339414..a7504f40d7ee 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -31,7 +31,7 @@
 #include <asm/memory.h>
 #include <asm/ptrace.h>
 #include <asm/thread_info.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 /*
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 1decd2b2c730..f0593c92279b 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -29,7 +29,7 @@
 #include <asm/debug-monitors.h>
 #include <asm/system_misc.h>
 #include <asm/insn.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 #include <asm/sections.h>
 
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index b7063de792f7..c747a0fc5d7d 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -26,7 +26,7 @@
 #include <asm/esr.h>
 #include <asm/fpsimd.h>
 #include <asm/signal32.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 struct compat_sigcontext {
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 3f9e15722473..b37446a8ffdb 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -26,7 +26,7 @@
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
 #include <asm/cputype.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/kvm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index d7150e30438a..add4a1334085 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,7 +17,7 @@
  */
 #include <linux/linkage.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 	.text
 
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index cfe13396085b..fd6cd05593f9 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -17,7 +17,7 @@
 #include <linux/linkage.h>
 
 #include <asm/cache.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Copy from user space to a kernel buffer (alignment handled by the hardware)
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 718b1c4e2f85..d828540ded6f 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -19,7 +19,7 @@
 #include <linux/linkage.h>
 
 #include <asm/cache.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Copy from user space to user space (alignment handled by the hardware)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index e99e31c9acac..3e6ae2663b82 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -17,7 +17,7 @@
 #include <linux/linkage.h>
 
 #include <asm/cache.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Copy to user space from a kernel buffer (alignment handled by the hardware)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index da9576932322..17f422a4dc55 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -23,7 +23,7 @@
 #include <asm/assembler.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  *	flush_icache_range(start,end)
diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S
index b41aff25426d..47cf3f9d89ff 100644
--- a/arch/arm64/xen/hypercall.S
+++ b/arch/arm64/xen/hypercall.S
@@ -49,7 +49,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <xen/interface/xen.h>
 
 
diff --git a/arch/avr32/kernel/avr32_ksyms.c b/arch/avr32/kernel/avr32_ksyms.c
index 7c6cf14f0985..0d05fd095468 100644
--- a/arch/avr32/kernel/avr32_ksyms.c
+++ b/arch/avr32/kernel/avr32_ksyms.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 
 #include <asm/checksum.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * GCC functions
diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c
index 4aedcab7cd4b..a89b893279bb 100644
--- a/arch/avr32/kernel/ptrace.c
+++ b/arch/avr32/kernel/ptrace.c
@@ -17,7 +17,7 @@
 #include <linux/notifier.h>
 
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ocd.h>
 #include <asm/mmu_context.h>
 #include <linux/kdebug.h>
diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c
index 8f1c63b9b983..b5fcc4914fe4 100644
--- a/arch/avr32/kernel/signal.c
+++ b/arch/avr32/kernel/signal.c
@@ -17,7 +17,7 @@
 #include <linux/unistd.h>
 #include <linux/tracehook.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ucontext.h>
 #include <asm/syscalls.h>
 
diff --git a/arch/avr32/mm/cache.c b/arch/avr32/mm/cache.c
index 85d635cd7b28..d9476825fc43 100644
--- a/arch/avr32/mm/cache.c
+++ b/arch/avr32/mm/cache.c
@@ -12,7 +12,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cachectl.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/syscalls.h>
 
 /*
diff --git a/arch/blackfin/kernel/bfin_dma.c b/arch/blackfin/kernel/bfin_dma.c
index 4a32f2dd5ddc..9d3eb0cf8ccc 100644
--- a/arch/blackfin/kernel/bfin_dma.c
+++ b/arch/blackfin/kernel/bfin_dma.c
@@ -19,7 +19,7 @@
 #include <asm/blackfin.h>
 #include <asm/cacheflush.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/early_printk.h>
 
 /*
diff --git a/arch/blackfin/kernel/kgdb_test.c b/arch/blackfin/kernel/kgdb_test.c
index 18ab004aea1c..b8b785dc4e3b 100644
--- a/arch/blackfin/kernel/kgdb_test.c
+++ b/arch/blackfin/kernel/kgdb_test.c
@@ -12,7 +12,7 @@
 #include <linux/proc_fs.h>
 
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/blackfin.h>
 
diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index 4489efc52883..0188c933b155 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -14,7 +14,7 @@
 #include <linux/kernel.h>
 #include <asm/dma.h>
 #include <asm/cacheflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Transfer the section to the L1 memory */
 int
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index 63f5560d6eb2..4cc72b0d1c1d 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -18,7 +18,7 @@
 #include <linux/initrd.h>
 
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * ZERO_PAGE is a special page that is used for zero-initialized
diff --git a/arch/cris/arch-v10/drivers/eeprom.c b/arch/cris/arch-v10/drivers/eeprom.c
index c903a9e53a47..33558d270a53 100644
--- a/arch/cris/arch-v10/drivers/eeprom.c
+++ b/arch/cris/arch-v10/drivers/eeprom.c
@@ -29,7 +29,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "i2c.h"
 
 #define D(x)
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 0f3983241e60..9ac75d68f184 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -27,7 +27,7 @@
 #include <asm/dma.h>
 #include <asm/io.h>
 #include <arch/svinto.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sync_serial.h>
 #include <arch/io_interface_mux.h>
 
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index bfddfb99401f..eca94c7d56e7 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -12,7 +12,7 @@
 #include <linux/signal.h>
 #include <linux/security.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c
index 7122d9773b13..db30c98e4926 100644
--- a/arch/cris/arch-v10/kernel/signal.c
+++ b/arch/cris/arch-v10/kernel/signal.c
@@ -26,7 +26,7 @@
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/system.h>
 
 #define DEBUG_SIG 0
diff --git a/arch/cris/arch-v10/kernel/traps.c b/arch/cris/arch-v10/kernel/traps.c
index 7001beda716c..96d004fe9740 100644
--- a/arch/cris/arch-v10/kernel/traps.c
+++ b/arch/cris/arch-v10/kernel/traps.c
@@ -9,7 +9,7 @@
  */
 
 #include <linux/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/sv_addr_ag.h>
 #include <arch/system.h>
 
diff --git a/arch/cris/arch-v10/lib/usercopy.c b/arch/cris/arch-v10/lib/usercopy.c
index b964c667aced..1ba7cc000dfc 100644
--- a/arch/cris/arch-v10/lib/usercopy.c
+++ b/arch/cris/arch-v10/lib/usercopy.c
@@ -8,7 +8,7 @@
  * Pieces used from memcpy, originally by Kenny Ranerup long time ago.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Asm:s have been tweaked (within the domain of correctness) to give
    satisfactory results for "gcc version 2.96 20000427 (experimental)".
diff --git a/arch/cris/arch-v10/mm/fault.c b/arch/cris/arch-v10/mm/fault.c
index ed60588f8467..75210cbe61ce 100644
--- a/arch/cris/arch-v10/mm/fault.c
+++ b/arch/cris/arch-v10/mm/fault.c
@@ -11,7 +11,7 @@
  */
 
 #include <linux/mm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <arch/svinto.h>
 #include <asm/mmu_context.h>
diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index 0068fd411a84..ae6903d7fdbe 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -14,7 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
 
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index fe1f9cf7b391..c366bc05466a 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -12,7 +12,7 @@
 #include <linux/signal.h>
 #include <linux/security.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c
index 150d1d76c29d..816bf2ca93ef 100644
--- a/arch/cris/arch-v32/kernel/signal.c
+++ b/arch/cris/arch-v32/kernel/signal.c
@@ -18,7 +18,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/hwregs/cpu_vect.h>
 
 extern unsigned long cris_signal_return_page;
diff --git a/arch/cris/arch-v32/kernel/traps.c b/arch/cris/arch-v32/kernel/traps.c
index 8bbe09c93132..d79666aefd71 100644
--- a/arch/cris/arch-v32/kernel/traps.c
+++ b/arch/cris/arch-v32/kernel/traps.c
@@ -4,7 +4,7 @@
 
 #include <linux/ptrace.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <hwregs/supp_reg.h>
 #include <hwregs/intr_vect_defs.h>
 #include <asm/irq.h>
diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c
index f0f335d8aa79..05e58dab800d 100644
--- a/arch/cris/arch-v32/lib/usercopy.c
+++ b/arch/cris/arch-v32/lib/usercopy.c
@@ -8,7 +8,7 @@
  * Pieces used from memcpy, originally by Kenny Ranerup long time ago.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Asm:s have been tweaked (within the domain of correctness) to give
    satisfactory results for "gcc version 3.2.1 Axis release R53/1.53-v32".
diff --git a/arch/cris/kernel/crisksyms.c b/arch/cris/kernel/crisksyms.c
index 31b4bd288cad..3166d1cf2f84 100644
--- a/arch/cris/kernel/crisksyms.c
+++ b/arch/cris/kernel/crisksyms.c
@@ -10,7 +10,7 @@
 #include <linux/tty.h>
 
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/checksum.h>
 #include <asm/io.h>
 #include <asm/delay.h>
diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index b78498eb079b..50a7dd451456 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -14,7 +14,7 @@
 
 #include <linux/atomic.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
diff --git a/arch/cris/kernel/profile.c b/arch/cris/kernel/profile.c
index cd9f15b92f8f..ad56b37f8e11 100644
--- a/arch/cris/kernel/profile.c
+++ b/arch/cris/kernel/profile.c
@@ -5,7 +5,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define SAMPLE_BUFFER_SIZE 8192
 
diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c
index fd3427e563c5..806b764059d5 100644
--- a/arch/cris/kernel/ptrace.c
+++ b/arch/cris/kernel/ptrace.c
@@ -18,7 +18,7 @@
 #include <linux/user.h>
 #include <linux/tracehook.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c
index 7aa036ec78ff..8febb032fdd7 100644
--- a/arch/cris/kernel/sys_cris.c
+++ b/arch/cris/kernel/sys_cris.c
@@ -23,7 +23,7 @@
 #include <linux/file.h>
 #include <linux/ipc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/segment.h>
 
 asmlinkage long
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index da4c72401e27..b2a312a7afc6 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -20,7 +20,7 @@
 #endif
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/system.h>
 
 extern void arch_enable_nmi(void);
diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h
index 4bea27f50a7a..2e1da71e27a4 100644
--- a/arch/frv/include/asm/futex.h
+++ b/arch/frv/include/asm/futex.h
@@ -5,7 +5,7 @@
 
 #include <linux/futex.h>
 #include <asm/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr);
 
diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c
index 2239346fa3db..93513e4ccd2b 100644
--- a/arch/frv/kernel/irq.c
+++ b/arch/frv/kernel/irq.c
@@ -28,7 +28,7 @@
 #include <linux/atomic.h>
 #include <asm/io.h>
 #include <asm/smp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/delay.h>
 #include <asm/irq.h>
diff --git a/arch/frv/kernel/pm-mb93093.c b/arch/frv/kernel/pm-mb93093.c
index eaa7b582ef52..8358e34a3fad 100644
--- a/arch/frv/kernel/pm-mb93093.c
+++ b/arch/frv/kernel/pm-mb93093.c
@@ -17,7 +17,7 @@
 #include <linux/sysctl.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/mb86943a.h>
 
diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index ac767d94a880..051ccecbf7f1 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -19,7 +19,7 @@
 #include <linux/sysctl.h>
 #include <linux/errno.h>
 #include <linux/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/mb86943a.h>
 
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 5d40aeb7712e..b306241c4ef2 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -28,7 +28,7 @@
 #include <linux/rcupdate.h>
 
 #include <asm/asm-offsets.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/setup.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index 3987ff88dab0..49768401ce0f 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -23,7 +23,7 @@
 #include <linux/elf.h>
 #include <linux/tracehook.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 82d5e914dc15..bf6e07a7a1b1 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -22,7 +22,7 @@
 #include <linux/personality.h>
 #include <linux/tracehook.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 
 #define DEBUG_SIG 0
diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c
index 9c4980825bbb..f80cc8b9bd45 100644
--- a/arch/frv/kernel/sys_frv.c
+++ b/arch/frv/kernel/sys_frv.c
@@ -25,7 +25,7 @@
 #include <linux/ipc.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
diff --git a/arch/frv/kernel/sysctl.c b/arch/frv/kernel/sysctl.c
index f4dfae2c75ad..b54a64971cf1 100644
--- a/arch/frv/kernel/sysctl.c
+++ b/arch/frv/kernel/sysctl.c
@@ -12,7 +12,7 @@
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const char frv_cache_wback[] = "wback";
 static const char frv_cache_wthru[] = "wthru";
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index a6d105d61b26..31221fb4348e 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -23,7 +23,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/setup.h>
 #include <asm/fpu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/siginfo.h>
 #include <asm/unaligned.h>
diff --git a/arch/frv/kernel/uaccess.c b/arch/frv/kernel/uaccess.c
index 374f88d6cc00..8b360b4222a5 100644
--- a/arch/frv/kernel/uaccess.c
+++ b/arch/frv/kernel/uaccess.c
@@ -11,7 +11,7 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*****************************************************************************/
 /*
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c
index 7a73aaeae3ac..e701aa9e6a14 100644
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -44,7 +44,7 @@
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/smp.h>
 
 static int map_page(unsigned long va, unsigned long pa, pgprot_t prot)
diff --git a/arch/frv/mm/extable.c b/arch/frv/mm/extable.c
index 8863d6c1df6e..9a641c1b085a 100644
--- a/arch/frv/mm/extable.c
+++ b/arch/frv/mm/extable.c
@@ -4,7 +4,7 @@
 
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern const void __memset_end, __memset_user_error_lr, __memset_user_error_handler;
 extern const void __memcpy_end, __memcpy_user_error_lr, __memcpy_user_error_handler;
diff --git a/arch/h8300/boot/compressed/misc.c b/arch/h8300/boot/compressed/misc.c
index 9f64fe8f29ff..a947dbb4fd91 100644
--- a/arch/h8300/boot/compressed/misc.c
+++ b/arch/h8300/boot/compressed/misc.c
@@ -9,7 +9,7 @@
  * Adapted for h8300 by Yoshinori Sato 2006
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * gzip declarations
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index dee41256922c..891974a11704 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -38,7 +38,7 @@
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/traps.h>
 #include <asm/setup.h>
 #include <asm/pgtable.h>
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 7138303cbbf2..d784f7117f9a 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -41,7 +41,7 @@
 #include <linux/tracehook.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/traps.h>
 #include <asm/ucontext.h>
diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c
index c041d8ecb1e2..af9dec4c28eb 100644
--- a/arch/hexagon/kernel/hexagon_ksyms.c
+++ b/arch/hexagon/kernel/hexagon_ksyms.c
@@ -21,7 +21,7 @@
 #include <linux/dma-mapping.h>
 #include <asm/hexagon_vm.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Additional functions */
 EXPORT_SYMBOL(__clear_user_hexagon);
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index b039a624c170..c6b22b9945a7 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -24,7 +24,7 @@
 #include <asm/registers.h>
 #include <asm/thread_info.h>
 #include <asm/unistd.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ucontext.h>
 #include <asm/cacheflush.h>
 #include <asm/signal.h>
diff --git a/arch/hexagon/mm/uaccess.c b/arch/hexagon/mm/uaccess.c
index 34127261c2b7..ec90afdb3ad0 100644
--- a/arch/hexagon/mm/uaccess.c
+++ b/arch/hexagon/mm/uaccess.c
@@ -23,7 +23,7 @@
  * we implement here as subroutines.
  */
 #include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 
 /*
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index bd7c251e2bce..de863d6d802b 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -26,7 +26,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mm.h>
 #include <linux/signal.h>
 #include <linux/module.h>
diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c
index 0b286ca164f9..8682df6263d6 100644
--- a/arch/ia64/kernel/brl_emu.c
+++ b/arch/ia64/kernel/brl_emu.c
@@ -9,7 +9,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 
 extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5;
diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c
index c8c9298666fb..9c12b794e774 100644
--- a/arch/ia64/kernel/crash_dump.c
+++ b/arch/ia64/kernel/crash_dump.c
@@ -11,7 +11,7 @@
 #include <linux/crash_dump.h>
 
 #include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**
  * copy_oldmem_page - copy one page from "oldmem"
diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c
index 0eaa89f3defd..fa8ee64adac2 100644
--- a/arch/ia64/kernel/init_task.c
+++ b/arch/ia64/kernel/init_task.c
@@ -14,7 +14,7 @@
 #include <linux/init_task.h>
 #include <linux/mqueue.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index de4fc00dea98..2ff1df7b14ea 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -17,7 +17,7 @@
  */
 
 #include <asm/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index c7c51445c3be..45ff27e9edbb 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -33,7 +33,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern void jprobe_inst_return(void);
 
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 2436ad5f92c1..677a86826771 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -50,7 +50,7 @@
 #include <asm/perfmon.h>
 #include <asm/processor.h>
 #include <asm/signal.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/delay.h>
 
 #ifdef CONFIG_PERFMON
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index aae6c4dc7ae7..52deab683ba1 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -41,7 +41,7 @@
 #include <asm/sal.h>
 #include <asm/switch_to.h>
 #include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unwind.h>
 #include <asm/user.h>
 
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 36f660da8124..0b1153e610ea 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -26,7 +26,7 @@
 #include <asm/processor.h>
 #include <asm/ptrace_offsets.h>
 #include <asm/rse.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unwind.h>
 #ifdef CONFIG_PERFMON
 #include <asm/perfmon.h>
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index aaf74f36cfa1..d194d5c83d32 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -48,7 +48,7 @@
 #include <linux/semaphore.h>
 
 #include <asm/sal.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 MODULE_AUTHOR("Jesse Barnes <jbarnes@sgi.com>");
 MODULE_DESCRIPTION("/proc interface to IA-64 SAL features");
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index b3a124da71e5..5db52c6813c4 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -22,7 +22,7 @@
 #include <linux/wait.h>
 
 #include <asm/intrinsics.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/rse.h>
 #include <asm/sigcontext.h>
 
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 41e33f84c185..a09c12230bc5 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -18,7 +18,7 @@
 #include <linux/hugetlb.h>
 
 #include <asm/shmparam.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 unsigned long
 arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len,
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 77edd68c5161..095bfaff82d0 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -21,7 +21,7 @@
 #include <asm/fpswa.h>
 #include <asm/intrinsics.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/setup.h>
 
 fpswa_interface_t *fpswa_interface;
diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 7f0d31656b4d..9cd01c2200ee 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -22,7 +22,7 @@
 #include <asm/intrinsics.h>
 #include <asm/processor.h>
 #include <asm/rse.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 
 extern int die_if_kernel(char *str, struct pt_regs *regs, long err);
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
index 8f66195999e7..9704e2cd9878 100644
--- a/arch/ia64/kernel/unwind.c
+++ b/arch/ia64/kernel/unwind.c
@@ -41,7 +41,7 @@
 #include <asm/ptrace_offsets.h>
 #include <asm/rse.h>
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "entry.h"
 #include "unwind_i.h"
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
index 118daf5a0632..42f7678ef6ad 100644
--- a/arch/ia64/lib/csum_partial_copy.c
+++ b/arch/ia64/lib/csum_partial_copy.c
@@ -11,7 +11,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * XXX Fixme: those 2 inlines are meant for debugging and will go away
diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c
index 8f70bb2d0c37..4edb816aba9a 100644
--- a/arch/ia64/mm/extable.c
+++ b/arch/ia64/mm/extable.c
@@ -5,7 +5,7 @@
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 void
 ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 1841ef69183d..bb4610faca84 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -31,7 +31,7 @@
 #include <asm/sal.h>
 #include <asm/sections.h>
 #include <asm/tlb.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/mca.h>
 
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index b9992571c036..4c3b84d8406a 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -37,7 +37,7 @@
 
 #include <asm/processor.h>
 #include <asm/topology.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sal.h>
 #include <asm/sn/io.h>
 #include <asm/sn/sn_sal.h>
diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
index 7aab87f48060..29cf8f8c08e9 100644
--- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
+++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c
@@ -9,7 +9,7 @@
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sn/sn_sal.h>
 
 static int partition_id_show(struct seq_file *s, void *p)
diff --git a/arch/ia64/sn/kernel/tiocx.c b/arch/ia64/sn/kernel/tiocx.c
index e35f6485c1fd..32d0380eb72e 100644
--- a/arch/ia64/sn/kernel/tiocx.c
+++ b/arch/ia64/sn/kernel/tiocx.c
@@ -14,7 +14,7 @@
 #include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sn/sn_sal.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/io.h>
diff --git a/arch/m32r/kernel/align.c b/arch/m32r/kernel/align.c
index ab871ccd33f8..ec51e5b34860 100644
--- a/arch/m32r/kernel/align.c
+++ b/arch/m32r/kernel/align.c
@@ -5,7 +5,7 @@
  */
 
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static int get_reg(struct pt_regs *regs, int nr)
 {
diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c
index c7272b894283..5537f7397297 100644
--- a/arch/m32r/kernel/irq.c
+++ b/arch/m32r/kernel/irq.c
@@ -19,7 +19,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * do_IRQ handles all normal device IRQs (the special
diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c
index 23f26f4adfff..d763f0bd2106 100644
--- a/arch/m32r/kernel/m32r_ksyms.c
+++ b/arch/m32r/kernel/m32r_ksyms.c
@@ -8,7 +8,7 @@
 #include <linux/string.h>
 
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/checksum.h>
 #include <asm/io.h>
 #include <asm/delay.h>
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index a88b1f01e91f..e0568bee60c0 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -29,7 +29,7 @@
 #include <linux/rcupdate.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/elf.h>
 #include <asm/m32r.h>
diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c
index c145605a981f..a68acb9fa515 100644
--- a/arch/m32r/kernel/ptrace.c
+++ b/arch/m32r/kernel/ptrace.c
@@ -27,7 +27,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 1c81e24fd006..1ed597041fba 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -23,7 +23,7 @@
 #include <linux/tracehook.h>
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DEBUG_SIG 0
 
diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c
index c3fdd632fba7..f34957032504 100644
--- a/arch/m32r/kernel/sys_m32r.c
+++ b/arch/m32r/kernel/sys_m32r.c
@@ -22,7 +22,7 @@
 #include <linux/utsname.h>
 #include <linux/ipc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cachectl.h>
 #include <asm/cacheflush.h>
 #include <asm/syscall.h>
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index a7a424f852e4..c3c5fdfae920 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -18,7 +18,7 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
 
diff --git a/arch/m32r/lib/csum_partial_copy.c b/arch/m32r/lib/csum_partial_copy.c
index 5596f3df833f..b3cd59c12b8e 100644
--- a/arch/m32r/lib/csum_partial_copy.c
+++ b/arch/m32r/lib/csum_partial_copy.c
@@ -22,7 +22,7 @@
 
 #include <net/checksum.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Copy while checksumming, otherwise like csum_partial
diff --git a/arch/m32r/lib/usercopy.c b/arch/m32r/lib/usercopy.c
index 82abd159dbef..fd03f2731f20 100644
--- a/arch/m32r/lib/usercopy.c
+++ b/arch/m32r/lib/usercopy.c
@@ -9,7 +9,7 @@
 #include <linux/prefetch.h>
 #include <linux/string.h>
 #include <linux/thread_info.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 unsigned long
 __generic_copy_to_user(void __user *to, const void *from, unsigned long n)
diff --git a/arch/m32r/mm/extable.c b/arch/m32r/mm/extable.c
index 1743f23d49a3..40ccf80d29cf 100644
--- a/arch/m32r/mm/extable.c
+++ b/arch/m32r/mm/extable.c
@@ -3,7 +3,7 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int fixup_exception(struct pt_regs *regs)
 {
diff --git a/arch/m32r/mm/fault-nommu.c b/arch/m32r/mm/fault-nommu.c
index 80f18cc6f547..e22d5ddae5cb 100644
--- a/arch/m32r/mm/fault-nommu.c
+++ b/arch/m32r/mm/fault-nommu.c
@@ -22,7 +22,7 @@
 #include <linux/vt_kern.h>              /* For unblank_screen() */
 
 #include <asm/m32r.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/hardirq.h>
diff --git a/arch/m68k/bvme6000/rtc.c b/arch/m68k/bvme6000/rtc.c
index f7984f44ff0f..d53c9b301f84 100644
--- a/arch/m68k/bvme6000/rtc.c
+++ b/arch/m68k/bvme6000/rtc.c
@@ -20,7 +20,7 @@
 #include <asm/bvme6000hw.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/setup.h>
 
 /*
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 4ba1ae7345c3..aaf28f8e342d 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -27,7 +27,7 @@
 #include <linux/mqueue.h>
 #include <linux/rcupdate.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/traps.h>
 #include <asm/machdep.h>
 #include <asm/setup.h>
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index 1bc10e62b9af..9cd86d7343a6 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -20,7 +20,7 @@
 #include <linux/signal.h>
 #include <linux/tracehook.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 58507edbdf1d..8ead291a902a 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -46,7 +46,7 @@
 #include <linux/tracehook.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/traps.h>
 #include <asm/ucontext.h>
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 9aa01adb407f..98a2daaae30c 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -22,7 +22,7 @@
 #include <linux/ipc.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cachectl.h>
 #include <asm/traps.h>
 #include <asm/page.h>
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 6c9ca24830e9..558f38402737 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -32,7 +32,7 @@
 
 #include <asm/setup.h>
 #include <asm/fpu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/traps.h>
 #include <asm/pgalloc.h>
 #include <asm/machdep.h>
diff --git a/arch/m68k/lib/uaccess.c b/arch/m68k/lib/uaccess.c
index 35d1442dee89..a76b73abaf64 100644
--- a/arch/m68k/lib/uaccess.c
+++ b/arch/m68k/lib/uaccess.c
@@ -5,7 +5,7 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 unsigned long __generic_copy_from_user(void *to, const void __user *from,
 				       unsigned long n)
diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index 0fb54a90eac2..c6d351f5bd79 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -16,7 +16,7 @@
 #include <linux/cuda.h>
 #include <linux/pmu.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/segment.h>
 #include <asm/setup.h>
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index b09a3cb29b68..9c1e656b1f8f 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -20,7 +20,7 @@
 #include <linux/gfp.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/traps.h>
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index 8f37fdd80be9..7cb72dbc2eaa 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -21,7 +21,7 @@
 #include <linux/gfp.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/machdep.h>
diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c
index 269f81158a33..b5b7d53f7283 100644
--- a/arch/m68k/mm/sun3mmu.c
+++ b/arch/m68k/mm/sun3mmu.c
@@ -18,7 +18,7 @@
 #include <linux/bootmem.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/machdep.h>
diff --git a/arch/m68k/mvme16x/rtc.c b/arch/m68k/mvme16x/rtc.c
index 1cdc73268188..8f00847a0e4b 100644
--- a/arch/m68k/mvme16x/rtc.c
+++ b/arch/m68k/mvme16x/rtc.c
@@ -19,7 +19,7 @@
 #include <asm/mvme16xhw.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/setup.h>
 
 /*
diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c
index 3f258e230ba5..0f95134e9b85 100644
--- a/arch/m68k/sun3/mmu_emu.c
+++ b/arch/m68k/sun3/mmu_emu.c
@@ -18,7 +18,7 @@
 
 #include <asm/setup.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/sun3mmu.h>
diff --git a/arch/metag/kernel/irq.c b/arch/metag/kernel/irq.c
index 3074b64793e6..c9939604a38f 100644
--- a/arch/metag/kernel/irq.c
+++ b/arch/metag/kernel/irq.c
@@ -13,7 +13,7 @@
 
 #include <asm/core_reg.h>
 #include <asm/mach/arch.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_4KSTACKS
 union irq_ctx {
diff --git a/arch/mips/alchemy/common/power.c b/arch/mips/alchemy/common/power.c
index 921ed30b440c..303257b697c2 100644
--- a/arch/mips/alchemy/common/power.c
+++ b/arch/mips/alchemy/common/power.c
@@ -33,7 +33,7 @@
 #include <linux/sysctl.h>
 #include <linux/jiffies.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mach-au1x00/au1000.h>
 
 /*
diff --git a/arch/mips/dec/kn01-berr.c b/arch/mips/dec/kn01-berr.c
index 44d8a87a8a68..e9d2db480aeb 100644
--- a/arch/mips/dec/kn01-berr.c
+++ b/arch/mips/dec/kn01-berr.c
@@ -23,7 +23,7 @@
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/dec/kn01.h>
 
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h
index bce1ce53149a..7749daf2a465 100644
--- a/arch/mips/include/asm/checksum.h
+++ b/arch/mips/include/asm/checksum.h
@@ -18,7 +18,7 @@
 
 #include <linux/in6.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * computes the checksum of a memory block at buff, length len,
diff --git a/arch/mips/include/asm/compat-signal.h b/arch/mips/include/asm/compat-signal.h
index 64e0b9343b8c..4c6176467146 100644
--- a/arch/mips/include/asm/compat-signal.h
+++ b/arch/mips/include/asm/compat-signal.h
@@ -8,7 +8,7 @@
 #include <asm/signal.h>
 #include <asm/siginfo.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static inline int __copy_conv_sigset_to_user(compat_sigset_t __user *d,
 	const sigset_t *s)
diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index 667ca3c467b7..b42b513007a2 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -20,7 +20,7 @@
 #include <asm/cpu-features.h>
 #include <asm/cpu-type.h>
 #include <asm/mipsmtregs.h>
-#include <asm/uaccess.h> /* for segment_eq() */
+#include <linux/uaccess.h> /* for segment_eq() */
 
 extern void (*r4k_blast_dcache)(void);
 extern void (*r4k_blast_icache)(void);
diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h
index 6245b68a69a8..ce2d72e34274 100644
--- a/arch/mips/include/asm/termios.h
+++ b/arch/mips/include/asm/termios.h
@@ -9,7 +9,7 @@
 #ifndef _ASM_TERMIOS_H
 #define _ASM_TERMIOS_H
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <uapi/asm/termios.h>
 
 /*
diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index db6f5afff4ff..1900f39588ae 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -18,7 +18,7 @@
 #include <asm/mipsregs.h>
 #include <asm/jazz.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/dma.h>
 #include <asm/jazzdma.h>
 #include <asm/pgtable.h>
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 12c718181e5e..ae037a304ee4 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -18,7 +18,7 @@
 #include <asm/inst.h>
 #include <asm/mips-r2-to-r6-emul.h>
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Calculate and return exception PC in case of branch delay slot
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index dd3175442c9e..07718bb5fc9d 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -30,7 +30,7 @@
 #include <asm/elf.h>
 #include <asm/pgtable-bits.h>
 #include <asm/spram.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Hardware capabilities */
 unsigned int elf_hwcap __read_mostly;
diff --git a/arch/mips/kernel/crash_dump.c b/arch/mips/kernel/crash_dump.c
index 6fe7790e5868..77ee99a2d0aa 100644
--- a/arch/mips/kernel/crash_dump.c
+++ b/arch/mips/kernel/crash_dump.c
@@ -1,7 +1,7 @@
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/crash_dump.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/slab.h>
 
 static void *kdump_buf_page;
diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c
index f25f7eab7307..f8f5836eb3c1 100644
--- a/arch/mips/kernel/irq.c
+++ b/arch/mips/kernel/irq.c
@@ -23,7 +23,7 @@
 #include <linux/ftrace.h>
 
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index de63d36af895..1f4bd222ba76 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -32,7 +32,7 @@
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static struct hard_trap_info {
 	unsigned char tt;	/* Trap type code for MIPS R3xxx and R4xxx */
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 50fb62544df7..0352f742d077 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -38,7 +38,7 @@
 
 #include <asm/compat-signal.h>
 #include <asm/sim.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/mman.h>
 
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 789d7bf4fef3..a12904ea9f65 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -11,7 +11,7 @@
 #include <linux/sched.h>
 #include <linux/security.h>
 #include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * CPU mask used to set process affinity for MT VPEs/TCs with FPUs
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index bd09853aecdf..ef2ca28a028b 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -29,7 +29,7 @@
 #include <asm/local.h>
 #include <asm/mipsregs.h>
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_64BIT
 #define ADDIU	"daddiu "
diff --git a/arch/mips/kernel/mips_ksyms.c b/arch/mips/kernel/mips_ksyms.c
index e2b6ab74643d..93aeec705a6e 100644
--- a/arch/mips/kernel/mips_ksyms.c
+++ b/arch/mips/kernel/mips_ksyms.c
@@ -12,7 +12,7 @@
 #include <linux/export.h>
 #include <asm/checksum.h>
 #include <linux/mm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ftrace.h>
 #include <asm/fpu.h>
 #include <asm/msa.h>
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 9514e5f2209f..5142b1dfe8a7 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -38,7 +38,7 @@
 #include <asm/mipsregs.h>
 #include <asm/processor.h>
 #include <asm/reg.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/elf.h>
 #include <asm/isadep.h>
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index a92994d60e91..c8ba26072132 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -41,7 +41,7 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/syscall.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/bootinfo.h>
 #include <asm/reg.h>
 
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 5fcbdcd7abd0..4f0998525626 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -32,7 +32,7 @@
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/reg.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/bootinfo.h>
 
 /*
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 97b7c51b8251..84165f2b31ff 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -16,7 +16,7 @@
 
 #include <asm/compat.h>
 #include <asm/compat-signal.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 #include "signal-common.h"
diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c
index a7bc38430500..b672cebb4a1a 100644
--- a/arch/mips/kernel/signal_n32.c
+++ b/arch/mips/kernel/signal_n32.c
@@ -33,7 +33,7 @@
 #include <asm/cacheflush.h>
 #include <asm/compat-signal.h>
 #include <asm/sim.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ucontext.h>
 #include <asm/fpu.h>
 #include <asm/cpu-features.h>
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 53a7ef9a8f32..833f82210528 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -36,7 +36,7 @@
 #include <asm/sim.h>
 #include <asm/shmparam.h>
 #include <asm/sysmips.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/switch_to.h>
 
 /*
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 3905003dfe2b..6c7f9d7e92b3 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -61,7 +61,7 @@
 #include <asm/siginfo.h>
 #include <asm/tlbdebug.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/watch.h>
 #include <asm/mmu_context.h>
 #include <asm/types.h>
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index f1c308dbbc4a..7ed98354fe9d 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -89,7 +89,7 @@
 #include <asm/fpu.h>
 #include <asm/fpu_emulator.h>
 #include <asm/inst.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define STR(x)	__STR(x)
 #define __STR(x)  #x
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index f8b7bf836437..a298ac93edcc 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -42,7 +42,7 @@
 #include <asm/inst.h>
 #include <asm/ptrace.h>
 #include <asm/signal.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/cpu-info.h>
 #include <asm/processor.h>
diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c
index 4a094f7acb3d..c4469ff4a996 100644
--- a/arch/mips/math-emu/dsemul.c
+++ b/arch/mips/math-emu/dsemul.c
@@ -6,7 +6,7 @@
 #include <asm/fpu_emulator.h>
 #include <asm/inst.h>
 #include <asm/mipsregs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**
  * struct emuframe - The 'emulation' frame structure
diff --git a/arch/mips/mm/extable.c b/arch/mips/mm/extable.c
index e474fa2efed4..81bc8a34a83f 100644
--- a/arch/mips/mm/extable.c
+++ b/arch/mips/mm/extable.c
@@ -8,7 +8,7 @@
 #include <linux/extable.h>
 #include <linux/spinlock.h>
 #include <asm/branch.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int fixup_exception(struct pt_regs *regs)
 {
diff --git a/arch/mips/mm/sc-debugfs.c b/arch/mips/mm/sc-debugfs.c
index 01f1154cdb0c..7e945e310b44 100644
--- a/arch/mips/mm/sc-debugfs.c
+++ b/arch/mips/mm/sc-debugfs.c
@@ -10,7 +10,7 @@
 
 #include <asm/bcache.h>
 #include <asm/debug.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/debugfs.h>
 #include <linux/init.h>
 
diff --git a/arch/mips/oprofile/op_model_loongson3.c b/arch/mips/oprofile/op_model_loongson3.c
index 85f3ee4ab456..40660392006f 100644
--- a/arch/mips/oprofile/op_model_loongson3.c
+++ b/arch/mips/oprofile/op_model_loongson3.c
@@ -11,7 +11,7 @@
 #include <linux/oprofile.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <irq.h>
 #include <loongson.h>
 #include "op_impl.h"
diff --git a/arch/mips/sgi-ip22/ip28-berr.c b/arch/mips/sgi-ip22/ip28-berr.c
index 712cc0f6a58d..9960a8302eac 100644
--- a/arch/mips/sgi-ip22/ip28-berr.c
+++ b/arch/mips/sgi-ip22/ip28-berr.c
@@ -19,7 +19,7 @@
 #include <asm/sgi/ioc.h>
 #include <asm/sgi/ip22.h>
 #include <asm/r4kcache.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/bootinfo.h>
 
 static unsigned int count_be_is_fixup;
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index 692778da9e76..2e0edb385656 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -19,7 +19,7 @@
 #include <asm/sn/sn0/hub.h>
 #include <asm/tlbdebug.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static void dump_hub_information(unsigned long errst0, unsigned long errst1)
 {
diff --git a/arch/mips/sgi-ip32/ip32-berr.c b/arch/mips/sgi-ip32/ip32-berr.c
index afc1cadbba37..ba8f46d80ab8 100644
--- a/arch/mips/sgi-ip32/ip32-berr.c
+++ b/arch/mips/sgi-ip32/ip32-berr.c
@@ -11,7 +11,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/addrspace.h>
 #include <asm/ptrace.h>
 #include <asm/tlbdebug.h>
diff --git a/arch/mips/sibyte/common/sb_tbprof.c b/arch/mips/sibyte/common/sb_tbprof.c
index 059e28c8fd97..99c720be72d2 100644
--- a/arch/mips/sibyte/common/sb_tbprof.c
+++ b/arch/mips/sibyte/common/sb_tbprof.c
@@ -54,7 +54,7 @@
 #define K_INT_PERF_CNT K_BCM1480_INT_PERF_CNT
 #endif
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define SBPROF_TB_MAJOR 240
 
diff --git a/arch/mn10300/kernel/fpu.c b/arch/mn10300/kernel/fpu.c
index 064fa194de2b..2578b7ae7dd5 100644
--- a/arch/mn10300/kernel/fpu.c
+++ b/arch/mn10300/kernel/fpu.c
@@ -8,7 +8,7 @@
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
  */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fpu.h>
 #include <asm/elf.h>
 #include <asm/exceptions.h>
diff --git a/arch/mn10300/kernel/mn10300_ksyms.c b/arch/mn10300/kernel/mn10300_ksyms.c
index f9eb9753a404..ec6c4f8f93a6 100644
--- a/arch/mn10300/kernel/mn10300_ksyms.c
+++ b/arch/mn10300/kernel/mn10300_ksyms.c
@@ -9,7 +9,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 
 
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index cbede4e88dee..e5def2217f72 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -26,7 +26,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/processor.h>
diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c
index 5bd58514e739..976020f469c1 100644
--- a/arch/mn10300/kernel/ptrace.c
+++ b/arch/mn10300/kernel/ptrace.c
@@ -19,7 +19,7 @@
 #include <linux/regset.h>
 #include <linux/elf.h>
 #include <linux/tracehook.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/cacheflush.h>
diff --git a/arch/mn10300/kernel/setup.c b/arch/mn10300/kernel/setup.c
index 2ad7f32fa122..1b3d80d8a171 100644
--- a/arch/mn10300/kernel/setup.c
+++ b/arch/mn10300/kernel/setup.c
@@ -25,7 +25,7 @@
 #include <linux/cpu.h>
 #include <asm/processor.h>
 #include <linux/console.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/setup.h>
 #include <asm/io.h>
 #include <asm/smp.h>
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index cd8cb1d1176b..2f3cb5734235 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -25,7 +25,7 @@
 #include <linux/tracehook.h>
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fpu.h>
 #include "sigframe.h"
 
diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c
index 815f1355fad4..f999981e55c0 100644
--- a/arch/mn10300/kernel/sys_mn10300.c
+++ b/arch/mn10300/kernel/sys_mn10300.c
@@ -21,7 +21,7 @@
 #include <linux/file.h>
 #include <linux/tty.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 asmlinkage long old_mmap(unsigned long addr, unsigned long len,
 			 unsigned long prot, unsigned long flags,
diff --git a/arch/mn10300/lib/checksum.c b/arch/mn10300/lib/checksum.c
index b6580f5d89ee..0f569151ef11 100644
--- a/arch/mn10300/lib/checksum.c
+++ b/arch/mn10300/lib/checksum.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/checksum.h>
 #include "internal.h"
 
diff --git a/arch/mn10300/mm/cache-smp.c b/arch/mn10300/mm/cache-smp.c
index 2d23b9eeee62..e80996064d3d 100644
--- a/arch/mn10300/mm/cache-smp.c
+++ b/arch/mn10300/mm/cache-smp.c
@@ -18,7 +18,7 @@
 #include <asm/processor.h>
 #include <asm/cacheflush.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/smp.h>
 #include "cache-smp.h"
 
diff --git a/arch/mn10300/mm/cache.c b/arch/mn10300/mm/cache.c
index 0a1f0aa92ebc..0b925cce2b83 100644
--- a/arch/mn10300/mm/cache.c
+++ b/arch/mn10300/mm/cache.c
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/cacheflush.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/smp.h>
 #include "cache-smp.h"
 
diff --git a/arch/mn10300/mm/extable.c b/arch/mn10300/mm/extable.c
index 25e5485ab87d..305de461cb8f 100644
--- a/arch/mn10300/mm/extable.c
+++ b/arch/mn10300/mm/extable.c
@@ -10,7 +10,7 @@
  */
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int fixup_exception(struct pt_regs *regs)
 {
diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c
index 97a1ec0beeec..8ce677d5575e 100644
--- a/arch/mn10300/mm/init.c
+++ b/arch/mn10300/mm/init.c
@@ -29,7 +29,7 @@
 #include <linux/gfp.h>
 
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/dma.h>
diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c
index b9920b1edd5a..31d04da85743 100644
--- a/arch/mn10300/mm/misalignment.c
+++ b/arch/mn10300/mm/misalignment.c
@@ -23,7 +23,7 @@
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
 #include <asm/smp.h>
diff --git a/arch/mn10300/proc-mn2ws0050/proc-init.c b/arch/mn10300/proc-mn2ws0050/proc-init.c
index 950cc8dbb284..25b1b453c515 100644
--- a/arch/mn10300/proc-mn2ws0050/proc-init.c
+++ b/arch/mn10300/proc-mn2ws0050/proc-init.c
@@ -16,7 +16,7 @@
 
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
 #include <asm/smp.h>
diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c
index 81f7da7b1d55..72ed30a93c85 100644
--- a/arch/nios2/kernel/traps.c
+++ b/arch/nios2/kernel/traps.c
@@ -19,7 +19,7 @@
 
 #include <asm/traps.h>
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_SPINLOCK(die_lock);
 
diff --git a/arch/openrisc/kernel/or32_ksyms.c b/arch/openrisc/kernel/or32_ksyms.c
index 83ccf7c0c58d..86e31cf1de1d 100644
--- a/arch/openrisc/kernel/or32_ksyms.c
+++ b/arch/openrisc/kernel/or32_ksyms.c
@@ -24,7 +24,7 @@
 #include <linux/semaphore.h>
 
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/checksum.h>
 #include <asm/io.h>
 #include <asm/hardirq.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 277123bb4bf8..d7990df9025a 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -36,7 +36,7 @@
 #include <linux/mqueue.h>
 #include <linux/fs.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/processor.h>
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index c82be69b43c6..265f10fb3930 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -30,7 +30,7 @@
 #include <asm/processor.h>
 #include <asm/syscall.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DEBUG_SIG 0
 
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 3d3f6062f49c..a4574cb4b0fb 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -31,7 +31,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/kallsyms.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/segment.h>
 #include <asm/io.h>
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index e94cd225e816..b1a7435e786a 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -20,7 +20,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/siginfo.h>
 #include <asm/signal.h>
 
diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c
index 78d30d2ea2d8..1c4fe61a592b 100644
--- a/arch/parisc/kernel/asm-offsets.c
+++ b/arch/parisc/kernel/asm-offsets.c
@@ -38,7 +38,7 @@
 #include <asm/ptrace.h>
 #include <asm/processor.h>
 #include <asm/pdc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_64BIT
 #define FRAME_SIZE	128
diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c
index 3cad8aadc69e..7484b3d11e0d 100644
--- a/arch/parisc/kernel/parisc_ksyms.c
+++ b/arch/parisc/kernel/parisc_ksyms.c
@@ -43,7 +43,7 @@ EXPORT_SYMBOL(__xchg64);
 EXPORT_SYMBOL(__cmpxchg_u64);
 #endif
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 EXPORT_SYMBOL(lclear_user);
 EXPORT_SYMBOL(lstrnlen_user);
 
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index b6298a85e8ae..697c53543a4d 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -33,7 +33,7 @@
 #include <asm/io.h>
 #include <asm/page.h>	/* get_order */
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlbflush.h>	/* for purge_tlb_*() macros */
 
 static struct proc_dir_entry * proc_gsc_root __read_mostly = NULL;
diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c
index 6eabce62463b..e282a5131d77 100644
--- a/arch/parisc/kernel/perf.c
+++ b/arch/parisc/kernel/perf.c
@@ -48,7 +48,7 @@
 #include <linux/miscdevice.h>
 #include <linux/spinlock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/perf.h>
 #include <asm/parisc-device.h>
 #include <asm/processor.h>
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index e02d7b4d2b69..f8b6959d2d97 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -24,7 +24,7 @@
 #include <linux/signal.h>
 #include <linux/audit.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/asm-offsets.h>
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 2264f68f3c2f..e58925ac64d1 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -27,7 +27,7 @@
 #include <linux/elf.h>
 #include <asm/ucontext.h>
 #include <asm/rt_sigframe.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/asm-offsets.h>
diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c
index c342b2e17492..70aaabb8b3cb 100644
--- a/arch/parisc/kernel/signal32.c
+++ b/arch/parisc/kernel/signal32.c
@@ -31,7 +31,7 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "signal32.h"
 
diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c
index a81e177cac7b..bf3294171230 100644
--- a/arch/parisc/kernel/sys_parisc.c
+++ b/arch/parisc/kernel/sys_parisc.c
@@ -23,7 +23,7 @@
  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/elf.h>
 #include <linux/file.h>
 #include <linux/fs.h>
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 4215f5596c8b..037d81f00520 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -28,7 +28,7 @@
 #include <linux/platform_device.h>
 #include <linux/ftrace.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/page.h>
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index 2b65c0177778..0a21067ac0a3 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -26,7 +26,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/ratelimit.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/hardirq.h>
 #include <asm/traps.h>
 
diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c
index e278a87f43cc..1b73690477c5 100644
--- a/arch/parisc/kernel/unwind.c
+++ b/arch/parisc/kernel/unwind.c
@@ -15,7 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/sort.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/assembly.h>
 #include <asm/asm-offsets.h>
 #include <asm/ptrace.h>
diff --git a/arch/parisc/lib/checksum.c b/arch/parisc/lib/checksum.c
index ae66d31f9ecf..ba6384da6ade 100644
--- a/arch/parisc/lib/checksum.c
+++ b/arch/parisc/lib/checksum.c
@@ -20,7 +20,7 @@
 #include <net/checksum.h>
 #include <asm/byteorder.h>
 #include <asm/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define addc(_t,_r)                     \
 	__asm__ __volatile__ (          \
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 81592562e0f8..ba47c70712f9 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -15,7 +15,7 @@
 #include <linux/threads.h>
 #include <asm/cacheflush.h>
 #include <asm/checksum.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/epapr_hcalls.h>
 
 #include <uapi/asm/ucontext.h>
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index 033f3385fa49..8d58c61908f7 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -20,7 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cache.h>
 #include <asm/cputable.h>
 #include <asm/emulated_ops.h>
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index cfa0f81a5bb0..d10ad258d41a 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -18,7 +18,7 @@
 #include <asm/kdump.h>
 #include <asm/prom.h>
 #include <asm/firmware.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/rtas.h>
 
 #ifdef DEBUG
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 03d089b3ed72..4d3aa05e28be 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -33,7 +33,7 @@
 #include <asm/hw_breakpoint.h>
 #include <asm/processor.h>
 #include <asm/sstep.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Stores the breakpoints currently in use on each breakpoint address
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 3c05c311e35e..a018f5cae899 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -55,7 +55,7 @@
 #include <linux/of.h>
 #include <linux/of_irq.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/irq.h>
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index ad108b842669..735ff3d3f77d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -35,7 +35,7 @@
 #include <asm/code-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/sstep.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index 30b89d5cbb03..3f7ba0f5bf29 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -22,7 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/bug.h>
 #include <asm/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/firmware.h>
 #include <linux/sort.h>
 #include <asm/setup.h>
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 34d2c595de23..d5e2b8309939 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -28,7 +28,7 @@
 #include <linux/pagemap.h>
 #include <linux/pstore.h>
 #include <linux/zlib.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/nvram.h>
 #include <asm/rtas.h>
 #include <asm/prom.h>
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index 678f87a63645..41c86c6b6e4d 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -24,7 +24,7 @@
 #include <asm/pci-bridge.h>
 #include <asm/ppc-pci.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/machdep.h>
 
 #undef DEBUG
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
index c30612aad68e..56548bf6231f 100644
--- a/arch/powerpc/kernel/proc_powerpc.c
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -24,7 +24,7 @@
 #include <asm/machdep.h>
 #include <asm/vdso_datapage.h>
 #include <asm/rtas.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/prom.h>
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index b1ec62f2cc31..e4744ff38a17 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -34,7 +34,7 @@
 #include <linux/perf_event.h>
 #include <linux/context_tracking.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/switch_to.h>
diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c
index 1e887f3a61a6..f37eb53de1a1 100644
--- a/arch/powerpc/kernel/ptrace32.c
+++ b/arch/powerpc/kernel/ptrace32.c
@@ -29,7 +29,7 @@
 #include <linux/signal.h>
 #include <linux/compat.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/switch_to.h>
diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c
index c82eed97bd22..df56dfc4b681 100644
--- a/arch/powerpc/kernel/rtas-proc.c
+++ b/arch/powerpc/kernel/rtas-proc.c
@@ -24,7 +24,7 @@
 #include <linux/bitops.h>
 #include <linux/rtc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <asm/io.h>
 #include <asm/prom.h>
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 6a3e5de544ce..112cc3b2ee1a 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -35,7 +35,7 @@
 #include <asm/page.h>
 #include <asm/param.h>
 #include <asm/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/udbg.h>
 #include <asm/syscalls.h>
 #include <asm/smp.h>
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index db2b482af658..f6f6a8a5103a 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -19,7 +19,7 @@
 #include <linux/proc_fs.h>
 #include <linux/reboot.h>
 #include <asm/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/rtas.h>
 
 #define MODULE_VERS "1.0"
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index a26a02006576..2bf1f9b5b34b 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -22,7 +22,7 @@
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/rtas.h>
 #include <asm/prom.h>
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 5fe79182f0fa..7fcf1f7f01c1 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -29,7 +29,7 @@
 #include <asm/bootx.h>
 #include <asm/btext.h>
 #include <asm/machdep.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pmac_feature.h>
 #include <asm/sections.h>
 #include <asm/nvram.h>
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index bbe77aed198d..3a3671172436 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -15,7 +15,7 @@
 #include <linux/key.h>
 #include <linux/context_tracking.h>
 #include <asm/hw_breakpoint.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/debug.h>
 #include <asm/tm.h>
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 27aa913ac91d..97bb1385e771 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -37,7 +37,7 @@
 #include <linux/binfmts.h>
 #endif
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/syscalls.h>
 #include <asm/sigcontext.h>
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 96698fdf93b4..c83c115858c1 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -27,7 +27,7 @@
 
 #include <asm/sigcontext.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/unistd.h>
 #include <asm/cacheflush.h>
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 8a285876aef8..15f216d022e2 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -44,7 +44,7 @@
 
 #include <asm/ptrace.h>
 #include <asm/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/time.h>
 #include <asm/mmu_context.h>
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index 644cce3d8dce..de04c9fbb5cd 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -36,7 +36,7 @@
 #include <linux/file.h>
 #include <linux/personality.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/syscalls.h>
 #include <asm/time.h>
 #include <asm/unistd.h>
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index be9751f1cb2a..19397e2a8bf5 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -64,7 +64,7 @@
 #include <asm/nvram.h>
 #include <asm/cache.h>
 #include <asm/machdep.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/time.h>
 #include <asm/prom.h>
 #include <asm/irq.h>
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 4239aaf74886..e6cc56b61d01 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -40,7 +40,7 @@
 
 #include <asm/emulated_ops.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/machdep.h>
 #include <asm/rtas.h>
diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c
index c4bfadb2606b..2d8f6d8ccafc 100644
--- a/arch/powerpc/kernel/vecemu.c
+++ b/arch/powerpc/kernel/vecemu.c
@@ -7,7 +7,7 @@
 #include <linux/sched.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Functions in vector.S */
 extern void vaddfp(vector128 *dst, vector128 *a, vector128 *b);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6952dd23152..019f008775b9 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -25,7 +25,7 @@
 #include <asm/cputable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8dcbe37a4dac..66b2a35be424 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -39,7 +39,7 @@
 #include <asm/cputable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 826c541a12af..1482961ceb4d 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -28,7 +28,7 @@
 #include <asm/cputable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 02176fd52f84..f102616febc7 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -17,7 +17,7 @@
 
 #include <linux/anon_inodes.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index ef27fbd5d9c5..20528701835b 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -11,7 +11,7 @@
 #include <linux/kvm.h>
 #include <linux/err.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 3bdc639157c1..20dff102a06f 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -14,7 +14,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/spinlock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index df3f2706d3e5..0514cbd4e533 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -30,7 +30,7 @@
 #include <linux/fs.h>
 
 #include <asm/cputable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/cacheflush.h>
 #include <asm/dbell.h>
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index ed38f8114118..fe312c160d97 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -29,7 +29,7 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/anon_inodes.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mpic.h>
 #include <asm/kvm_para.h>
 #include <asm/kvm_host.h>
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index efd1183a6b16..cd892dec7cb6 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,7 +30,7 @@
 #include <linux/irqbypass.h>
 #include <linux/kvm_irqfd.h>
 #include <asm/cputable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
 #include <asm/cputhreads.h>
diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c
index 08e3a3356c40..a0cb63fb76a1 100644
--- a/arch/powerpc/lib/checksum_wrappers.c
+++ b/arch/powerpc/lib/checksum_wrappers.c
@@ -21,7 +21,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <asm/checksum.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 __wsum csum_and_copy_from_user(const void __user *src, void *dst,
 			       int len, __wsum sum, int *err_ptr)
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index d5edbeb8eb82..c1746df0f88e 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -13,7 +13,7 @@
 #include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/code-patching.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 int patch_instruction(unsigned int *addr, unsigned int instr)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9c78a9c102c3..06c7e9b88408 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -14,7 +14,7 @@
 #include <linux/prefetch.h>
 #include <asm/sstep.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cpu_has_feature.h>
 #include <asm/cputable.h>
 
diff --git a/arch/powerpc/lib/usercopy_64.c b/arch/powerpc/lib/usercopy_64.c
index 5eea6f3c1e03..9bd3a3dad78d 100644
--- a/arch/powerpc/lib/usercopy_64.c
+++ b/arch/powerpc/lib/usercopy_64.c
@@ -7,7 +7,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 unsigned long copy_from_user(void *to, const void __user *from, unsigned long n)
 {
diff --git a/arch/powerpc/math-emu/fabs.c b/arch/powerpc/math-emu/fabs.c
index 549baba5948f..a5e7ad1384ee 100644
--- a/arch/powerpc/math-emu/fabs.c
+++ b/arch/powerpc/math-emu/fabs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 fabs(u32 *frD, u32 *frB)
diff --git a/arch/powerpc/math-emu/fadd.c b/arch/powerpc/math-emu/fadd.c
index 0158a16e2b82..29de37e0e0da 100644
--- a/arch/powerpc/math-emu/fadd.c
+++ b/arch/powerpc/math-emu/fadd.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fadds.c b/arch/powerpc/math-emu/fadds.c
index 5930f40a8687..7093c5b58002 100644
--- a/arch/powerpc/math-emu/fadds.c
+++ b/arch/powerpc/math-emu/fadds.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fcmpo.c b/arch/powerpc/math-emu/fcmpo.c
index 5bce011c2aec..5d644467221c 100644
--- a/arch/powerpc/math-emu/fcmpo.c
+++ b/arch/powerpc/math-emu/fcmpo.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fcmpu.c b/arch/powerpc/math-emu/fcmpu.c
index d4fb1babc6ad..0f9bf4864832 100644
--- a/arch/powerpc/math-emu/fcmpu.c
+++ b/arch/powerpc/math-emu/fcmpu.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fctiw.c b/arch/powerpc/math-emu/fctiw.c
index f694440ddc00..716d6da7f204 100644
--- a/arch/powerpc/math-emu/fctiw.c
+++ b/arch/powerpc/math-emu/fctiw.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fctiwz.c b/arch/powerpc/math-emu/fctiwz.c
index 71e782fd4fe3..7212fa7cfd36 100644
--- a/arch/powerpc/math-emu/fctiwz.c
+++ b/arch/powerpc/math-emu/fctiwz.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fdiv.c b/arch/powerpc/math-emu/fdiv.c
index a29239c05e3e..e1e452069e49 100644
--- a/arch/powerpc/math-emu/fdiv.c
+++ b/arch/powerpc/math-emu/fdiv.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fdivs.c b/arch/powerpc/math-emu/fdivs.c
index 526bc261275f..5511e2d1c3ad 100644
--- a/arch/powerpc/math-emu/fdivs.c
+++ b/arch/powerpc/math-emu/fdivs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fmadd.c b/arch/powerpc/math-emu/fmadd.c
index 8c3f20aa5a95..2b6fae0bc8c2 100644
--- a/arch/powerpc/math-emu/fmadd.c
+++ b/arch/powerpc/math-emu/fmadd.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fmadds.c b/arch/powerpc/math-emu/fmadds.c
index 794fb31e59d1..aff35f24a236 100644
--- a/arch/powerpc/math-emu/fmadds.c
+++ b/arch/powerpc/math-emu/fmadds.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fmr.c b/arch/powerpc/math-emu/fmr.c
index bd55384b8196..f6347911f6a3 100644
--- a/arch/powerpc/math-emu/fmr.c
+++ b/arch/powerpc/math-emu/fmr.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 fmr(u32 *frD, u32 *frB)
diff --git a/arch/powerpc/math-emu/fmsub.c b/arch/powerpc/math-emu/fmsub.c
index 626f6fed84ac..1fb26cebe04e 100644
--- a/arch/powerpc/math-emu/fmsub.c
+++ b/arch/powerpc/math-emu/fmsub.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fmsubs.c b/arch/powerpc/math-emu/fmsubs.c
index 3425bc899760..f73965453e05 100644
--- a/arch/powerpc/math-emu/fmsubs.c
+++ b/arch/powerpc/math-emu/fmsubs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fmul.c b/arch/powerpc/math-emu/fmul.c
index 2c1929779892..ffd31b549290 100644
--- a/arch/powerpc/math-emu/fmul.c
+++ b/arch/powerpc/math-emu/fmul.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fmuls.c b/arch/powerpc/math-emu/fmuls.c
index f5ad5c9c77d0..21aee431ca9d 100644
--- a/arch/powerpc/math-emu/fmuls.c
+++ b/arch/powerpc/math-emu/fmuls.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fnabs.c b/arch/powerpc/math-emu/fnabs.c
index a7d34f3d9499..af877a53d264 100644
--- a/arch/powerpc/math-emu/fnabs.c
+++ b/arch/powerpc/math-emu/fnabs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 fnabs(u32 *frD, u32 *frB)
diff --git a/arch/powerpc/math-emu/fneg.c b/arch/powerpc/math-emu/fneg.c
index 1e988cd9c6cc..8417d174758c 100644
--- a/arch/powerpc/math-emu/fneg.c
+++ b/arch/powerpc/math-emu/fneg.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 fneg(u32 *frD, u32 *frB)
diff --git a/arch/powerpc/math-emu/fnmadd.c b/arch/powerpc/math-emu/fnmadd.c
index e817bc5453ef..6316ef0e0874 100644
--- a/arch/powerpc/math-emu/fnmadd.c
+++ b/arch/powerpc/math-emu/fnmadd.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fnmadds.c b/arch/powerpc/math-emu/fnmadds.c
index 4db4b7d9ba8d..9ffe037df2b9 100644
--- a/arch/powerpc/math-emu/fnmadds.c
+++ b/arch/powerpc/math-emu/fnmadds.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fnmsub.c b/arch/powerpc/math-emu/fnmsub.c
index f65979fa770e..f97a9cfb54ea 100644
--- a/arch/powerpc/math-emu/fnmsub.c
+++ b/arch/powerpc/math-emu/fnmsub.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fnmsubs.c b/arch/powerpc/math-emu/fnmsubs.c
index 9021dacc03b8..7fa1217bd930 100644
--- a/arch/powerpc/math-emu/fnmsubs.c
+++ b/arch/powerpc/math-emu/fnmsubs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fre.c b/arch/powerpc/math-emu/fre.c
index 49ccf2cc6a5a..b621a790aa67 100644
--- a/arch/powerpc/math-emu/fre.c
+++ b/arch/powerpc/math-emu/fre.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int fre(void *frD, void *frB)
 {
diff --git a/arch/powerpc/math-emu/fres.c b/arch/powerpc/math-emu/fres.c
index 10ecbd08b79e..211c30d0145f 100644
--- a/arch/powerpc/math-emu/fres.c
+++ b/arch/powerpc/math-emu/fres.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 fres(void *frD, void *frB)
diff --git a/arch/powerpc/math-emu/frsp.c b/arch/powerpc/math-emu/frsp.c
index ddcc14664b1a..3e3bc73e27ae 100644
--- a/arch/powerpc/math-emu/frsp.c
+++ b/arch/powerpc/math-emu/frsp.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/frsqrte.c b/arch/powerpc/math-emu/frsqrte.c
index 1d0a3a0fd0e6..7c2ce43750dc 100644
--- a/arch/powerpc/math-emu/frsqrte.c
+++ b/arch/powerpc/math-emu/frsqrte.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 frsqrte(void *frD, void *frB)
diff --git a/arch/powerpc/math-emu/frsqrtes.c b/arch/powerpc/math-emu/frsqrtes.c
index 7e838e380314..269951a8c650 100644
--- a/arch/powerpc/math-emu/frsqrtes.c
+++ b/arch/powerpc/math-emu/frsqrtes.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int frsqrtes(void *frD, void *frB)
 {
diff --git a/arch/powerpc/math-emu/fsel.c b/arch/powerpc/math-emu/fsel.c
index 1b0c14498032..32b62c6c7f48 100644
--- a/arch/powerpc/math-emu/fsel.c
+++ b/arch/powerpc/math-emu/fsel.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fsqrt.c b/arch/powerpc/math-emu/fsqrt.c
index a55fc7d49983..0e2a34b616dc 100644
--- a/arch/powerpc/math-emu/fsqrt.c
+++ b/arch/powerpc/math-emu/fsqrt.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fsqrts.c b/arch/powerpc/math-emu/fsqrts.c
index 31dccbfc39ff..420cf19b5fd4 100644
--- a/arch/powerpc/math-emu/fsqrts.c
+++ b/arch/powerpc/math-emu/fsqrts.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fsub.c b/arch/powerpc/math-emu/fsub.c
index 02c5dff458ba..feedd705cf62 100644
--- a/arch/powerpc/math-emu/fsub.c
+++ b/arch/powerpc/math-emu/fsub.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/fsubs.c b/arch/powerpc/math-emu/fsubs.c
index 5d9b18c35e07..74190514063e 100644
--- a/arch/powerpc/math-emu/fsubs.c
+++ b/arch/powerpc/math-emu/fsubs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/lfd.c b/arch/powerpc/math-emu/lfd.c
index 79ac76d596c3..d998a50740a0 100644
--- a/arch/powerpc/math-emu/lfd.c
+++ b/arch/powerpc/math-emu/lfd.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/double.h>
diff --git a/arch/powerpc/math-emu/lfs.c b/arch/powerpc/math-emu/lfs.c
index 434ed27be8db..1ee10b83d7e3 100644
--- a/arch/powerpc/math-emu/lfs.c
+++ b/arch/powerpc/math-emu/lfs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/math.c b/arch/powerpc/math-emu/math.c
index ab151f040502..76ee2e5dba65 100644
--- a/arch/powerpc/math-emu/math.c
+++ b/arch/powerpc/math-emu/math.c
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/reg.h>
 #include <asm/switch_to.h>
 
diff --git a/arch/powerpc/math-emu/math_efp.c b/arch/powerpc/math-emu/math_efp.c
index 28337c9709ae..581f404caa1d 100644
--- a/arch/powerpc/math-emu/math_efp.c
+++ b/arch/powerpc/math-emu/math_efp.c
@@ -22,7 +22,7 @@
 #include <linux/types.h>
 #include <linux/prctl.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/reg.h>
 
 #define FP_EX_BOOKE_E500_SPE
diff --git a/arch/powerpc/math-emu/mcrfs.c b/arch/powerpc/math-emu/mcrfs.c
index e948d5708e2b..8e8e72397ebc 100644
--- a/arch/powerpc/math-emu/mcrfs.c
+++ b/arch/powerpc/math-emu/mcrfs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/mffs.c b/arch/powerpc/math-emu/mffs.c
index 5526cf96ede5..e00fdc22a0bc 100644
--- a/arch/powerpc/math-emu/mffs.c
+++ b/arch/powerpc/math-emu/mffs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/mtfsb0.c b/arch/powerpc/math-emu/mtfsb0.c
index bc985585bca8..5ed3e7d5063e 100644
--- a/arch/powerpc/math-emu/mtfsb0.c
+++ b/arch/powerpc/math-emu/mtfsb0.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/mtfsb1.c b/arch/powerpc/math-emu/mtfsb1.c
index fe6ed5ac85b3..602aa16eda81 100644
--- a/arch/powerpc/math-emu/mtfsb1.c
+++ b/arch/powerpc/math-emu/mtfsb1.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/mtfsf.c b/arch/powerpc/math-emu/mtfsf.c
index 44b0fc8214f4..b0d5593ad357 100644
--- a/arch/powerpc/math-emu/mtfsf.c
+++ b/arch/powerpc/math-emu/mtfsf.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/mtfsfi.c b/arch/powerpc/math-emu/mtfsfi.c
index fd2acc26813b..5df30541a784 100644
--- a/arch/powerpc/math-emu/mtfsfi.c
+++ b/arch/powerpc/math-emu/mtfsfi.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/math-emu/stfd.c b/arch/powerpc/math-emu/stfd.c
index 33a165c8df0f..6baeaec134a2 100644
--- a/arch/powerpc/math-emu/stfd.c
+++ b/arch/powerpc/math-emu/stfd.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 stfd(void *frS, void *ea)
diff --git a/arch/powerpc/math-emu/stfiwx.c b/arch/powerpc/math-emu/stfiwx.c
index f15a35f67e2c..9da7c5d1a872 100644
--- a/arch/powerpc/math-emu/stfiwx.c
+++ b/arch/powerpc/math-emu/stfiwx.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int
 stfiwx(u32 *frS, void *ea)
diff --git a/arch/powerpc/math-emu/stfs.c b/arch/powerpc/math-emu/stfs.c
index 6122147356d1..62bd25264fb5 100644
--- a/arch/powerpc/math-emu/stfs.c
+++ b/arch/powerpc/math-emu/stfs.c
@@ -1,6 +1,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sfp-machine.h>
 #include <math-emu/soft-fp.h>
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c
index 31a5d42df8c9..61ac468c87c6 100644
--- a/arch/powerpc/mm/40x_mmu.c
+++ b/arch/powerpc/mm/40x_mmu.c
@@ -43,7 +43,7 @@
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/smp.h>
 #include <asm/bootx.h>
 #include <asm/machdep.h>
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index 139dec421e57..080d49b26c3a 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -48,7 +48,7 @@
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/smp.h>
 #include <asm/machdep.h>
 #include <asm/setup.h>
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8410b4bb36ed..80334937e14f 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -42,7 +42,7 @@
 #include <asm/mmu_context.h>
 #include <asm/page.h>
 #include <asm/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/machdep.h>
 #include <asm/prom.h>
 #include <asm/tlbflush.h>
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a000c3585390..93abf8a9813d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -51,7 +51,7 @@
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/smp.h>
 #include <asm/machdep.h>
 #include <asm/tlb.h>
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index d5543514c1df..5c096c01e8bd 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -15,7 +15,7 @@
 #include <linux/hugetlb.h>
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlbflush.h>
 
 /*
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 85c85eb3e245..e5a891ae80ee 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -30,7 +30,7 @@
 #include <linux/coredump.h>
 #include <linux/binfmts.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "spufs.h"
 
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 3a147122bc98..a35e2c29d7ee 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -35,7 +35,7 @@
 #include <asm/time.h>
 #include <asm/spu.h>
 #include <asm/spu_info.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "spufs.h"
 #include "sputrace.h"
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 5364d4a54249..d8af9bc0489f 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -38,7 +38,7 @@
 #include <asm/prom.h>
 #include <asm/spu.h>
 #include <asm/spu_priv1.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "spufs.h"
 
diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index a87200a535fa..0d290ea83dc1 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -5,7 +5,7 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "spufs.h"
 
diff --git a/arch/powerpc/platforms/chrp/nvram.c b/arch/powerpc/platforms/chrp/nvram.c
index 9ef8cc3378d0..c3ede2c365c3 100644
--- a/arch/powerpc/platforms/chrp/nvram.c
+++ b/arch/powerpc/platforms/chrp/nvram.c
@@ -13,7 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/prom.h>
 #include <asm/machdep.h>
 #include <asm/rtas.h>
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
index f2344cbd2f46..ecd6d9177d13 100644
--- a/arch/powerpc/platforms/powernv/opal-elog.c
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -18,7 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/fcntl.h>
 #include <linux/kobject.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/opal.h>
 
 struct elog_obj {
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index e4169d68cb32..4886eb8b6381 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -21,7 +21,7 @@
 #include <asm/xics.h>
 #include <asm/opal.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/debug.h>
 
 static int opal_lpc_chip_id = -1;
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index e315e704cca7..2d6ee1c5ad85 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -29,7 +29,7 @@
 #include <asm/opal-prd.h>
 #include <asm/opal.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 /**
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 972328829387..4839db385bb0 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -37,7 +37,7 @@
 #include <asm/hvcall.h>
 #include <asm/mmu.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/memory.h>
 #include <asm/plpar_wrappers.h>
 
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 76caa4a45ccd..5cb2e4beffc5 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -24,7 +24,7 @@
 
 #include <asm/prom.h>
 #include <asm/machdep.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/rtas.h>
 
 static struct workqueue_struct *pseries_hp_wq;
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index 39049e4884fb..6b04e3f0f982 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -24,7 +24,7 @@
 #include <linux/debugfs.h>
 #include <linux/spinlock.h>
 #include <asm/smp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/firmware.h>
 #include <asm/lppaca.h>
 #include <asm/debug.h>
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index e6397976060e..779fc2a1c8f7 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -25,7 +25,7 @@
 #include <linux/init.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/lppaca.h>
 #include <asm/hvcall.h>
 #include <asm/firmware.h>
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index 79aef8c1c5b3..69cedc1b3b8a 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -18,7 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/nvram.h>
 #include <asm/rtas.h>
 #include <asm/prom.h>
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index cc66c49f07aa..e5bf1e84047f 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -19,7 +19,7 @@
 
 #include <asm/prom.h>
 #include <asm/machdep.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu.h>
 
 #include "of_helpers.h"
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
index 7d28cabf1206..c47585a78b69 100644
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -27,7 +27,7 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/rtas.h>
 #include <asm/prom.h>
 
diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c
index 6f5a8d177c42..d0e9f178a324 100644
--- a/arch/powerpc/sysdev/scom.c
+++ b/arch/powerpc/sysdev/scom.c
@@ -25,7 +25,7 @@
 #include <asm/debug.h>
 #include <asm/prom.h>
 #include <asm/scom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 const struct scom_controller *scom_controller;
 EXPORT_SYMBOL_GPL(scom_controller);
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index 53a16aa4d384..5692dd569b9b 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -30,7 +30,7 @@
 #include <asm/byteorder.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/machdep.h>
 #include <asm/pci-bridge.h>
 #include <asm/tsi108.h>
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index f587c4811faf..5a8dfa22da7c 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -28,7 +28,7 @@
 #include <linux/platform_device.h>
 #include <asm/appldata.h>
 #include <asm/vtimer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/smp.h>
 
diff --git a/arch/s390/boot/compressed/misc.c b/arch/s390/boot/compressed/misc.c
index 4da604ebf6fd..8515dd5a5663 100644
--- a/arch/s390/boot/compressed/misc.c
+++ b/arch/s390/boot/compressed/misc.c
@@ -6,7 +6,7 @@
  * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/sclp.h>
 #include <asm/ipl.h>
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 1113389d0a39..daf9bb063aaa 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -21,7 +21,7 @@
 #include <linux/random.h>
 #include <linux/slab.h>
 #include <asm/debug.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/timex.h>
 #include <asm/cpacf.h>
 
diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h
index d7f100c53f07..12bf4fef2a68 100644
--- a/arch/s390/include/asm/checksum.h
+++ b/arch/s390/include/asm/checksum.h
@@ -11,7 +11,7 @@
 #ifndef _S390_CHECKSUM_H
 #define _S390_CHECKSUM_H
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * computes the checksum of a memory block at buff, length len,
diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h
index a7b2d7504049..280b60a0bcd4 100644
--- a/arch/s390/include/asm/idals.h
+++ b/arch/s390/include/asm/idals.h
@@ -17,7 +17,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <asm/cio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */
 #define IDA_BLOCK_SIZE (1L<<IDA_SIZE_LOG)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 515fea5a3fc4..67f7a991c929 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -8,7 +8,7 @@
 #define __S390_MMU_CONTEXT_H
 
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/ctl_reg.h>
 
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 0f9cd90c11af..96df4547377a 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -51,7 +51,7 @@
 #include <linux/slab.h>
 
 #include <asm/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/scm.h>
 #include <net/sock.h>
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 6f2a6ab13cb5..362350cc485c 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -23,7 +23,7 @@
 #include <linux/personality.h>
 #include <linux/binfmts.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/lowcore.h>
 #include <asm/switch_to.h>
 #include "compat_linux.h"
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index aa12de72fd47..79f8ae933520 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -19,7 +19,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/sysctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index c74c59236f44..9f017cf417f6 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -22,7 +22,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/dis.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index fdb40424acfe..84e0557b16fe 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -33,7 +33,7 @@
 #include <linux/ftrace.h>
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/dis.h>
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index b81ab8882e2e..7447ba509c30 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -26,7 +26,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/switch_to.h>
 #include "entry.h"
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 9f241d1efeda..62a4c263e887 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -26,7 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/lowcore.h>
 #include <asm/switch_to.h>
 #include "entry.h"
diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index f145490cce54..b7af452978ca 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -27,7 +27,7 @@
 #include <linux/personality.h>
 #include <linux/unistd.h>
 #include <linux/ipc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "entry.h"
 
 /*
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 867d0a057046..ec76315c9ee5 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -38,7 +38,7 @@
 #include <linux/clockchips.h>
 #include <linux/gfp.h>
 #include <linux/kprobes.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/facility.h>
 #include <asm/delay.h>
 #include <asm/div64.h>
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index d0539f76fd24..283ad7840335 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -19,7 +19,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fpu/api.h>
 #include "entry.h"
 
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index af13f1a135b6..6843dd5a1cba 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -20,7 +20,7 @@
 #include <linux/vmalloc.h>
 #include <asm/asm-offsets.h>
 #include <asm/dis.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sclp.h>
 #include <asm/isc.h>
 #include <asm/gmap.h>
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index b3e9d18f2ec6..b67454ad8408 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -29,7 +29,7 @@
 #include <linux/gfp.h>
 #include <linux/memblock.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/dma.h>
diff --git a/arch/score/include/asm/checksum.h b/arch/score/include/asm/checksum.h
index 539d9fd45d21..0338927f4826 100644
--- a/arch/score/include/asm/checksum.h
+++ b/arch/score/include/asm/checksum.h
@@ -2,7 +2,7 @@
 #define _ASM_SCORE_CHECKSUM_H
 
 #include <linux/in6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * computes the checksum of a memory block at buff, length len,
diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c
index 4f7314d5f334..8b75e54816c1 100644
--- a/arch/score/kernel/ptrace.c
+++ b/arch/score/kernel/ptrace.c
@@ -29,7 +29,7 @@
 #include <linux/ptrace.h>
 #include <linux/regset.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * retrieve the contents of SCORE userspace general registers
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 5cea1e750cec..d948a6818961 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -29,7 +29,7 @@
 #include <asm/cacheflush.h>
 #include <asm/irq.h>
 #include <asm/irq_regs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 unsigned long exception_handlers[32];
 
diff --git a/arch/score/lib/checksum_copy.c b/arch/score/lib/checksum_copy.c
index 9b770b30e8a5..39b99ef61804 100644
--- a/arch/score/lib/checksum_copy.c
+++ b/arch/score/lib/checksum_copy.c
@@ -25,7 +25,7 @@
 
 #include <net/checksum.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 unsigned int csum_partial_copy(const char *src, char *dst,
 				int len, unsigned int sum)
diff --git a/arch/sh/boards/mach-landisk/gio.c b/arch/sh/boards/mach-landisk/gio.c
index 8132dff078fb..32c317f5d991 100644
--- a/arch/sh/boards/mach-landisk/gio.c
+++ b/arch/sh/boards/mach-landisk/gio.c
@@ -18,7 +18,7 @@
 #include <linux/cdev.h>
 #include <linux/fs.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <mach-landisk/mach/gio.h>
 #include <mach-landisk/mach/iodata_landisk.h>
 
diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c
index 208a9753ab38..ae1dfdb0013b 100644
--- a/arch/sh/boot/compressed/misc.c
+++ b/arch/sh/boot/compressed/misc.c
@@ -11,7 +11,7 @@
  * Modified to use standard LinuxSH BIOS by Greg Banks 7Jul2000
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/addrspace.h>
 #include <asm/page.h>
 
diff --git a/arch/sh/include/asm/mmu_context.h b/arch/sh/include/asm/mmu_context.h
index 9f417feaf6e8..35ffdd081d26 100644
--- a/arch/sh/include/asm/mmu_context.h
+++ b/arch/sh/include/asm/mmu_context.h
@@ -10,7 +10,7 @@
 #ifdef __KERNEL__
 #include <cpu/mmu_context.h>
 #include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm-generic/mm_hooks.h>
 
diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c
index c8b3be1b54e6..c4f01c5c8736 100644
--- a/arch/sh/kernel/cpu/init.c
+++ b/arch/sh/kernel/cpu/init.c
@@ -16,7 +16,7 @@
 #include <linux/log2.h>
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
 #include <asm/cache.h>
diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c
index 53b8eeb1db20..c32e66079f7c 100644
--- a/arch/sh/kernel/cpu/shmobile/cpuidle.c
+++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c
@@ -16,7 +16,7 @@
 #include <linux/cpuidle.h>
 #include <linux/export.h>
 #include <asm/suspend.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static unsigned long cpuidle_mode[] = {
 	SUSP_SH_SLEEP, /* regular sleep mode */
diff --git a/arch/sh/kernel/cpu/shmobile/pm.c b/arch/sh/kernel/cpu/shmobile/pm.c
index ac37b7234f85..fba2be5d72e9 100644
--- a/arch/sh/kernel/cpu/shmobile/pm.c
+++ b/arch/sh/kernel/cpu/shmobile/pm.c
@@ -14,7 +14,7 @@
 #include <linux/io.h>
 #include <linux/suspend.h>
 #include <asm/suspend.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/bl_bit.h>
 
diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c
index 569e7b171c01..b33be505361e 100644
--- a/arch/sh/kernel/crash_dump.c
+++ b/arch/sh/kernel/crash_dump.c
@@ -7,7 +7,7 @@
 #include <linux/errno.h>
 #include <linux/crash_dump.h>
 #include <linux/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**
  * copy_oldmem_page - copy one page from "oldmem"
diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c
index f8ce36286cea..4d4e7a2a774b 100644
--- a/arch/sh/kernel/io_trapped.c
+++ b/arch/sh/kernel/io_trapped.c
@@ -16,7 +16,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/io_trapped.h>
 
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 6c0378c0b8b5..bc3591125df7 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -16,7 +16,7 @@
 #include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/machvec.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/thread_info.h>
 #include <cpu/mmu_context.h>
 
diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c
index 83acbf3f6de8..1653ff64b103 100644
--- a/arch/sh/kernel/kprobes.c
+++ b/arch/sh/kernel/kprobes.c
@@ -15,7 +15,7 @@
 #include <linux/kdebug.h>
 #include <linux/slab.h>
 #include <asm/cacheflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index ee12e9451874..51741850a715 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -23,7 +23,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/prefetch.h>
 #include <linux/stackprotector.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/fpu.h>
 #include <asm/syscalls.h>
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index 9d3e9916555d..e0b271bffd6a 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -26,7 +26,7 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <asm/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index c1a6b89bfe70..1aabfd356b35 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -26,7 +26,7 @@
 #include <linux/elf.h>
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c
index 5cea973a65b2..c49d0d05a215 100644
--- a/arch/sh/kernel/ptrace_64.c
+++ b/arch/sh/kernel/ptrace_64.c
@@ -32,7 +32,7 @@
 #include <linux/elf.h>
 #include <linux/regset.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/mmu_context.h>
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index e7b49d81053e..3a44c753b642 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -31,7 +31,7 @@
 #include <linux/memblock.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/page.h>
 #include <asm/elf.h>
diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c
index 26a0774f5272..6ee3740e009e 100644
--- a/arch/sh/kernel/sh_ksyms_64.c
+++ b/arch/sh/kernel/sh_ksyms_64.c
@@ -18,7 +18,7 @@
 #include <linux/screen_info.h>
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/checksum.h>
 #include <asm/io.h>
 #include <asm/delay.h>
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index f7c3d5c25caf..5128d3001ee5 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -25,7 +25,7 @@
 #include <linux/io.h>
 #include <linux/tracehook.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/syscalls.h>
diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c
index d8a3f0d22809..7b77f1812434 100644
--- a/arch/sh/kernel/signal_64.c
+++ b/arch/sh/kernel/signal_64.c
@@ -23,7 +23,7 @@
 #include <linux/stddef.h>
 #include <linux/tracehook.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/fpu.h>
diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c
index 8c6a350df751..6576e5ee1fc3 100644
--- a/arch/sh/kernel/sys_sh.c
+++ b/arch/sh/kernel/sys_sh.c
@@ -23,7 +23,7 @@
 #include <linux/fs.h>
 #include <linux/ipc.h>
 #include <asm/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/cacheflush.h>
 #include <asm/cachectl.h>
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index b66d1c62eb19..d5287d76809c 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -13,7 +13,7 @@
 #include <linux/fs.h>
 #include <linux/ipc.h>
 #include <asm/cacheflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/syscalls.h>
 
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index d208c27ccc67..00835edb6e20 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -25,7 +25,7 @@
 #include <linux/sysctl.h>
 #include <linux/module.h>
 #include <linux/perf_event.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/alignment.h>
 #include <asm/processor.h>
diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c
index 04aa55fa8c75..5078cb809750 100644
--- a/arch/sh/math-emu/math.c
+++ b/arch/sh/math-emu/math.c
@@ -14,7 +14,7 @@
 #include <linux/signal.h>
 #include <linux/perf_event.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <asm/io.h>
 
diff --git a/arch/sh/mm/cache-debugfs.c b/arch/sh/mm/cache-debugfs.c
index 777e50f33c00..4eb9d43578b4 100644
--- a/arch/sh/mm/cache-debugfs.c
+++ b/arch/sh/mm/cache-debugfs.c
@@ -12,7 +12,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cache.h>
 #include <asm/io.h>
 
diff --git a/arch/sh/mm/cache-sh3.c b/arch/sh/mm/cache-sh3.c
index e37523f65195..031634f273fa 100644
--- a/arch/sh/mm/cache-sh3.c
+++ b/arch/sh/mm/cache-sh3.c
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/cache.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c
index d1bffbcd9d52..d94dadedf74f 100644
--- a/arch/sh/mm/cache-sh5.c
+++ b/arch/sh/mm/cache-sh5.c
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/cache.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 
 extern void __weak sh4__flush_region_init(void);
diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c
index 7729cca727eb..6cd2aa395817 100644
--- a/arch/sh/mm/cache-sh7705.c
+++ b/arch/sh/mm/cache-sh7705.c
@@ -20,7 +20,7 @@
 #include <asm/processor.h>
 #include <asm/cache.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
diff --git a/arch/sh/mm/extable_32.c b/arch/sh/mm/extable_32.c
index c1cf4463d09d..9cfcbb5848e4 100644
--- a/arch/sh/mm/extable_32.c
+++ b/arch/sh/mm/extable_32.c
@@ -5,7 +5,7 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int fixup_exception(struct pt_regs *regs)
 {
diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c
index f05499688d88..96edaff8c983 100644
--- a/arch/sh/mm/extable_64.c
+++ b/arch/sh/mm/extable_64.c
@@ -12,7 +12,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern unsigned long copy_user_memcpy, copy_user_memcpy_end;
 extern void __copy_user_fixup(void);
diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c
index 36312d254faf..82f8197b93f6 100644
--- a/arch/sh/mm/nommu.c
+++ b/arch/sh/mm/nommu.c
@@ -14,7 +14,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Nothing too terribly exciting here ..
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 7160c9fd6fe3..7b2cc490ebb7 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -25,7 +25,7 @@
 #include <linux/vmalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/sizes.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
diff --git a/arch/sh/mm/tlb-sh3.c b/arch/sh/mm/tlb-sh3.c
index 6554fb439f0e..5c66665bff8b 100644
--- a/arch/sh/mm/tlb-sh3.c
+++ b/arch/sh/mm/tlb-sh3.c
@@ -21,7 +21,7 @@
 #include <linux/interrupt.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
diff --git a/arch/sh/mm/tlbex_64.c b/arch/sh/mm/tlbex_64.c
index 8557548fc53e..8ff966dd0c74 100644
--- a/arch/sh/mm/tlbex_64.c
+++ b/arch/sh/mm/tlbex_64.c
@@ -36,7 +36,7 @@
 #include <linux/kprobes.h>
 #include <asm/tlb.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
index f33fdd2558e8..bd0715d5dca4 100644
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -24,7 +24,7 @@
 #include <linux/interrupt.h>
 #include <asm/io.h>
 #include <asm/tlb.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
 
diff --git a/arch/sh/oprofile/backtrace.c b/arch/sh/oprofile/backtrace.c
index 9c88dcd56e86..c7695f99c8c3 100644
--- a/arch/sh/oprofile/backtrace.c
+++ b/arch/sh/oprofile/backtrace.c
@@ -19,7 +19,7 @@
 #include <linux/mm.h>
 #include <asm/unwinder.h>
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sections.h>
 #include <asm/stacktrace.h>
 
diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h
index eff748c871ec..e25af5fc99fd 100644
--- a/arch/sparc/include/asm/checksum_32.h
+++ b/arch/sparc/include/asm/checksum_32.h
@@ -16,7 +16,7 @@
  */
 
 #include <linux/in6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit)
diff --git a/arch/sparc/include/asm/checksum_64.h b/arch/sparc/include/asm/checksum_64.h
index 0395d75322e9..96a5ed58cea6 100644
--- a/arch/sparc/include/asm/checksum_64.h
+++ b/arch/sparc/include/asm/checksum_64.h
@@ -16,7 +16,7 @@
  */
 
 #include <linux/in6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* computes the checksum of a memory block at buff, length len,
  * and adds in "sum" (32-bit)
diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c
index 742f6c4436bf..9ebc37e7d64c 100644
--- a/arch/sparc/kernel/apc.c
+++ b/arch/sparc/kernel/apc.c
@@ -17,7 +17,7 @@
 
 #include <asm/io.h>
 #include <asm/oplib.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/auxio.h>
 #include <asm/apc.h>
 #include <asm/processor.h>
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 34a7930b76ef..3bebf395252c 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -35,7 +35,7 @@
 #include <asm/timer.h>
 #include <asm/smp.h>
 #include <asm/starfire.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cache.h>
 #include <asm/cpudata.h>
 #include <asm/auxio.h>
diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c
index b0377db12d83..2d13a4fc0384 100644
--- a/arch/sparc/kernel/kprobes.c
+++ b/arch/sparc/kernel/kprobes.c
@@ -11,7 +11,7 @@
 #include <linux/context_tracking.h>
 #include <asm/signal.h>
 #include <asm/cacheflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* We do not have hardware single-stepping on sparc64.
  * So we implement software single-stepping with breakpoint
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 8a6982dfd733..c0765bbf60ea 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -17,7 +17,7 @@
 #include <asm/hypervisor.h>
 #include <asm/mdesc.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/oplib.h>
 #include <asm/smp.h>
 
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 9c1878f4fa9f..015e55a7495d 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -21,7 +21,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/irq.h>
 #include <asm/prom.h>
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index 24384e1dc33d..a38787b84322 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -33,7 +33,7 @@
 #include <asm/pcic.h>
 #include <asm/timex.h>
 #include <asm/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq_regs.h>
 
 #include "kernel.h"
diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c
index 97d123107ecb..f12b23f7b515 100644
--- a/arch/sparc/kernel/pmc.c
+++ b/arch/sparc/kernel/pmc.c
@@ -15,7 +15,7 @@
 
 #include <asm/io.h>
 #include <asm/oplib.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/auxio.h>
 #include <asm/processor.h>
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index b7780a5bef11..48ffc3e7d1dd 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -28,7 +28,7 @@
 
 #include <asm/auxio.h>
 #include <asm/oplib.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 47ff5588e521..d249ca10b203 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -33,7 +33,7 @@
 #include <linux/nmi.h>
 #include <linux/context_tracking.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c
index a331fdc11a2c..eca3dc76793c 100644
--- a/arch/sparc/kernel/ptrace_32.c
+++ b/arch/sparc/kernel/ptrace_32.c
@@ -23,7 +23,7 @@
 #include <linux/tracehook.h>
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 
 #include "kernel.h"
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 96494b2ef41f..901063c1cf7e 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -31,7 +31,7 @@
 
 #include <asm/asi.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/psrcompat.h>
 #include <asm/visasm.h>
 #include <asm/spitfire.h>
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 91cc2f4ae4d9..b4096bb665b2 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -21,7 +21,7 @@
 #include <linux/bitops.h>
 #include <linux/tracehook.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/pgtable.h>
 #include <asm/psrcompat.h>
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index 9c0c8fd0b292..62c3e255ae7c 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -20,7 +20,7 @@
 #include <linux/bitops.h>
 #include <linux/tracehook.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index c782c9b716db..965d50e833e7 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -25,7 +25,7 @@
 #include <linux/bitops.h>
 #include <linux/context_tracking.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/pgtable.h>
 #include <asm/fpumacro.h>
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 8182f7caf5b1..0ce347f8e4cc 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -43,7 +43,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/oplib.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/starfire.h>
 #include <asm/tlb.h>
 #include <asm/sections.h>
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 022c30c72ebd..bca44f3e6b86 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -44,7 +44,7 @@
 #include <linux/slab.h>
 
 #include <asm/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fpumacro.h>
 #include <asm/mmu_context.h>
 #include <asm/compat_signal.h>
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index 646988d4c1a3..fb7b185ee941 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -21,7 +21,7 @@
 #include <linux/smp.h>
 #include <linux/ipc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 #include "systbls.h"
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index fe8b8ee8e660..884c70331345 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -26,7 +26,7 @@
 #include <linux/export.h>
 #include <linux/context_tracking.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/utrap.h>
 #include <asm/unistd.h>
 
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index c69b21e51efc..807f7e2ce014 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -45,7 +45,7 @@
 #include <asm/smp.h>
 #include <asm/sections.h>
 #include <asm/cpudata.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq_regs.h>
 
 #include "entry.h"
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 496fa926e1e0..4bc10e44d1ca 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -29,7 +29,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/unistd.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fpumacro.h>
 #include <asm/lsu.h>
 #include <asm/dcu.h>
diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c
index 32b61d1b6379..d20d4e3fd129 100644
--- a/arch/sparc/kernel/unaligned_32.c
+++ b/arch/sparc/kernel/unaligned_32.c
@@ -12,7 +12,7 @@
 #include <linux/mm.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/smp.h>
 #include <linux/perf_event.h>
 
diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c
index 52c00d90d4b4..cda7fd367c4f 100644
--- a/arch/sparc/kernel/unaligned_64.c
+++ b/arch/sparc/kernel/unaligned_64.c
@@ -16,7 +16,7 @@
 #include <asm/ptrace.h>
 #include <asm/pstate.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/smp.h>
 #include <linux/bitops.h>
 #include <linux/perf_event.h>
diff --git a/arch/sparc/kernel/uprobes.c b/arch/sparc/kernel/uprobes.c
index b68314050602..d852ae56ddc1 100644
--- a/arch/sparc/kernel/uprobes.c
+++ b/arch/sparc/kernel/uprobes.c
@@ -29,7 +29,7 @@
 #include <linux/kdebug.h>
 
 #include <asm/cacheflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Compute the address of the breakpoint instruction and return it.
  *
diff --git a/arch/sparc/kernel/visemul.c b/arch/sparc/kernel/visemul.c
index c096c624ac4d..c4ac58e483a4 100644
--- a/arch/sparc/kernel/visemul.c
+++ b/arch/sparc/kernel/visemul.c
@@ -10,7 +10,7 @@
 #include <asm/ptrace.h>
 #include <asm/pstate.h>
 #include <asm/fpumacro.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 
 /* OPF field of various VIS instructions.  */
diff --git a/arch/sparc/kernel/windows.c b/arch/sparc/kernel/windows.c
index 87bab0a3857a..435a467b0595 100644
--- a/arch/sparc/kernel/windows.c
+++ b/arch/sparc/kernel/windows.c
@@ -11,7 +11,7 @@
 #include <linux/smp.h>
 
 #include <asm/cacheflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "kernel.h"
 
diff --git a/arch/sparc/math-emu/math_32.c b/arch/sparc/math-emu/math_32.c
index 5ce8f2f64604..4d7e0fff054f 100644
--- a/arch/sparc/math-emu/math_32.c
+++ b/arch/sparc/math-emu/math_32.c
@@ -68,7 +68,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/perf_event.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "sfp-util_32.h"
 #include <math-emu/soft-fp.h>
diff --git a/arch/sparc/math-emu/math_64.c b/arch/sparc/math-emu/math_64.c
index 034aadbff036..9647051853d3 100644
--- a/arch/sparc/math-emu/math_64.c
+++ b/arch/sparc/math-emu/math_64.c
@@ -15,7 +15,7 @@
 
 #include <asm/fpumacro.h>
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 
 #include "sfp-util_64.h"
diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c
index a61c349448e1..768a11e6bd4f 100644
--- a/arch/sparc/mm/extable.c
+++ b/arch/sparc/mm/extable.c
@@ -3,7 +3,7 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 void sort_extable(struct exception_table_entry *start,
 		  struct exception_table_entry *finish)
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 37aa537b3ad8..5d2f91511c60 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -35,7 +35,7 @@
 #include <asm/oplib.h>
 #include <asm/iommu.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 #include <asm/dma.h>
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 9f37106ef93a..c84c54a1ac55 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -35,7 +35,7 @@
 #include <asm/syscalls.h>
 #include <asm/traps.h>
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c
index 862973074bf9..de3eae813e52 100644
--- a/arch/tile/kernel/single_step.c
+++ b/arch/tile/kernel/single_step.c
@@ -25,7 +25,7 @@
 #include <linux/prctl.h>
 #include <asm/cacheflush.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 #include <arch/abi.h>
 #include <arch/spr_def.h>
diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c
index 4fe78c5b8394..f229e979584e 100644
--- a/arch/tile/kernel/unaligned.c
+++ b/arch/tile/kernel/unaligned.c
@@ -27,7 +27,7 @@
 #include <linux/prctl.h>
 #include <asm/cacheflush.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 #include <arch/abi.h>
 #include <arch/spr_def.h>
diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c
index 3282787bbcfb..6d381279b362 100644
--- a/arch/um/drivers/harddog_kern.c
+++ b/arch/um/drivers/harddog_kern.c
@@ -45,7 +45,7 @@
 #include <linux/mutex.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "mconsole.h"
 
 MODULE_LICENSE("GPL");
diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c
index 3a4b58730f5f..12bdb5996bf5 100644
--- a/arch/um/drivers/hostaudio_kern.c
+++ b/arch/um/drivers/hostaudio_kern.c
@@ -9,7 +9,7 @@
 #include <linux/sound.h>
 #include <linux/soundcard.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <init.h>
 #include <os.h>
 
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 8a6b57108ac2..8a4c72af3bc0 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -24,7 +24,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/switch_to.h>
 
 #include <init.h>
diff --git a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c
index 62145c276167..3645fcb2a787 100644
--- a/arch/um/drivers/mmapper_kern.c
+++ b/arch/um/drivers/mmapper_kern.c
@@ -17,7 +17,7 @@
 #include <linux/module.h>
 #include <linux/mm.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <mem_user.h>
 
 /* These are set in mmapper_init, which is called at boot time */
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index dd16c902ff70..05523f14d7b2 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -12,7 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/miscdevice.h>
 #include <linux/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <irq_kern.h>
 #include <os.h>
 
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 0d7103c9eff3..770ec07b6a6a 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -11,7 +11,7 @@
 #include <linux/slab.h>
 #include <asm/current.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <as-layout.h>
 #include <mem_user.h>
 #include <skas.h>
diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c
index 41ebbfebb333..546302e3b7fb 100644
--- a/arch/um/kernel/exitcode.c
+++ b/arch/um/kernel/exitcode.c
@@ -10,7 +10,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * If read and write race, the read will still atomically read a valid
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 034b42c7ab40..078630d6448c 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -24,7 +24,7 @@
 #include <asm/current.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <as-layout.h>
 #include <kern_util.h>
 #include <os.h>
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 6a826cbb15c4..bc2a516c190f 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -7,7 +7,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/tracehook.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace-abi.h>
 
 void user_enable_single_step(struct task_struct *child)
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index c1d0ae069b53..6258676bed85 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -11,7 +11,7 @@
 #include <linux/syscalls.h>
 #include <asm/current.h>
 #include <asm/mman.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 long old_mmap(unsigned long addr, unsigned long len,
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index bdd9cc59d20f..b83c61cfd154 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -25,7 +25,7 @@
 #include <asm/desc.h>
 #include <asm/traps.h>
 #include <asm/vdso.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cpufeature.h>
 
 #define CREATE_TRACE_POINTS
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index cb26f18d43af..7c0a711989d2 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -27,7 +27,7 @@
 #include <linux/jiffies.h>
 #include <linux/perf_event.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/cacheflush.h>
 #include <asm/user32.h>
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb13c0564ea7..95c0b4ae09b0 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -20,7 +20,7 @@
 #include <linux/compat.h>
 #include <linux/binfmts.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fpu/internal.h>
 #include <asm/fpu/signal.h>
 #include <asm/ptrace.h>
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 719cd702b0a4..47956c6a4fd8 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -42,7 +42,7 @@
 #include <linux/slab.h>
 #include <asm/mman.h>
 #include <asm/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <asm/vgtod.h>
 #include <asm/sys_ia32.h>
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 44b8762fa0c7..830b19dbfa31 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -1,5 +1,5 @@
 #include <asm/ftrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/string.h>
 #include <asm/page.h>
 #include <asm/checksum.h>
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index c020ee75dce7..08e7efb2c140 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -8,7 +8,7 @@
  */
 
 #include <linux/compiler.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 /**
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f5fb840b43e8..33cbd3db97b9 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -7,7 +7,7 @@
 #include <linux/pfn.h>
 #include <linux/mm.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 643818a7688b..45d44c173cf9 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -234,7 +234,7 @@
 #include <linux/i8253.h>
 #include <linux/cpuidle.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/desc.h>
 #include <asm/olpc.h>
 #include <asm/paravirt.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index c7efbcfbeda6..87cc9ab7a13c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "mce-internal.h"
 
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 11891ca7b716..538fedea9b3f 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -10,7 +10,7 @@
 #include <linux/highmem.h>
 #include <linux/crash_dump.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static void *kdump_buf_page;
 
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index f6dfd9334b67..b2f7207ba86c 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -3,7 +3,7 @@
 #include <linux/init_task.h>
 #include <linux/fs.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d9d8d16b69db..eb3509338ae0 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -56,7 +56,7 @@
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/alternative.h>
 #include <asm/insn.h>
 #include <asm/debugreg.h>
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 3bb4c5f021f6..3d1bee9d6a72 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -33,7 +33,7 @@
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/alternative.h>
 #include <asm/insn.h>
 #include <asm/debugreg.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 37363e46b1f0..b615a1113f58 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -23,7 +23,7 @@
 #include <asm/cpu.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mwait.h>
 #include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0e63c0267f99..9cc7d5a330ef 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -24,7 +24,7 @@
 #include <linux/export.h>
 #include <linux/context_tracking.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/fpu/internal.h>
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 27538f183c3b..a3b875c9e6af 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -13,7 +13,7 @@
 #include <linux/sort.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/asm.h>
 
 extern int rodata_test_data;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9692a5e9fdab..6c8934406dc9 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -5,7 +5,7 @@
 #include <linux/regset.h>
 #include <linux/syscalls.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/desc.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 01f30e56f99e..ec5d7545e6dc 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -47,7 +47,7 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 0b281217c890..1f65ff6540f0 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -11,7 +11,7 @@
 #include <linux/export.h>
 #include <linux/backing-dev.h>
 #include <linux/interrupt.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmx.h>
 #include <asm/asm.h>
 
diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c
index 9e6545f269e5..2ccc424a57d9 100644
--- a/arch/x86/math-emu/errors.c
+++ b/arch/x86/math-emu/errors.c
@@ -19,7 +19,7 @@
 
 #include <linux/signal.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "fpu_emu.h"
 #include "fpu_system.h"
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index e945fedf1de2..0203baefb5c0 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -27,7 +27,7 @@
 #include <linux/signal.h>
 #include <linux/regset.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/traps.h>
 #include <asm/user.h>
 #include <asm/fpu/internal.h>
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 8db26591d91a..b8ef9f9d2ffc 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -19,7 +19,7 @@
 
 #include <linux/stddef.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/vm86.h>
 
 #include "fpu_system.h"
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index 95228ff042c0..1643054766eb 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -18,7 +18,7 @@
  |    other processes using the emulator while swapping is in progress.      |
  +---------------------------------------------------------------------------*/
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "fpu_system.h"
 #include "exception.h"
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index d597fe7423c9..2c98965a60ba 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -19,7 +19,7 @@
 
 #include "fpu_emu.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "fpu_system.h"
 #include "exception.h"
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index fcd06f7526de..61a7e9ea9aa1 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,5 +1,5 @@
 #include <linux/extable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/traps.h>
 #include <asm/kdebug.h>
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cf8059016ec8..928d657de829 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -34,7 +34,7 @@
 #include <asm/asm.h>
 #include <asm/bios_ebda.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/dma.h>
 #include <asm/fixmap.h>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 963895f9af7f..af85b686a7b0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -36,7 +36,7 @@
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/dma.h>
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e3353c97d086..5a287e523eab 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -20,7 +20,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
 #include <asm/pat.h>
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index 60a5a5a85505..2497bac56066 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -5,7 +5,7 @@
 
 #include <linux/mm.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace-abi.h>
 #include <skas.h>
 
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index e30202b1716e..a5c9910d234f 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -10,7 +10,7 @@
 #include <linux/errno.h>
 #define __FRAME_OFFSETS
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace-abi.h>
 
 /*
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 49e503697022..727ed442e0a5 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -9,7 +9,7 @@
 #include <linux/ptrace.h>
 #include <linux/kernel.h>
 #include <asm/unistd.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ucontext.h>
 #include <frame_kern.h>
 #include <skas.h>
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index 48e38584d5c1..5bd949da7a4a 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -6,7 +6,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ptrace-abi.h>
 #include <os.h>
 #include <skas.h>
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 37129db76d33..276da636dd39 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -71,7 +71,7 @@
 
 #include <asm/cache.h>
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/xen/page.h>
 #include <asm/xen/hypercall.h>
diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h
index ec35074fcb03..3ae74d7e074b 100644
--- a/arch/xtensa/include/asm/checksum.h
+++ b/arch/xtensa/include/asm/checksum.h
@@ -12,7 +12,7 @@
 #define _XTENSA_CHECKSUM_H
 
 #include <linux/in6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <variant/core.h>
 
 /*
diff --git a/arch/xtensa/include/asm/segment.h b/arch/xtensa/include/asm/segment.h
index a2eb547a1a75..98964ad15ca2 100644
--- a/arch/xtensa/include/asm/segment.h
+++ b/arch/xtensa/include/asm/segment.h
@@ -11,6 +11,6 @@
 #ifndef _XTENSA_SEGMENT_H
 #define _XTENSA_SEGMENT_H
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #endif	/* _XTENSA_SEGEMENT_H */
diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c
index 8e10e357ee32..bcb5beb81177 100644
--- a/arch/xtensa/kernel/asm-offsets.c
+++ b/arch/xtensa/kernel/asm-offsets.c
@@ -24,7 +24,7 @@
 
 #include <asm/ptrace.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int main(void)
 {
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index 4ac3d23161cf..a265edd6ac37 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -25,7 +25,7 @@
 #include <linux/of.h>
 
 #include <asm/mxregs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/platform.h>
 
 DECLARE_PER_CPU(unsigned long, nmi_count);
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index e0ded48561db..826d25104846 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -35,7 +35,7 @@
 #include <linux/rcupdate.h>
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/platform.h>
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index a651f3a628ee..32519b71d914 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -29,7 +29,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 void user_enable_single_step(struct task_struct *child)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index e87adaa07ff3..c41294745731 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -22,7 +22,7 @@
 #include <linux/tracehook.h>
 
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/coprocessor.h>
 #include <asm/unistd.h>
diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c
index 7538d802b65a..e7d30ee0fd48 100644
--- a/arch/xtensa/kernel/stacktrace.c
+++ b/arch/xtensa/kernel/stacktrace.c
@@ -14,7 +14,7 @@
 
 #include <asm/stacktrace.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #if IS_ENABLED(CONFIG_OPROFILE) || IS_ENABLED(CONFIG_PERF_EVENTS)
 
diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c
index 83cf49685373..d3fd100dffc9 100644
--- a/arch/xtensa/kernel/syscall.c
+++ b/arch/xtensa/kernel/syscall.c
@@ -15,7 +15,7 @@
  * Kevin Chea
  *
  */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/syscall.h>
 #include <asm/unistd.h>
 #include <linux/linkage.h>
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index ce37d5b899fe..282bf721a4d6 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -35,7 +35,7 @@
 #include <asm/stacktrace.h>
 #include <asm/ptrace.h>
 #include <asm/timex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/traps.h>
diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c
index 4d2872fd9bb5..d159e9b9c018 100644
--- a/arch/xtensa/kernel/xtensa_ksyms.c
+++ b/arch/xtensa/kernel/xtensa_ksyms.c
@@ -19,7 +19,7 @@
 #include <asm/irq.h>
 #include <linux/in6.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/checksum.h>
 #include <asm/dma.h>
diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c
index c68f1e6158aa..0140a22551c8 100644
--- a/arch/xtensa/platforms/iss/console.c
+++ b/arch/xtensa/platforms/iss/console.c
@@ -20,7 +20,7 @@
 #include <linux/seq_file.h>
 #include <linux/serial.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 
 #include <platform/simcall.h>
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index ede04cca30dd..02e94bb3ad3e 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -17,7 +17,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/proc_fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <platform/simcall.h>
 
 #define SIMDISK_MAJOR 240
diff --git a/block/ioctl.c b/block/ioctl.c
index 656c8c6ed206..be7f4de3eb3c 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -8,7 +8,7 @@
 #include <linux/fs.h>
 #include <linux/blktrace_api.h>
 #include <linux/pr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
 {
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 47a61474e795..14b081af8d61 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -10,7 +10,7 @@
 #include <linux/slab.h>
 #include <asm/dasd.h>
 #include <asm/ebcdic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/vtoc.h>
 
 #include "check.h"
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index c6fee7437be4..c2b64923ab66 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -28,7 +28,7 @@
 #include <linux/slab.h>
 #include <linux/times.h>
 #include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index 201292e67ee8..d00bc0ef87a6 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -37,7 +37,7 @@
 #include <linux/suspend.h>
 #include <linux/acpi.h>
 #include <acpi/video.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PREFIX "ACPI: "
 
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 05fe9ebfb9b5..4ef1e4624b2b 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -36,7 +36,7 @@
 #ifdef CONFIG_ACPI_PROCFS_POWER
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #endif
 
 #include <linux/acpi.h>
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index a404ff4d7151..57fb5f468ac2 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -42,7 +42,7 @@
 #include <linux/semaphore.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 
 #include "internal.h"
diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c
index 2a358154b770..a34669cc823b 100644
--- a/drivers/acpi/proc.c
+++ b/drivers/acpi/proc.c
@@ -4,7 +4,7 @@
 #include <linux/suspend.h>
 #include <linux/bcd.h>
 #include <linux/acpi.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "sleep.h"
 #include "internal.h"
diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c
index 1fed84a092c2..59c3a5d1e600 100644
--- a/drivers/acpi/processor_thermal.c
+++ b/drivers/acpi/processor_thermal.c
@@ -28,7 +28,7 @@
 #include <linux/cpufreq.h>
 #include <linux/acpi.h>
 #include <acpi/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PREFIX "ACPI: "
 
diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c
index d51ca1c05619..a12f96cc93ff 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -31,7 +31,7 @@
 #include <linux/acpi.h>
 #include <acpi/processor.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PREFIX "ACPI: "
 
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index 35e8fbca10ad..1d0417b87cb7 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -40,7 +40,7 @@
 #include <linux/thermal.h>
 #include <linux/acpi.h>
 #include <linux/workqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PREFIX "ACPI: "
 
diff --git a/drivers/atm/adummy.c b/drivers/atm/adummy.c
index f9b983ae6877..1fd25e872ece 100644
--- a/drivers/atm/adummy.c
+++ b/drivers/atm/adummy.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/atmdev.h>
 #include <linux/atm.h>
diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c
index 480fa6ffbc09..3ef6253e1cce 100644
--- a/drivers/atm/atmtcp.c
+++ b/drivers/atm/atmtcp.c
@@ -10,7 +10,7 @@
 #include <linux/bitops.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 
 
diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c
index 40c2d561417b..c53a9dd1353f 100644
--- a/drivers/atm/eni.c
+++ b/drivers/atm/eni.c
@@ -21,7 +21,7 @@
 #include <linux/slab.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/string.h>
 #include <asm/byteorder.h>
 
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 85aaf2222587..80c2ddcfa92c 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -52,7 +52,7 @@
 #include <asm/string.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/wait.h>
 
 #include "firestream.h"
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 81aaa505862c..637c3e6b0f9e 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -43,7 +43,7 @@
 #include <asm/irq.h>
 #include <asm/dma.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 
 #ifdef CONFIG_SBUS
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 31b513a23ae0..3617659b9184 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -71,7 +71,7 @@
 #include <linux/slab.h>
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/atmdev.h>
 #include <linux/atm.h>
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index 5fc81e240c24..584aa881882b 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -45,7 +45,7 @@
 
 #include <asm/io.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/string.h>
 #include <asm/byteorder.h>
 
diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c
index feb023d7eebd..082aa02abc57 100644
--- a/drivers/atm/idt77105.c
+++ b/drivers/atm/idt77105.c
@@ -17,7 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "idt77105.h"
 
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 074616b39f4d..471ddfd93ea8 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -45,7 +45,7 @@
 #include <linux/slab.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <asm/byteorder.h>
 
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index b2756765950e..8640bafeb471 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -58,7 +58,7 @@
 #include <linux/slab.h>
 #include <asm/io.h>  
 #include <linux/atomic.h>
-#include <asm/uaccess.h>  
+#include <linux/uaccess.h>  
 #include <asm/string.h>  
 #include <asm/byteorder.h>  
 #include <linux/vmalloc.h>
diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index c7296b583787..cb28579e8a94 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -50,7 +50,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/etherdevice.h>
 #include "nicstar.h"
diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c
index 02159345566c..b0363149b2fd 100644
--- a/drivers/atm/suni.c
+++ b/drivers/atm/suni.c
@@ -23,7 +23,7 @@
 #include <linux/atm_suni.h>
 #include <linux/slab.h>
 #include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 
 #include "suni.h"
diff --git a/drivers/atm/uPD98402.c b/drivers/atm/uPD98402.c
index 5120a96b3a89..4fa13a807873 100644
--- a/drivers/atm/uPD98402.c
+++ b/drivers/atm/uPD98402.c
@@ -10,7 +10,7 @@
 #include <linux/sonet.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 
 #include "uPD98402.h"
diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
index d3dc95484161..292dec18ffb8 100644
--- a/drivers/atm/zatm.c
+++ b/drivers/atm/zatm.c
@@ -27,7 +27,7 @@
 #include <asm/string.h>
 #include <asm/io.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "uPD98401.h"
 #include "uPD98402.h"
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index bb69e58c29f3..8ab8ea1253e6 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -23,7 +23,7 @@
 #include <linux/slab.h>
 
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_MUTEX(mem_sysfs_mutex);
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 0809cda93cc0..26a51be77227 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -48,7 +48,7 @@
 #include <linux/random.h>
 #include <linux/scatterlist.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "DAC960.h"
 
 #define DAC960_GAM_MINOR	252
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 5fd50a284168..a328f673adfe 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -70,7 +70,7 @@
 #include <linux/platform_device.h>
 
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/amigahw.h>
 #include <asm/amigaints.h>
 #include <asm/irq.h>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index ad793f35632c..3adc32a3153b 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -23,7 +23,7 @@
 #include <linux/pfn_t.h>
 #endif
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define SECTOR_SHIFT		9
 #define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index db9d6bb6352d..e5c5b8eb14a9 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -43,7 +43,7 @@
 #include <linux/mutex.h>
 #include <linux/bitmap.h>
 #include <linux/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/dma-mapping.h>
 #include <linux/blkdev.h>
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
index 3d31761c0ed0..74e03aa537ad 100644
--- a/drivers/block/cryptoloop.c
+++ b/drivers/block/cryptoloop.c
@@ -26,7 +26,7 @@
 #include <linux/string.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "loop.h"
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 3abb121825bc..a9b48ed7a3cd 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -45,7 +45,7 @@
 
 #define REALLY_SLOW_IO
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef __arm__
 #undef  HD_IRQ
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4af818766797..f347285c67ec 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -78,7 +78,7 @@
 #include <linux/uio.h>
 #include "loop.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_index_mutex);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 99c84468f154..38c576f76d36 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -36,7 +36,7 @@
 #include <linux/debugfs.h>
 #include <linux/blk-mq.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/types.h>
 
 #include <linux/nbd.h>
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 93362362aa55..5fd2d0e25567 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -139,7 +139,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_MUTEX(pcd_mutex);
 static DEFINE_SPINLOCK(pcd_lock);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 78a39f736c64..c3ed2fc72daa 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -155,7 +155,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
 #include <linux/blkpg.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/workqueue.h>
 
 static DEFINE_MUTEX(pd_mutex);
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 7a7d977a76c5..ed93e8badf56 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -155,7 +155,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
 #include <linux/blkdev.h>
 #include <linux/blkpg.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_MUTEX(pf_mutex);
 static DEFINE_SPINLOCK(pf_spin_lock);
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index bfbd4c852dd9..5db955fe3a94 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -166,7 +166,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 #include <linux/mutex.h>
 #include <linux/jiffies.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 module_param(verbose, int, 0644);
 module_param(major, int, 0);
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 216a94fed5b4..61fc6824299a 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -150,7 +150,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
 #include <linux/sched.h>	/* current, TASK_*, schedule_timeout() */
 #include <linux/mutex.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 module_param(verbose, int, 0);
 module_param(major, int, 0);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 95c98de92971..1b94c1ca5c5f 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -68,7 +68,7 @@
 #include <linux/debugfs.h>
 #include <linux/device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DRIVER_NAME	"pktcdvd"
 
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c264f2d284a7..aabd8e9d3035 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -34,7 +34,7 @@
 #include <asm/io.h>
 #include <asm/dbdma.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mediabay.h>
 #include <asm/machdep.h>
 #include <asm/pmac_feature.h>
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index ba4bfe933276..0e93ad7b8511 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -29,7 +29,7 @@
 #include <linux/completion.h>
 #include <linux/scatterlist.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #if 0
 #define CARM_DEBUG
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 46f4c719fed9..c141cc3be22b 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -54,7 +54,7 @@
 
 #include "umem.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #define MM_MAXCARDS 4
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 5d475b3a0b2e..59cca72647a6 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -282,7 +282,7 @@
 #include <linux/blkdev.h>
 #include <linux/times.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* used to tell the module to turn on full debugging messages */
 static bool debug;
diff --git a/drivers/char/agp/compat_ioctl.c b/drivers/char/agp/compat_ioctl.c
index a48e05b31593..2053f70ef66b 100644
--- a/drivers/char/agp/compat_ioctl.c
+++ b/drivers/char/agp/compat_ioctl.c
@@ -31,7 +31,7 @@
 #include <linux/fs.h>
 #include <linux/agpgart.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "agp.h"
 #include "compat_ioctl.h"
 
diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c
index 0f64d149c98d..f6955888e676 100644
--- a/drivers/char/agp/frontend.c
+++ b/drivers/char/agp/frontend.c
@@ -38,7 +38,7 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include "agp.h"
 
diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c
index 14790304b84b..e5c62dcf2c11 100644
--- a/drivers/char/applicom.c
+++ b/drivers/char/applicom.c
@@ -34,7 +34,7 @@
 #include <linux/fs.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "applicom.h"
 
diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c
index 35d46da754d7..0584025bb0c2 100644
--- a/drivers/char/bfin-otp.c
+++ b/drivers/char/bfin-otp.c
@@ -20,7 +20,7 @@
 
 #include <asm/blackfin.h>
 #include <asm/bfrom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define stamp(fmt, args...) pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args)
 #define stampit() stamp("here i am")
diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c
index 0fae5296e311..eb53cbadb68f 100644
--- a/drivers/char/ds1620.c
+++ b/drivers/char/ds1620.c
@@ -13,7 +13,7 @@
 
 #include <mach/hardware.h>
 #include <asm/mach-types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/therm.h>
 
 #ifdef CONFIG_PROC_FS
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 65a8d96c0e93..58471394beb9 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -59,7 +59,7 @@
 #include <linux/sched.h>
 #include <linux/mutex.h>
 #include <asm/io.h>		/* for inb_p, outb_p, inb, outb, etc. */
-#include <asm/uaccess.h>	/* for get_user, etc. */
+#include <linux/uaccess.h>	/* for get_user, etc. */
 #include <linux/wait.h>		/* for wait_queue */
 #include <linux/init.h>		/* for __init, module_{init,exit} */
 #include <linux/poll.h>		/* for POLLIN, etc. */
diff --git a/drivers/char/generic_nvram.c b/drivers/char/generic_nvram.c
index 073db9558379..14e728fbb8a0 100644
--- a/drivers/char/generic_nvram.c
+++ b/drivers/char/generic_nvram.c
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/mutex.h>
 #include <linux/pagemap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/nvram.h>
 #ifdef CONFIG_PPC_PMAC
 #include <asm/machdep.h>
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index a7c5c59675f0..4f337375252e 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -46,7 +46,7 @@
 #include <linux/reboot.h>
 #include <linux/init.h>
 #include <linux/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/sysrq.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index f9766415ff10..6ce5ce8be2f2 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -43,7 +43,7 @@
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/err.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 #define RNG_MODULE_NAME		"hw_random"
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index 4facc7517a6a..4035495f3a86 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -43,7 +43,7 @@
 #include <linux/kdebug.h>
 #include <linux/rwsem.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/notifier.h>
 #include <linux/nmi.h>
 #include <linux/reboot.h>
diff --git a/drivers/char/lp.c b/drivers/char/lp.c
index c4094c4e22c1..5b6742770656 100644
--- a/drivers/char/lp.c
+++ b/drivers/char/lp.c
@@ -134,7 +134,7 @@
 #include <linux/lp.h>
 
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* if you have more than 8 printers, remember to increase LP_NO */
 #define LP_NO 8
diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c
index 67d426470e53..8c9216a0f62e 100644
--- a/drivers/char/mbcs.c
+++ b/drivers/char/mbcs.c
@@ -28,7 +28,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/intr.h>
diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 3d6c0671e996..f786b18ac500 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -35,7 +35,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/intr.h>
 #include <asm/sn/shub_mmr.h>
diff --git a/drivers/char/mwave/3780i.c b/drivers/char/mwave/3780i.c
index 972c40a19629..4a8937f80570 100644
--- a/drivers/char/mwave/3780i.c
+++ b/drivers/char/mwave/3780i.c
@@ -54,7 +54,7 @@
 #include <linux/sched.h>	/* cond_resched() */
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 #include "smapi.h"
 #include "mwavedd.h"
diff --git a/drivers/char/mwave/mwavedd.h b/drivers/char/mwave/mwavedd.h
index 37e0a4926080..21cb09c7bed7 100644
--- a/drivers/char/mwave/mwavedd.h
+++ b/drivers/char/mwave/mwavedd.h
@@ -53,7 +53,7 @@
 #include "smapi.h"
 #include "mwavepub.h"
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/wait.h>
 
 extern int mwave_debug;
diff --git a/drivers/char/nsc_gpio.c b/drivers/char/nsc_gpio.c
index b07b119ae57f..2a91bf048804 100644
--- a/drivers/char/nsc_gpio.c
+++ b/drivers/char/nsc_gpio.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/nsc_gpio.h>
 #include <linux/platform_device.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #define NAME "nsc_gpio"
diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c
index 0e184426db98..a5b1eb276c0b 100644
--- a/drivers/char/nwbutton.c
+++ b/drivers/char/nwbutton.c
@@ -16,7 +16,7 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 #include <asm/mach-types.h>
 
diff --git a/drivers/char/nwflash.c b/drivers/char/nwflash.c
index dbe598de9b74..a284ae25e69a 100644
--- a/drivers/char/nwflash.c
+++ b/drivers/char/nwflash.c
@@ -31,7 +31,7 @@
 #include <asm/hardware/dec21285.h>
 #include <asm/io.h>
 #include <asm/mach-types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*****************************************************************************/
 #include <asm/nwflash.h>
diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c
index 3f79a9fb6b1b..5f4be88c0dfc 100644
--- a/drivers/char/pc8736x_gpio.c
+++ b/drivers/char/pc8736x_gpio.c
@@ -20,7 +20,7 @@
 #include <linux/mutex.h>
 #include <linux/nsc_gpio.h>
 #include <linux/platform_device.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DEVNAME "pc8736x_gpio"
 
diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c
index fc061f7c2bd1..d7123259143e 100644
--- a/drivers/char/pcmcia/cm4040_cs.c
+++ b/drivers/char/pcmcia/cm4040_cs.c
@@ -26,7 +26,7 @@
 #include <linux/poll.h>
 #include <linux/mutex.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #include <pcmcia/cistpl.h>
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index a7dd5f4f2c5a..d136db1a10f0 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -84,7 +84,7 @@
 #define PUT_USER(error,value,addr) error = put_user(value,addr)
 #define COPY_TO_USER(error,dest,src,size) error = copy_to_user(dest,src,size) ? -EFAULT : 0
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static MGSL_PARAMS default_params = {
 	MGSL_MODE_HDLC,			/* unsigned long mode */
diff --git a/drivers/char/random.c b/drivers/char/random.c
index d6876d506220..1ef26403bcc8 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -265,7 +265,7 @@
 #include <crypto/chacha20.h>
 
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 #include <asm/irq_regs.h>
 #include <asm/io.h>
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index e83b2adc014a..293167c6e254 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -24,7 +24,7 @@
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 struct raw_device_data {
 	struct block_device *binding;
diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
index 0bc135b9b16f..903761bc41c9 100644
--- a/drivers/char/scx200_gpio.c
+++ b/drivers/char/scx200_gpio.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #include <linux/types.h>
diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 719c5b4eed39..4fa7fcd8af36 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -52,7 +52,7 @@
 #include <linux/platform_device.h>
 #include <linux/gfp.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #include <linux/sonypi.h>
diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c
index 100cd1de9939..572a51704e67 100644
--- a/drivers/char/tlclk.c
+++ b/drivers/char/tlclk.c
@@ -44,7 +44,7 @@
 #include <linux/miscdevice.h>
 #include <linux/platform_device.h>
 #include <asm/io.h>		/* inb/outb */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 MODULE_AUTHOR("Sebastien Bouchard <sebastien.bouchard@ca.kontron.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/toshiba.c b/drivers/char/toshiba.c
index f5a45d887a37..5488516da8ea 100644
--- a/drivers/char/toshiba.c
+++ b/drivers/char/toshiba.c
@@ -63,7 +63,7 @@
 #include <linux/miscdevice.h>
 #include <linux/ioport.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/init.h>
 #include <linux/stat.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
index c07dfe5c4da3..3e6b23c3453c 100644
--- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c
+++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
@@ -88,7 +88,7 @@
 #include <linux/slab.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_OF
 /* For open firmware. */
diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c b/drivers/cpufreq/ia64-acpi-cpufreq.c
index 759612da4fdc..e28a31a40829 100644
--- a/drivers/cpufreq/ia64-acpi-cpufreq.c
+++ b/drivers/cpufreq/ia64-acpi-cpufreq.c
@@ -18,7 +18,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pal.h>
 
 #include <linux/acpi.h>
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index fe8f08948fcb..ac321f09e717 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -19,7 +19,7 @@
 #include <linux/tick.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PROMOTION_COUNT 4
 #define DEMOTION_COUNT 1
diff --git a/drivers/dio/dio.c b/drivers/dio/dio.c
index 55dd88d82d6d..830184529109 100644
--- a/drivers/dio/dio.c
+++ b/drivers/dio/dio.c
@@ -31,7 +31,7 @@
 #include <linux/init.h>
 #include <linux/dio.h>
 #include <linux/slab.h>                         /* kmalloc() */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>                             /* readb() */
 
 struct dio_bus dio_bus = {
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index de4d5d08af9e..65cf2b9355c4 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -13,7 +13,7 @@
  */
 
 #include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 5f2c717f8053..750891ea07de 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -28,7 +28,7 @@
 #include <linux/ctype.h>
 #include <linux/edac.h>
 #include <linux/bitops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include "edac_mc.h"
 #include "edac_module.h"
diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c
index 4e9d5632041a..48c844a72a27 100644
--- a/drivers/edac/edac_pci.c
+++ b/drivers/edac/edac_pci.c
@@ -10,7 +10,7 @@
  *
  */
 #include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/highmem.h>
 #include <linux/init.h>
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 3de95a29024c..cf9e396d7702 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -30,7 +30,7 @@
 #define pr_fmt(fmt) "i2c-core: " fmt
 
 #include <dt-bindings/i2c/i2c.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/acpi.h>
 #include <linux/clk/clk-conf.h>
 #include <linux/completion.h>
diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c
index 0ceae5cbd89a..4b5dc0162e67 100644
--- a/drivers/ide/hpt366.c
+++ b/drivers/ide/hpt366.c
@@ -130,7 +130,7 @@
 #include <linux/ide.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #define DRV_NAME "hpt366"
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 83679da0c3f0..5ceace542b77 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -31,7 +31,7 @@
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/div64.h>
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 6360bbd37efe..201e43fcbc94 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -51,7 +51,7 @@
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 int ide_end_rq(ide_drive_t *drive, struct request *rq, int error,
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 376f2dc410c5..210a0887dd29 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -24,7 +24,7 @@
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 void SELECT_MASK(ide_drive_t *drive, int mask)
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 0b63facd1d87..330e319419e6 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -36,7 +36,7 @@
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 /**
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 97c070077774..863db44c7916 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -16,7 +16,7 @@
 
 #include <linux/module.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/errno.h>
 #include <linux/proc_fs.h>
 #include <linux/stat.h>
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 579f9a7f6283..e0a995b85a2d 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -46,7 +46,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <rdma/ib.h>
 #include <rdma/ib_cm.h>
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 415a3185cde7..249b403b43a4 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -50,7 +50,7 @@
 #include <linux/semaphore.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <rdma/ib_mad.h>
 #include <rdma/ib_user_mad.h>
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 09b649159e6c..700782203483 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -38,7 +38,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "uverbs.h"
 #include "core_priv.h"
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 813593550c4b..b3f95d453fba 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -46,7 +46,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <rdma/ib.h>
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index a2f9f29c6ab5..fd811115af49 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -35,7 +35,7 @@
 #include <linux/init.h>
 #include <linux/seq_file.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "ipoib.h"
 
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 64b3d11dcf1e..9104e6b8cac9 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -62,7 +62,7 @@
 
 #include <net/sock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
diff --git a/drivers/input/input-compat.c b/drivers/input/input-compat.c
index d84d20b9cec0..2186f71c9fe5 100644
--- a/drivers/input/input-compat.c
+++ b/drivers/input/input-compat.c
@@ -9,7 +9,7 @@
  */
 
 #include <linux/export.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "input-compat.h"
 
 #ifdef CONFIG_COMPAT
diff --git a/drivers/input/misc/atlas_btns.c b/drivers/input/misc/atlas_btns.c
index 638165c78e75..6423aaccc763 100644
--- a/drivers/input/misc/atlas_btns.c
+++ b/drivers/input/misc/atlas_btns.c
@@ -28,7 +28,7 @@
 #include <linux/input.h>
 #include <linux/types.h>
 #include <linux/acpi.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define ACPI_ATLAS_NAME		"Atlas ACPI"
 #define ACPI_ATLAS_CLASS	"Atlas"
diff --git a/drivers/input/mouse/amimouse.c b/drivers/input/mouse/amimouse.c
index a7fd8f22ba56..a33437c480e3 100644
--- a/drivers/input/mouse/amimouse.c
+++ b/drivers/input/mouse/amimouse.c
@@ -25,7 +25,7 @@
 
 #include <asm/irq.h>
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/amigahw.h>
 #include <asm/amigaints.h>
 
diff --git a/drivers/input/mouse/atarimouse.c b/drivers/input/mouse/atarimouse.c
index d1c43236b125..96f2f51604bd 100644
--- a/drivers/input/mouse/atarimouse.c
+++ b/drivers/input/mouse/atarimouse.c
@@ -47,7 +47,7 @@
 
 #include <asm/irq.h>
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/atarihw.h>
 #include <asm/atarikb.h>
 #include <asm/atariints.h>
diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c
index 354d47ecd66a..7331084973e1 100644
--- a/drivers/input/mouse/trackpoint.c
+++ b/drivers/input/mouse/trackpoint.c
@@ -15,7 +15,7 @@
 #include <linux/input.h>
 #include <linux/libps2.h>
 #include <linux/proc_fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "psmouse.h"
 #include "trackpoint.h"
 
diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c
index 852858e5d8d0..559c99ca6592 100644
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -79,7 +79,7 @@
 # define sdc_readb(p)		gsc_readb(p)
 # define sdc_writeb(v,p)	gsc_writeb((v),(p))
 #elif defined(__mc68000__)
-# include <asm/uaccess.h>
+#include <linux/uaccess.h>
 # define sdc_readb(p)		in_8(p)
 # define sdc_writeb(v,p)	out_8((p),(v))
 #else
diff --git a/drivers/input/serio/q40kbd.c b/drivers/input/serio/q40kbd.c
index 5a9d521510bf..d0fccc8ec259 100644
--- a/drivers/input/serio/q40kbd.c
+++ b/drivers/input/serio/q40kbd.c
@@ -38,7 +38,7 @@
 #include <linux/slab.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/q40_master.h>
 #include <asm/irq.h>
 #include <asm/q40ints.h>
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index d189843f3727..f8ead9f9c77e 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -13,7 +13,7 @@
  * the Free Software Foundation.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/drivers/input/tablet/aiptek.c b/drivers/input/tablet/aiptek.c
index 4613f0aefd08..d67547bded3e 100644
--- a/drivers/input/tablet/aiptek.c
+++ b/drivers/input/tablet/aiptek.c
@@ -75,7 +75,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/usb/input.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 
 /*
diff --git a/drivers/input/tablet/gtco.c b/drivers/input/tablet/gtco.c
index abf09ac42ce4..b796e891e2ee 100644
--- a/drivers/input/tablet/gtco.c
+++ b/drivers/input/tablet/gtco.c
@@ -56,7 +56,7 @@ Scott Hill shill@gtcocalcomp.com
 #include <linux/slab.h>
 #include <linux/input.h>
 #include <linux/usb.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
 #include <linux/bitops.h>
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 823f6985b260..49d0f70c2bae 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -28,7 +28,7 @@
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/isdn/capicmd.h>
 #include <linux/isdn/capiutil.h>
 #ifdef AVMB1_COMPAT
diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c
index 4d9b195547c5..9fdbd99c7547 100644
--- a/drivers/isdn/hardware/avm/b1.c
+++ b/drivers/isdn/hardware/avm/b1.c
@@ -24,7 +24,7 @@
 #include <linux/slab.h>
 #include <asm/io.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/netdevice.h>
 #include <linux/isdn/capilli.h>
 #include "avmcard.h"
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index 19b113faeb7b..818bd8f231db 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -23,7 +23,7 @@
 #include <linux/gfp.h>
 #include <asm/io.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/netdevice.h>
 #include <linux/isdn/capilli.h>
 #include "avmcard.h"
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index 5d00d72fe482..17beb2869dc1 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -24,7 +24,7 @@
 #include <linux/init.h>
 #include <linux/gfp.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/netdevice.h>
 #include <linux/isdn/capicmd.h>
 #include <linux/isdn/capiutil.h>
diff --git a/drivers/isdn/hardware/eicon/capimain.c b/drivers/isdn/hardware/eicon/capimain.c
index 997d46abf5b2..be36d82004d6 100644
--- a/drivers/isdn/hardware/eicon/capimain.c
+++ b/drivers/isdn/hardware/eicon/capimain.c
@@ -13,7 +13,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/skbuff.h>
 
diff --git a/drivers/isdn/hardware/eicon/divamnt.c b/drivers/isdn/hardware/eicon/divamnt.c
index 0de29b7b712f..72e58bf07577 100644
--- a/drivers/isdn/hardware/eicon/divamnt.c
+++ b/drivers/isdn/hardware/eicon/divamnt.c
@@ -15,7 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/poll.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "platform.h"
 #include "di_defs.h"
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index 4103a8c178d7..cb88090f9cea 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -18,7 +18,7 @@
 #include <linux/proc_fs.h>
 #include <linux/skbuff.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "platform.h"
 #include "di_defs.h"
diff --git a/drivers/isdn/hardware/eicon/divasmain.c b/drivers/isdn/hardware/eicon/divasmain.c
index 32f34511c416..8b7ad4f1ab01 100644
--- a/drivers/isdn/hardware/eicon/divasmain.c
+++ b/drivers/isdn/hardware/eicon/divasmain.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
diff --git a/drivers/isdn/hardware/eicon/divasproc.c b/drivers/isdn/hardware/eicon/divasproc.c
index 56ce98a4e248..b57efd6ad916 100644
--- a/drivers/isdn/hardware/eicon/divasproc.c
+++ b/drivers/isdn/hardware/eicon/divasproc.c
@@ -16,7 +16,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/list.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "platform.h"
 #include "debuglib.h"
diff --git a/drivers/isdn/hysdn/hysdn_boot.c b/drivers/isdn/hysdn/hysdn_boot.c
index eda4741e3f2f..4a0425378f37 100644
--- a/drivers/isdn/hysdn/hysdn_boot.c
+++ b/drivers/isdn/hysdn/hysdn_boot.c
@@ -13,7 +13,7 @@
 
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "hysdn_defs.h"
 #include "hysdn_pof.h"
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 9e385b38debf..ac219045daf7 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -15,7 +15,7 @@
 #include <linux/slab.h>
 #include <asm/paravirt.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/poll.h>
 #include <asm/asm-offsets.h>
 #include "lg.h"
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index e3abebc912c0..0bc127e9f16a 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -16,7 +16,7 @@
 #include <linux/random.h>
 #include <linux/percpu.h>
 #include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "lg.h"
 
 /*M:008
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 743253fc638f..d71f6323ac00 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -45,7 +45,7 @@
 #include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/lguest.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
 #include "../lg.h"
diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c
index cd35079c8c98..281fa9e6fc1f 100644
--- a/drivers/macintosh/ans-lcd.c
+++ b/drivers/macintosh/ans-lcd.c
@@ -11,7 +11,7 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sections.h>
 #include <asm/prom.h>
 #include <asm/io.h>
diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c
index 08edb2c25b60..227869159ac0 100644
--- a/drivers/macintosh/smu.c
+++ b/drivers/macintosh/smu.c
@@ -47,7 +47,7 @@
 #include <asm/pmac_feature.h>
 #include <asm/smu.h>
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define VERSION "0.7"
 #define AUTHOR  "(c) 2005 Benjamin Herrenschmidt, IBM Corp."
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 91081dcdc272..43b8db2b5445 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -57,7 +57,7 @@
 #include <asm/pmac_feature.h>
 #include <asm/pmac_pfunc.h>
 #include <asm/pmac_low_i2c.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/cputable.h>
 #include <asm/time.h>
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
index a00ee41f0573..a411c5cb77a1 100644
--- a/drivers/macintosh/via-pmu68k.c
+++ b/drivers/macintosh/via-pmu68k.c
@@ -38,7 +38,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Misc minor number allocated for /dev/pmu */
 #define PMU_MINOR	154
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c72a77048b73..a5a9b17f0f7f 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -17,7 +17,7 @@
 #include <linux/hdreg.h>
 #include <linux/compat.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DM_MSG_PREFIX "ioctl"
 #define DM_DRIVER_EMAIL "dm-devel@redhat.com"
diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c
index efe55a3e80d0..0c44479b556e 100644
--- a/drivers/media/dvb-core/dmxdev.c
+++ b/drivers/media/dvb-core/dmxdev.c
@@ -30,7 +30,7 @@
 #include <linux/poll.h>
 #include <linux/ioctl.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "dmxdev.h"
 
 static int debug;
diff --git a/drivers/media/dvb-core/dvb_demux.c b/drivers/media/dvb-core/dvb_demux.c
index 3ad0b2cd26b1..bbbff72bbb2a 100644
--- a/drivers/media/dvb-core/dvb_demux.c
+++ b/drivers/media/dvb-core/dvb_demux.c
@@ -31,7 +31,7 @@
 #include <linux/poll.h>
 #include <linux/string.h>
 #include <linux/crc32.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/div64.h>
 
 #include "dvb_demux.h"
diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c
index dfc03a95df71..bc5e8cfe7ca2 100644
--- a/drivers/media/dvb-core/dvb_net.c
+++ b/drivers/media/dvb-core/dvb_net.c
@@ -62,7 +62,7 @@
 #include <linux/etherdevice.h>
 #include <linux/dvb/net.h>
 #include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/crc32.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
diff --git a/drivers/media/dvb-core/dvb_ringbuffer.c b/drivers/media/dvb-core/dvb_ringbuffer.c
index 7df7fb3738a0..5c4b5a1f604f 100644
--- a/drivers/media/dvb-core/dvb_ringbuffer.c
+++ b/drivers/media/dvb-core/dvb_ringbuffer.c
@@ -31,7 +31,7 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "dvb_ringbuffer.h"
 
diff --git a/drivers/media/i2c/adv7170.c b/drivers/media/i2c/adv7170.c
index 05f1dc6c72af..fc9ec0f3679c 100644
--- a/drivers/media/i2c/adv7170.c
+++ b/drivers/media/i2c/adv7170.c
@@ -32,7 +32,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/adv7175.c b/drivers/media/i2c/adv7175.c
index f554809a51e7..72139bdae1ca 100644
--- a/drivers/media/i2c/adv7175.c
+++ b/drivers/media/i2c/adv7175.c
@@ -28,7 +28,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/bt856.c b/drivers/media/i2c/bt856.c
index 48176591a80d..54c627859c8e 100644
--- a/drivers/media/i2c/bt856.c
+++ b/drivers/media/i2c/bt856.c
@@ -32,7 +32,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/bt866.c b/drivers/media/i2c/bt866.c
index bbec70c882a3..0d3f46af2545 100644
--- a/drivers/media/i2c/bt866.c
+++ b/drivers/media/i2c/bt866.c
@@ -32,7 +32,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/cs53l32a.c b/drivers/media/i2c/cs53l32a.c
index e4b3cf49dd38..59c1a98c5a90 100644
--- a/drivers/media/i2c/cs53l32a.c
+++ b/drivers/media/i2c/cs53l32a.c
@@ -24,7 +24,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/m52790.c b/drivers/media/i2c/m52790.c
index 81171d8e1c2c..89c28c36c5bf 100644
--- a/drivers/media/i2c/m52790.c
+++ b/drivers/media/i2c/m52790.c
@@ -24,7 +24,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/i2c/m52790.h>
diff --git a/drivers/media/i2c/saa6588.c b/drivers/media/i2c/saa6588.c
index 89e458c23983..00640233a5e3 100644
--- a/drivers/media/i2c/saa6588.c
+++ b/drivers/media/i2c/saa6588.c
@@ -29,7 +29,7 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <media/i2c/saa6588.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/saa7110.c b/drivers/media/i2c/saa7110.c
index 6f49886806ee..ad456ce051f9 100644
--- a/drivers/media/i2c/saa7110.c
+++ b/drivers/media/i2c/saa7110.c
@@ -31,7 +31,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/saa7185.c b/drivers/media/i2c/saa7185.c
index eecad2d1edce..119050e1197a 100644
--- a/drivers/media/i2c/saa7185.c
+++ b/drivers/media/i2c/saa7185.c
@@ -28,7 +28,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/tlv320aic23b.c b/drivers/media/i2c/tlv320aic23b.c
index 2e06c06cac9b..cc6104da34ef 100644
--- a/drivers/media/i2c/tlv320aic23b.c
+++ b/drivers/media/i2c/tlv320aic23b.c
@@ -27,7 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/vp27smpx.c b/drivers/media/i2c/vp27smpx.c
index d6c23bdbcd4a..ef0d8b8e3df7 100644
--- a/drivers/media/i2c/vp27smpx.c
+++ b/drivers/media/i2c/vp27smpx.c
@@ -25,7 +25,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/vpx3220.c b/drivers/media/i2c/vpx3220.c
index 90b693f4e2ab..ce9f09370e22 100644
--- a/drivers/media/i2c/vpx3220.c
+++ b/drivers/media/i2c/vpx3220.c
@@ -23,7 +23,7 @@
 #include <linux/delay.h>
 #include <linux/types.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/wm8739.c b/drivers/media/i2c/wm8739.c
index f086e5e6e844..c885def54b15 100644
--- a/drivers/media/i2c/wm8739.c
+++ b/drivers/media/i2c/wm8739.c
@@ -25,7 +25,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/i2c/wm8775.c b/drivers/media/i2c/wm8775.c
index 5581f4db02af..45039d756753 100644
--- a/drivers/media/i2c/wm8775.c
+++ b/drivers/media/i2c/wm8775.c
@@ -29,7 +29,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/i2c.h>
 #include <linux/videodev2.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/media/pci/ivtv/ivtv-driver.h b/drivers/media/pci/ivtv/ivtv-driver.h
index 10cba305dbd2..6b09a9514d64 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.h
+++ b/drivers/media/pci/ivtv/ivtv-driver.h
@@ -53,7 +53,7 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include <linux/dvb/video.h>
diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c
index e825bc93ea7a..24fba633c217 100644
--- a/drivers/media/pci/meye/meye.c
+++ b/drivers/media/pci/meye/meye.c
@@ -37,7 +37,7 @@
 #include <media/v4l2-ioctl.h>
 #include <media/v4l2-fh.h>
 #include <media/v4l2-event.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
diff --git a/drivers/media/pci/zoran/videocodec.c b/drivers/media/pci/zoran/videocodec.c
index 13a3c07cd259..3c3cbce0f9cc 100644
--- a/drivers/media/pci/zoran/videocodec.c
+++ b/drivers/media/pci/zoran/videocodec.c
@@ -40,7 +40,7 @@
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #endif
 
 #include "videocodec.h"
diff --git a/drivers/media/pci/zoran/zoran_driver.c b/drivers/media/pci/zoran/zoran_driver.c
index 2170e174c335..94b9b616df98 100644
--- a/drivers/media/pci/zoran/zoran_driver.c
+++ b/drivers/media/pci/zoran/zoran_driver.c
@@ -66,7 +66,7 @@
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/proc_fs.h>
 
 #include <linux/mutex.h>
diff --git a/drivers/media/platform/arv.c b/drivers/media/platform/arv.c
index 03c5098499c4..8fe59bf6cd3f 100644
--- a/drivers/media/platform/arv.c
+++ b/drivers/media/platform/arv.c
@@ -34,7 +34,7 @@
 #include <media/v4l2-fh.h>
 #include <linux/mutex.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/m32r.h>
 #include <asm/io.h>
 #include <asm/dma.h>
diff --git a/drivers/media/usb/pvrusb2/pvrusb2-ioread.c b/drivers/media/usb/pvrusb2/pvrusb2-ioread.c
index 70b8a052eb5b..3c7ca2c2c108 100644
--- a/drivers/media/usb/pvrusb2/pvrusb2-ioread.c
+++ b/drivers/media/usb/pvrusb2/pvrusb2-ioread.c
@@ -25,7 +25,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define BUFFER_COUNT 32
 #define BUFFER_SIZE PAGE_ALIGN(0x4000)
diff --git a/drivers/media/usb/pwc/pwc-ctrl.c b/drivers/media/usb/pwc/pwc-ctrl.c
index 3a1618580ed6..655cef39eb3d 100644
--- a/drivers/media/usb/pwc/pwc-ctrl.c
+++ b/drivers/media/usb/pwc/pwc-ctrl.c
@@ -39,7 +39,7 @@
 /* Control functions for the cam; brightness, contrast, video mode, etc. */
 
 #ifdef __KERNEL__
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #endif
 #include <asm/errno.h>
 
diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
index 57cfe26a393f..a5ea1f517291 100644
--- a/drivers/media/v4l2-core/v4l2-common.c
+++ b/drivers/media/v4l2-core/v4l2-common.c
@@ -54,7 +54,7 @@
 #if defined(CONFIG_SPI)
 #include <linux/spi/spi.h>
 #endif
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/div64.h>
diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c
index 8be561ab2615..fa2124cb31bd 100644
--- a/drivers/media/v4l2-core/v4l2-dev.c
+++ b/drivers/media/v4l2-core/v4l2-dev.c
@@ -25,7 +25,7 @@
 #include <linux/init.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <media/v4l2-common.h>
 #include <media/v4l2-device.h>
diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c
index 02b5f69e1a42..7b3b41368931 100644
--- a/drivers/message/fusion/mptctl.c
+++ b/drivers/message/fusion/mptctl.c
@@ -58,7 +58,7 @@
 #include <linux/compat.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
diff --git a/drivers/message/fusion/mptlan.h b/drivers/message/fusion/mptlan.h
index 69e9d5463564..8946e19dbfc8 100644
--- a/drivers/message/fusion/mptlan.h
+++ b/drivers/message/fusion/mptlan.h
@@ -70,7 +70,7 @@
 #include <linux/workqueue.h>
 #include <linux/delay.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
     /* Override mptbase.h by pre-defining these! */
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index 520f58439080..e05c3245930a 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -76,7 +76,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include "ibmasm.h"
 #include "remote.h"
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index bab3f07b1117..cb1698f268f1 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -43,7 +43,7 @@
 #include <linux/mmc/mmc.h>
 #include <linux/mmc/sd.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "queue.h"
 #include "block.h"
diff --git a/drivers/mmc/host/android-goldfish.c b/drivers/mmc/host/android-goldfish.c
index dca5518b0139..590a8a4522be 100644
--- a/drivers/mmc/host/android-goldfish.c
+++ b/drivers/mmc/host/android-goldfish.c
@@ -49,7 +49,7 @@
 
 #include <asm/types.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DRIVER_NAME "goldfish_mmc"
 
diff --git a/drivers/mtd/devices/pmc551.c b/drivers/mtd/devices/pmc551.c
index 220f9200fa52..cadea0620cd0 100644
--- a/drivers/mtd/devices/pmc551.c
+++ b/drivers/mtd/devices/pmc551.c
@@ -82,7 +82,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/ptrace.h>
diff --git a/drivers/mtd/devices/slram.c b/drivers/mtd/devices/slram.c
index a70eb83e68f1..8087c36dc693 100644
--- a/drivers/mtd/devices/slram.c
+++ b/drivers/mtd/devices/slram.c
@@ -30,7 +30,7 @@
 
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c
index 9fb3b0dcdac2..664d206a4cbe 100644
--- a/drivers/mtd/ftl.c
+++ b/drivers/mtd/ftl.c
@@ -70,7 +70,7 @@
 #include <linux/hdreg.h>
 #include <linux/vmalloc.h>
 #include <linux/blkpg.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/mtd/ftl.h>
 
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index b66b541877f0..8db740d6eb08 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -34,7 +34,7 @@
 #include <linux/mtd/nftl.h>
 #include <linux/mtd/inftl.h>
 #include <linux/mtd/nand.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/errno.h>
 #include <asm/io.h>
 
diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c
index 1388c8d7f309..8d6bb189ea8e 100644
--- a/drivers/mtd/inftlmount.c
+++ b/drivers/mtd/inftlmount.c
@@ -27,7 +27,7 @@
 #include <linux/module.h>
 #include <asm/errno.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/mtd/mtd.h>
diff --git a/drivers/mtd/maps/sun_uflash.c b/drivers/mtd/maps/sun_uflash.c
index d459aca07881..414956eca0c9 100644
--- a/drivers/mtd/maps/sun_uflash.c
+++ b/drivers/mtd/maps/sun_uflash.c
@@ -16,7 +16,7 @@
 #include <linux/of_device.h>
 #include <linux/slab.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #include <linux/mtd/mtd.h>
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 8d58acf33021..df8a5ef334c0 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -31,7 +31,7 @@
 #include <linux/spinlock.h>
 #include <linux/hdreg.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "mtdcore.h"
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 2a47a3f0e730..ce5ccc573a9c 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -37,7 +37,7 @@
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/map.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "mtdcore.h"
 
diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c
index 46f27de018c3..e21161353e76 100644
--- a/drivers/mtd/nftlcore.c
+++ b/drivers/mtd/nftlcore.c
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <asm/errno.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/init.h>
diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index 31f89f1c6123..b8c293373ecc 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -33,7 +33,7 @@
 #include <linux/if_arp.h>
 #include <linux/slab.h>
 #include <net/route.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "ipddp.h"		/* Our stuff */
 
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index a10ad74cc8d2..fe13bfea30ac 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -127,7 +127,7 @@
 #include <linux/if_eql.h>
 #include <linux/pkt_sched.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static int eql_open(struct net_device *dev);
 static int eql_close(struct net_device *dev);
diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c
index a7533780dddc..c7f9f2c77da7 100644
--- a/drivers/net/ethernet/3com/3c509.c
+++ b/drivers/net/ethernet/3com/3c509.c
@@ -88,7 +88,7 @@
 #include <linux/eisa.h>
 #include <linux/bitops.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 
diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c
index be5b80103bec..e7b1fa56b290 100644
--- a/drivers/net/ethernet/3com/3c515.c
+++ b/drivers/net/ethernet/3com/3c515.c
@@ -72,7 +72,7 @@ static int max_interrupt_work = 20;
 #include <linux/ethtool.h>
 #include <linux/bitops.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/dma.h>
 
diff --git a/drivers/net/ethernet/3com/3c574_cs.c b/drivers/net/ethernet/3com/3c574_cs.c
index 9359a37fedc0..47c844cc9d27 100644
--- a/drivers/net/ethernet/3com/3c574_cs.c
+++ b/drivers/net/ethernet/3com/3c574_cs.c
@@ -92,7 +92,7 @@ earlier 3Com products.
 #include <pcmcia/ciscode.h>
 #include <pcmcia/ds.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 /*====================================================================*/
diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index b3560a364e53..40196f41768a 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -92,7 +92,7 @@ static int vortex_debug = 1;
 #include <linux/gfp.h>
 #include <asm/irq.h>			/* For nr_irqs only. */
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Kernel compatibility defines, some common to David Hinds' PCMCIA package.
    This is only in the support-all-kernels source code. */
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c
index a0cacbe846ba..9fe3990319ec 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -119,7 +119,7 @@ static const int multicast_filter_limit = 32;
 #include <linux/bitops.h>
 #include <asm/processor.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/in6.h>
 #include <linux/dma-mapping.h>
 #include <linux/firmware.h>
diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c
index 1d84a0544ace..3da1fc539ef9 100644
--- a/drivers/net/ethernet/8390/axnet_cs.c
+++ b/drivers/net/ethernet/8390/axnet_cs.c
@@ -46,7 +46,7 @@
 
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define AXNET_CMD	0x00
 #define AXNET_DATAPORT	0x10	/* NatSemi-defined port window offset. */
diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c
index 07355302443d..1bdea746926c 100644
--- a/drivers/net/ethernet/8390/ne2k-pci.c
+++ b/drivers/net/ethernet/8390/ne2k-pci.c
@@ -54,7 +54,7 @@ static int options[MAX_UNITS];
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "8390.h"
 
diff --git a/drivers/net/ethernet/8390/pcnet_cs.c b/drivers/net/ethernet/8390/pcnet_cs.c
index 63079a6e20d9..bd0a2a14b649 100644
--- a/drivers/net/ethernet/8390/pcnet_cs.c
+++ b/drivers/net/ethernet/8390/pcnet_cs.c
@@ -49,7 +49,7 @@
 
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PCNET_CMD	0x00
 #define PCNET_DATAPORT	0x10	/* NatSemi-defined port window offset. */
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 3aaad33cdbc6..c12d2618eebf 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -45,7 +45,7 @@
 #include <linux/mm.h>
 #include <linux/firmware.h>
 #include <asm/processor.h>		/* Processor type for cache alignment. */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 /*
diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index 16f0c70266bc..a1a52eb53b14 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -80,7 +80,7 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 #define DRV_NAME "acenic"
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 11cf1e3e0295..9595f1bc535b 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -87,7 +87,7 @@ Revision History:
 
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #if IS_ENABLED(CONFIG_VLAN_8021Q)
 #define AMD8111E_VLAN_TAG_USED 1
diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c
index 113a3b3cc50c..b556c926557a 100644
--- a/drivers/net/ethernet/amd/nmclan_cs.c
+++ b/drivers/net/ethernet/amd/nmclan_cs.c
@@ -151,7 +151,7 @@ Include Files
 #include <pcmcia/cistpl.h>
 #include <pcmcia/ds.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 /* ----------------------------------------------------------------------------
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 1df3048a3cdb..48707ed76ffc 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -32,7 +32,7 @@
 #include <linux/slab.h>
 #include <linux/phy.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 
diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
index 3a05f9098e75..d8aff7a4b3c7 100644
--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
+++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
@@ -44,7 +44,7 @@
 #include <linux/mii.h>
 #include <linux/sockios.h>
 #include <linux/dma-mapping.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "cpl5_cmd.h"
 #include "regs.h"
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index 7b2224ae72f2..d76491676b51 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -50,7 +50,7 @@
 #include <linux/stringify.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "common.h"
 #include "cxgb3_ioctl.h"
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 66c37fac59b2..6f951877430b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -63,7 +63,7 @@
 #include <net/addrconf.h>
 #include <net/bonding.h>
 #include <net/addrconf.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/crash_dump.h>
 
 #include "cxgb4.h"
diff --git a/drivers/net/ethernet/dec/tulip/de2104x.c b/drivers/net/ethernet/dec/tulip/de2104x.c
index 90c573b8ccaf..57c17e797ae3 100644
--- a/drivers/net/ethernet/dec/tulip/de2104x.c
+++ b/drivers/net/ethernet/dec/tulip/de2104x.c
@@ -49,7 +49,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 
 /* These identify the driver base version and may not be removed. */
diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c
index 51fda3a6b13f..df4a871df633 100644
--- a/drivers/net/ethernet/dec/tulip/de4x5.c
+++ b/drivers/net/ethernet/dec/tulip/de4x5.c
@@ -472,7 +472,7 @@
 #include <asm/dma.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #ifdef CONFIG_PPC_PMAC
 #include <asm/machdep.h>
 #endif /* CONFIG_PPC_PMAC */
diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c
index df4994919456..07e10a45beaa 100644
--- a/drivers/net/ethernet/dec/tulip/dmfe.c
+++ b/drivers/net/ethernet/dec/tulip/dmfe.c
@@ -90,7 +90,7 @@
 #include <asm/processor.h>
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 
 #ifdef CONFIG_TULIP_DM910X
diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c
index 5f1377449b8f..17e566a8b345 100644
--- a/drivers/net/ethernet/dec/tulip/tulip_core.c
+++ b/drivers/net/ethernet/dec/tulip/tulip_core.c
@@ -31,7 +31,7 @@
 #include <linux/mii.h>
 #include <linux/crc32.h>
 #include <asm/unaligned.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_SPARC
 #include <asm/prom.h>
diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c
index e1c4133b8787..f82ebe5d89ee 100644
--- a/drivers/net/ethernet/dec/tulip/uli526x.c
+++ b/drivers/net/ethernet/dec/tulip/uli526x.c
@@ -40,7 +40,7 @@
 #include <asm/processor.h>
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define uw32(reg, val)	iowrite32(val, ioaddr + (reg))
 #define ur32(reg)	ioread32(ioaddr + (reg))
diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c
index feda96d585e7..bc9bf88e5831 100644
--- a/drivers/net/ethernet/dec/tulip/winbond-840.c
+++ b/drivers/net/ethernet/dec/tulip/winbond-840.c
@@ -129,7 +129,7 @@ static int full_duplex[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1};
 #include <linux/rtnetlink.h>
 #include <linux/crc32.h>
 #include <linux/bitops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>		/* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <asm/irq.h>
diff --git a/drivers/net/ethernet/dec/tulip/xircom_cb.c b/drivers/net/ethernet/dec/tulip/xircom_cb.c
index 19e4ea15b504..a8de79355578 100644
--- a/drivers/net/ethernet/dec/tulip/xircom_cb.c
+++ b/drivers/net/ethernet/dec/tulip/xircom_cb.c
@@ -30,7 +30,7 @@
 #include <linux/delay.h>
 #include <linux/bitops.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #ifdef CONFIG_NET_POLL_CONTROLLER
 #include <asm/irq.h>
diff --git a/drivers/net/ethernet/dlink/dl2k.h b/drivers/net/ethernet/dlink/dl2k.h
index 8f4f61262d5c..5d8ae5320242 100644
--- a/drivers/net/ethernet/dlink/dl2k.h
+++ b/drivers/net/ethernet/dlink/dl2k.h
@@ -31,7 +31,7 @@
 #include <linux/bitops.h>
 #include <asm/processor.h>	/* Processor type for cache alignment. */
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/spinlock.h>
 #include <linux/time.h>
diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c
index eab36acfc0d1..2e5b66762e15 100644
--- a/drivers/net/ethernet/dlink/sundance.c
+++ b/drivers/net/ethernet/dlink/sundance.c
@@ -91,7 +91,7 @@ static char *media[MAX_UNITS];
 #include <linux/skbuff.h>
 #include <linux/init.h>
 #include <linux/bitops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>		/* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <linux/delay.h>
diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c
index 6967b287b6e7..9cb436cb3745 100644
--- a/drivers/net/ethernet/fealnx.c
+++ b/drivers/net/ethernet/fealnx.c
@@ -88,7 +88,7 @@ static int full_duplex[MAX_UNITS] = { -1, -1, -1, -1, -1, -1, -1, -1 };
 
 #include <asm/processor.h>	/* Processor type for cache alignment. */
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 /* These identify the driver base version and may not be removed. */
diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
index d9f3a480ca1b..1f98838f32b7 100644
--- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
+++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
@@ -44,7 +44,7 @@
 #include <linux/vmalloc.h>
 #include <asm/pgtable.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "fs_enet.h"
 
diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c b/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c
index 120c758f5d01..6e64989f8478 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c
@@ -42,7 +42,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "fs_enet.h"
 
diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c
index 777beffa1e1e..db9c0bcf54cd 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c
@@ -36,7 +36,7 @@
 #include <linux/gfp.h>
 
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_8xx
 #include <asm/8xx_immap.h>
diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-scc.c b/drivers/net/ethernet/freescale/fs_enet/mac-scc.c
index 15abd37d70e3..96d44cf44fe0 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mac-scc.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mac-scc.c
@@ -35,7 +35,7 @@
 #include <linux/of_platform.h>
 
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_8xx
 #include <asm/8xx_immap.h>
diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
index a89267b94352..1582d82483ec 100644
--- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
+++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c
@@ -35,7 +35,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mpc5xxx.h>
 
 #include "fs_enet.h"
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 756f7e763d5f..a6e7afa878be 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -93,7 +93,7 @@
 #include <asm/mpc85xx.h>
 #endif
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/dma-mapping.h>
 #include <linux/crc32.h>
diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h
index 6e8a9c8467b9..5aa814799d70 100644
--- a/drivers/net/ethernet/freescale/gianfar.h
+++ b/drivers/net/ethernet/freescale/gianfar.h
@@ -40,7 +40,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/crc32.h>
 #include <linux/workqueue.h>
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 56588f2e1d91..a93e0199c369 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -32,7 +32,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/crc32.h>
 #include <asm/types.h>
diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c
index 53c5fcf1436c..9d660888510f 100644
--- a/drivers/net/ethernet/freescale/ucc_geth.c
+++ b/drivers/net/ethernet/freescale/ucc_geth.c
@@ -37,7 +37,7 @@
 #include <linux/of_net.h>
 #include <linux/of_platform.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 #include <asm/io.h>
 #include <soc/fsl/qe/immap_qe.h>
diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c
index 8ba636f61b50..b642990b549c 100644
--- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c
+++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c
@@ -32,7 +32,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/types.h>
 
 #include "ucc_geth.h"
diff --git a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
index 51c4abc51bf4..a69cd19a55ae 100644
--- a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
+++ b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c
@@ -54,7 +54,7 @@
 #include <pcmcia/ciscode.h>
 #include <pcmcia/ds.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 /*====================================================================*/
diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c
index 52a69c925965..5909615c27f7 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -47,7 +47,7 @@
 #include <asm/processor.h>
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/dcr.h>
 #include <asm/dcr-regs.h>
 
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
index d2b29b490ae0..e5d72559cca9 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c
@@ -30,7 +30,7 @@
 
 #include "ixgb.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define IXGB_ALL_RAR_ENTRIES 16
 
diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c
index 22b0821c1da0..90eac63f9606 100644
--- a/drivers/net/ethernet/natsemi/natsemi.c
+++ b/drivers/net/ethernet/natsemi/natsemi.c
@@ -51,7 +51,7 @@
 #include <asm/processor.h>	/* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DRV_NAME	"natsemi"
 #define DRV_VERSION	"2.1"
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 93c4bdc0cdca..f9d2eb9a920a 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -119,7 +119,7 @@
 #include <linux/slab.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define DRV_NAME "ns83820"
 
diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c
index 2d04679a923a..baff744b560e 100644
--- a/drivers/net/ethernet/packetengines/hamachi.c
+++ b/drivers/net/ethernet/packetengines/hamachi.c
@@ -160,7 +160,7 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1};
 #include <linux/delay.h>
 #include <linux/bitops.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>	/* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <asm/unaligned.h>
diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c
index 2a2ca5fa0c69..fa7770da6ef8 100644
--- a/drivers/net/ethernet/packetengines/yellowfin.c
+++ b/drivers/net/ethernet/packetengines/yellowfin.c
@@ -100,7 +100,7 @@ static int gx_fix;
 #include <linux/ethtool.h>
 #include <linux/crc32.h>
 #include <linux/bitops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>		/* Processor type for cache alignment. */
 #include <asm/unaligned.h>
 #include <asm/io.h>
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index b7c89ebcf4a2..0b3cd58093d5 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -76,7 +76,7 @@
 #include <linux/cache.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* These identify the driver base version and may not be removed. */
 static char version[] =
diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 42051ab98cf0..d390b9663dc3 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -60,7 +60,7 @@
 #include <asm/byteorder.h>
 #include <asm/io.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sn/types.h>
 #include <asm/sn/ioc3.h>
 #include <asm/pci/bridge.h>
diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c
index 39fca6c0b68d..19a458716f1a 100644
--- a/drivers/net/ethernet/sis/sis900.c
+++ b/drivers/net/ethernet/sis/sis900.c
@@ -74,7 +74,7 @@
 #include <asm/processor.h>      /* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>	/* User space memory access functions */
+#include <linux/uaccess.h>	/* User space memory access functions */
 
 #include "sis900.h"
 
diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c
index fe9760ffab51..55a95e1d69d6 100644
--- a/drivers/net/ethernet/smsc/epic100.c
+++ b/drivers/net/ethernet/smsc/epic100.c
@@ -86,7 +86,7 @@ static int rx_copybreak;
 #include <linux/crc32.h>
 #include <linux/bitops.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 /* These identify the driver base version and may not be removed. */
diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c
index f1c75e291e55..67154621abcf 100644
--- a/drivers/net/ethernet/smsc/smc91c92_cs.c
+++ b/drivers/net/ethernet/smsc/smc91c92_cs.c
@@ -52,7 +52,7 @@
 #include <pcmcia/ss.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*====================================================================*/
 
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index e9e5ef241c6f..0e8e89f17dbb 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -99,7 +99,7 @@
 #include <linux/atomic.h>
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define cas_page_map(x)      kmap_atomic((x))
 #define cas_page_unmap(x)    kunmap_atomic((x))
diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index 66ecf0fcc330..d277e4107976 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -42,7 +42,7 @@
 
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/irq.h>
 
 #ifdef CONFIG_SPARC
diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index ca96408058b0..72ff05cd3ed8 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -49,7 +49,7 @@
 #include <asm/prom.h>
 #include <asm/auxio.h>
 #endif
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
 #include <asm/irq.h>
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index ba5c54249055..0a6c4e804eed 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -114,7 +114,7 @@ static const int multicast_filter_limit = 32;
 #include <asm/processor.h>	/* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/dmi.h>
 
 /* These identify the driver base version and may not be removed. */
diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c
index 3b08ec766076..f71883264cc0 100644
--- a/drivers/net/ethernet/xircom/xirc2ps_cs.c
+++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c
@@ -88,7 +88,7 @@
 #include <pcmcia/ciscode.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifndef MANFID_COMPAQ
   #define MANFID_COMPAQ 	   0x0138
diff --git a/drivers/net/fddi/skfp/skfddi.c b/drivers/net/fddi/skfp/skfddi.c
index 3a639180e4a0..2414f1dc8ddd 100644
--- a/drivers/net/fddi/skfp/skfddi.c
+++ b/drivers/net/fddi/skfp/skfddi.c
@@ -88,7 +88,7 @@ static const char * const boot_msg =
 
 #include <asm/byteorder.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include	"h/types.h"
 #undef ADDR			// undo Linux definition
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 470b3dcd54e5..922bf440e9f1 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -13,7 +13,7 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/string.h>
 #include <linux/mm.h>
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 78dbc44540f6..7d054697b199 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -55,7 +55,7 @@
 #include <linux/jiffies.h>
 #include <linux/random.h>
 #include <net/ax25.h> 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* --------------------------------------------------------------------- */
 
diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c
index 072cddce9264..809dc25909d1 100644
--- a/drivers/net/hamradio/baycom_par.c
+++ b/drivers/net/hamradio/baycom_par.c
@@ -86,7 +86,7 @@
 #include <linux/bitops.h>
 #include <linux/jiffies.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* --------------------------------------------------------------------- */
 
diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c
index 7b916d5b14b9..ebc06822fd4d 100644
--- a/drivers/net/hamradio/baycom_ser_fdx.c
+++ b/drivers/net/hamradio/baycom_ser_fdx.c
@@ -82,7 +82,7 @@
 #include <linux/jiffies.h>
 #include <linux/time64.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 
diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c
index f9a8976195ba..60fcf512c208 100644
--- a/drivers/net/hamradio/baycom_ser_hdx.c
+++ b/drivers/net/hamradio/baycom_ser_hdx.c
@@ -67,7 +67,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/hdlcdrv.h>
 #include <linux/baycom.h>
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index 622ab3ab9e93..f62e7f325cf9 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -69,7 +69,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index e4137c1b3df9..2479072981a1 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -40,7 +40,7 @@
 #include <asm/dma.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/ax25.h>
 #include "z8530.h"
 
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index 4bad0b894e9c..8c3633c1d078 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -58,7 +58,7 @@
 #include <linux/hdlcdrv.h>
 #include <linux/random.h>
 #include <net/ax25.h> 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/crc-ccitt.h>
 
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 1dfe2304daa7..ece59c54a653 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -17,7 +17,7 @@
  */
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/crc16.h>
 #include <linux/string.h>
 #include <linux/mm.h>
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index b8083161ef46..6754cd01c605 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -178,7 +178,7 @@
 
 #include <asm/irq.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "z8530.h"
 
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index aaff07c10058..b6891ada1d7b 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -68,7 +68,7 @@
 #include <linux/seq_file.h>
 #include <net/net_namespace.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/init.h>
 
 #include <linux/yam.h>
diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c
index f5a9728b89f3..dd7fc6659ad4 100644
--- a/drivers/net/hippi/rrunner.c
+++ b/drivers/net/hippi/rrunner.c
@@ -46,7 +46,7 @@
 #include <asm/byteorder.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define rr_if_busy(dev)     netif_queue_stopped(dev)
 #define rr_if_running(dev)  netif_running(dev)
diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c
index 7a3f990c1935..7a20a9a4663a 100644
--- a/drivers/net/irda/irtty-sir.c
+++ b/drivers/net/irda/irtty-sir.c
@@ -31,7 +31,7 @@
 #include <linux/slab.h>
 #include <linux/tty.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 
diff --git a/drivers/net/irda/kingsun-sir.c b/drivers/net/irda/kingsun-sir.c
index fb5d162ec7d2..24c0f169a7b1 100644
--- a/drivers/net/irda/kingsun-sir.c
+++ b/drivers/net/irda/kingsun-sir.c
@@ -71,7 +71,7 @@
 
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/wrapper.h>
diff --git a/drivers/net/irda/ks959-sir.c b/drivers/net/irda/ks959-sir.c
index 8e6e0edf2440..3affded3e30d 100644
--- a/drivers/net/irda/ks959-sir.c
+++ b/drivers/net/irda/ks959-sir.c
@@ -123,7 +123,7 @@
 
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/wrapper.h>
diff --git a/drivers/net/irda/ksdazzle-sir.c b/drivers/net/irda/ksdazzle-sir.c
index 37f23a189b35..741452c7ce35 100644
--- a/drivers/net/irda/ksdazzle-sir.c
+++ b/drivers/net/irda/ksdazzle-sir.c
@@ -87,7 +87,7 @@
 
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/wrapper.h>
diff --git a/drivers/net/irda/mcs7780.c b/drivers/net/irda/mcs7780.c
index bca6a1e72d1d..6f6ed75b63c9 100644
--- a/drivers/net/irda/mcs7780.c
+++ b/drivers/net/irda/mcs7780.c
@@ -55,7 +55,7 @@
 
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/wrapper.h>
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index a0849f49bbec..ffedad2a360a 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -45,7 +45,7 @@ MODULE_LICENSE("GPL");
 #include <linux/seq_file.h>
 #include <linux/math64.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include <net/irda/irda.h>
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 6255973e3dda..1e05b7c2d157 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -40,7 +40,7 @@
 #include <linux/fcntl.h>
 #include <linux/in.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #include <linux/inet.h>
diff --git a/drivers/net/phy/davicom.c b/drivers/net/phy/davicom.c
index 36e3e2033eca..e28913d9ea7e 100644
--- a/drivers/net/phy/davicom.c
+++ b/drivers/net/phy/davicom.c
@@ -32,7 +32,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define MII_DM9161_SCR		0x10
 #define MII_DM9161_SCR_INIT	0x0610
diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c
index 22b51f01a94a..567280a72241 100644
--- a/drivers/net/phy/icplus.c
+++ b/drivers/net/phy/icplus.c
@@ -28,7 +28,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 MODULE_DESCRIPTION("ICPlus IP175C/IP101A/IP101G/IC1001 PHY drivers");
 MODULE_AUTHOR("Michael Barkowski");
diff --git a/drivers/net/phy/lxt.c b/drivers/net/phy/lxt.c
index b9fde1bcf0f0..8d198a1f0031 100644
--- a/drivers/net/phy/lxt.c
+++ b/drivers/net/phy/lxt.c
@@ -32,7 +32,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* The Level one LXT970 is used by many boards				     */
 
diff --git a/drivers/net/phy/qsemi.c b/drivers/net/phy/qsemi.c
index d470db89e8dd..dbef8002bc28 100644
--- a/drivers/net/phy/qsemi.c
+++ b/drivers/net/phy/qsemi.c
@@ -32,7 +32,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* ------------------------------------------------------------------------- */
 /* The Quality Semiconductor QS6612 is used on the RPX CLLF                  */
diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
index 9c889e0303dd..feb9569e3345 100644
--- a/drivers/net/ppp/ppp_async.c
+++ b/drivers/net/ppp/ppp_async.c
@@ -34,7 +34,7 @@
 #include <linux/jiffies.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/string.h>
 
 #define PPP_VERSION	"2.4.2"
diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c
index 925d3e295bac..9ae53986cb4a 100644
--- a/drivers/net/ppp/ppp_synctty.c
+++ b/drivers/net/ppp/ppp_synctty.c
@@ -47,7 +47,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <asm/unaligned.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PPP_VERSION	"2.4.2"
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index f017c72bb7fd..d7e405268983 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -83,7 +83,7 @@
 #include <net/netns/generic.h>
 #include <net/sock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define PPPOE_HASH_BITS 4
 #define PPPOE_HASH_SIZE (1 << PPPOE_HASH_BITS)
diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c
index b9c8be6283d3..c0599b3b23c0 100644
--- a/drivers/net/ppp/pppox.c
+++ b/drivers/net/ppp/pppox.c
@@ -34,7 +34,7 @@
 
 #include <net/sock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const struct pppox_proto *pppox_protos[PX_MAX_PROTO + 1];
 
diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c
index 8b8b53259783..7820fced33f6 100644
--- a/drivers/net/sb1000.c
+++ b/drivers/net/sb1000.c
@@ -55,7 +55,7 @@ static char version[] = "sb1000.c:v1.1.2 6/01/98 (fventuri@mediaone.net)\n";
 
 #include <asm/io.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef SB1000_DEBUG
 static int sb1000_debug = SB1000_DEBUG;
diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c
index 27ed25252aac..5782733959f0 100644
--- a/drivers/net/slip/slhc.c
+++ b/drivers/net/slip/slhc.c
@@ -75,7 +75,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/checksum.h>
 #include <asm/unaligned.h>
 
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 7e933d8ff811..9841f3dc0682 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -64,7 +64,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/sched.h>
 #include <linux/string.h>
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 57e88b814700..cd8e02c94be0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -73,7 +73,7 @@
 #include <linux/uio.h>
 #include <linux/skb_array.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c
index a1f2f6f1e614..3daa41bdd4ea 100644
--- a/drivers/net/usb/catc.c
+++ b/drivers/net/usb/catc.c
@@ -42,7 +42,7 @@
 #include <linux/crc32.h>
 #include <linux/bitops.h>
 #include <linux/gfp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #undef DEBUG
 
diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c
index 338aed5da14d..876f02f4945e 100644
--- a/drivers/net/usb/kaweth.c
+++ b/drivers/net/usb/kaweth.c
@@ -54,7 +54,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/wait.h>
 #include <linux/firmware.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #undef DEBUG
diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c
index 399f7ee57aea..24e803fe9a53 100644
--- a/drivers/net/usb/pegasus.c
+++ b/drivers/net/usb/pegasus.c
@@ -42,7 +42,7 @@
 #include <linux/usb.h>
 #include <linux/module.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "pegasus.h"
 
 /*
diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c
index 93a1bda1c1e5..95b7bd0d7abc 100644
--- a/drivers/net/usb/rtl8150.c
+++ b/drivers/net/usb/rtl8150.c
@@ -14,7 +14,7 @@
 #include <linux/mii.h>
 #include <linux/ethtool.h>
 #include <linux/usb.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Version Information */
 #define DRIVER_VERSION "v0.6.2 (2004/08/27)"
diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
index ae6ecf401189..65ee2a6f248c 100644
--- a/drivers/net/wan/dlci.c
+++ b/drivers/net/wan/dlci.c
@@ -52,7 +52,7 @@
 
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const char version[] = "DLCI driver v0.35, 4 Jan 1997, mike.mclagan@linux.org";
 
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index 7351e5440ed7..799830ffcae2 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -95,7 +95,7 @@
 
 #include <asm/cache.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 03696d35ee9c..33265eb50420 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -30,7 +30,7 @@
 #include <linux/if.h>
 #include <linux/hdlc.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "farsync.h"
 
diff --git a/drivers/net/wan/hd64570.c b/drivers/net/wan/hd64570.c
index dc334c85d966..166696d2c496 100644
--- a/drivers/net/wan/hd64570.c
+++ b/drivers/net/wan/hd64570.c
@@ -39,7 +39,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "hd64570.h"
 
 #define get_msci(port)	  (phy_node(port) ?   MSCI1_OFFSET :   MSCI0_OFFSET)
diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c
index e92ecf1d3314..7ef49dab6855 100644
--- a/drivers/net/wan/hd64572.c
+++ b/drivers/net/wan/hd64572.c
@@ -39,7 +39,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "hd64572.h"
 
 #define NAPI_WEIGHT		16
diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
index 6676607164d6..9df9ed62beff 100644
--- a/drivers/net/wan/lapbether.c
+++ b/drivers/net/wan/lapbether.c
@@ -35,7 +35,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c
index 001b7796740d..4698450c77d1 100644
--- a/drivers/net/wan/lmc/lmc_main.c
+++ b/drivers/net/wan/lmc/lmc_main.c
@@ -59,7 +59,7 @@
 #include <asm/processor.h>             /* Processor type for cache alignment. */
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 //#include <asm/spinlock.h>
 
 #define DRIVER_MAJOR_VERSION     1
diff --git a/drivers/net/wan/lmc/lmc_media.c b/drivers/net/wan/lmc/lmc_media.c
index ff2e4a5654c7..cffe23bd16e1 100644
--- a/drivers/net/wan/lmc/lmc_media.c
+++ b/drivers/net/wan/lmc/lmc_media.c
@@ -19,7 +19,7 @@
 #include <asm/io.h>
 #include <asm/dma.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "lmc.h"
 #include "lmc_var.h"
diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c
index 3f83be98d469..3ca3419c54a0 100644
--- a/drivers/net/wan/sbni.c
+++ b/drivers/net/wan/sbni.c
@@ -63,7 +63,7 @@
 #include <asm/types.h>
 #include <asm/byteorder.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "sbni.h"
 
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index 421ac5f85699..236c62538036 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -56,7 +56,7 @@
 
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const char* version = "SDLA driver v0.30, 12 Sep 1996, mike.mclagan@linux.org";
 
diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c
index eb92d5ab7a27..e12f62356fd1 100644
--- a/drivers/net/wireless/atmel/atmel.c
+++ b/drivers/net/wireless/atmel/atmel.c
@@ -48,7 +48,7 @@
 #include <linux/timer.h>
 #include <asm/byteorder.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
index 64176090b196..356aba9d3d53 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c
@@ -148,7 +148,7 @@ that only one external action is invoked at a time.
 #include <linux/dma-mapping.h>
 #include <linux/proc_fs.h>
 #include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_geo.c b/drivers/net/wireless/intel/ipw2x00/libipw_geo.c
index 218f2a32de21..ce7eda20a68f 100644
--- a/drivers/net/wireless/intel/ipw2x00/libipw_geo.c
+++ b/drivers/net/wireless/intel/ipw2x00/libipw_geo.c
@@ -38,7 +38,7 @@
 #include <linux/types.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "libipw.h"
 
diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_module.c b/drivers/net/wireless/intel/ipw2x00/libipw_module.c
index 2332075565f2..c58c5b2dcce5 100644
--- a/drivers/net/wireless/intel/ipw2x00/libipw_module.c
+++ b/drivers/net/wireless/intel/ipw2x00/libipw_module.c
@@ -46,7 +46,7 @@
 #include <linux/types.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/net_namespace.h>
 #include <net/arp.h>
 
diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c
index 1c1ec7bb9302..6df19f03355a 100644
--- a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c
+++ b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c
@@ -29,7 +29,7 @@
 #include <linux/types.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/ctype.h>
 
 #include <net/lib80211.h>
diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c
index e8c039879b05..048f1e3ada11 100644
--- a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c
+++ b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c
@@ -39,7 +39,7 @@
 #include <linux/types.h>
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "libipw.h"
 
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index a8a9bd8e176a..544ef7adde7d 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -32,7 +32,7 @@
 
 
 #include <asm/delay.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/slab.h>
 #include <linux/netdevice.h>
diff --git a/drivers/net/wireless/intersil/hostap/hostap_main.c b/drivers/net/wireless/intersil/hostap/hostap_main.c
index 1a16b8cb366e..544fc09dcb62 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_main.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_main.c
@@ -27,7 +27,7 @@
 #include <net/net_namespace.h>
 #include <net/iw_handler.h>
 #include <net/lib80211.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "hostap_wlan.h"
 #include "hostap_80211.h"
diff --git a/drivers/net/wireless/intersil/prism54/isl_38xx.c b/drivers/net/wireless/intersil/prism54/isl_38xx.c
index 6700387ef9ab..ce9d4db0d9ca 100644
--- a/drivers/net/wireless/intersil/prism54/isl_38xx.c
+++ b/drivers/net/wireless/intersil/prism54/isl_38xx.c
@@ -21,7 +21,7 @@
 #include <linux/delay.h>
 #include <linux/ktime.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 #include "prismcompat.h"
diff --git a/drivers/net/wireless/intersil/prism54/isl_ioctl.c b/drivers/net/wireless/intersil/prism54/isl_ioctl.c
index 48e8a978a832..334717b0a2be 100644
--- a/drivers/net/wireless/intersil/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/intersil/prism54/isl_ioctl.c
@@ -26,7 +26,7 @@
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "prismcompat.h"
 #include "isl_ioctl.h"
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 4fdc7223c894..b94479441b0c 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -53,7 +53,7 @@
 
 #include <asm/io.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Warning : these stuff will slow down the driver... */
 #define WIRELESS_SPY		/* Enable spying addresses */
diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c
index d9d29ab88184..acec0d9ec422 100644
--- a/drivers/net/wireless/wl3501_cs.c
+++ b/drivers/net/wireless/wl3501_cs.c
@@ -51,7 +51,7 @@
 #include <pcmcia/ds.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "wl3501.h"
 
diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c
index 5371b374f1fe..e8f68f5732f1 100644
--- a/drivers/nubus/proc.c
+++ b/drivers/nubus/proc.c
@@ -25,7 +25,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 static int
diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c
index c0cc4e7ff023..67935fbbbcab 100644
--- a/drivers/oprofile/event_buffer.c
+++ b/drivers/oprofile/event_buffer.c
@@ -18,7 +18,7 @@
 #include <linux/capability.h>
 #include <linux/dcookies.h>
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "oprof.h"
 #include "event_buffer.h"
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 134398e0231b..d77ebbfc67c9 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -15,7 +15,7 @@
 #include <linux/oprofile.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "oprof.h"
 
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 3ed6238f8f6e..553ef8a5d588 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -48,7 +48,7 @@
 
 #include <asm/byteorder.h>
 #include <asm/cache.h>		/* for L1_CACHE_BYTES */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/dma.h>
 #include <asm/io.h>
diff --git a/drivers/parisc/ccio-rm-dma.c b/drivers/parisc/ccio-rm-dma.c
index f78f6f1aef47..1bf988010855 100644
--- a/drivers/parisc/ccio-rm-dma.c
+++ b/drivers/parisc/ccio-rm-dma.c
@@ -40,7 +40,7 @@
 #include <linux/pci.h>
 #include <linux/gfp.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/io.h>
 #include <asm/hardware.h>
diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c
index 783906fe659a..4dd9b1308128 100644
--- a/drivers/parisc/eisa_eeprom.c
+++ b/drivers/parisc/eisa_eeprom.c
@@ -26,7 +26,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/eisa_eeprom.h>
 
 #define 	EISA_EEPROM_MINOR 241
diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c
index 21905fef2cbf..d9bffe8d29b9 100644
--- a/drivers/parisc/eisa_enumerator.c
+++ b/drivers/parisc/eisa_enumerator.c
@@ -15,7 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include <asm/eisa_bus.h>
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index b48243131993..ff1a332d76e4 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -49,7 +49,7 @@
 #include <asm/param.h>		/* HZ */
 #include <asm/led.h>
 #include <asm/pdc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* The control of the LEDs and LCDs on PARISC-machines have to be done 
    completely in software. The necessary calculations are done in a work queue
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index 3651c3871d5b..055f83fddc18 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -68,7 +68,7 @@
 
 #include <asm/pdc.h>
 #include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/hardware.h>
 
 #define PDCS_VERSION	"0.30"
diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c
index 5bed17f68ef4..d998d0ed2bec 100644
--- a/drivers/parport/daisy.c
+++ b/drivers/parport/daisy.c
@@ -26,7 +26,7 @@
 #include <linux/sched.h>
 
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #undef DEBUG
 
diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c
index 2e21af43d91e..c0e7d21c88c2 100644
--- a/drivers/parport/ieee1284_ops.c
+++ b/drivers/parport/ieee1284_ops.c
@@ -18,7 +18,7 @@
 #include <linux/parport.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #undef DEBUG /* undef me for production */
 
diff --git a/drivers/parport/parport_gsc.c b/drivers/parport/parport_gsc.c
index 6e3a60c78873..dd6d4ccb41e4 100644
--- a/drivers/parport/parport_gsc.c
+++ b/drivers/parport/parport_gsc.c
@@ -34,7 +34,7 @@
 
 #include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/superio.h>
 
 #include <linux/parport.h>
diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c
index d763bc9e44c1..4d1d6eaf333d 100644
--- a/drivers/parport/probe.c
+++ b/drivers/parport/probe.c
@@ -10,7 +10,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const struct {
 	const char *token;
diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c
index 74ed3e459a3e..8ee44a104ac4 100644
--- a/drivers/parport/procfs.c
+++ b/drivers/parport/procfs.c
@@ -23,7 +23,7 @@
 #include <linux/sysctl.h>
 #include <linux/device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 
diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c
index f6221d739f59..68d105aaf4e2 100644
--- a/drivers/pci/hotplug/acpiphp_ibm.c
+++ b/drivers/pci/hotplug/acpiphp_ibm.c
@@ -35,7 +35,7 @@
 #include <linux/kobject.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "acpiphp.h"
 #include "../pci.h"
diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c
index ec009a7dba20..33d300d12411 100644
--- a/drivers/pci/hotplug/cpqphp_core.c
+++ b/drivers/pci/hotplug/cpqphp_core.c
@@ -40,7 +40,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "cpqphp.h"
 #include "cpqphp_nvram.h"
diff --git a/drivers/pci/hotplug/cpqphp_nvram.c b/drivers/pci/hotplug/cpqphp_nvram.c
index c25fc9061059..daae8071a156 100644
--- a/drivers/pci/hotplug/cpqphp_nvram.c
+++ b/drivers/pci/hotplug/cpqphp_nvram.c
@@ -34,7 +34,7 @@
 #include <linux/workqueue.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "cpqphp.h"
 #include "cpqphp_nvram.h"
 
diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c
index 56013d0daf7f..7b0e97be9063 100644
--- a/drivers/pci/hotplug/pci_hotplug_core.c
+++ b/drivers/pci/hotplug/pci_hotplug_core.c
@@ -42,7 +42,7 @@
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/pci_hotplug.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "../pci.h"
 #include "cpci_hotplug.h"
 
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 2408abe4ee8c..f82710a8694d 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -11,7 +11,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/capability.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 #include "pci.h"
 
diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c
index b91c4da68365..9bf993e1f71e 100644
--- a/drivers/pci/syscall.c
+++ b/drivers/pci/syscall.c
@@ -10,7 +10,7 @@
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "pci.h"
 
 SYSCALL_DEFINE5(pciconfig_read, unsigned long, bus, unsigned long, dfn,
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index c890a49587e4..aa2ee51d3547 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -68,7 +68,7 @@
 #include <linux/poll.h>
 #include <linux/miscdevice.h>
 #endif
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <acpi/video.h>
 
 #define dprintk(fmt, ...)			\
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index aa65a857a6b1..cacb43fb1df7 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -82,7 +82,7 @@
 #include <sound/core.h>
 #include <sound/control.h>
 #include <sound/initval.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <acpi/video.h>
 
 /* ThinkPad CMOS commands */
diff --git a/drivers/pnp/interface.c b/drivers/pnp/interface.c
index 4b6808ff0e5d..5c5b3d47b5f6 100644
--- a/drivers/pnp/interface.c
+++ b/drivers/pnp/interface.c
@@ -17,7 +17,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "base.h"
 
diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c
index c212db0fc65d..5ee6b2a5f8d5 100644
--- a/drivers/pnp/pnpbios/proc.c
+++ b/drivers/pnp/pnpbios/proc.c
@@ -26,7 +26,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "pnpbios.h"
 
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 84ca314c87e3..dd46e96a3034 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 
 #include <asm/debug.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ipl.h>
 
 /* This is ugly... */
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 67bf50c9946f..ade04216c970 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -26,7 +26,7 @@
 #include <asm/idals.h>
 #include <asm/ebcdic.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cio.h>
 #include <asm/ccwdev.h>
 #include <asm/itcw.h>
diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c
index 6c5d671304b4..8713fefd794b 100644
--- a/drivers/s390/block/dasd_eer.c
+++ b/drivers/s390/block/dasd_eer.c
@@ -20,7 +20,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <asm/ebcdic.h>
 
diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c
index 113c1c1fa1af..9e3419124264 100644
--- a/drivers/s390/block/dasd_erp.c
+++ b/drivers/s390/block/dasd_erp.c
@@ -15,7 +15,7 @@
 
 #include <asm/debug.h>
 #include <asm/ebcdic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* This is ugly... */
 #define PRINTK_HEADER "dasd_erp:"
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index e2fa759bf2ad..8b1341fb2e0d 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -16,7 +16,7 @@
 #include <linux/fs.h>
 #include <linux/blkpg.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* This is ugly... */
 #define PRINTK_HEADER "dasd_gendisk:"
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 9dfbd972f844..ec65c1e51c2a 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -21,7 +21,7 @@
 #include <asm/ccwdev.h>
 #include <asm/schid.h>
 #include <asm/cmb.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* This is ugly... */
 #define PRINTK_HEADER "dasd_ioctl:"
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index bad7a196bf84..70dc2c4cd3f7 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -20,7 +20,7 @@
 #include <linux/proc_fs.h>
 
 #include <asm/debug.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* This is ugly... */
 #define PRINTK_HEADER "dasd_proc:"
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 288f59a4147b..b9d7e755c8a3 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -41,7 +41,7 @@
 #include <linux/suspend.h>
 #include <linux/platform_device.h>
 #include <linux/gfp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define XPRAM_NAME	"xpram"
 #define XPRAM_DEVS	1	/* one partition */
diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c
index 1b8d825623bd..9ec4ae056158 100644
--- a/drivers/s390/char/con3215.c
+++ b/drivers/s390/char/con3215.c
@@ -25,7 +25,7 @@
 #include <asm/cio.h>
 #include <asm/io.h>
 #include <asm/ebcdic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/delay.h>
 #include <asm/cpcmd.h>
 #include <asm/setup.h>
diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c
index 7b9c50aa4cc9..82c913318b73 100644
--- a/drivers/s390/char/keyboard.c
+++ b/drivers/s390/char/keyboard.c
@@ -14,7 +14,7 @@
 #include <linux/consolemap.h>
 #include <linux/kbd_kern.h>
 #include <linux/kbd_diacr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "keyboard.h"
 
diff --git a/drivers/s390/char/monreader.c b/drivers/s390/char/monreader.c
index ebdeaa53182d..027ac6ae5eea 100644
--- a/drivers/s390/char/monreader.c
+++ b/drivers/s390/char/monreader.c
@@ -23,7 +23,7 @@
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <net/iucv/iucv.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ebcdic.h>
 #include <asm/extmem.h>
 
diff --git a/drivers/s390/char/monwriter.c b/drivers/s390/char/monwriter.c
index 9b5d1138b2e2..571a7e352755 100644
--- a/drivers/s390/char/monwriter.c
+++ b/drivers/s390/char/monwriter.c
@@ -21,7 +21,7 @@
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ebcdic.h>
 #include <asm/io.h>
 #include <asm/appldata.h>
diff --git a/drivers/s390/char/sclp_rw.c b/drivers/s390/char/sclp_rw.c
index 6010cd347a08..91b26df5227d 100644
--- a/drivers/s390/char/sclp_rw.c
+++ b/drivers/s390/char/sclp_rw.c
@@ -13,7 +13,7 @@
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/ctype.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "sclp.h"
 #include "sclp_rw.h"
diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c
index 9259017a1295..236b736ae136 100644
--- a/drivers/s390/char/sclp_tty.c
+++ b/drivers/s390/char/sclp_tty.c
@@ -15,7 +15,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/gfp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "ctrlchar.h"
 #include "sclp.h"
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 68d6ee7ae504..095481d32236 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -26,7 +26,7 @@
 #include <linux/reboot.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "sclp.h"
 #include "ctrlchar.h"
 
diff --git a/drivers/s390/char/tape_char.c b/drivers/s390/char/tape_char.c
index 77f9b9c2f701..46ac1164f242 100644
--- a/drivers/s390/char/tape_char.c
+++ b/drivers/s390/char/tape_char.c
@@ -18,7 +18,7 @@
 #include <linux/mtio.h>
 #include <linux/compat.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define TAPE_DBF_AREA	tape_core_dbf
 
diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index 272cb6cd1b2a..e5ebe2fbee23 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -24,7 +24,7 @@
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
 #include <asm/ebcdic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "raw3270.h"
 #include "tty3270.h"
diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c
index 2a67b496a9e2..65f5a794f26d 100644
--- a/drivers/s390/char/vmcp.c
+++ b/drivers/s390/char/vmcp.c
@@ -21,7 +21,7 @@
 #include <asm/compat.h>
 #include <asm/cpcmd.h>
 #include <asm/debug.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "vmcp.h"
 
 static debug_info_t *vmcp_debug;
diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c
index 3167e8581994..57974a1e0e03 100644
--- a/drivers/s390/char/vmlogrdr.c
+++ b/drivers/s390/char/vmlogrdr.c
@@ -21,7 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cpcmd.h>
 #include <asm/debug.h>
 #include <asm/ebcdic.h>
diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c
index ff18f373af9a..04aceb694d51 100644
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@ -15,7 +15,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cio.h>
 #include <asm/ccwdev.h>
 #include <asm/debug.h>
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index f771e5e9e26b..d3b51edb056e 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -23,7 +23,7 @@
 #include <asm/ipl.h>
 #include <asm/sclp.h>
 #include <asm/setup.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/debug.h>
 #include <asm/processor.h>
 #include <asm/irqflags.h>
diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c
index 9082476b51db..bf7f5d4c50e1 100644
--- a/drivers/s390/cio/blacklist.c
+++ b/drivers/s390/cio/blacklist.c
@@ -17,7 +17,7 @@
 #include <linux/ctype.h>
 #include <linux/device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cio.h>
 #include <asm/ipl.h>
 
diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 854a6e58dfea..51eece9af577 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -36,7 +36,7 @@
 #include <linux/compat.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/hw_random.h>
 #include <linux/debugfs.h>
 #include <asm/debug.h>
diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c
index c7d48a18199e..b97c5d5ee5a4 100644
--- a/drivers/s390/crypto/zcrypt_cex2a.c
+++ b/drivers/s390/crypto/zcrypt_cex2a.c
@@ -30,7 +30,7 @@
 #include <linux/init.h>
 #include <linux/err.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mod_devicetable.h>
 
 #include "ap_bus.h"
diff --git a/drivers/s390/crypto/zcrypt_pcixcc.c b/drivers/s390/crypto/zcrypt_pcixcc.c
index 26ceaa696765..600604782b65 100644
--- a/drivers/s390/crypto/zcrypt_pcixcc.c
+++ b/drivers/s390/crypto/zcrypt_pcixcc.c
@@ -31,7 +31,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mod_devicetable.h>
 
 #include "ap_bus.h"
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index 2981024a2438..3f85b97ab8d2 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -62,7 +62,7 @@
 #include <net/dst.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ebcdic.h>
 
 #include <net/iucv/iucv.h>
diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c
index 33fbe8249fd5..04efed171c88 100644
--- a/drivers/sbus/char/display7seg.c
+++ b/drivers/sbus/char/display7seg.c
@@ -17,7 +17,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>		/* put_/get_user			*/
+#include <linux/uaccess.h>		/* put_/get_user			*/
 #include <asm/io.h>
 
 #include <asm/display7seg.h>
diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c
index 5609b602c54d..56e962a01493 100644
--- a/drivers/sbus/char/envctrl.c
+++ b/drivers/sbus/char/envctrl.c
@@ -29,7 +29,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/envctrl.h>
 #include <asm/io.h>
 
diff --git a/drivers/sbus/char/flash.c b/drivers/sbus/char/flash.c
index 206ef4232adf..216f923161d1 100644
--- a/drivers/sbus/char/flash.c
+++ b/drivers/sbus/char/flash.c
@@ -15,7 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/upa.h>
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index a40ee1e37486..6ff61dad5e21 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -37,7 +37,7 @@
 #include <linux/string.h>
 #include <linux/genhd.h>
 #include <linux/blkdev.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/pcic.h>
diff --git a/drivers/sbus/char/openprom.c b/drivers/sbus/char/openprom.c
index 4612691c6619..2c2e6a3b4c7e 100644
--- a/drivers/sbus/char/openprom.c
+++ b/drivers/sbus/char/openprom.c
@@ -40,7 +40,7 @@
 #include <linux/fs.h>
 #include <asm/oplib.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/openpromio.h>
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 316f87fe3299..00e7968a1d70 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -92,7 +92,7 @@
 #include <linux/slab.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_tcq.h>
diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c
index 970d8fa6bd53..b150e131b2e7 100644
--- a/drivers/scsi/3w-sas.c
+++ b/drivers/scsi/3w-sas.c
@@ -64,7 +64,7 @@
 #include <linux/slab.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_tcq.h>
diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c
index aa412ab02765..33261b690774 100644
--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -210,7 +210,7 @@
 #include <linux/mutex.h>
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_tcq.h>
diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index 6678d1fd897b..1ee7c654f7b8 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -32,7 +32,7 @@
 #include <linux/slab.h>
 #include <linux/completion.h>
 #include <linux/blkdev.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/highmem.h> /* For flush_kernel_dcache_page */
 #include <linux/module.h>
 
diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c
index 5648b715fed9..e1daff230c7d 100644
--- a/drivers/scsi/aacraid/commctrl.c
+++ b/drivers/scsi/aacraid/commctrl.c
@@ -41,7 +41,7 @@
 #include <linux/delay.h> /* ssleep prototype */
 #include <linux/kthread.h>
 #include <linux/semaphore.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <scsi/scsi_host.h>
 
 #include "aacraid.h"
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index 9e45749d55ed..af032c46ec0e 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -61,7 +61,7 @@
 #include <linux/circ_buf.h>
 #include <asm/dma.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index 9d253cb83ee7..d9e15210b110 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -27,7 +27,7 @@
 #include <linux/fs.h>
 #include <linux/pci.h>
 #include <linux/firmware.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/fcntl.h>
 
 #include "bfad_drv.h"
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index 27c0dce22e72..5f75e638ec95 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -37,7 +37,7 @@ MODULE_DESCRIPTION("Adaptec I2O RAID Driver");
 ////////////////////////////////////////////////////////////////
 
 #include <linux/ioctl.h>	/* For SCSI-Passthrough */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/stat.h>
 #include <linux/slab.h>		/* for kmalloc() */
diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c
index 0a767740bf02..d020a13646ae 100644
--- a/drivers/scsi/gdth.c
+++ b/drivers/scsi/gdth.c
@@ -130,7 +130,7 @@
 
 #include <asm/dma.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c
index a83f705ed8a5..db17ad15b0c1 100644
--- a/drivers/scsi/hptiop.c
+++ b/drivers/scsi/hptiop.c
@@ -26,7 +26,7 @@
 #include <linux/timer.h>
 #include <linux/spinlock.h>
 #include <linux/gfp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/div64.h>
 #include <scsi/scsi_cmnd.h>
diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h
index 45b9566b928e..b782bb60baf0 100644
--- a/drivers/scsi/ips.h
+++ b/drivers/scsi/ips.h
@@ -51,7 +51,7 @@
    #define _IPS_H_
 
 #include <linux/nmi.h>
-   #include <asm/uaccess.h>
+#include <linux/uaccess.h>
    #include <asm/io.h>
 
    /*
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index 9d05302a3bcd..3c63c292cb92 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -34,7 +34,7 @@
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/completion.h>
 #include <linux/delay.h>
diff --git a/drivers/scsi/megaraid/megaraid_mm.h b/drivers/scsi/megaraid/megaraid_mm.h
index 55b425c0a654..a30e725f2d5c 100644
--- a/drivers/scsi/megaraid/megaraid_mm.h
+++ b/drivers/scsi/megaraid/megaraid_mm.h
@@ -17,7 +17,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/pci.h>
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 6484c382f670..d5cf15eb8c5e 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -42,7 +42,7 @@
 #include <linux/delay.h>
 #include <linux/uio.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fs.h>
 #include <linux/compat.h>
 #include <linux/blkdev.h>
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index a2960f5d98ec..e8196c55b633 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -52,7 +52,7 @@ static const char * osst_version = "0.99.4";
 #include <linux/delay.h>
 #include <linux/jiffies.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/dma.h>
 
 /* The driver prints some debugging information on the console if DEBUG
diff --git a/drivers/scsi/qla2xxx/qla_sup.c b/drivers/scsi/qla2xxx/qla_sup.c
index 9f6012b78e56..b4336e0cd85f 100644
--- a/drivers/scsi/qla2xxx/qla_sup.c
+++ b/drivers/scsi/qla2xxx/qla_sup.c
@@ -9,7 +9,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * NVRAM support routines
diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c
index c4f7b56fa6f6..8b8c814df5c7 100644
--- a/drivers/scsi/scsi_ioctl.c
+++ b/drivers/scsi/scsi_ioctl.c
@@ -12,7 +12,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index 7a74b82e8973..480a597b3877 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -26,7 +26,7 @@
 #include <linux/seq_file.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_device.h>
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 1622e23138e0..b1933041da39 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -53,7 +53,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/pr.h>
 #include <linux/t10-pi.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 
 #include <scsi/scsi.h>
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index bed2bbd6b923..94352e4df831 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -46,7 +46,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/pm_runtime.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_dbg.h>
diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c
index 03054c0e7689..dfffdf63e44c 100644
--- a/drivers/scsi/sr_ioctl.c
+++ b/drivers/scsi/sr_ioctl.c
@@ -10,7 +10,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_dbg.h>
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 605887d5ee57..5f35b863e1a7 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -41,7 +41,7 @@ static const char *verstr = "20160209";
 #include <linux/delay.h>
 #include <linux/mutex.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/dma.h>
 
 #include <scsi/scsi.h>
diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c
index dfbb974927f2..dea16bb8c46a 100644
--- a/drivers/tty/amiserial.c
+++ b/drivers/tty/amiserial.c
@@ -127,7 +127,7 @@ static struct serial_state rs_table[1];
 
 #define NR_PORTS ARRAY_SIZE(rs_table)
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define serial_isroot()	(capable(CAP_SYS_ADMIN))
 
diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index ce864875330e..9b5c0fb216b5 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -42,7 +42,7 @@
 #include <linux/slab.h>
 #include <linux/serial_core.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "hvc_console.h"
 
diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c
index 3c4d7c2b4ade..7823d6d998cf 100644
--- a/drivers/tty/hvc/hvcs.c
+++ b/drivers/tty/hvc/hvcs.c
@@ -81,7 +81,7 @@
 #include <linux/tty_flip.h>
 #include <asm/hvconsole.h>
 #include <asm/hvcserver.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/vio.h>
 
 /*
diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c
index 96ce6bd1cc6f..2e578d6433af 100644
--- a/drivers/tty/hvc/hvsi.c
+++ b/drivers/tty/hvc/hvsi.c
@@ -46,7 +46,7 @@
 #include <asm/hvcall.h>
 #include <asm/hvconsole.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/vio.h>
 #include <asm/param.h>
 #include <asm/hvsi.h>
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 60d37b225589..4caf0c3b1f99 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -47,7 +47,7 @@
 #include <linux/ratelimit.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "moxa.h"
 
diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c
index 69294ae154be..7b8f383fb090 100644
--- a/drivers/tty/mxser.c
+++ b/drivers/tty/mxser.c
@@ -43,7 +43,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "mxser.h"
 
diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c
index a7fa016f31eb..eb278832f5ce 100644
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -103,7 +103,7 @@
 #include <linux/bitops.h>
 
 #include <asm/termios.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Buffers for individual HDLC frames
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
index 345111467b85..305b6490d405 100644
--- a/drivers/tty/n_r3964.c
+++ b/drivers/tty/n_r3964.c
@@ -65,7 +65,7 @@
 #include <linux/n_r3964.h>
 #include <linux/poll.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*#define DEBUG_QUEUE*/
 
diff --git a/drivers/tty/serial/icom.c b/drivers/tty/serial/icom.c
index c60a8d5e4020..d83783cfbade 100644
--- a/drivers/tty/serial/icom.c
+++ b/drivers/tty/serial/icom.c
@@ -53,7 +53,7 @@
 
 #include <asm/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "icom.h"
 
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index d0847375ea64..9939c3d9912b 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -36,7 +36,7 @@
 #include <linux/mutex.h>
 
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * This is used to lock changes in serial line configuration.
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index 415885c56435..657eed82eeb3 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -107,7 +107,7 @@
 #define PUT_USER(error,value,addr) error = put_user(value,addr)
 #define COPY_TO_USER(error,dest,src,size) error = copy_to_user(dest,src,size) ? -EFAULT : 0
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define RCLRVALUE 0xffff
 
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 8267bcf2405e..31885f20fc15 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -77,7 +77,7 @@
 #include <asm/irq.h>
 #include <asm/dma.h>
 #include <asm/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #if defined(CONFIG_HDLC) || (defined(CONFIG_HDLC_MODULE) && defined(CONFIG_SYNCLINK_GT_MODULE))
 #define SYNCLINK_GENERIC_HDLC 1
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index d66620f7eaa3..51e8846cd68f 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -79,7 +79,7 @@
 #define PUT_USER(error,value,addr) error = put_user(value,addr)
 #define COPY_TO_USER(error,dest,src,size) error = copy_to_user(dest,src,size) ? -EFAULT : 0
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static MGSL_PARAMS default_params = {
 	MGSL_MODE_HDLC,			/* unsigned long mode */
diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index bf36ac9aee41..f27fc0f14c11 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -22,7 +22,7 @@
 #include <linux/compat.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #undef TTY_DEBUG_WAIT_UNTIL_SENT
 
diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c
index 71e81406ef71..1f6e17fc3fb0 100644
--- a/drivers/tty/vt/consolemap.c
+++ b/drivers/tty/vt/consolemap.c
@@ -29,7 +29,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/tty.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/console.h>
 #include <linux/consolemap.h>
 #include <linux/vt_kern.h>
diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index 368ce1803e8f..36e1b8c7680f 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/kbd_kern.h>
 #include <linux/vt_kern.h>
diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c
index 14a2b5f11bca..56dcff6059d3 100644
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -39,7 +39,7 @@
 #include <linux/slab.h>
 #include <linux/notifier.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index f62c598810ff..a56edf2d58eb 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c
@@ -29,7 +29,7 @@
 #include <linux/timex.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/kbd_kern.h>
 #include <linux/vt_kern.h>
diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c
index 4dec9df8764b..5a59da0dc98a 100644
--- a/drivers/usb/atm/usbatm.c
+++ b/drivers/usb/atm/usbatm.c
@@ -64,7 +64,7 @@
 
 #include "usbatm.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/crc32.h>
 #include <linux/errno.h>
 #include <linux/init.h>
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 143454ea385b..1fa5c0f29c64 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -29,7 +29,7 @@
 #include <linux/random.h>
 #include <linux/pm_qos.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include "hub.h"
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 10b2576f8b6a..e8f4102d19df 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -20,7 +20,7 @@
 #include <linux/uts.h>
 #include <linux/wait.h>
 #include <linux/compiler.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 5d3d914ab4fb..683098afa93e 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -42,7 +42,7 @@
 #include <linux/bitops.h>
 #include <linux/dmi.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 
diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c
index 9a82f8308ad7..01a9373b7e18 100644
--- a/drivers/usb/misc/ftdi-elan.c
+++ b/drivers/usb/misc/ftdi-elan.c
@@ -48,7 +48,7 @@
 #include <linux/module.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/usb.h>
 #include <linux/workqueue.h>
 #include <linux/platform_device.h>
diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c
index 2975e80b7a56..debc1fd74b0d 100644
--- a/drivers/usb/misc/idmouse.c
+++ b/drivers/usb/misc/idmouse.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/usb.h>
 
 /* image constants */
diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c
index 9ca595632f17..3bc5356832db 100644
--- a/drivers/usb/misc/ldusb.c
+++ b/drivers/usb/misc/ldusb.c
@@ -28,7 +28,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/input.h>
 #include <linux/usb.h>
 #include <linux/poll.h>
diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c
index c8fbe7b739a0..b10e26c74a90 100644
--- a/drivers/usb/misc/legousbtower.c
+++ b/drivers/usb/misc/legousbtower.c
@@ -83,7 +83,7 @@
 #include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/usb.h>
 #include <linux/poll.h>
 
diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c
index 1a874a1f3890..91c22276c03b 100644
--- a/drivers/usb/mon/mon_bin.c
+++ b/drivers/usb/mon/mon_bin.c
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 #include <linux/time64.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "usb_mon.h"
 
diff --git a/drivers/usb/mon/mon_stat.c b/drivers/usb/mon/mon_stat.c
index 5388a339cfb8..5bdf73a57498 100644
--- a/drivers/usb/mon/mon_stat.c
+++ b/drivers/usb/mon/mon_stat.c
@@ -12,7 +12,7 @@
 #include <linux/export.h>
 #include <linux/usb.h>
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "usb_mon.h"
 
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index e59334b09c41..db1a4abf2806 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -14,7 +14,7 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/scatterlist.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "usb_mon.h"
 
diff --git a/drivers/usb/musb/musb_debugfs.c b/drivers/usb/musb/musb_debugfs.c
index 9b22d946c089..4fef50e5c8c1 100644
--- a/drivers/usb/musb/musb_debugfs.c
+++ b/drivers/usb/musb/musb_debugfs.c
@@ -37,7 +37,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "musb_core.h"
 #include "musb_debug.h"
diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c
index 1e11614322fe..42d02a206059 100644
--- a/drivers/video/console/newport_con.c
+++ b/drivers/video/console/newport_con.c
@@ -21,7 +21,7 @@
 #include <linux/slab.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/gio_device.h>
diff --git a/drivers/video/fbdev/68328fb.c b/drivers/video/fbdev/68328fb.c
index 17f21cedff9b..c0c6b88d3839 100644
--- a/drivers/video/fbdev/68328fb.c
+++ b/drivers/video/fbdev/68328fb.c
@@ -35,7 +35,7 @@
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fb.h>
 #include <linux/init.h>
 
diff --git a/drivers/video/fbdev/hitfb.c b/drivers/video/fbdev/hitfb.c
index 9d68dc9ee7bf..abe3e54d4506 100644
--- a/drivers/video/fbdev/hitfb.c
+++ b/drivers/video/fbdev/hitfb.c
@@ -22,7 +22,7 @@
 #include <linux/fb.h>
 
 #include <asm/machvec.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
 #include <asm/hd64461.h>
diff --git a/drivers/video/fbdev/hpfb.c b/drivers/video/fbdev/hpfb.c
index 9476d196f510..16f16f5e1a4b 100644
--- a/drivers/video/fbdev/hpfb.c
+++ b/drivers/video/fbdev/hpfb.c
@@ -16,7 +16,7 @@
 #include <linux/dio.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static struct fb_info fb_info = {
 	.fix = {
diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c
index 8778e01cebac..1c3c7ab26a95 100644
--- a/drivers/video/fbdev/mx3fb.c
+++ b/drivers/video/fbdev/mx3fb.c
@@ -33,7 +33,7 @@
 #include <linux/platform_data/video-mx3fb.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define MX3FB_NAME		"mx3_sdc_fb"
 
diff --git a/drivers/video/fbdev/q40fb.c b/drivers/video/fbdev/q40fb.c
index 7487f76f6275..04ea330ccf5d 100644
--- a/drivers/video/fbdev/q40fb.c
+++ b/drivers/video/fbdev/q40fb.c
@@ -18,7 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/setup.h>
 #include <asm/q40_master.h>
 #include <linux/fb.h>
diff --git a/drivers/video/fbdev/sm501fb.c b/drivers/video/fbdev/sm501fb.c
index d0a4e2f79a57..d80bc8a3200f 100644
--- a/drivers/video/fbdev/sm501fb.c
+++ b/drivers/video/fbdev/sm501fb.c
@@ -31,7 +31,7 @@
 #include <linux/console.h>
 #include <linux/io.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/div64.h>
 
 #ifdef CONFIG_PM
diff --git a/drivers/video/fbdev/stifb.c b/drivers/video/fbdev/stifb.c
index 7df4228e25f0..accfef71e984 100644
--- a/drivers/video/fbdev/stifb.c
+++ b/drivers/video/fbdev/stifb.c
@@ -67,7 +67,7 @@
 #include <linux/io.h>
 
 #include <asm/grfioctl.h>	/* for HP-UX compatibility */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "sticore.h"
 
diff --git a/drivers/video/fbdev/w100fb.c b/drivers/video/fbdev/w100fb.c
index 10951c82f6ed..d570e19a2864 100644
--- a/drivers/video/fbdev/w100fb.c
+++ b/drivers/video/fbdev/w100fb.c
@@ -35,7 +35,7 @@
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <video/w100fb.h>
 #include "w100fb.h"
 
diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c
index 6ac2579da0eb..05397305fccd 100644
--- a/drivers/zorro/proc.c
+++ b/drivers/zorro/proc.c
@@ -16,7 +16,7 @@
 #include <linux/export.h>
 
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/amigahw.h>
 #include <asm/setup.h>
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index d7b78d531e63..6a0f3fa85ef7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -34,7 +34,7 @@
 #include <linux/list.h>
 #include <linux/pagemap.h>
 #include <linux/utsname.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/idr.h>
 #include <linux/uio.h>
 #include <linux/slab.h>
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 2853b4095344..35efb9a31dd7 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -14,7 +14,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 static struct proc_dir_entry *proc_afs;
diff --git a/fs/aio.c b/fs/aio.c
index 8c79e1a53af9..955c5241a8f2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -42,7 +42,7 @@
 #include <linux/mount.h>
 
 #include <asm/kmap_types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 80ef38c73e5a..3168ee4e77f4 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -20,7 +20,7 @@
 #include <linux/magic.h>
 #include <linux/anon_inodes.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1e5c896f6b79..f2deec0a62f0 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -16,7 +16,7 @@
 #include <linux/vfs.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "bfs.h"
 
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ae1b5404fced..2a59139f520b 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -26,7 +26,7 @@
 #include <linux/coredump.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/a.out-core.h>
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e6c1bd443806..29a02daf08a9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,7 +36,7 @@
 #include <linux/coredump.h>
 #include <linux/sched.h>
 #include <linux/dax.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 464a972e88c1..d2e36f82c35d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -37,7 +37,7 @@
 #include <linux/coredump.h>
 #include <linux/dax.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/param.h>
 #include <asm/pgalloc.h>
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7c4507224ed6..6254cee8f8f3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -32,7 +32,7 @@
 #include <linux/badblocks.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/falloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 struct bdev_inode {
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 3d03e48a9213..9727e1dcacd5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -24,7 +24,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index e3fed9249a04..b47261858e6d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -35,7 +35,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/task_io_accounting_ops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b822bf364c2b..35ae49ed1f76 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -34,7 +34,7 @@
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
 #include <linux/namei.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <linux/inet.h>
 #include <linux/module.h>
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 5f02edc819af..fbb84c08e3cd 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -30,7 +30,7 @@
 #include <linux/tcp.h>
 #include <linux/bvec.h>
 #include <linux/highmem.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <linux/mempool.h>
 #include "cifspdu.h"
diff --git a/fs/compat.c b/fs/compat.c
index 3f4908c28698..e50a2114f474 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,7 +49,7 @@
 #include <linux/pagemap.h>
 #include <linux/aio.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/ioctls.h>
 #include "internal.h"
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f2d7402abe02..11d087b2b28e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -76,7 +76,7 @@
 #include <scsi/sg.h>
 #endif
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/ethtool.h>
 #include <linux/mii.h>
 #include <linux/if_bonding.h>
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2c6312db8516..39da1103d341 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -29,7 +29,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/coredump.c b/fs/coredump.c
index eb9c92c9b20f..e525b6017cdf 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -38,7 +38,7 @@
 #include <linux/path.h>
 #include <linux/timekeeping.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 #include <asm/exec.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 252378359a8f..769903dbc19d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,7 +26,7 @@
 #include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/security.h>
 #include <linux/seqlock.h>
 #include <linux/swap.h>
diff --git a/fs/dcookies.c b/fs/dcookies.c
index a26a701ef512..0d0461cf2431 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -26,7 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/path.h>
 #include <linux/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* The dcookies are allocated from a kmem_cache and
  * hashed onto a small number of lists. None of the
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b670f5601fbb..748e8d59e611 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -38,7 +38,7 @@
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/ratelimit.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/dlm.h>
 #include "config.h"
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index 5bbf9612140c..70f5d4f9a945 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -14,7 +14,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define EFS_VERSION "1.0a"
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 10db91218933..bcb68fcc8445 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,7 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/anon_inodes.h>
 #include <linux/device.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/mman.h>
 #include <linux/atomic.h>
diff --git a/fs/exec.c b/fs/exec.c
index eadbf5069c38..e57946610733 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -58,7 +58,7 @@
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 9d617423e936..191e02b28ce8 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -14,7 +14,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 6cb042b53b5b..9e25a71fe1a2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,7 +31,7 @@
 #include <linux/mount.h>
 #include <linux/log2.h>
 #include <linux/quotaops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b1f8416923ab..3e295d3350a9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,7 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fiemap.h>
 #include <linux/backing-dev.h>
 #include "ext4_jbd2.h"
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 49fd1371bfa2..d534399cf607 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,7 +15,7 @@
 #include <linux/file.h>
 #include <linux/quotaops.h>
 #include <linux/uuid.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 63a6b6332682..66845a08a87a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,7 +38,7 @@
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <linux/cleancache.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 6e2771c210f6..e1c54f20325c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -25,7 +25,7 @@
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ca3c3dd01789..5559168d5637 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -8,7 +8,7 @@
 #include <linux/fs_struct.h>
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 #include "mount.h"
 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index c5618db110be..cac75547d35c 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Handling of filesystem drivers list.
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e23ff70b3435..016c11eaca7c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -22,7 +22,7 @@
 #include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
 #include <linux/delay.h>
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 14cbf60167a7..f7b3ba61add5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -21,7 +21,7 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kthread.h>
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6cd9f84967b8..eb7724b8578a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -19,7 +19,7 @@
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
 #include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index c9ff1cf7d4f3..f8d30e41d1d3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/genhd.h>
 
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index aee4485ad8a9..763d659db91b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -14,7 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index a4a577088d19..d87721aeb575 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -14,7 +14,7 @@
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/posix_acl_xattr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4cdec5a19347..6d0783e2e276 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -23,7 +23,7 @@
 #include <linux/workqueue.h>
 
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "hfs.h"
 
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 99627f8a0a18..0a156d84e67d 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -16,7 +16,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "hfsplus_fs.h"
 
 /*
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4fb7b10f3a05..54de77e78775 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -37,7 +37,7 @@
 #include <linux/migrate.h>
 #include <linux/uio.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const struct super_operations hugetlbfs_ops;
 static const struct address_space_operations hugetlbfs_aops;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ed971eeab44..a097048ed1a3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -47,7 +47,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 
 #ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index b6fd1ff29ddf..fc89f9436784 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -13,7 +13,7 @@
 #include <linux/sched.h>
 #include <linux/blkdev.h>
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "jfs_filsys.h"
 #include "jfs_debug.h"
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index a37eb5f8cbc0..a70907606025 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -22,7 +22,7 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_debug.h"
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 85671f7f8518..2be7c9ce6663 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -31,7 +31,7 @@
 #include <linux/exportfs.h>
 #include <linux/crc32.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/blkdev.h>
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 6637aa60c1da..e973cd51f126 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,7 +16,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/fs/locks.c b/fs/locks.c
index 22c5b4aa4961..26811321d39b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -131,7 +131,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/filelock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
diff --git a/fs/namei.c b/fs/namei.c
index d9fc7617b9e4..ad74877e1442 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -37,7 +37,7 @@
 #include <linux/hash.h>
 #include <linux/bitops.h>
 #include <linux/init_task.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 #include "mount.h"
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 6df2a3827574..088f52484d6e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -18,7 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/namei.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include "ncp_fs.h"
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 83ca77231707..76965e772264 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -8,7 +8,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/time.h>
 #include <linux/kernel.h>
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index ba611bf1aff3..7eb89c23c847 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -13,7 +13,7 @@
 
 #include <linux/module.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include <linux/time.h>
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0a3f9b594602..4434e4977cf3 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,7 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "ncp_fs.h"
 
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 33b873b259a8..39f57bef8531 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -18,7 +18,7 @@
 #include <linux/fcntl.h>
 #include <linux/memcontrol.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "ncp_fs.h"
 
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 17cfb743b5bf..b4c87cfcee95 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -21,7 +21,7 @@
 #include <linux/fcntl.h>
 #include <linux/pagemap.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <asm/string.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 471bc3d1139e..f32f272ee501 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -16,7 +16,7 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/in.h>
 #include <linux/net.h>
 #include <linux/mm.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 421b6f91e8ec..a6d26b46fc05 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -21,7 +21,7 @@
  */
 
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index be88bcdca692..aab32fc3d6a8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -52,7 +52,7 @@
 #include <linux/nfs_page.h>
 #include <linux/sunrpc/clnt.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 
 #include "internal.h"
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 157cb43ce9db..26dbe8b0c10d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -29,7 +29,7 @@
 #include <linux/gfp.h>
 #include <linux/swap.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "delegation.h"
 #include "internal.h"
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a608ffd28acc..391dafaf9182 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -30,7 +30,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 011e4f8c1e01..5ca4d96b1942 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -39,7 +39,7 @@
 #include <linux/compat.h>
 #include <linux/freezer.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ddce94ce8142..6bca17883b93 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -55,7 +55,7 @@
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6e761f3f4cbf..b00d53d13d47 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -24,7 +24,7 @@
 #include <linux/freezer.h>
 #include <linux/wait.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "delegation.h"
 #include "internal.h"
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index c16bf5af6831..34c1c449fddf 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/sunrpc/addr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "state.h"
 #include "netns.h"
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7a21abe7caf7..26c6fdb4bf67 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -26,7 +26,7 @@
 #include <linux/jhash.h>
 #include <linux/ima.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/security.h>
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 99510d811a8c..358ed7e1195a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -30,7 +30,7 @@
 #include <linux/writeback.h>
 
 #include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "attrib.h"
 #include "bitmap.h"
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index dfe162f5fd4c..d331c2386b94 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -24,7 +24,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "masklog.h"
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 8abab16b4602..d4b5c81f0445 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -62,7 +62,7 @@
 #include <linux/export.h>
 #include <net/tcp.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1079fae5aa12..9ab9e1892b5f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -45,7 +45,7 @@
 #include <linux/backing-dev.h>
 #include <linux/poll.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "stackglue.h"
 #include "userdlm.h"
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c9e828ec3c8e..dae9eb7c441e 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -24,7 +24,7 @@
 #include <linux/slab.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "stackglue.h"
 
diff --git a/fs/open.c b/fs/open.c
index d3ed8171e8e0..9921f70bc5ca 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -19,7 +19,7 @@
 #include <linux/mount.h>
 #include <linux/fcntl.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/pagemap.h>
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c003a667ed1a..13215f26e321 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -16,7 +16,7 @@
 #include <asm/openprom.h>
 #include <asm/oplib.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_MUTEX(op_mutex);
 
diff --git a/fs/pipe.c b/fs/pipe.c
index 8e0d9f26dfad..73b84baf58f8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -23,7 +23,7 @@
 #include <linux/fcntl.h>
 #include <linux/memcontrol.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ioctls.h>
 
 #include "internal.h"
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5ea836362870..8e7e61b28f31 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -47,7 +47,7 @@
  *  Overall revision about smaps.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/time.h>
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7eb3cefcf2a3..f6a01f09f79d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -22,7 +22,7 @@
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 873300164dc6..842a5ff5b85c 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,7 +24,7 @@
 #include <linux/mount.h>
 #include <linux/magic.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5c89a07e3d7f..0b80ad87b4d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,7 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
 #include <linux/ioport.h>
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 05f8dcdb086e..f9387bb7631b 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -14,7 +14,7 @@
 #include <linux/fs.h>
 #include <linux/syslog.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 extern wait_queue_head_t log_wait;
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index f8595e8b5cd0..75634379f82e 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -25,7 +25,7 @@
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 #include <asm/div64.h>
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3ecd445e830d..a2066e6dee90 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -13,7 +13,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
 #include <linux/kernel-page-flags.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 #define KPMSIZE sizeof(u64)
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7ae6b1da7cab..ffd72a6c6e04 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,7 +8,7 @@
  *  proc net directory handling functions
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/time.h>
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 15f327bed8c6..901bd06f437d 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,7 +4,7 @@
  * Copyright 1997, Theodore Ts'o
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4bd0373576b5..1988440b2049 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,7 +6,7 @@
  *  proc root directory handling functions
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/time.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 958f32545064..8f96a49178d0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 #include <linux/shmem_fs.h>
 
 #include <asm/elf.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 8ab782d8b33d..5105b1599981 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -22,7 +22,7 @@
 #include <linux/list.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include "internal.h"
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2bcbf4e77982..2ef7ce75c062 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -23,7 +23,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8621c039b536..26e45863e499 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,7 +35,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 #define RAMFS_DEFAULT_MODE	0755
diff --git a/fs/read_write.c b/fs/read_write.c
index 7537b6b6b5a2..5816d4c4cab0 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -20,7 +20,7 @@
 #include <linux/fs.h>
 #include "internal.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
diff --git a/fs/readdir.c b/fs/readdir.c
index 9d0212c374d6..0e8a7f355f7a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -19,7 +19,7 @@
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int iterate_dir(struct file *file, struct dir_context *ctx)
 {
diff --git a/fs/select.c b/fs/select.c
index 3d4f85defeab..305c0daf5d67 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -31,7 +31,7 @@
 #include <net/busy_poll.h>
 #include <linux/vmalloc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 /*
diff --git a/fs/seq_file.c b/fs/seq_file.c
index a11f271800ef..ca69fb99e41a 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -15,7 +15,7 @@
 #include <linux/printk.h>
 #include <linux/string_helpers.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 
 static void seq_set_overflow(struct seq_file *m)
diff --git a/fs/stat.c b/fs/stat.c
index 0b210c3ead5c..a268b7f27adf 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -15,7 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 void generic_fillattr(struct inode *inode, struct kstat *stat)
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 1bc0bd6a9848..7e41aee7b69a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -25,7 +25,7 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f04ab232d08d..131b2b77c818 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -71,7 +71,7 @@
 
 #include <stdarg.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/utimes.c b/fs/utimes.c
index 5fdb505e307c..32b15b3f6629 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -8,7 +8,7 @@
 #include <linux/stat.h>
 #include <linux/utime.h>
 #include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 #ifdef __ARCH_WANT_SYS_UTIME
diff --git a/fs/xattr.c b/fs/xattr.c
index 2d13b4e62fae..7e3317cf4045 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -22,7 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/posix_acl_xattr.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const char *
 strcmp_prefix(const char *a, const char *a_prefix)
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 321f57721b92..7c49938c5aed 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -19,7 +19,7 @@
 #include <linux/ioctl.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index a415f822f2c1..e467218c0098 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -83,7 +83,7 @@ typedef __u32			xfs_nlink_t;
 #include <asm/page.h>
 #include <asm/div64.h>
 #include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
diff --git a/include/asm-generic/termios-base.h b/include/asm-generic/termios-base.h
index 0a769feb22b0..157bbf6f4510 100644
--- a/include/asm-generic/termios-base.h
+++ b/include/asm-generic/termios-base.h
@@ -4,7 +4,7 @@
 #ifndef _ASM_GENERIC_TERMIOS_BASE_H
 #define _ASM_GENERIC_TERMIOS_BASE_H
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifndef __ARCH_TERMIO_GETPUT
 
diff --git a/include/asm-generic/termios.h b/include/asm-generic/termios.h
index 4fa6fe0fc2a2..8c13a16b074e 100644
--- a/include/asm-generic/termios.h
+++ b/include/asm-generic/termios.h
@@ -2,7 +2,7 @@
 #define _ASM_GENERIC_TERMIOS_H
 
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <uapi/asm-generic/termios.h>
 
 /*	intr=^C		quit=^\		erase=del	kill=^U
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index a9cfd33c7b1a..192016e2b518 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -61,7 +61,7 @@
 
 #include <asm/mman.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <uapi/drm/drm.h>
 #include <uapi/drm/drm_mode.h>
diff --git a/include/linux/isdnif.h b/include/linux/isdnif.h
index 0fc6ff276221..8d80fdc68647 100644
--- a/include/linux/isdnif.h
+++ b/include/linux/isdnif.h
@@ -500,6 +500,6 @@ typedef struct {
  *
  */
 extern int register_isdn(isdn_if*);
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #endif /* __ISDNIF_H__ */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7dbe9148b2f8..f29f80f81dbf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -9,7 +9,7 @@
 #include <linux/list.h>
 #include <linux/highmem.h>
 #include <linux/compiler.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/gfp.h>
 #include <linux/bitops.h>
 #include <linux/hardirq.h> /* for in_interrupt() */
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 37b057b63b46..a46d6755035e 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -8,7 +8,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/sysctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <uapi/linux/poll.h>
 
 extern struct ctl_table epoll_table[]; /* for sysctl */
diff --git a/include/net/checksum.h b/include/net/checksum.h
index 5c30891e84e5..35d0fabd2782 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -22,7 +22,7 @@
 #include <linux/errno.h>
 #include <asm/types.h>
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/checksum.h>
 
 #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index f0dcaebebddb..d8833a86cd7e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -69,7 +69,7 @@
 #include <net/ip6_route.h>
 #endif
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 #include <net/sock.h>
 #include <net/snmp.h>
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 8029d2a51f14..958a24d8fae7 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -59,7 +59,7 @@
 #include <linux/if_link.h>
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
diff --git a/init/init_task.c b/init/init_task.c
index 11f83be1fa79..53d4ce942a88 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -9,7 +9,7 @@
 #include <linux/mm.h>
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
diff --git a/kernel/capability.c b/kernel/capability.c
index 4984e1f552eb..a98e814f216f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -17,7 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Leveraged for setting/resetting capabilities
diff --git a/kernel/compat.c b/kernel/compat.c
index b3a047f208a7..19aec5d98108 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -28,7 +28,7 @@
 #include <linux/ptrace.h>
 #include <linux/gfp.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
 {
diff --git a/kernel/configs.c b/kernel/configs.c
index c18b1f1ae515..2df132b20217 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -28,7 +28,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**************************************************/
 /* the actual current config file                 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 29f815d2ef7e..b3088886cd37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -55,7 +55,7 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
 #include <linux/cgroup.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index aacff8e2aec0..8f14b866f9f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,7 +56,7 @@
 #include <linux/kcov.h>
 #include <linux/random.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..e3beec4a2339 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -22,7 +22,7 @@
 #include <linux/init.h>
 
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * mutex protecting text section modification (dynamic code patching).
diff --git a/kernel/fork.c b/kernel/fork.c
index 869b8ccc00bf..11c5c8ab827c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -79,7 +79,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 4ae3232e7a28..3f409968e466 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -13,7 +13,7 @@
 #include <linux/ptrace.h>
 #include <linux/syscalls.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 /*
diff --git a/kernel/groups.c b/kernel/groups.c
index 2fcadd66a8fd..8dd7a61b7115 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -8,7 +8,7 @@
 #include <linux/syscalls.h>
 #include <linux/user_namespace.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 struct group_info *groups_alloc(int gidsetsize)
 {
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0277d1216f80..d45c96073afb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -39,7 +39,7 @@
 #include <linux/rwsem.h>
 #include <linux/ptrace.h>
 #include <linux/async.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <trace/events/module.h>
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d63095472ea9..43460104f119 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -52,7 +52,7 @@
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define KPROBE_HASH_BITS 6
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index a0f61effad25..6d1fcc786081 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -18,7 +18,7 @@
 #include <linux/debug_locks.h>
 #include <linux/vmalloc.h>
 #include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/div64.h>
 
 #include "lockdep_internals.h"
diff --git a/kernel/module.c b/kernel/module.c
index f7482db0f843..5088784c0cf9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -46,7 +46,7 @@
 #include <linux/string.h>
 #include <linux/mutex.h>
 #include <linux/rculist.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 #include <linux/license.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4f0f0604f1c4..2d8e2b227db8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,7 +30,7 @@
 #include <linux/compiler.h>
 #include <linux/ktime.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 35310b627388..22df9f7ff672 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -25,7 +25,7 @@
 #include <linux/cpu.h>
 #include <linux/freezer.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "power.h"
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e2cdd87e7a63..8b2696420abb 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,7 +46,7 @@
 #include <linux/ctype.h>
 #include <linux/uio.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/sections.h>
 
 #define CREATE_TRACE_POINTS
diff --git a/kernel/profile.c b/kernel/profile.c
index 2dbccf2d806c..f67ce0aa6bc4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -408,7 +408,7 @@ void profile_tick(int type)
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
 {
diff --git a/kernel/signal.c b/kernel/signal.c
index ae60996fedff..f5d4e275345e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -39,7 +39,7 @@
 #include <trace/events/signal.h>
 
 #include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
 #include <asm/cacheflush.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 9758892a2d09..842914ef7de4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -57,7 +57,7 @@
 /* Move somewhere else to avoid recompiling? */
 #include <generated/utsrelease.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1a292ebcbbb6..8dbaec0e4f7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,7 +67,7 @@
 #include <linux/bpf.h>
 #include <linux/mount.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 
 #ifdef CONFIG_X86
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 08be5c99d26b..161e340395d5 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -50,7 +50,7 @@
 #include <linux/timer.h>
 #include <linux/freezer.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <trace/events/timer.h>
 
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 2b9f45bc955d..a45afb7277c2 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -14,7 +14,7 @@
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**
  * itimer_get_remtime - get remaining time for the timer
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index f246763c9947..e9e8c10f0d9a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -6,7 +6,7 @@
 #include <linux/posix-timers.h>
 #include <linux/errno.h>
 #include <linux/math64.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/kernel_stat.h>
 #include <trace/events/timer.h>
 #include <linux/tick.h>
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f2826c35e918..42d7b9558741 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -36,7 +36,7 @@
 #include <linux/time.h>
 #include <linux/mutex.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
diff --git a/kernel/time/time.c b/kernel/time/time.c
index bd62fb8e8e77..a3a9a8a029dc 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -38,7 +38,7 @@
 #include <linux/math64.h>
 #include <linux/ptrace.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 #include <generated/timeconst.h>
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ea4fbf8477a9..ec33a6933eae 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -43,7 +43,7 @@
 #include <linux/slab.h>
 #include <linux/compat.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ba7d8b288bb3..afe6cd1944fc 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -17,7 +17,7 @@
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "tick-internal.h"
 
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 087204c733eb..afddded947df 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -43,7 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * This is our basic unit of interest: a timer expiry event identified
diff --git a/kernel/uid16.c b/kernel/uid16.c
index cc40793464e3..71645ae9303a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -14,7 +14,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
diff --git a/lib/extable.c b/lib/extable.c
index 0be02ad561e9..62968daa66a9 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifndef ARCH_HAS_RELATIVE_EXTABLE
 #define ex_to_insn(x)	((x)->insn)
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index b8e2080c1a47..bf85e05ce858 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -17,7 +17,7 @@
 #include <linux/math64.h>
 #include <linux/export.h>
 #include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "kstrtox.h"
 
 const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 175ec51c346d..4048897e7b01 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -68,7 +68,7 @@
 #include <net/ip.h>
 #include "slab.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <trace/events/vmscan.h>
 
diff --git a/mm/memory.c b/mm/memory.c
index 455c3e628d52..7d23b5050248 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,7 +68,7 @@
 #include <asm/io.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6d3639e1f254..2e346645eb80 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -96,7 +96,7 @@
 #include <linux/printk.h>
 
 #include <asm/tlbflush.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/mm/mincore.c b/mm/mincore.c
index bfb866435478..ddb872da3f5b 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -16,7 +16,7 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 
 static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
diff --git a/mm/mmap.c b/mm/mmap.c
index 1af87c14183d..dc4291dcc99b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -45,7 +45,7 @@
 #include <linux/moduleparam.h>
 #include <linux/pkeys.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cc2459c57f60..f9c07f54dd62 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -25,7 +25,7 @@
 #include <linux/perf_event.h>
 #include <linux/pkeys.h>
 #include <linux/ksm.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 210d7ec2843c..24f9f5f39145 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -35,7 +35,7 @@
 #include <linux/audit.h>
 #include <linux/printk.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
diff --git a/mm/shmem.c b/mm/shmem.c
index b1b20dc63265..bb53285a1d99 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,7 +71,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/fcntl.h>
 #include <uapi/linux/memfd.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 
 #include "internal.h"
diff --git a/mm/util.c b/mm/util.c
index 1a41553db866..3cb2164f4099 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -13,7 +13,7 @@
 #include <linux/vmalloc.h>
 
 #include <asm/sections.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a5584384eabc..3ca82d44edd3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -32,7 +32,7 @@
 #include <linux/llist.h>
 #include <linux/bitops.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
diff --git a/net/802/fc.c b/net/802/fc.c
index 7b9219022418..1bb496ea997e 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -10,7 +10,7 @@
  *		v 1.0 03/22/99
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 5e4427beab2b..4460606e9c36 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -34,7 +34,7 @@
 #include <linux/errno.h>
 #include <net/arp.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Create the HIPPI MAC header for an arbitrary protocol layer
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 691f0ad7067d..467069b73ce1 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -34,7 +34,7 @@
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/if_vlan.h>
 #include "vlan.h"
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 2fdebabbfacd..90fcf5fc2e0a 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -32,7 +32,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/termios.h>	/* For TIOCINQ/OUTQ */
 #include <linux/mm.h>
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index e7c9b0ea17a1..ac2542b7be88 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -21,7 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 3d106767b272..9a3a301e1e2f 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -23,7 +23,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 9bd31e88aeca..891596e74278 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -22,7 +22,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index e05bd57b5afd..28827e81ba2b 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -23,7 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index 5237dff6941d..5fb2104b7304 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -24,7 +24,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
index 7f16e8a931b2..8c07c28569e4 100644
--- a/net/ax25/ax25_iface.c
+++ b/net/ax25/ax25_iface.c
@@ -23,7 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index bb5a0e4e98d9..860752639b1a 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -25,7 +25,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 2fa3be965101..183b1c583d56 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -23,7 +23,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/termios.h>	/* For TIOCINQ/OUTQ */
 #include <linux/mm.h>
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index 8ddd41baa81c..b11a5f466fcc 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -25,7 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index d39097737e38..e1fda27cb27c 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -31,7 +31,7 @@
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 3fbf8f7b2cf4..8632b86e843e 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -29,7 +29,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
index 8b66a41e538f..94bd06396a43 100644
--- a/net/ax25/ax25_std_subr.c
+++ b/net/ax25/ax25_std_subr.c
@@ -20,7 +20,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 2c0d6ef66f9d..30bbc675261d 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -24,7 +24,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 655a7d4c96e1..4855d18a8511 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -25,7 +25,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
index c3cffa79bafb..23a6f38a80bf 100644
--- a/net/ax25/ax25_timer.c
+++ b/net/ax25/ax25_timer.c
@@ -28,7 +28,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 4ad2fb7bcd35..0403b0def7e6 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -25,7 +25,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index bca5ead3e973..ed3b3192fb00 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -19,7 +19,7 @@
 #include <linux/list.h>
 #include <linux/netfilter_bridge.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "br_private.h"
 
 #define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index d99b2009771a..da8157c57eb1 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -18,7 +18,7 @@
 #include <linux/slab.h>
 #include <linux/times.h>
 #include <net/net_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "br_private.h"
 
 static int get_bridge_ifindices(struct net *net, int *indices, int num)
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index b12501a77f18..8ca6a929bf12 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -40,7 +40,7 @@
 #include <net/netfilter/br_netfilter.h>
 #include <net/netns/generic.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "br_private.h"
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 5989661c659f..96c072e71ea2 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -38,7 +38,7 @@
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "br_private.h"
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 1ab6014cf0f8..537e3d506fc2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -23,7 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/audit.h>
diff --git a/net/compat.c b/net/compat.c
index 1cd2ec046164..96c544b05b15 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -28,7 +28,7 @@
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/compat.h>
 
 int get_compat_msghdr(struct msghdr *kmsg,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9482037a5c8c..662bea587165 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -36,7 +36,7 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/errno.h>
diff --git a/net/core/dev.c b/net/core/dev.c
index 6372117f653f..037ffd27fcc2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -72,7 +72,7 @@
  *				        - netif_rx() feedback
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
diff --git a/net/core/filter.c b/net/core/filter.c
index 7190bd648154..e6c412b94dec 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -40,7 +40,7 @@
 #include <net/flow_dissector.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/filter.h>
 #include <linux/ratelimit.h>
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 101b5d0e2142..0385dece1f6f 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -14,7 +14,7 @@
  *              names to make it usable in general net subsystem.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/types.h>
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c482491a63d8..18b5aae99bec 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -40,7 +40,7 @@
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/inet.h>
 #include <linux/netdevice.h>
diff --git a/net/core/scm.c b/net/core/scm.c
index 2696aefdc148..d8820438ba37 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -29,7 +29,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 65a74e13c45b..e77f40616fea 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -72,7 +72,7 @@
 #include <net/ip6_checksum.h>
 #include <net/xfrm.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <trace/events/skb.h>
 #include <linux/highmem.h>
 #include <linux/capability.h>
diff --git a/net/core/sock.c b/net/core/sock.c
index 9fa46b956bdc..f560e0826009 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -118,7 +118,7 @@
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/netdevice.h>
 #include <net/protocol.h>
diff --git a/net/core/utils.c b/net/core/utils.c
index cf5622b9ccc4..6592d7bbed39 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -31,7 +31,7 @@
 #include <net/net_ratelimit.h>
 
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
 /*
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 41f803e35da3..8fdd9f492b0e 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -42,7 +42,7 @@
 #include <linux/notifier.h>
 #include <linux/slab.h>
 #include <linux/jiffies.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/net_namespace.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index a796fc7cbc35..7af0ba6157a1 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -31,7 +31,7 @@
 #include <linux/timer.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
 #include <net/flow.h>
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 1540b506e3e0..232675480756 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -25,7 +25,7 @@
 #include <linux/timer.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/route.h> /* RTF_xxx */
 #include <net/neighbour.h>
 #include <net/netlink.h>
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 5325b541c526..6c7da6c29bf0 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -22,7 +22,7 @@
 #include <net/dst.h>
 #include <net/flow.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/dn.h>
 #include <net/dn_dev.h>
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1830e6f0e9cc..f75069883f2b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -90,7 +90,7 @@
 #include <linux/random.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/inet.h>
 #include <linux/igmp.h>
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 062a67ca9a21..4cd2ee8857d2 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -26,7 +26,7 @@
  */
 
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/capability.h>
 #include <linux/module.h>
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index dbad5a1c161a..3ff8938893ec 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -14,7 +14,7 @@
  */
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/capability.h>
 #include <linux/types.h>
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c1bc1e92de0e..7a5b4c7d9a87 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -13,7 +13,7 @@
  *		2 of the License, or (at your option) any later version.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1b0e7d1f5217..2919d1a10cfd 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,7 +50,7 @@
 
 #define VERSION "0.409"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f79d7a8ab1c6..0777ea949223 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -91,7 +91,7 @@
 #include <linux/errno.h>
 #include <linux/timer.h>
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
 #include <net/inet_common.h>
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 15db786d50ed..68d622133f53 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -72,7 +72,7 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 78fd62048335..c9c1cb635d9a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -17,7 +17,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/in.h>
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4d158ff1def1..93157f2f4758 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -15,7 +15,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/skbuff.h>
 #include <linux/ip.h>
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 618ab5079816..fac275c48108 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -42,7 +42,7 @@
  *		Hirokazu Takahashi:	sendfile() on UDP works now.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 976073417b03..57e1405e8282 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -44,7 +44,7 @@
 #include <net/ip_fib.h>
 
 #include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  *	SOL_IP control messages.
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 071a785c65eb..fd9f34bbd740 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -61,7 +61,7 @@
 #include <net/ipconfig.h>
 #include <net/route.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/checksum.h>
 #include <asm/processor.h>
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 79489f017854..00d4229b6954 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -96,7 +96,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/in.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 665505d86b12..efc1e76d4977 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -26,7 +26,7 @@
  *
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 1258a9ab62ef..a467e1236c43 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -24,7 +24,7 @@
 #include <linux/err.h>
 #include <net/compat.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp/arp_tables.h>
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 308b456723f0..91656a1d8fbd 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -20,7 +20,7 @@
 #include <linux/icmp.h>
 #include <net/ip.h>
 #include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mutex.h>
 #include <linux/proc_fs.h>
 #include <linux/err.h>
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2300fae11b22..4e49e5cb001c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -41,7 +41,7 @@
 #include <linux/atomic.h>
 #include <asm/byteorder.h>
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9eabf490133a..a82a11747b3f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -65,7 +65,7 @@
 #define pr_fmt(fmt) "IPv4: " fmt
 
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1ef3165114ba..4a044964da66 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -277,7 +277,7 @@
 #include <net/ip.h>
 #include <net/sock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 9ca279b130d5..1307a7c2e544 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -79,7 +79,7 @@
 
 #define pr_fmt(fmt) "UDP: " fmt
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 237e654ba717..aa42123bc301 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -63,7 +63,7 @@
 #include <net/calipso.h>
 #include <net/seg6.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mroute6.h>
 
 #include "ip6_offload.h"
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1407426bc862..a3eaafd87100 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -33,7 +33,7 @@
 #include <net/dsfield.h>
 
 #include <linux/errqueue.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static bool ipv6_mapped_addr_any(const struct in6_addr *a)
 {
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 17fa28f7a0ff..3036f665e6c8 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -70,7 +70,7 @@
 #include <net/dsfield.h>
 #include <net/l3mdev.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  *	The ICMP socket(s). This is the most convenient way to flow control
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index b912f0dbaf72..8081bafe441b 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -29,7 +29,7 @@
 #include <net/rawv6.h>
 #include <net/transp_v6.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define FL_MIN_LINGER	6	/* Minimal linger. It is set to 6sec specified
 				   in old IPv6 RFC. Well, it was reasonable value.
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 8b186b56183a..36d292180942 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -42,7 +42,7 @@
 #include <linux/hash.h>
 #include <linux/etherdevice.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 
 #include <net/icmp.h>
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 52101b37ad6e..604d8953c775 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -16,7 +16,7 @@
  *
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 3ba530373560..ee97c44e2aa0 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -54,7 +54,7 @@
 #include <net/compat.h>
 #include <net/seg6.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 struct ip6_ra_chain *ip6_ra_chain;
 DEFINE_RWLOCK(ip6_ra_lock);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d56d8ac09a94..25a022d41a70 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -24,7 +24,7 @@
 #include <linux/icmpv6.h>
 #include <net/ipv6.h>
 #include <net/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/mutex.h>
 #include <linux/proc_fs.h>
 #include <linux/err.h>
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 890acace01d0..8417c41d8ec8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -64,7 +64,7 @@
 #include <net/l3mdev.h>
 #include <trace/events/fib6.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0355231162b8..fad992ad4bc8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -31,7 +31,7 @@
 #include <linux/if_arp.h>
 #include <linux/icmp.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 649efc26a252..4d5c4eee4b3f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -35,7 +35,7 @@
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/addrconf.h>
 #include <net/ndisc.h>
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 48d0dc89b58d..e07f22b0c58a 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -56,7 +56,7 @@
 #include <net/tcp_states.h>
 #include <net/net_namespace.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* Configuration Variables */
 static unsigned char ipxcfg_max_hops = 16;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 391c3cbd2eed..ab254041dab7 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -52,7 +52,7 @@
 #include <linux/poll.h>
 
 #include <asm/ioctls.h>		/* TIOCOUTQ, TIOCINQ */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/sock.h>
 #include <net/tcp_states.h>
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index 873c4b707d6a..817b1b186aff 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -40,7 +40,7 @@
 #include <linux/interrupt.h>
 #include <linux/device.h>		/* for MODULE_ALIAS_CHARDEV_MAJOR */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/irmod.h>
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 8f5678cb6263..f18070118d05 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -32,7 +32,7 @@
 #include <linux/tty.h>
 #include <linux/serial.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/irmod.h>
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index 856736656a30..890b90d055d5 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -43,7 +43,7 @@
 #include <linux/export.h>
 
 #include <asm/ioctls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/dma.h>
 #include <asm/io.h>
 
diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h
index c69f0f38f566..9d451f8ed47a 100644
--- a/net/irda/irnet/irnet.h
+++ b/net/irda/irnet/irnet.h
@@ -249,7 +249,7 @@
 #include <linux/capability.h>
 #include <linux/ctype.h>	/* isspace() */
 #include <linux/string.h>	/* skip_spaces() */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/init.h>
 
 #include <linux/ppp_defs.h>
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index fc60d9d738b5..b50b64ac8815 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -33,7 +33,7 @@
 #include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index 182470847fcf..d5d2110eb717 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -31,7 +31,7 @@
 #include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index 482c94d9d958..eda726e22f64 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -29,7 +29,7 @@
 #include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 3c1914df641f..75efde3e616c 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -28,7 +28,7 @@
 #include <linux/skbuff.h>
 #include <linux/slab.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 355cc3b6fa4d..1a5535bc3b8d 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -29,7 +29,7 @@
 #include <linux/inet.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3d02b0c13547..55e0169caa4c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -48,7 +48,7 @@
 #include <net/sock.h>
 #include <net/genetlink.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <net/ip_vs.h>
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 2278d9ab723b..a09fa9fd8f3d 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -22,7 +22,7 @@
 #include <linux/sockios.h>
 #include <linux/net.h>
 #include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/sock.h>
 #include <linux/init.h>
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 801d474de75b..161b628ab2b0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -40,7 +40,7 @@
 #include <linux/net.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 49cd0c70a13a..b9e1a13b4ba3 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -73,7 +73,7 @@
 #include <net/sock.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <asm/page.h>
 #include <asm/cacheflush.h>
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 129d357d2722..9ad301c46b88 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -34,7 +34,7 @@
 #include <linux/if_arp.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/termios.h>
 #include <linux/mm.h>
diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c
index 0fc76d845103..452bbb38d943 100644
--- a/net/rose/rose_route.c
+++ b/net/rose/rose_route.c
@@ -25,7 +25,7 @@
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/termios.h>	/* For TIOCINQ/OUTQ */
 #include <linux/mm.h>
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 176af3080a2b..5ed8e79bf102 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -71,7 +71,7 @@
 #include <net/inet_ecn.h>
 #include <net/sctp/sctp.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static inline int sctp_v6_addr_match_len(union sctp_addr *s1,
 					 union sctp_addr *s2);
diff --git a/net/socket.c b/net/socket.c
index dc01d7be2fda..5ff26c44db33 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -90,7 +90,7 @@
 #include <linux/slab.h>
 #include <linux/xattr.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 #include <net/compat.h>
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 16cea00c959b..cdeb1d814833 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -50,7 +50,7 @@
 #include <linux/workqueue.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/sunrpc/gss_api.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/hashtable.h>
 
 #include "../netns.h"
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8aabe12201f8..8147e8d56eb2 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -21,7 +21,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/string_helpers.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 135ec2c11b3b..a3e85ee28b5a 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -42,7 +42,7 @@
 #include <net/udp.h>
 #include <net/tcp.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <trace/events/skb.h>
 
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index c88d9bc06f5c..8c3936403fea 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -14,7 +14,7 @@
 #include <linux/sysctl.h>
 #include <linux/module.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/stats.h>
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 310882fb698e..127656ebe7be 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -100,7 +100,7 @@
 #include <linux/in.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <net/net_namespace.h>
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index f83b74d3e2ac..079c883aa96e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -51,7 +51,7 @@
 #include <linux/slab.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fcntl.h>
 #include <linux/termios.h>	/* For TIOCINQ/OUTQ */
 #include <linux/notifier.h>
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index fd5ffb25873f..bcaa180d6a3f 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -29,7 +29,7 @@
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/init.h>
 #include <net/x25.h>
 
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 45cb7c699b65..d4ab9a7f3d94 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -20,7 +20,7 @@
 #include <linux/module.h>
 #include <linux/cache.h>
 #include <linux/audit.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/ktime.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 671a1d0333f0..9705c279494b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -27,7 +27,7 @@
 #include <net/xfrm.h>
 #include <net/netlink.h>
 #include <net/ah.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #if IS_ENABLED(CONFIG_IPV6)
 #include <linux/in6.h>
 #endif
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index f89f1900e58d..04a764f71ec8 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -23,7 +23,7 @@
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 #include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 #define KEY_MAX_DESC_SIZE 4096
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 40a885239782..918cddcd4516 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -18,7 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/security.h>
 #include <linux/user_namespace.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 /* Session keyring create vs join semaphore */
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 9db8b4a82787..6bbe2f535f08 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -16,7 +16,7 @@
 #include <linux/err.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 #include <keys/user-type.h>
 
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index 66b1840b4110..e187c8909d9d 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -15,7 +15,7 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <keys/user-type.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 static int logon_vet_description(const char *desc);
diff --git a/sound/oss/dmasound/dmasound_atari.c b/sound/oss/dmasound/dmasound_atari.c
index 1c56bf58eff9..a1a2979c0bb1 100644
--- a/sound/oss/dmasound/dmasound_atari.c
+++ b/sound/oss/dmasound/dmasound_atari.c
@@ -22,7 +22,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/atariints.h>
 #include <asm/atari_stram.h>
 
diff --git a/sound/oss/dmasound/dmasound_core.c b/sound/oss/dmasound/dmasound_core.c
index f4ee85a4c42f..5f248fb41bea 100644
--- a/sound/oss/dmasound/dmasound_core.c
+++ b/sound/oss/dmasound/dmasound_core.c
@@ -183,7 +183,7 @@
 #include <linux/poll.h>
 #include <linux/mutex.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "dmasound.h"
 
diff --git a/sound/oss/dmasound/dmasound_paula.c b/sound/oss/dmasound/dmasound_paula.c
index 3f653618614d..81eb82c4675a 100644
--- a/sound/oss/dmasound/dmasound_paula.c
+++ b/sound/oss/dmasound/dmasound_paula.c
@@ -23,7 +23,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/setup.h>
 #include <asm/amigahw.h>
 #include <asm/amigaints.h>
diff --git a/sound/oss/dmasound/dmasound_q40.c b/sound/oss/dmasound/dmasound_q40.c
index 99bcb21c2281..be4fe15cfd6b 100644
--- a/sound/oss/dmasound/dmasound_q40.c
+++ b/sound/oss/dmasound/dmasound_q40.c
@@ -20,7 +20,7 @@
 #include <linux/soundcard.h>
 #include <linux/interrupt.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/q40ints.h>
 #include <asm/q40_master.h>
 
diff --git a/sound/oss/msnd.c b/sound/oss/msnd.c
index c0cc951ba97d..b63010ad22f1 100644
--- a/sound/oss/msnd.c
+++ b/sound/oss/msnd.c
@@ -32,7 +32,7 @@
 #include <linux/interrupt.h>
 
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/spinlock.h>
 #include <asm/irq.h>
 #include "msnd.h"
diff --git a/sound/oss/os.h b/sound/oss/os.h
index 75ad0cd0c0ab..0bf89e1d679c 100644
--- a/sound/oss/os.h
+++ b/sound/oss/os.h
@@ -17,7 +17,7 @@
 #include <linux/ioport.h>
 #include <asm/page.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/poll.h>
 #include <linux/pci.h>
 #endif
diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c
index 213a416b6e0b..f3af63e58b36 100644
--- a/sound/oss/swarm_cs4297a.c
+++ b/sound/oss/swarm_cs4297a.c
@@ -80,7 +80,7 @@
 #include <asm/byteorder.h>
 #include <asm/dma.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <asm/sibyte/sb1250_regs.h>
 #include <asm/sibyte/sb1250_int.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index de102cae7125..994f81f8eecb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -53,7 +53,7 @@
 #include <asm/processor.h>
 #include <asm/io.h>
 #include <asm/ioctl.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 
 #include "coalesced_mmio.h"
-- 
cgit v1.2.3


From dc280d93623927570da279e99393879dbbab39e7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Dec 2016 20:19:49 +0100
Subject: cpu/hotplug: Prevent overwriting of callbacks

Developers manage to overwrite states blindly without thought. That's fatal
and hard to debug. Add sanity checks to make it fail.

This requries to restructure the code so that the dynamic state allocation
happens in the same lock protected section as the actual store. Otherwise
the previous assignment of 'Reserved' to the name field would trigger the
overwrite check.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Link: http://lkml.kernel.org/r/20161221192111.675234535@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 96 +++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 50 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5339aca811d2..3ff0ea5db371 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1432,23 +1432,53 @@ static int cpuhp_cb_check(enum cpuhp_state state)
 	return 0;
 }
 
-static void cpuhp_store_callbacks(enum cpuhp_state state,
-				  const char *name,
-				  int (*startup)(unsigned int cpu),
-				  int (*teardown)(unsigned int cpu),
-				  bool multi_instance)
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
+{
+	enum cpuhp_state i;
+
+	for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
+		if (!cpuhp_ap_states[i].name)
+			return i;
+	}
+	WARN(1, "No more dynamic states available for CPU hotplug\n");
+	return -ENOSPC;
+}
+
+static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
+				 int (*startup)(unsigned int cpu),
+				 int (*teardown)(unsigned int cpu),
+				 bool multi_instance)
 {
 	/* (Un)Install the callbacks for further cpu hotplug operations */
 	struct cpuhp_step *sp;
+	int ret = 0;
 
 	mutex_lock(&cpuhp_state_mutex);
+
+	if (state == CPUHP_AP_ONLINE_DYN) {
+		ret = cpuhp_reserve_state(state);
+		if (ret < 0)
+			goto out;
+		state = ret;
+	}
 	sp = cpuhp_get_step(state);
+	if (name && sp->name) {
+		ret = -EBUSY;
+		goto out;
+	}
 	sp->startup.single = startup;
 	sp->teardown.single = teardown;
 	sp->name = name;
 	sp->multi_instance = multi_instance;
 	INIT_HLIST_HEAD(&sp->list);
+out:
 	mutex_unlock(&cpuhp_state_mutex);
+	return ret;
 }
 
 static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
@@ -1509,29 +1539,6 @@ static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
 	}
 }
 
-/*
- * Returns a free for dynamic slot assignment of the Online state. The states
- * are protected by the cpuhp_slot_states mutex and an empty slot is identified
- * by having no name assigned.
- */
-static int cpuhp_reserve_state(enum cpuhp_state state)
-{
-	enum cpuhp_state i;
-
-	mutex_lock(&cpuhp_state_mutex);
-	for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
-		if (cpuhp_ap_states[i].name)
-			continue;
-
-		cpuhp_ap_states[i].name = "Reserved";
-		mutex_unlock(&cpuhp_state_mutex);
-		return i;
-	}
-	mutex_unlock(&cpuhp_state_mutex);
-	WARN(1, "No more dynamic states available for CPU hotplug\n");
-	return -ENOSPC;
-}
-
 int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
 			       bool invoke)
 {
@@ -1580,11 +1587,13 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
 
 /**
  * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
- * @state:	The state to setup
- * @invoke:	If true, the startup function is invoked for cpus where
- *		cpu state >= @state
- * @startup:	startup callback function
- * @teardown:	teardown callback function
+ * @state:		The state to setup
+ * @invoke:		If true, the startup function is invoked for cpus where
+ *			cpu state >= @state
+ * @startup:		startup callback function
+ * @teardown:		teardown callback function
+ * @multi_instance:	State is set up for multiple instances which get
+ *			added afterwards.
  *
  * Returns:
  *   On success:
@@ -1599,25 +1608,16 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 			bool multi_instance)
 {
 	int cpu, ret = 0;
-	int dyn_state = 0;
 
 	if (cpuhp_cb_check(state) || !name)
 		return -EINVAL;
 
 	get_online_cpus();
 
-	/* currently assignments for the ONLINE state are possible */
-	if (state == CPUHP_AP_ONLINE_DYN) {
-		dyn_state = 1;
-		ret = cpuhp_reserve_state(state);
-		if (ret < 0)
-			goto out;
-		state = ret;
-	}
-
-	cpuhp_store_callbacks(state, name, startup, teardown, multi_instance);
+	ret = cpuhp_store_callbacks(state, name, startup, teardown,
+				    multi_instance);
 
-	if (!invoke || !startup)
+	if (ret || !invoke || !startup)
 		goto out;
 
 	/*
@@ -1641,7 +1641,11 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 	}
 out:
 	put_online_cpus();
-	if (!ret && dyn_state)
+	/*
+	 * If the requested state is CPUHP_AP_ONLINE_DYN, return the
+	 * dynamically allocated state in case of success.
+	 */
+	if (!ret && state == CPUHP_AP_ONLINE_DYN)
 		return state;
 	return ret;
 }
-- 
cgit v1.2.3


From 530e9b76ae8f863dfdef4a6ad0b38613d32e8c3f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Dec 2016 20:19:53 +0100
Subject: cpu/hotplug: Remove obsolete cpu hotplug register/unregister
 functions

hotcpu_notifier(), cpu_notifier(), __hotcpu_notifier(), __cpu_notifier(),
register_hotcpu_notifier(), register_cpu_notifier(),
__register_hotcpu_notifier(), __register_cpu_notifier(),
unregister_hotcpu_notifier(), unregister_cpu_notifier(),
__unregister_hotcpu_notifier(), __unregister_cpu_notifier()

are unused now. Remove them and all related code.

Remove also the now pointless cpu notifier error injection mechanism. The
states can be executed step by step and error rollback is the same as cpu
down, so any state transition can be tested w/o requiring the notifier
error injection.

Some CPU hotplug states are kept as they are (ab)used for hotplug state
tracking.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20161221192112.005642358@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h             |  90 --------------------------
 include/linux/cpuhotplug.h      |   3 -
 kernel/cpu.c                    | 139 +---------------------------------------
 lib/Kconfig.debug               |  24 -------
 lib/Makefile                    |   1 -
 lib/cpu-notifier-error-inject.c |  84 ------------------------
 6 files changed, 1 insertion(+), 340 deletions(-)
 delete mode 100644 lib/cpu-notifier-error-inject.c

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 09807c2ce328..21f9c74496e7 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -57,9 +57,6 @@ struct notifier_block;
 
 #define CPU_ONLINE		0x0002 /* CPU (unsigned)v is up */
 #define CPU_UP_PREPARE		0x0003 /* CPU (unsigned)v coming up */
-#define CPU_UP_CANCELED		0x0004 /* CPU (unsigned)v NOT coming up */
-#define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
-#define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
 #define CPU_POST_DEAD		0x0009 /* CPU (unsigned)v dead, cpu_hotplug
 					* lock is dropped */
@@ -80,80 +77,14 @@ struct notifier_block;
 
 #ifdef CONFIG_SMP
 extern bool cpuhp_tasks_frozen;
-/* Need to know about CPUs going up/down? */
-#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
-#define cpu_notifier(fn, pri) {					\
-	static struct notifier_block fn##_nb =			\
-		{ .notifier_call = fn, .priority = pri };	\
-	register_cpu_notifier(&fn##_nb);			\
-}
-
-#define __cpu_notifier(fn, pri) {				\
-	static struct notifier_block fn##_nb =			\
-		{ .notifier_call = fn, .priority = pri };	\
-	__register_cpu_notifier(&fn##_nb);			\
-}
-
-extern int register_cpu_notifier(struct notifier_block *nb);
-extern int __register_cpu_notifier(struct notifier_block *nb);
-extern void unregister_cpu_notifier(struct notifier_block *nb);
-extern void __unregister_cpu_notifier(struct notifier_block *nb);
-
-#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
-#define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-#define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-
-static inline int register_cpu_notifier(struct notifier_block *nb)
-{
-	return 0;
-}
-
-static inline int __register_cpu_notifier(struct notifier_block *nb)
-{
-	return 0;
-}
-
-static inline void unregister_cpu_notifier(struct notifier_block *nb)
-{
-}
-
-static inline void __unregister_cpu_notifier(struct notifier_block *nb)
-{
-}
-#endif
-
 int cpu_up(unsigned int cpu);
 void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
 
-#define cpu_notifier_register_begin	cpu_maps_update_begin
-#define cpu_notifier_register_done	cpu_maps_update_done
-
 #else	/* CONFIG_SMP */
 #define cpuhp_tasks_frozen	0
 
-#define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-#define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-
-static inline int register_cpu_notifier(struct notifier_block *nb)
-{
-	return 0;
-}
-
-static inline int __register_cpu_notifier(struct notifier_block *nb)
-{
-	return 0;
-}
-
-static inline void unregister_cpu_notifier(struct notifier_block *nb)
-{
-}
-
-static inline void __unregister_cpu_notifier(struct notifier_block *nb)
-{
-}
-
 static inline void cpu_maps_update_begin(void)
 {
 }
@@ -162,14 +93,6 @@ static inline void cpu_maps_update_done(void)
 {
 }
 
-static inline void cpu_notifier_register_begin(void)
-{
-}
-
-static inline void cpu_notifier_register_done(void)
-{
-}
-
 #endif /* CONFIG_SMP */
 extern struct bus_type cpu_subsys;
 
@@ -182,12 +105,6 @@ extern void get_online_cpus(void);
 extern void put_online_cpus(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
-#define hotcpu_notifier(fn, pri)	cpu_notifier(fn, pri)
-#define __hotcpu_notifier(fn, pri)	__cpu_notifier(fn, pri)
-#define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
-#define __register_hotcpu_notifier(nb)	__register_cpu_notifier(nb)
-#define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
-#define __unregister_hotcpu_notifier(nb)	__unregister_cpu_notifier(nb)
 void clear_tasks_mm_cpumask(int cpu);
 int cpu_down(unsigned int cpu);
 
@@ -199,13 +116,6 @@ static inline void cpu_hotplug_done(void) {}
 #define put_online_cpus()	do { } while (0)
 #define cpu_hotplug_disable()	do { } while (0)
 #define cpu_hotplug_enable()	do { } while (0)
-#define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-#define __hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
-/* These aren't inline functions due to a GCC bug. */
-#define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
-#define __register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
-#define unregister_hotcpu_notifier(nb)	({ (void)(nb); })
-#define __unregister_hotcpu_notifier(nb)	({ (void)(nb); })
 #endif		/* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_PM_SLEEP_SMP
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 175d276ac335..0d5ef8563113 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -59,7 +59,6 @@ enum cpuhp_state {
 	CPUHP_POWERPC_MMU_CTX_PREPARE,
 	CPUHP_XEN_PREPARE,
 	CPUHP_XEN_EVTCHN_PREPARE,
-	CPUHP_NOTIFY_PREPARE,
 	CPUHP_ARM_SHMOBILE_SCU_PREPARE,
 	CPUHP_SH_SH3X_PREPARE,
 	CPUHP_BLK_MQ_PREPARE,
@@ -74,7 +73,6 @@ enum cpuhp_state {
 	CPUHP_KVM_PPC_BOOK3S_PREPARE,
 	CPUHP_ZCOMP_PREPARE,
 	CPUHP_TIMERS_DEAD,
-	CPUHP_NOTF_ERR_INJ_PREPARE,
 	CPUHP_MIPS_SOC_PREPARE,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
@@ -145,7 +143,6 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
-	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
 	CPUHP_AP_X86_HPET_ONLINE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3ff0ea5db371..042fd7e8e030 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -183,23 +183,16 @@ EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
  * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
- * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
- * hotplug callback (un)registration performed using __register_cpu_notifier()
- * or __unregister_cpu_notifier().
  */
 void cpu_maps_update_begin(void)
 {
 	mutex_lock(&cpu_add_remove_lock);
 }
-EXPORT_SYMBOL(cpu_notifier_register_begin);
 
 void cpu_maps_update_done(void)
 {
 	mutex_unlock(&cpu_add_remove_lock);
 }
-EXPORT_SYMBOL(cpu_notifier_register_done);
-
-static RAW_NOTIFIER_HEAD(cpu_chain);
 
 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
@@ -349,66 +342,7 @@ void cpu_hotplug_enable(void)
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
-/* Need to know about CPUs going up/down? */
-int register_cpu_notifier(struct notifier_block *nb)
-{
-	int ret;
-	cpu_maps_update_begin();
-	ret = raw_notifier_chain_register(&cpu_chain, nb);
-	cpu_maps_update_done();
-	return ret;
-}
-
-int __register_cpu_notifier(struct notifier_block *nb)
-{
-	return raw_notifier_chain_register(&cpu_chain, nb);
-}
-
-static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
-			int *nr_calls)
-{
-	unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
-	void *hcpu = (void *)(long)cpu;
-
-	int ret;
-
-	ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
-					nr_calls);
-
-	return notifier_to_errno(ret);
-}
-
-static int cpu_notify(unsigned long val, unsigned int cpu)
-{
-	return __cpu_notify(val, cpu, -1, NULL);
-}
-
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
-	BUG_ON(cpu_notify(val, cpu));
-}
-
 /* Notifier wrappers for transitioning to state machine */
-static int notify_prepare(unsigned int cpu)
-{
-	int nr_calls = 0;
-	int ret;
-
-	ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
-	if (ret) {
-		nr_calls--;
-		printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
-				__func__, cpu);
-		__cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
-	}
-	return ret;
-}
-
-static int notify_online(unsigned int cpu)
-{
-	cpu_notify(CPU_ONLINE, cpu);
-	return 0;
-}
 
 static int bringup_wait_for_ap(unsigned int cpu)
 {
@@ -433,10 +367,8 @@ static int bringup_cpu(unsigned int cpu)
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu, idle);
 	irq_unlock_sparse();
-	if (ret) {
-		cpu_notify(CPU_UP_CANCELED, cpu);
+	if (ret)
 		return ret;
-	}
 	ret = bringup_wait_for_ap(cpu);
 	BUG_ON(!cpu_online(cpu));
 	return ret;
@@ -565,11 +497,6 @@ static void cpuhp_thread_fun(unsigned int cpu)
 		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
 
 		undo_cpu_down(cpu, st);
-		/*
-		 * This is a momentary workaround to keep the notifier users
-		 * happy. Will go away once we got rid of the notifiers.
-		 */
-		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
 		st->rollback = false;
 	} else {
 		/* Cannot happen .... */
@@ -659,22 +586,6 @@ void __init cpuhp_threads_init(void)
 	kthread_unpark(this_cpu_read(cpuhp_state.thread));
 }
 
-EXPORT_SYMBOL(register_cpu_notifier);
-EXPORT_SYMBOL(__register_cpu_notifier);
-void unregister_cpu_notifier(struct notifier_block *nb)
-{
-	cpu_maps_update_begin();
-	raw_notifier_chain_unregister(&cpu_chain, nb);
-	cpu_maps_update_done();
-}
-EXPORT_SYMBOL(unregister_cpu_notifier);
-
-void __unregister_cpu_notifier(struct notifier_block *nb)
-{
-	raw_notifier_chain_unregister(&cpu_chain, nb);
-}
-EXPORT_SYMBOL(__unregister_cpu_notifier);
-
 #ifdef CONFIG_HOTPLUG_CPU
 /**
  * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
@@ -741,20 +652,6 @@ static inline void check_for_tasks(int dead_cpu)
 	read_unlock(&tasklist_lock);
 }
 
-static int notify_down_prepare(unsigned int cpu)
-{
-	int err, nr_calls = 0;
-
-	err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
-	if (err) {
-		nr_calls--;
-		__cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
-		pr_warn("%s: attempt to take down CPU %u failed\n",
-				__func__, cpu);
-	}
-	return err;
-}
-
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
@@ -833,13 +730,6 @@ static int takedown_cpu(unsigned int cpu)
 	return 0;
 }
 
-static int notify_dead(unsigned int cpu)
-{
-	cpu_notify_nofail(CPU_DEAD, cpu);
-	check_for_tasks(cpu);
-	return 0;
-}
-
 static void cpuhp_complete_idle_dead(void *arg)
 {
 	struct cpuhp_cpu_state *st = arg;
@@ -863,9 +753,7 @@ void cpuhp_report_idle_dead(void)
 }
 
 #else
-#define notify_down_prepare	NULL
 #define takedown_cpu		NULL
-#define notify_dead		NULL
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -924,9 +812,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
 out:
 	cpu_hotplug_done();
-	/* This post dead nonsense must die */
-	if (!ret && hasdied)
-		cpu_notify_nofail(CPU_POST_DEAD, cpu);
 	return ret;
 }
 
@@ -1291,17 +1176,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup.single		= rcutree_prepare_cpu,
 		.teardown.single	= rcutree_dead_cpu,
 	},
-	/*
-	 * Preparatory and dead notifiers. Will be replaced once the notifiers
-	 * are converted to states.
-	 */
-	[CPUHP_NOTIFY_PREPARE] = {
-		.name			= "notify:prepare",
-		.startup.single		= notify_prepare,
-		.teardown.single	= notify_dead,
-		.skip_onerr		= true,
-		.cant_stop		= true,
-	},
 	/*
 	 * On the tear-down path, timers_dead_cpu() must be invoked
 	 * before blk_mq_queue_reinit_notify() from notify_dead(),
@@ -1391,17 +1265,6 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup.single		= rcutree_online_cpu,
 		.teardown.single	= rcutree_offline_cpu,
 	},
-
-	/*
-	 * Online/down_prepare notifiers. Will be removed once the notifiers
-	 * are converted to states.
-	 */
-	[CPUHP_AP_NOTIFY_ONLINE] = {
-		.name			= "notify:online",
-		.startup.single		= notify_online,
-		.teardown.single	= notify_down_prepare,
-		.skip_onerr		= true,
-	},
 #endif
 	/*
 	 * The dynamically registered state space is here
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cb66a4648840..b06848a104e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1538,30 +1538,6 @@ config NOTIFIER_ERROR_INJECTION
 
 	  Say N if unsure.
 
-config CPU_NOTIFIER_ERROR_INJECT
-	tristate "CPU notifier error injection module"
-	depends on HOTPLUG_CPU && NOTIFIER_ERROR_INJECTION
-	help
-	  This option provides a kernel module that can be used to test
-	  the error handling of the cpu notifiers by injecting artificial
-	  errors to CPU notifier chain callbacks.  It is controlled through
-	  debugfs interface under /sys/kernel/debug/notifier-error-inject/cpu
-
-	  If the notifier call chain should be failed with some events
-	  notified, write the error code to "actions/<notifier event>/error".
-
-	  Example: Inject CPU offline error (-1 == -EPERM)
-
-	  # cd /sys/kernel/debug/notifier-error-inject/cpu
-	  # echo -1 > actions/CPU_DOWN_PREPARE/error
-	  # echo 0 > /sys/devices/system/cpu/cpu1/online
-	  bash: echo: write error: Operation not permitted
-
-	  To compile this code as a module, choose M here: the module will
-	  be called cpu-notifier-error-inject.
-
-	  If unsure, say N.
-
 config PM_NOTIFIER_ERROR_INJECT
 	tristate "PM notifier error injection module"
 	depends on PM && NOTIFIER_ERROR_INJECTION
diff --git a/lib/Makefile b/lib/Makefile
index 50144a3aeebd..bc4073a8cd08 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -128,7 +128,6 @@ obj-$(CONFIG_SWIOTLB) += swiotlb.o
 obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
-obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o
 obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
 obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o
 obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o
diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c
deleted file mode 100644
index 0e2c9a1e958a..000000000000
--- a/lib/cpu-notifier-error-inject.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-
-#include "notifier-error-inject.h"
-
-static int priority;
-module_param(priority, int, 0);
-MODULE_PARM_DESC(priority, "specify cpu notifier priority");
-
-#define UP_PREPARE 0
-#define UP_PREPARE_FROZEN 0
-#define DOWN_PREPARE 0
-#define DOWN_PREPARE_FROZEN 0
-
-static struct notifier_err_inject cpu_notifier_err_inject = {
-	.actions = {
-		{ NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) },
-		{ NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) },
-		{ NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) },
-		{ NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) },
-		{}
-	}
-};
-
-static int notf_err_handle(struct notifier_err_inject_action *action)
-{
-	int ret;
-
-	ret = action->error;
-	if (ret)
-		pr_info("Injecting error (%d) to %s\n", ret, action->name);
-	return ret;
-}
-
-static int notf_err_inj_up_prepare(unsigned int cpu)
-{
-	if (!cpuhp_tasks_frozen)
-		return notf_err_handle(&cpu_notifier_err_inject.actions[0]);
-	else
-		return notf_err_handle(&cpu_notifier_err_inject.actions[1]);
-}
-
-static int notf_err_inj_dead(unsigned int cpu)
-{
-	if (!cpuhp_tasks_frozen)
-		return notf_err_handle(&cpu_notifier_err_inject.actions[2]);
-	else
-		return notf_err_handle(&cpu_notifier_err_inject.actions[3]);
-}
-
-static struct dentry *dir;
-
-static int err_inject_init(void)
-{
-	int err;
-
-	dir = notifier_err_inject_init("cpu", notifier_err_inject_dir,
-					&cpu_notifier_err_inject, priority);
-	if (IS_ERR(dir))
-		return PTR_ERR(dir);
-
-	err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE,
-					"cpu-err-notif:prepare",
-					notf_err_inj_up_prepare,
-					notf_err_inj_dead);
-	if (err)
-		debugfs_remove_recursive(dir);
-
-	return err;
-}
-
-static void err_inject_exit(void)
-{
-	cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE);
-	debugfs_remove_recursive(dir);
-}
-
-module_init(err_inject_init);
-module_exit(err_inject_exit);
-
-MODULE_DESCRIPTION("CPU notifier error injection module");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Akinobu Mita <akinobu.mita@gmail.com>");
-- 
cgit v1.2.3


From a5a1d1c2914b5316924c7893eb683a5420ebd3be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Dec 2016 20:32:01 +0100
Subject: clocksource: Use a plain u64 instead of cycle_t

There is no point in having an extra type for extra confusion. u64 is
unambiguous.

Conversion was done with the following coccinelle script:

@rem@
@@
-typedef u64 cycle_t;

@fix@
typedef cycle_t;
@@
-cycle_t
+u64

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
---
 arch/alpha/kernel/time.c                           |  4 +-
 arch/arm/mach-davinci/time.c                       |  2 +-
 arch/arm/mach-ep93xx/timer-ep93xx.c                |  4 +-
 arch/arm/mach-footbridge/dc21285-timer.c           |  2 +-
 arch/arm/mach-ixp4xx/common.c                      |  2 +-
 arch/arm/mach-mmp/time.c                           |  2 +-
 arch/arm/mach-omap2/timer.c                        |  4 +-
 arch/arm/plat-iop/time.c                           |  2 +-
 arch/avr32/kernel/time.c                           |  4 +-
 arch/blackfin/kernel/time-ts.c                     |  4 +-
 arch/c6x/kernel/time.c                             |  2 +-
 arch/hexagon/kernel/time.c                         |  4 +-
 arch/ia64/kernel/cyclone.c                         |  4 +-
 arch/ia64/kernel/fsyscall_gtod_data.h              |  6 +--
 arch/ia64/kernel/time.c                            |  6 +--
 arch/ia64/sn/kernel/sn2/timer.c                    |  4 +-
 arch/m68k/68000/timers.c                           |  2 +-
 arch/m68k/coldfire/dma_timer.c                     |  2 +-
 arch/m68k/coldfire/pit.c                           |  2 +-
 arch/m68k/coldfire/sltimers.c                      |  2 +-
 arch/m68k/coldfire/timers.c                        |  2 +-
 arch/microblaze/kernel/timer.c                     |  6 +--
 arch/mips/alchemy/common/time.c                    |  2 +-
 arch/mips/cavium-octeon/csrc-octeon.c              |  2 +-
 arch/mips/jz4740/time.c                            |  2 +-
 arch/mips/kernel/cevt-txx9.c                       |  2 +-
 arch/mips/kernel/csrc-bcm1480.c                    |  4 +-
 arch/mips/kernel/csrc-ioasic.c                     |  2 +-
 arch/mips/kernel/csrc-r4k.c                        |  2 +-
 arch/mips/kernel/csrc-sb1250.c                     |  4 +-
 arch/mips/loongson32/common/time.c                 |  4 +-
 arch/mips/loongson64/common/cs5536/cs5536_mfgpt.c  |  4 +-
 arch/mips/loongson64/loongson-3/hpet.c             |  4 +-
 arch/mips/mti-malta/malta-time.c                   |  2 +-
 arch/mips/netlogic/common/time.c                   |  4 +-
 arch/mips/sgi-ip27/ip27-timer.c                    |  2 +-
 arch/mn10300/kernel/csrc-mn10300.c                 |  2 +-
 arch/nios2/kernel/time.c                           |  2 +-
 arch/openrisc/kernel/time.c                        |  4 +-
 arch/parisc/kernel/time.c                          |  2 +-
 arch/powerpc/kernel/time.c                         | 14 +++---
 arch/s390/kernel/time.c                            |  2 +-
 arch/sparc/kernel/time_32.c                        |  2 +-
 arch/sparc/kernel/time_64.c                        |  2 +-
 arch/um/kernel/time.c                              |  2 +-
 arch/unicore32/kernel/time.c                       |  2 +-
 arch/x86/entry/vdso/vclock_gettime.c               |  8 +--
 arch/x86/include/asm/kvm_host.h                    |  2 +-
 arch/x86/include/asm/pvclock.h                     |  7 ++-
 arch/x86/include/asm/tsc.h                         |  2 +-
 arch/x86/include/asm/vgtod.h                       |  4 +-
 arch/x86/kernel/apb_timer.c                        |  4 +-
 arch/x86/kernel/cpu/mshyperv.c                     |  4 +-
 arch/x86/kernel/hpet.c                             | 14 +++---
 arch/x86/kernel/kvmclock.c                         | 10 ++--
 arch/x86/kernel/pvclock.c                          |  4 +-
 arch/x86/kernel/tsc.c                              |  6 +--
 arch/x86/kvm/x86.c                                 | 14 +++---
 arch/x86/lguest/boot.c                             |  2 +-
 arch/x86/platform/uv/uv_time.c                     |  8 +--
 arch/x86/xen/time.c                                |  6 +--
 arch/x86/xen/xen-ops.h                             |  2 +-
 arch/xtensa/kernel/time.c                          |  4 +-
 drivers/char/hpet.c                                |  4 +-
 drivers/clocksource/acpi_pm.c                      | 14 +++---
 drivers/clocksource/arc_timer.c                    | 12 ++---
 drivers/clocksource/arm_arch_timer.c               |  4 +-
 drivers/clocksource/arm_global_timer.c             |  2 +-
 drivers/clocksource/cadence_ttc_timer.c            |  4 +-
 drivers/clocksource/clksrc-dbx500-prcmu.c          |  2 +-
 drivers/clocksource/dw_apb_timer.c                 |  8 +--
 drivers/clocksource/em_sti.c                       | 12 ++---
 drivers/clocksource/exynos_mct.c                   |  6 +--
 drivers/clocksource/h8300_timer16.c                |  2 +-
 drivers/clocksource/h8300_tpu.c                    |  2 +-
 drivers/clocksource/i8253.c                        |  4 +-
 drivers/clocksource/jcore-pit.c                    |  2 +-
 drivers/clocksource/metag_generic.c                |  2 +-
 drivers/clocksource/mips-gic-timer.c               |  2 +-
 drivers/clocksource/mmio.c                         | 18 +++----
 drivers/clocksource/mxs_timer.c                    |  2 +-
 drivers/clocksource/qcom-timer.c                   |  2 +-
 drivers/clocksource/samsung_pwm_timer.c            |  2 +-
 drivers/clocksource/scx200_hrt.c                   |  4 +-
 drivers/clocksource/sh_cmt.c                       |  2 +-
 drivers/clocksource/sh_tmu.c                       |  2 +-
 drivers/clocksource/tcb_clksrc.c                   |  4 +-
 drivers/clocksource/time-pistachio.c               |  4 +-
 drivers/clocksource/timer-atlas7.c                 |  2 +-
 drivers/clocksource/timer-atmel-pit.c              |  2 +-
 drivers/clocksource/timer-atmel-st.c               |  2 +-
 drivers/clocksource/timer-nps.c                    |  4 +-
 drivers/clocksource/timer-prima2.c                 |  2 +-
 drivers/clocksource/timer-sun5i.c                  |  2 +-
 drivers/clocksource/timer-ti-32k.c                 |  4 +-
 drivers/clocksource/vt8500_timer.c                 |  4 +-
 drivers/hv/hv.c                                    |  8 +--
 drivers/irqchip/irq-mips-gic.c                     | 16 +++---
 drivers/net/ethernet/amd/xgbe/xgbe-ptp.c           |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   |  2 +-
 drivers/net/ethernet/freescale/fec_ptp.c           |  2 +-
 drivers/net/ethernet/intel/e1000e/netdev.c         | 18 +++----
 drivers/net/ethernet/intel/e1000e/ptp.c            |  4 +-
 drivers/net/ethernet/intel/igb/igb_ptp.c           |  4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c       |  4 +-
 drivers/net/ethernet/mellanox/mlx4/en_clock.c      |  2 +-
 drivers/net/ethernet/mellanox/mlx4/main.c          |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_clock.c |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  4 +-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |  2 +-
 drivers/net/ethernet/ti/cpts.c                     |  2 +-
 include/kvm/arm_arch_timer.h                       |  4 +-
 include/linux/clocksource.h                        | 22 ++++-----
 include/linux/dw_apb_timer.h                       |  2 +-
 include/linux/irqchip/mips-gic.h                   |  8 +--
 include/linux/mlx4/device.h                        |  2 +-
 include/linux/timecounter.h                        | 12 ++---
 include/linux/timekeeper_internal.h                | 10 ++--
 include/linux/timekeeping.h                        |  4 +-
 include/linux/types.h                              |  3 --
 kernel/time/clocksource.c                          |  2 +-
 kernel/time/jiffies.c                              |  4 +-
 kernel/time/timecounter.c                          |  6 +--
 kernel/time/timekeeping.c                          | 57 ++++++++++------------
 kernel/time/timekeeping_internal.h                 |  6 +--
 kernel/trace/ftrace.c                              |  4 +-
 kernel/trace/trace.c                               |  6 +--
 kernel/trace/trace.h                               |  8 +--
 kernel/trace/trace_irqsoff.c                       |  4 +-
 kernel/trace/trace_sched_wakeup.c                  |  4 +-
 sound/hda/hdac_stream.c                            |  6 +--
 virt/kvm/arm/arch_timer.c                          |  6 +--
 132 files changed, 320 insertions(+), 327 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 992000e3d9e4..3bfe058d75d9 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -133,7 +133,7 @@ init_rtc_clockevent(void)
  * The QEMU clock as a clocksource primitive.
  */
 
-static cycle_t
+static u64
 qemu_cs_read(struct clocksource *cs)
 {
 	return qemu_get_vmtime();
@@ -260,7 +260,7 @@ common_init_rtc(void)
  * use this method when WTINT is in use.
  */
 
-static cycle_t read_rpcc(struct clocksource *cs)
+static u64 read_rpcc(struct clocksource *cs)
 {
 	return rpcc();
 }
diff --git a/arch/arm/mach-davinci/time.c b/arch/arm/mach-davinci/time.c
index 6c18445a4639..034f865fe78e 100644
--- a/arch/arm/mach-davinci/time.c
+++ b/arch/arm/mach-davinci/time.c
@@ -268,7 +268,7 @@ static void __init timer_init(void)
 /*
  * clocksource
  */
-static cycle_t read_cycles(struct clocksource *cs)
+static u64 read_cycles(struct clocksource *cs)
 {
 	struct timer_s *t = &timers[TID_CLOCKSOURCE];
 
diff --git a/arch/arm/mach-ep93xx/timer-ep93xx.c b/arch/arm/mach-ep93xx/timer-ep93xx.c
index e5f791145bd0..874cbc91b669 100644
--- a/arch/arm/mach-ep93xx/timer-ep93xx.c
+++ b/arch/arm/mach-ep93xx/timer-ep93xx.c
@@ -59,13 +59,13 @@ static u64 notrace ep93xx_read_sched_clock(void)
 	return ret;
 }
 
-cycle_t ep93xx_clocksource_read(struct clocksource *c)
+u64 ep93xx_clocksource_read(struct clocksource *c)
 {
 	u64 ret;
 
 	ret = readl(EP93XX_TIMER4_VALUE_LOW);
 	ret |= ((u64) (readl(EP93XX_TIMER4_VALUE_HIGH) & 0xff) << 32);
-	return (cycle_t) ret;
+	return (u64) ret;
 }
 
 static int ep93xx_clkevt_set_next_event(unsigned long next,
diff --git a/arch/arm/mach-footbridge/dc21285-timer.c b/arch/arm/mach-footbridge/dc21285-timer.c
index 810edc78c817..75395a720e63 100644
--- a/arch/arm/mach-footbridge/dc21285-timer.c
+++ b/arch/arm/mach-footbridge/dc21285-timer.c
@@ -19,7 +19,7 @@
 
 #include "common.h"
 
-static cycle_t cksrc_dc21285_read(struct clocksource *cs)
+static u64 cksrc_dc21285_read(struct clocksource *cs)
 {
 	return cs->mask - *CSR_TIMER2_VALUE;
 }
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 0f08f611c1a6..846e033c56fa 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -493,7 +493,7 @@ static u64 notrace ixp4xx_read_sched_clock(void)
  * clocksource
  */
 
-static cycle_t ixp4xx_clocksource_read(struct clocksource *c)
+static u64 ixp4xx_clocksource_read(struct clocksource *c)
 {
 	return *IXP4XX_OSTS;
 }
diff --git a/arch/arm/mach-mmp/time.c b/arch/arm/mach-mmp/time.c
index 3c2c92aaa0ae..96ad1db0b04b 100644
--- a/arch/arm/mach-mmp/time.c
+++ b/arch/arm/mach-mmp/time.c
@@ -144,7 +144,7 @@ static struct clock_event_device ckevt = {
 	.set_state_oneshot	= timer_set_shutdown,
 };
 
-static cycle_t clksrc_read(struct clocksource *cs)
+static u64 clksrc_read(struct clocksource *cs)
 {
 	return timer_read();
 }
diff --git a/arch/arm/mach-omap2/timer.c b/arch/arm/mach-omap2/timer.c
index 5e2e2218a402..56128da23c3a 100644
--- a/arch/arm/mach-omap2/timer.c
+++ b/arch/arm/mach-omap2/timer.c
@@ -369,9 +369,9 @@ static bool use_gptimer_clksrc __initdata;
 /*
  * clocksource
  */
-static cycle_t clocksource_read_cycles(struct clocksource *cs)
+static u64 clocksource_read_cycles(struct clocksource *cs)
 {
-	return (cycle_t)__omap_dm_timer_read_counter(&clksrc,
+	return (u64)__omap_dm_timer_read_counter(&clksrc,
 						     OMAP_TIMER_NONPOSTED);
 }
 
diff --git a/arch/arm/plat-iop/time.c b/arch/arm/plat-iop/time.c
index ed8d129d4bea..2cff0010f677 100644
--- a/arch/arm/plat-iop/time.c
+++ b/arch/arm/plat-iop/time.c
@@ -38,7 +38,7 @@
 /*
  * IOP clocksource (free-running timer 1).
  */
-static cycle_t notrace iop_clocksource_read(struct clocksource *unused)
+static u64 notrace iop_clocksource_read(struct clocksource *unused)
 {
 	return 0xffffffffu - read_tcr1();
 }
diff --git a/arch/avr32/kernel/time.c b/arch/avr32/kernel/time.c
index a124c55733db..4d9b69615979 100644
--- a/arch/avr32/kernel/time.c
+++ b/arch/avr32/kernel/time.c
@@ -20,9 +20,9 @@
 
 static bool disable_cpu_idle_poll;
 
-static cycle_t read_cycle_count(struct clocksource *cs)
+static u64 read_cycle_count(struct clocksource *cs)
 {
-	return (cycle_t)sysreg_read(COUNT);
+	return (u64)sysreg_read(COUNT);
 }
 
 /*
diff --git a/arch/blackfin/kernel/time-ts.c b/arch/blackfin/kernel/time-ts.c
index fb9e95f1b719..0e9fcf841d67 100644
--- a/arch/blackfin/kernel/time-ts.c
+++ b/arch/blackfin/kernel/time-ts.c
@@ -26,7 +26,7 @@
 
 #if defined(CONFIG_CYCLES_CLOCKSOURCE)
 
-static notrace cycle_t bfin_read_cycles(struct clocksource *cs)
+static notrace u64 bfin_read_cycles(struct clocksource *cs)
 {
 #ifdef CONFIG_CPU_FREQ
 	return __bfin_cycles_off + (get_cycles() << __bfin_cycles_mod);
@@ -80,7 +80,7 @@ void __init setup_gptimer0(void)
 	enable_gptimers(TIMER0bit);
 }
 
-static cycle_t bfin_read_gptimer0(struct clocksource *cs)
+static u64 bfin_read_gptimer0(struct clocksource *cs)
 {
 	return bfin_read_TIMER0_COUNTER();
 }
diff --git a/arch/c6x/kernel/time.c b/arch/c6x/kernel/time.c
index 04845aaf5985..6a8e00a1f6d5 100644
--- a/arch/c6x/kernel/time.c
+++ b/arch/c6x/kernel/time.c
@@ -26,7 +26,7 @@
 static u32 sched_clock_multiplier;
 #define SCHED_CLOCK_SHIFT 16
 
-static cycle_t tsc_read(struct clocksource *cs)
+static u64 tsc_read(struct clocksource *cs)
 {
 	return get_cycles();
 }
diff --git a/arch/hexagon/kernel/time.c b/arch/hexagon/kernel/time.c
index a6a1d1f8309a..ff4e9bf995e9 100644
--- a/arch/hexagon/kernel/time.c
+++ b/arch/hexagon/kernel/time.c
@@ -72,9 +72,9 @@ struct adsp_hw_timer_struct {
 /*  Look for "TCX0" for related constants.  */
 static __iomem struct adsp_hw_timer_struct *rtos_timer;
 
-static cycle_t timer_get_cycles(struct clocksource *cs)
+static u64 timer_get_cycles(struct clocksource *cs)
 {
-	return (cycle_t) __vmgettime();
+	return (u64) __vmgettime();
 }
 
 static struct clocksource hexagon_clocksource = {
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 5fa3848ba224..ee1a4afbf9da 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -21,9 +21,9 @@ void __init cyclone_setup(void)
 
 static void __iomem *cyclone_mc;
 
-static cycle_t read_cyclone(struct clocksource *cs)
+static u64 read_cyclone(struct clocksource *cs)
 {
-	return (cycle_t)readq((void __iomem *)cyclone_mc);
+	return (u64)readq((void __iomem *)cyclone_mc);
 }
 
 static struct clocksource clocksource_cyclone = {
diff --git a/arch/ia64/kernel/fsyscall_gtod_data.h b/arch/ia64/kernel/fsyscall_gtod_data.h
index 146b15b5fec3..dcc514917731 100644
--- a/arch/ia64/kernel/fsyscall_gtod_data.h
+++ b/arch/ia64/kernel/fsyscall_gtod_data.h
@@ -9,15 +9,15 @@ struct fsyscall_gtod_data_t {
 	seqcount_t	seq;
 	struct timespec	wall_time;
 	struct timespec monotonic_time;
-	cycle_t		clk_mask;
+	u64		clk_mask;
 	u32		clk_mult;
 	u32		clk_shift;
 	void		*clk_fsys_mmio;
-	cycle_t		clk_cycle_last;
+	u64		clk_cycle_last;
 } ____cacheline_aligned;
 
 struct itc_jitter_data_t {
 	int		itc_jitter;
-	cycle_t		itc_lastcycle;
+	u64		itc_lastcycle;
 } ____cacheline_aligned;
 
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 021f44ab4bfb..71775b95d6cc 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -31,7 +31,7 @@
 
 #include "fsyscall_gtod_data.h"
 
-static cycle_t itc_get_cycles(struct clocksource *cs);
+static u64 itc_get_cycles(struct clocksource *cs);
 
 struct fsyscall_gtod_data_t fsyscall_gtod_data;
 
@@ -323,7 +323,7 @@ void ia64_init_itm(void)
 	}
 }
 
-static cycle_t itc_get_cycles(struct clocksource *cs)
+static u64 itc_get_cycles(struct clocksource *cs)
 {
 	unsigned long lcycle, now, ret;
 
@@ -397,7 +397,7 @@ void update_vsyscall_tz(void)
 }
 
 void update_vsyscall_old(struct timespec *wall, struct timespec *wtm,
-			 struct clocksource *c, u32 mult, cycle_t cycle_last)
+			 struct clocksource *c, u32 mult, u64 cycle_last)
 {
 	write_seqcount_begin(&fsyscall_gtod_data.seq);
 
diff --git a/arch/ia64/sn/kernel/sn2/timer.c b/arch/ia64/sn/kernel/sn2/timer.c
index abab8f99e913..66edc36426ed 100644
--- a/arch/ia64/sn/kernel/sn2/timer.c
+++ b/arch/ia64/sn/kernel/sn2/timer.c
@@ -22,9 +22,9 @@
 
 extern unsigned long sn_rtc_cycles_per_second;
 
-static cycle_t read_sn2(struct clocksource *cs)
+static u64 read_sn2(struct clocksource *cs)
 {
-	return (cycle_t)readq(RTC_COUNTER_ADDR);
+	return (u64)readq(RTC_COUNTER_ADDR);
 }
 
 static struct clocksource clocksource_sn2 = {
diff --git a/arch/m68k/68000/timers.c b/arch/m68k/68000/timers.c
index 99a98698bc95..252455bce144 100644
--- a/arch/m68k/68000/timers.c
+++ b/arch/m68k/68000/timers.c
@@ -76,7 +76,7 @@ static struct irqaction m68328_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t m68328_read_clk(struct clocksource *cs)
+static u64 m68328_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68k/coldfire/dma_timer.c b/arch/m68k/coldfire/dma_timer.c
index 235ad57c4707..8273eea57874 100644
--- a/arch/m68k/coldfire/dma_timer.c
+++ b/arch/m68k/coldfire/dma_timer.c
@@ -34,7 +34,7 @@
 #define DMA_DTMR_CLK_DIV_16	(2 << 1)
 #define DMA_DTMR_ENABLE		(1 << 0)
 
-static cycle_t cf_dt_get_cycles(struct clocksource *cs)
+static u64 cf_dt_get_cycles(struct clocksource *cs)
 {
 	return __raw_readl(DTCN0);
 }
diff --git a/arch/m68k/coldfire/pit.c b/arch/m68k/coldfire/pit.c
index d86a9ffb3f13..175553d5b8ed 100644
--- a/arch/m68k/coldfire/pit.c
+++ b/arch/m68k/coldfire/pit.c
@@ -118,7 +118,7 @@ static struct irqaction pit_irq = {
 
 /***************************************************************************/
 
-static cycle_t pit_read_clk(struct clocksource *cs)
+static u64 pit_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/m68k/coldfire/sltimers.c b/arch/m68k/coldfire/sltimers.c
index 831a08cf6f40..3292c0d68b18 100644
--- a/arch/m68k/coldfire/sltimers.c
+++ b/arch/m68k/coldfire/sltimers.c
@@ -97,7 +97,7 @@ static struct irqaction mcfslt_timer_irq = {
 	.handler = mcfslt_tick,
 };
 
-static cycle_t mcfslt_read_clk(struct clocksource *cs)
+static u64 mcfslt_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles, scnt;
diff --git a/arch/m68k/coldfire/timers.c b/arch/m68k/coldfire/timers.c
index cd496a20fcc7..2dc7a58204f6 100644
--- a/arch/m68k/coldfire/timers.c
+++ b/arch/m68k/coldfire/timers.c
@@ -89,7 +89,7 @@ static struct irqaction mcftmr_timer_irq = {
 
 /***************************************************************************/
 
-static cycle_t mcftmr_read_clk(struct clocksource *cs)
+static u64 mcftmr_read_clk(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 cycles;
diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c
index 9e954959f605..1d6fad50fa76 100644
--- a/arch/microblaze/kernel/timer.c
+++ b/arch/microblaze/kernel/timer.c
@@ -190,17 +190,17 @@ static u64 xilinx_clock_read(void)
 	return read_fn(timer_baseaddr + TCR1);
 }
 
-static cycle_t xilinx_read(struct clocksource *cs)
+static u64 xilinx_read(struct clocksource *cs)
 {
 	/* reading actual value of timer 1 */
-	return (cycle_t)xilinx_clock_read();
+	return (u64)xilinx_clock_read();
 }
 
 static struct timecounter xilinx_tc = {
 	.cc = NULL,
 };
 
-static cycle_t xilinx_cc_read(const struct cyclecounter *cc)
+static u64 xilinx_cc_read(const struct cyclecounter *cc)
 {
 	return xilinx_read(NULL);
 }
diff --git a/arch/mips/alchemy/common/time.c b/arch/mips/alchemy/common/time.c
index f99d3ec17a45..e1bec5a77c39 100644
--- a/arch/mips/alchemy/common/time.c
+++ b/arch/mips/alchemy/common/time.c
@@ -44,7 +44,7 @@
 /* 32kHz clock enabled and detected */
 #define CNTR_OK (SYS_CNTRL_E0 | SYS_CNTRL_32S)
 
-static cycle_t au1x_counter1_read(struct clocksource *cs)
+static u64 au1x_counter1_read(struct clocksource *cs)
 {
 	return alchemy_rdsys(AU1000_SYS_RTCREAD);
 }
diff --git a/arch/mips/cavium-octeon/csrc-octeon.c b/arch/mips/cavium-octeon/csrc-octeon.c
index 23c2344a3552..39f153fe0022 100644
--- a/arch/mips/cavium-octeon/csrc-octeon.c
+++ b/arch/mips/cavium-octeon/csrc-octeon.c
@@ -98,7 +98,7 @@ void octeon_init_cvmcount(void)
 	local_irq_restore(flags);
 }
 
-static cycle_t octeon_cvmcount_read(struct clocksource *cs)
+static u64 octeon_cvmcount_read(struct clocksource *cs)
 {
 	return read_c0_cvmcount();
 }
diff --git a/arch/mips/jz4740/time.c b/arch/mips/jz4740/time.c
index 1f7ca2c9f262..bcf8f8c62737 100644
--- a/arch/mips/jz4740/time.c
+++ b/arch/mips/jz4740/time.c
@@ -34,7 +34,7 @@
 
 static uint16_t jz4740_jiffies_per_tick;
 
-static cycle_t jz4740_clocksource_read(struct clocksource *cs)
+static u64 jz4740_clocksource_read(struct clocksource *cs)
 {
 	return jz4740_timer_get_count(TIMER_CLOCKSOURCE);
 }
diff --git a/arch/mips/kernel/cevt-txx9.c b/arch/mips/kernel/cevt-txx9.c
index 537eefdf838f..aaca60d6ffc3 100644
--- a/arch/mips/kernel/cevt-txx9.c
+++ b/arch/mips/kernel/cevt-txx9.c
@@ -27,7 +27,7 @@ struct txx9_clocksource {
 	struct txx9_tmr_reg __iomem *tmrptr;
 };
 
-static cycle_t txx9_cs_read(struct clocksource *cs)
+static u64 txx9_cs_read(struct clocksource *cs)
 {
 	struct txx9_clocksource *txx9_cs =
 		container_of(cs, struct txx9_clocksource, cs);
diff --git a/arch/mips/kernel/csrc-bcm1480.c b/arch/mips/kernel/csrc-bcm1480.c
index 7f65b53d1b24..f011261e9506 100644
--- a/arch/mips/kernel/csrc-bcm1480.c
+++ b/arch/mips/kernel/csrc-bcm1480.c
@@ -25,9 +25,9 @@
 
 #include <asm/sibyte/sb1250.h>
 
-static cycle_t bcm1480_hpt_read(struct clocksource *cs)
+static u64 bcm1480_hpt_read(struct clocksource *cs)
 {
-	return (cycle_t) __raw_readq(IOADDR(A_SCD_ZBBUS_CYCLE_COUNT));
+	return (u64) __raw_readq(IOADDR(A_SCD_ZBBUS_CYCLE_COUNT));
 }
 
 struct clocksource bcm1480_clocksource = {
diff --git a/arch/mips/kernel/csrc-ioasic.c b/arch/mips/kernel/csrc-ioasic.c
index 722f5589cd1d..f6acd1e58c26 100644
--- a/arch/mips/kernel/csrc-ioasic.c
+++ b/arch/mips/kernel/csrc-ioasic.c
@@ -22,7 +22,7 @@
 #include <asm/dec/ioasic.h>
 #include <asm/dec/ioasic_addrs.h>
 
-static cycle_t dec_ioasic_hpt_read(struct clocksource *cs)
+static u64 dec_ioasic_hpt_read(struct clocksource *cs)
 {
 	return ioasic_read(IO_REG_FCTR);
 }
diff --git a/arch/mips/kernel/csrc-r4k.c b/arch/mips/kernel/csrc-r4k.c
index d76275da54cb..eed099f35bf1 100644
--- a/arch/mips/kernel/csrc-r4k.c
+++ b/arch/mips/kernel/csrc-r4k.c
@@ -11,7 +11,7 @@
 
 #include <asm/time.h>
 
-static cycle_t c0_hpt_read(struct clocksource *cs)
+static u64 c0_hpt_read(struct clocksource *cs)
 {
 	return read_c0_count();
 }
diff --git a/arch/mips/kernel/csrc-sb1250.c b/arch/mips/kernel/csrc-sb1250.c
index d915652b4d56..b07b7310d3f4 100644
--- a/arch/mips/kernel/csrc-sb1250.c
+++ b/arch/mips/kernel/csrc-sb1250.c
@@ -30,7 +30,7 @@
  * The HPT is free running from SB1250_HPT_VALUE down to 0 then starts over
  * again.
  */
-static inline cycle_t sb1250_hpt_get_cycles(void)
+static inline u64 sb1250_hpt_get_cycles(void)
 {
 	unsigned int count;
 	void __iomem *addr;
@@ -41,7 +41,7 @@ static inline cycle_t sb1250_hpt_get_cycles(void)
 	return SB1250_HPT_VALUE - count;
 }
 
-static cycle_t sb1250_hpt_read(struct clocksource *cs)
+static u64 sb1250_hpt_read(struct clocksource *cs)
 {
 	return sb1250_hpt_get_cycles();
 }
diff --git a/arch/mips/loongson32/common/time.c b/arch/mips/loongson32/common/time.c
index ff224f0020e5..e6f972d35252 100644
--- a/arch/mips/loongson32/common/time.c
+++ b/arch/mips/loongson32/common/time.c
@@ -63,7 +63,7 @@ void __init ls1x_pwmtimer_init(void)
 	ls1x_pwmtimer_restart();
 }
 
-static cycle_t ls1x_clocksource_read(struct clocksource *cs)
+static u64 ls1x_clocksource_read(struct clocksource *cs)
 {
 	unsigned long flags;
 	int count;
@@ -107,7 +107,7 @@ static cycle_t ls1x_clocksource_read(struct clocksource *cs)
 
 	raw_spin_unlock_irqrestore(&ls1x_timer_lock, flags);
 
-	return (cycle_t) (jifs * ls1x_jiffies_per_tick) + count;
+	return (u64) (jifs * ls1x_jiffies_per_tick) + count;
 }
 
 static struct clocksource ls1x_clocksource = {
diff --git a/arch/mips/loongson64/common/cs5536/cs5536_mfgpt.c b/arch/mips/loongson64/common/cs5536/cs5536_mfgpt.c
index da77d412514c..9edfa55a0e78 100644
--- a/arch/mips/loongson64/common/cs5536/cs5536_mfgpt.c
+++ b/arch/mips/loongson64/common/cs5536/cs5536_mfgpt.c
@@ -144,7 +144,7 @@ void __init setup_mfgpt0_timer(void)
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t mfgpt_read(struct clocksource *cs)
+static u64 mfgpt_read(struct clocksource *cs)
 {
 	unsigned long flags;
 	int count;
@@ -188,7 +188,7 @@ static cycle_t mfgpt_read(struct clocksource *cs)
 
 	raw_spin_unlock_irqrestore(&mfgpt_lock, flags);
 
-	return (cycle_t) (jifs * COMPARE) + count;
+	return (u64) (jifs * COMPARE) + count;
 }
 
 static struct clocksource clocksource_mfgpt = {
diff --git a/arch/mips/loongson64/loongson-3/hpet.c b/arch/mips/loongson64/loongson-3/hpet.c
index 4788bea62a6a..24afe364637b 100644
--- a/arch/mips/loongson64/loongson-3/hpet.c
+++ b/arch/mips/loongson64/loongson-3/hpet.c
@@ -248,9 +248,9 @@ void __init setup_hpet_timer(void)
 	pr_info("hpet clock event device register\n");
 }
 
-static cycle_t hpet_read_counter(struct clocksource *cs)
+static u64 hpet_read_counter(struct clocksource *cs)
 {
-	return (cycle_t)hpet_read(HPET_COUNTER);
+	return (u64)hpet_read(HPET_COUNTER);
 }
 
 static void hpet_suspend(struct clocksource *cs)
diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c
index 7407da04f8d6..1829a9031eec 100644
--- a/arch/mips/mti-malta/malta-time.c
+++ b/arch/mips/mti-malta/malta-time.c
@@ -75,7 +75,7 @@ static void __init estimate_frequencies(void)
 	unsigned int count, start;
 	unsigned char secs1, secs2, ctrl;
 	int secs;
-	cycle_t giccount = 0, gicstart = 0;
+	u64 giccount = 0, gicstart = 0;
 
 #if defined(CONFIG_KVM_GUEST) && CONFIG_KVM_GUEST_TIMER_FREQ
 	mips_hpt_frequency = CONFIG_KVM_GUEST_TIMER_FREQ * 1000000;
diff --git a/arch/mips/netlogic/common/time.c b/arch/mips/netlogic/common/time.c
index 5873c83e65be..cbbf0d48216b 100644
--- a/arch/mips/netlogic/common/time.c
+++ b/arch/mips/netlogic/common/time.c
@@ -59,14 +59,14 @@ unsigned int get_c0_compare_int(void)
 	return IRQ_TIMER;
 }
 
-static cycle_t nlm_get_pic_timer(struct clocksource *cs)
+static u64 nlm_get_pic_timer(struct clocksource *cs)
 {
 	uint64_t picbase = nlm_get_node(0)->picbase;
 
 	return ~nlm_pic_read_timer(picbase, PIC_CLOCK_TIMER);
 }
 
-static cycle_t nlm_get_pic_timer32(struct clocksource *cs)
+static u64 nlm_get_pic_timer32(struct clocksource *cs)
 {
 	uint64_t picbase = nlm_get_node(0)->picbase;
 
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index 42d6cb9f956e..695c51bdd7dc 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -140,7 +140,7 @@ static void __init hub_rt_clock_event_global_init(void)
 	setup_irq(irq, &hub_rt_irqaction);
 }
 
-static cycle_t hub_rt_read(struct clocksource *cs)
+static u64 hub_rt_read(struct clocksource *cs)
 {
 	return REMOTE_HUB_L(cputonasid(0), PI_RT_COUNT);
 }
diff --git a/arch/mn10300/kernel/csrc-mn10300.c b/arch/mn10300/kernel/csrc-mn10300.c
index 45644cf18c41..6b74df3661f2 100644
--- a/arch/mn10300/kernel/csrc-mn10300.c
+++ b/arch/mn10300/kernel/csrc-mn10300.c
@@ -13,7 +13,7 @@
 #include <asm/timex.h>
 #include "internal.h"
 
-static cycle_t mn10300_read(struct clocksource *cs)
+static u64 mn10300_read(struct clocksource *cs)
 {
 	return read_timestamp_counter();
 }
diff --git a/arch/nios2/kernel/time.c b/arch/nios2/kernel/time.c
index 746bf5caaffc..6e2bdc9b8530 100644
--- a/arch/nios2/kernel/time.c
+++ b/arch/nios2/kernel/time.c
@@ -81,7 +81,7 @@ static inline unsigned long read_timersnapshot(struct nios2_timer *timer)
 	return count;
 }
 
-static cycle_t nios2_timer_read(struct clocksource *cs)
+static u64 nios2_timer_read(struct clocksource *cs)
 {
 	struct nios2_clocksource *nios2_cs = to_nios2_clksource(cs);
 	unsigned long flags;
diff --git a/arch/openrisc/kernel/time.c b/arch/openrisc/kernel/time.c
index 50e970183dcd..687c11d048d7 100644
--- a/arch/openrisc/kernel/time.c
+++ b/arch/openrisc/kernel/time.c
@@ -117,9 +117,9 @@ static __init void openrisc_clockevent_init(void)
  * is 32 bits wide and runs at the CPU clock frequency.
  */
 
-static cycle_t openrisc_timer_read(struct clocksource *cs)
+static u64 openrisc_timer_read(struct clocksource *cs)
 {
-	return (cycle_t) mfspr(SPR_TTCR);
+	return (u64) mfspr(SPR_TTCR);
 }
 
 static struct clocksource openrisc_timer = {
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 037d81f00520..da0d9cb63403 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -137,7 +137,7 @@ EXPORT_SYMBOL(profile_pc);
 
 /* clock source code */
 
-static cycle_t notrace read_cr16(struct clocksource *cs)
+static u64 notrace read_cr16(struct clocksource *cs)
 {
 	return get_cycles();
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 19397e2a8bf5..bc2e08d415fa 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -80,7 +80,7 @@
 #include <linux/clockchips.h>
 #include <linux/timekeeper_internal.h>
 
-static cycle_t rtc_read(struct clocksource *);
+static u64 rtc_read(struct clocksource *);
 static struct clocksource clocksource_rtc = {
 	.name         = "rtc",
 	.rating       = 400,
@@ -89,7 +89,7 @@ static struct clocksource clocksource_rtc = {
 	.read         = rtc_read,
 };
 
-static cycle_t timebase_read(struct clocksource *);
+static u64 timebase_read(struct clocksource *);
 static struct clocksource clocksource_timebase = {
 	.name         = "timebase",
 	.rating       = 400,
@@ -802,18 +802,18 @@ void read_persistent_clock(struct timespec *ts)
 }
 
 /* clocksource code */
-static cycle_t rtc_read(struct clocksource *cs)
+static u64 rtc_read(struct clocksource *cs)
 {
-	return (cycle_t)get_rtc();
+	return (u64)get_rtc();
 }
 
-static cycle_t timebase_read(struct clocksource *cs)
+static u64 timebase_read(struct clocksource *cs)
 {
-	return (cycle_t)get_tb();
+	return (u64)get_tb();
 }
 
 void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm,
-			 struct clocksource *clock, u32 mult, cycle_t cycle_last)
+			 struct clocksource *clock, u32 mult, u64 cycle_last)
 {
 	u64 new_tb_to_xs, new_stamp_xsec;
 	u32 frac_sec;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index ec76315c9ee5..52949df88529 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -209,7 +209,7 @@ void read_boot_clock64(struct timespec64 *ts)
 	tod_to_timeval(clock - TOD_UNIX_EPOCH, ts);
 }
 
-static cycle_t read_tod_clock(struct clocksource *cs)
+static u64 read_tod_clock(struct clocksource *cs)
 {
 	unsigned long long now, adj;
 
diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c
index 1affabc96b08..244062bdaa56 100644
--- a/arch/sparc/kernel/time_32.c
+++ b/arch/sparc/kernel/time_32.c
@@ -148,7 +148,7 @@ static unsigned int sbus_cycles_offset(void)
 	return offset;
 }
 
-static cycle_t timer_cs_read(struct clocksource *cs)
+static u64 timer_cs_read(struct clocksource *cs)
 {
 	unsigned int seq, offset;
 	u64 cycles;
diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 807f7e2ce014..12a6d3555cb8 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -770,7 +770,7 @@ void udelay(unsigned long usecs)
 }
 EXPORT_SYMBOL(udelay);
 
-static cycle_t clocksource_tick_read(struct clocksource *cs)
+static u64 clocksource_tick_read(struct clocksource *cs)
 {
 	return tick_ops->get_tick();
 }
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 25c23666d592..ba87a27d6715 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -83,7 +83,7 @@ static irqreturn_t um_timer(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static cycle_t timer_read(struct clocksource *cs)
+static u64 timer_read(struct clocksource *cs)
 {
 	return os_nsecs() / TIMER_MULTIPLIER;
 }
diff --git a/arch/unicore32/kernel/time.c b/arch/unicore32/kernel/time.c
index ac4c5449bb88..fceaa673f861 100644
--- a/arch/unicore32/kernel/time.c
+++ b/arch/unicore32/kernel/time.c
@@ -62,7 +62,7 @@ static struct clock_event_device ckevt_puv3_osmr0 = {
 	.set_state_oneshot	= puv3_osmr0_shutdown,
 };
 
-static cycle_t puv3_read_oscr(struct clocksource *cs)
+static u64 puv3_read_oscr(struct clocksource *cs)
 {
 	return readl(OST_OSCR);
 }
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 02223cb4bcfd..9d4d6e138311 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -92,10 +92,10 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
 	return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
 }
 
-static notrace cycle_t vread_pvclock(int *mode)
+static notrace u64 vread_pvclock(int *mode)
 {
 	const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
-	cycle_t ret;
+	u64 ret;
 	u64 last;
 	u32 version;
 
@@ -142,9 +142,9 @@ static notrace cycle_t vread_pvclock(int *mode)
 }
 #endif
 
-notrace static cycle_t vread_tsc(void)
+notrace static u64 vread_tsc(void)
 {
-	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 ret = (u64)rdtsc_ordered();
 	u64 last = gtod->cycle_last;
 
 	if (likely(ret >= last))
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2e25038dbd93..a7066dc1a7e9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -768,7 +768,7 @@ struct kvm_arch {
 	spinlock_t pvclock_gtod_sync_lock;
 	bool use_master_clock;
 	u64 master_kernel_ns;
-	cycle_t master_cycle_now;
+	u64 master_cycle_now;
 	struct delayed_work kvmclock_update_work;
 	struct delayed_work kvmclock_sync_work;
 
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 3ad741b84072..448cfe1b48cf 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -14,7 +14,7 @@ static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
 #endif
 
 /* some helper functions for xen and kvm pv clock sources */
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
 void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
@@ -87,11 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 }
 
 static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
-			      u64 tsc)
+u64 __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
 {
 	u64 delta = tsc - src->tsc_timestamp;
-	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
+	u64 offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
 					     src->tsc_shift);
 	return src->system_time + offset;
 }
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index abb1fdcc545a..f5e6f1c417df 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -29,7 +29,7 @@ static inline cycles_t get_cycles(void)
 	return rdtsc();
 }
 
-extern struct system_counterval_t convert_art_to_tsc(cycle_t art);
+extern struct system_counterval_t convert_art_to_tsc(u64 art);
 
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3a01996db58f..022e59714562 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -17,8 +17,8 @@ struct vsyscall_gtod_data {
 	unsigned seq;
 
 	int vclock_mode;
-	cycle_t	cycle_last;
-	cycle_t	mask;
+	u64	cycle_last;
+	u64	mask;
 	u32	mult;
 	u32	shift;
 
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 456316f6c868..092ea664d2c6 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -247,7 +247,7 @@ void apbt_setup_secondary_clock(void) {}
 static int apbt_clocksource_register(void)
 {
 	u64 start, now;
-	cycle_t t1;
+	u64 t1;
 
 	/* Start the counter, use timer 2 as source, timer 0/1 for event */
 	dw_apb_clocksource_start(clocksource_apbt);
@@ -355,7 +355,7 @@ unsigned long apbt_quick_calibrate(void)
 {
 	int i, scale;
 	u64 old, new;
-	cycle_t t1, t2;
+	u64 t1, t2;
 	unsigned long khz = 0;
 	u32 loop, shift;
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f37e02e41a77..65e20c97e04b 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -133,9 +133,9 @@ static uint32_t  __init ms_hyperv_platform(void)
 	return 0;
 }
 
-static cycle_t read_hv_clock(struct clocksource *arg)
+static u64 read_hv_clock(struct clocksource *arg)
 {
-	cycle_t current_tick;
+	u64 current_tick;
 	/*
 	 * Read the partition counter to get the current tick count. This count
 	 * is set to 0 when the partition is created and is incremented in
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 274fab99169d..367756d55980 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -791,7 +791,7 @@ static union hpet_lock hpet __cacheline_aligned = {
 	{ .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
 };
 
-static cycle_t read_hpet(struct clocksource *cs)
+static u64 read_hpet(struct clocksource *cs)
 {
 	unsigned long flags;
 	union hpet_lock old, new;
@@ -802,7 +802,7 @@ static cycle_t read_hpet(struct clocksource *cs)
 	 * Read HPET directly if in NMI.
 	 */
 	if (in_nmi())
-		return (cycle_t)hpet_readl(HPET_COUNTER);
+		return (u64)hpet_readl(HPET_COUNTER);
 
 	/*
 	 * Read the current state of the lock and HPET value atomically.
@@ -821,7 +821,7 @@ static cycle_t read_hpet(struct clocksource *cs)
 		WRITE_ONCE(hpet.value, new.value);
 		arch_spin_unlock(&hpet.lock);
 		local_irq_restore(flags);
-		return (cycle_t)new.value;
+		return (u64)new.value;
 	}
 	local_irq_restore(flags);
 
@@ -843,15 +843,15 @@ contended:
 		new.lockval = READ_ONCE(hpet.lockval);
 	} while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
 
-	return (cycle_t)new.value;
+	return (u64)new.value;
 }
 #else
 /*
  * For UP or 32-bit.
  */
-static cycle_t read_hpet(struct clocksource *cs)
+static u64 read_hpet(struct clocksource *cs)
 {
-	return (cycle_t)hpet_readl(HPET_COUNTER);
+	return (u64)hpet_readl(HPET_COUNTER);
 }
 #endif
 
@@ -867,7 +867,7 @@ static struct clocksource clocksource_hpet = {
 static int hpet_clocksource_register(void)
 {
 	u64 start, now;
-	cycle_t t1;
+	u64 t1;
 
 	/* Start the counter */
 	hpet_restart_counter();
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 60b9949f1e65..2a5cafdf8808 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,7 +32,7 @@
 static int kvmclock __ro_after_init = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
-static cycle_t kvm_sched_clock_offset;
+static u64 kvm_sched_clock_offset;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -79,10 +79,10 @@ static int kvm_set_wallclock(const struct timespec *now)
 	return -1;
 }
 
-static cycle_t kvm_clock_read(void)
+static u64 kvm_clock_read(void)
 {
 	struct pvclock_vcpu_time_info *src;
-	cycle_t ret;
+	u64 ret;
 	int cpu;
 
 	preempt_disable_notrace();
@@ -93,12 +93,12 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
-static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+static u64 kvm_clock_get_cycles(struct clocksource *cs)
 {
 	return kvm_clock_read();
 }
 
-static cycle_t kvm_sched_clock_read(void)
+static u64 kvm_sched_clock_read(void)
 {
 	return kvm_clock_read() - kvm_sched_clock_offset;
 }
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5b2cc889ce34..9e93fe5803b4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -71,10 +71,10 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
 	return flags & valid_flags;
 }
 
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	unsigned version;
-	cycle_t ret;
+	u64 ret;
 	u64 last;
 	u8 flags;
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0aed75a1e31b..be3a49ee0356 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1101,9 +1101,9 @@ static void tsc_resume(struct clocksource *cs)
  * checking the result of read_tsc() - cycle_last for being negative.
  * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
  */
-static cycle_t read_tsc(struct clocksource *cs)
+static u64 read_tsc(struct clocksource *cs)
 {
-	return (cycle_t)rdtsc_ordered();
+	return (u64)rdtsc_ordered();
 }
 
 /*
@@ -1192,7 +1192,7 @@ int unsynchronized_tsc(void)
 /*
  * Convert ART to TSC given numerator/denominator found in detect_art()
  */
-struct system_counterval_t convert_art_to_tsc(cycle_t art)
+struct system_counterval_t convert_art_to_tsc(u64 art)
 {
 	u64 tmp, res, rem;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 445c51b6cf6d..ed04398f52c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1131,8 +1131,8 @@ struct pvclock_gtod_data {
 
 	struct { /* extract of a clocksource struct */
 		int vclock_mode;
-		cycle_t	cycle_last;
-		cycle_t	mask;
+		u64	cycle_last;
+		u64	mask;
 		u32	mult;
 		u32	shift;
 	} clock;
@@ -1572,9 +1572,9 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 
 #ifdef CONFIG_X86_64
 
-static cycle_t read_tsc(void)
+static u64 read_tsc(void)
 {
-	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 ret = (u64)rdtsc_ordered();
 	u64 last = pvclock_gtod_data.clock.cycle_last;
 
 	if (likely(ret >= last))
@@ -1592,7 +1592,7 @@ static cycle_t read_tsc(void)
 	return last;
 }
 
-static inline u64 vgettsc(cycle_t *cycle_now)
+static inline u64 vgettsc(u64 *cycle_now)
 {
 	long v;
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -1603,7 +1603,7 @@ static inline u64 vgettsc(cycle_t *cycle_now)
 	return v * gtod->clock.mult;
 }
 
-static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
+static int do_monotonic_boot(s64 *t, u64 *cycle_now)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	unsigned long seq;
@@ -1624,7 +1624,7 @@ static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
 }
 
 /* returns true if host is using tsc clocksource */
-static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 {
 	/* checked again under seqlock below */
 	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4ca0d78adcf0..d3289d7e78fa 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -916,7 +916,7 @@ static unsigned long lguest_tsc_khz(void)
  * If we can't use the TSC, the kernel falls back to our lower-priority
  * "lguest_clock", where we read the time value given to us by the Host.
  */
-static cycle_t lguest_clock_read(struct clocksource *cs)
+static u64 lguest_clock_read(struct clocksource *cs)
 {
 	unsigned long sec, nsec;
 
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index b333fc45f9ec..2ee7632d4916 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -30,7 +30,7 @@
 
 #define RTC_NAME		"sgi_rtc"
 
-static cycle_t uv_read_rtc(struct clocksource *cs);
+static u64 uv_read_rtc(struct clocksource *cs);
 static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
 static int uv_rtc_shutdown(struct clock_event_device *evt);
 
@@ -38,7 +38,7 @@ static struct clocksource clocksource_uv = {
 	.name		= RTC_NAME,
 	.rating		= 299,
 	.read		= uv_read_rtc,
-	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+	.mask		= (u64)UVH_RTC_REAL_TIME_CLOCK_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -296,7 +296,7 @@ static int uv_rtc_unset_timer(int cpu, int force)
  * cachelines of it's own page.  This allows faster simultaneous reads
  * from a given socket.
  */
-static cycle_t uv_read_rtc(struct clocksource *cs)
+static u64 uv_read_rtc(struct clocksource *cs)
 {
 	unsigned long offset;
 
@@ -305,7 +305,7 @@ static cycle_t uv_read_rtc(struct clocksource *cs)
 	else
 		offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
 
-	return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
+	return (u64)uv_read_local_mmr(UVH_RTC | offset);
 }
 
 /*
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 33d8f6a7829d..1e69956d7852 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -39,10 +39,10 @@ static unsigned long xen_tsc_khz(void)
 	return pvclock_tsc_khz(info);
 }
 
-cycle_t xen_clocksource_read(void)
+u64 xen_clocksource_read(void)
 {
         struct pvclock_vcpu_time_info *src;
-	cycle_t ret;
+	u64 ret;
 
 	preempt_disable_notrace();
 	src = &__this_cpu_read(xen_vcpu)->time;
@@ -51,7 +51,7 @@ cycle_t xen_clocksource_read(void)
 	return ret;
 }
 
-static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
+static u64 xen_clocksource_get_cycles(struct clocksource *cs)
 {
 	return xen_clocksource_read();
 }
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3cbce3b085e7..ac0a2b0f9e62 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -67,7 +67,7 @@ void xen_init_irq_ops(void);
 void xen_setup_timer(int cpu);
 void xen_setup_runstate_info(int cpu);
 void xen_teardown_timer(int cpu);
-cycle_t xen_clocksource_read(void);
+u64 xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
 void __init xen_init_time_ops(void);
 void __init xen_hvm_init_time_ops(void);
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index be81e69b25bc..668c1056f9e4 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -34,9 +34,9 @@
 unsigned long ccount_freq;		/* ccount Hz */
 EXPORT_SYMBOL(ccount_freq);
 
-static cycle_t ccount_read(struct clocksource *cs)
+static u64 ccount_read(struct clocksource *cs)
 {
-	return (cycle_t)get_ccount();
+	return (u64)get_ccount();
 }
 
 static u64 notrace ccount_sched_clock_read(void)
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index be54e5331a45..20b32bb8c2af 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -69,9 +69,9 @@ static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ;
 #ifdef CONFIG_IA64
 static void __iomem *hpet_mctr;
 
-static cycle_t read_hpet(struct clocksource *cs)
+static u64 read_hpet(struct clocksource *cs)
 {
-	return (cycle_t)read_counter((void __iomem *)hpet_mctr);
+	return (u64)read_counter((void __iomem *)hpet_mctr);
 }
 
 static struct clocksource clocksource_hpet = {
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 28037d0b8dcd..1961e3539b57 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -58,16 +58,16 @@ u32 acpi_pm_read_verified(void)
 	return v2;
 }
 
-static cycle_t acpi_pm_read(struct clocksource *cs)
+static u64 acpi_pm_read(struct clocksource *cs)
 {
-	return (cycle_t)read_pmtmr();
+	return (u64)read_pmtmr();
 }
 
 static struct clocksource clocksource_acpi_pm = {
 	.name		= "acpi_pm",
 	.rating		= 200,
 	.read		= acpi_pm_read,
-	.mask		= (cycle_t)ACPI_PM_MASK,
+	.mask		= (u64)ACPI_PM_MASK,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -81,9 +81,9 @@ static int __init acpi_pm_good_setup(char *__str)
 }
 __setup("acpi_pm_good", acpi_pm_good_setup);
 
-static cycle_t acpi_pm_read_slow(struct clocksource *cs)
+static u64 acpi_pm_read_slow(struct clocksource *cs)
 {
-	return (cycle_t)acpi_pm_read_verified();
+	return (u64)acpi_pm_read_verified();
 }
 
 static inline void acpi_pm_need_workaround(void)
@@ -145,7 +145,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
  */
 static int verify_pmtmr_rate(void)
 {
-	cycle_t value1, value2;
+	u64 value1, value2;
 	unsigned long count, delta;
 
 	mach_prepare_counter();
@@ -175,7 +175,7 @@ static int verify_pmtmr_rate(void)
 
 static int __init init_acpi_pm_clocksource(void)
 {
-	cycle_t value1, value2;
+	u64 value1, value2;
 	unsigned int i, j = 0;
 
 	if (!pmtmr_ioport)
diff --git a/drivers/clocksource/arc_timer.c b/drivers/clocksource/arc_timer.c
index a49748d826c0..3ea46343024f 100644
--- a/drivers/clocksource/arc_timer.c
+++ b/drivers/clocksource/arc_timer.c
@@ -56,7 +56,7 @@ static int noinline arc_get_timer_clk(struct device_node *node)
 
 #ifdef CONFIG_ARC_TIMERS_64BIT
 
-static cycle_t arc_read_gfrc(struct clocksource *cs)
+static u64 arc_read_gfrc(struct clocksource *cs)
 {
 	unsigned long flags;
 	u32 l, h;
@@ -71,7 +71,7 @@ static cycle_t arc_read_gfrc(struct clocksource *cs)
 
 	local_irq_restore(flags);
 
-	return (((cycle_t)h) << 32) | l;
+	return (((u64)h) << 32) | l;
 }
 
 static struct clocksource arc_counter_gfrc = {
@@ -105,7 +105,7 @@ CLOCKSOURCE_OF_DECLARE(arc_gfrc, "snps,archs-timer-gfrc", arc_cs_setup_gfrc);
 #define AUX_RTC_LOW	0x104
 #define AUX_RTC_HIGH	0x105
 
-static cycle_t arc_read_rtc(struct clocksource *cs)
+static u64 arc_read_rtc(struct clocksource *cs)
 {
 	unsigned long status;
 	u32 l, h;
@@ -122,7 +122,7 @@ static cycle_t arc_read_rtc(struct clocksource *cs)
 		status = read_aux_reg(AUX_RTC_CTRL);
 	} while (!(status & _BITUL(31)));
 
-	return (((cycle_t)h) << 32) | l;
+	return (((u64)h) << 32) | l;
 }
 
 static struct clocksource arc_counter_rtc = {
@@ -166,9 +166,9 @@ CLOCKSOURCE_OF_DECLARE(arc_rtc, "snps,archs-timer-rtc", arc_cs_setup_rtc);
  * 32bit TIMER1 to keep counting monotonically and wraparound
  */
 
-static cycle_t arc_read_timer1(struct clocksource *cs)
+static u64 arc_read_timer1(struct clocksource *cs)
 {
-	return (cycle_t) read_aux_reg(ARC_REG_TIMER1_CNT);
+	return (u64) read_aux_reg(ARC_REG_TIMER1_CNT);
 }
 
 static struct clocksource arc_counter_timer1 = {
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 02fef6830e72..394e417414d3 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -562,12 +562,12 @@ static u64 arch_counter_get_cntvct_mem(void)
  */
 u64 (*arch_timer_read_counter)(void) = arch_counter_get_cntvct;
 
-static cycle_t arch_counter_read(struct clocksource *cs)
+static u64 arch_counter_read(struct clocksource *cs)
 {
 	return arch_timer_read_counter();
 }
 
-static cycle_t arch_counter_read_cc(const struct cyclecounter *cc)
+static u64 arch_counter_read_cc(const struct cyclecounter *cc)
 {
 	return arch_timer_read_counter();
 }
diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c
index 8da03298f844..570cc58baec4 100644
--- a/drivers/clocksource/arm_global_timer.c
+++ b/drivers/clocksource/arm_global_timer.c
@@ -195,7 +195,7 @@ static int gt_dying_cpu(unsigned int cpu)
 	return 0;
 }
 
-static cycle_t gt_clocksource_read(struct clocksource *cs)
+static u64 gt_clocksource_read(struct clocksource *cs)
 {
 	return gt_counter_read();
 }
diff --git a/drivers/clocksource/cadence_ttc_timer.c b/drivers/clocksource/cadence_ttc_timer.c
index fbfbdec13b08..44e5e951583b 100644
--- a/drivers/clocksource/cadence_ttc_timer.c
+++ b/drivers/clocksource/cadence_ttc_timer.c
@@ -158,11 +158,11 @@ static irqreturn_t ttc_clock_event_interrupt(int irq, void *dev_id)
  *
  * returns: Current timer counter register value
  **/
-static cycle_t __ttc_clocksource_read(struct clocksource *cs)
+static u64 __ttc_clocksource_read(struct clocksource *cs)
 {
 	struct ttc_timer *timer = &to_ttc_timer_clksrc(cs)->ttc;
 
-	return (cycle_t)readl_relaxed(timer->base_addr +
+	return (u64)readl_relaxed(timer->base_addr +
 				TTC_COUNT_VAL_OFFSET);
 }
 
diff --git a/drivers/clocksource/clksrc-dbx500-prcmu.c b/drivers/clocksource/clksrc-dbx500-prcmu.c
index 77a365f573d7..c69e2772658d 100644
--- a/drivers/clocksource/clksrc-dbx500-prcmu.c
+++ b/drivers/clocksource/clksrc-dbx500-prcmu.c
@@ -30,7 +30,7 @@
 
 static void __iomem *clksrc_dbx500_timer_base;
 
-static cycle_t notrace clksrc_dbx500_prcmu_read(struct clocksource *cs)
+static u64 notrace clksrc_dbx500_prcmu_read(struct clocksource *cs)
 {
 	void __iomem *base = clksrc_dbx500_timer_base;
 	u32 count, count2;
diff --git a/drivers/clocksource/dw_apb_timer.c b/drivers/clocksource/dw_apb_timer.c
index 797505aa2ba4..63e4f5519577 100644
--- a/drivers/clocksource/dw_apb_timer.c
+++ b/drivers/clocksource/dw_apb_timer.c
@@ -348,7 +348,7 @@ void dw_apb_clocksource_start(struct dw_apb_clocksource *dw_cs)
 	dw_apb_clocksource_read(dw_cs);
 }
 
-static cycle_t __apbt_read_clocksource(struct clocksource *cs)
+static u64 __apbt_read_clocksource(struct clocksource *cs)
 {
 	u32 current_count;
 	struct dw_apb_clocksource *dw_cs =
@@ -357,7 +357,7 @@ static cycle_t __apbt_read_clocksource(struct clocksource *cs)
 	current_count = apbt_readl_relaxed(&dw_cs->timer,
 					APBTMR_N_CURRENT_VALUE);
 
-	return (cycle_t)~current_count;
+	return (u64)~current_count;
 }
 
 static void apbt_restart_clocksource(struct clocksource *cs)
@@ -416,7 +416,7 @@ void dw_apb_clocksource_register(struct dw_apb_clocksource *dw_cs)
  *
  * @dw_cs:	The clocksource to read.
  */
-cycle_t dw_apb_clocksource_read(struct dw_apb_clocksource *dw_cs)
+u64 dw_apb_clocksource_read(struct dw_apb_clocksource *dw_cs)
 {
-	return (cycle_t)~apbt_readl(&dw_cs->timer, APBTMR_N_CURRENT_VALUE);
+	return (u64)~apbt_readl(&dw_cs->timer, APBTMR_N_CURRENT_VALUE);
 }
diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c
index 19bb1792d647..aff87df07449 100644
--- a/drivers/clocksource/em_sti.c
+++ b/drivers/clocksource/em_sti.c
@@ -110,9 +110,9 @@ static void em_sti_disable(struct em_sti_priv *p)
 	clk_disable_unprepare(p->clk);
 }
 
-static cycle_t em_sti_count(struct em_sti_priv *p)
+static u64 em_sti_count(struct em_sti_priv *p)
 {
-	cycle_t ticks;
+	u64 ticks;
 	unsigned long flags;
 
 	/* the STI hardware buffers the 48-bit count, but to
@@ -121,14 +121,14 @@ static cycle_t em_sti_count(struct em_sti_priv *p)
 	 * Always read STI_COUNT_H before STI_COUNT_L.
 	 */
 	raw_spin_lock_irqsave(&p->lock, flags);
-	ticks = (cycle_t)(em_sti_read(p, STI_COUNT_H) & 0xffff) << 32;
+	ticks = (u64)(em_sti_read(p, STI_COUNT_H) & 0xffff) << 32;
 	ticks |= em_sti_read(p, STI_COUNT_L);
 	raw_spin_unlock_irqrestore(&p->lock, flags);
 
 	return ticks;
 }
 
-static cycle_t em_sti_set_next(struct em_sti_priv *p, cycle_t next)
+static u64 em_sti_set_next(struct em_sti_priv *p, u64 next)
 {
 	unsigned long flags;
 
@@ -198,7 +198,7 @@ static struct em_sti_priv *cs_to_em_sti(struct clocksource *cs)
 	return container_of(cs, struct em_sti_priv, cs);
 }
 
-static cycle_t em_sti_clocksource_read(struct clocksource *cs)
+static u64 em_sti_clocksource_read(struct clocksource *cs)
 {
 	return em_sti_count(cs_to_em_sti(cs));
 }
@@ -271,7 +271,7 @@ static int em_sti_clock_event_next(unsigned long delta,
 				   struct clock_event_device *ced)
 {
 	struct em_sti_priv *p = ced_to_em_sti(ced);
-	cycle_t next;
+	u64 next;
 	int safe;
 
 	next = em_sti_set_next(p, em_sti_count(p) + delta);
diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c
index 8f3488b80896..c8b9f834f4de 100644
--- a/drivers/clocksource/exynos_mct.c
+++ b/drivers/clocksource/exynos_mct.c
@@ -183,7 +183,7 @@ static u64 exynos4_read_count_64(void)
 		hi2 = readl_relaxed(reg_base + EXYNOS4_MCT_G_CNT_U);
 	} while (hi != hi2);
 
-	return ((cycle_t)hi << 32) | lo;
+	return ((u64)hi << 32) | lo;
 }
 
 /**
@@ -199,7 +199,7 @@ static u32 notrace exynos4_read_count_32(void)
 	return readl_relaxed(reg_base + EXYNOS4_MCT_G_CNT_L);
 }
 
-static cycle_t exynos4_frc_read(struct clocksource *cs)
+static u64 exynos4_frc_read(struct clocksource *cs)
 {
 	return exynos4_read_count_32();
 }
@@ -266,7 +266,7 @@ static void exynos4_mct_comp0_stop(void)
 static void exynos4_mct_comp0_start(bool periodic, unsigned long cycles)
 {
 	unsigned int tcon;
-	cycle_t comp_cycle;
+	u64 comp_cycle;
 
 	tcon = readl_relaxed(reg_base + EXYNOS4_MCT_G_TCON);
 
diff --git a/drivers/clocksource/h8300_timer16.c b/drivers/clocksource/h8300_timer16.c
index 07d9d5be9054..5b27fb9997c2 100644
--- a/drivers/clocksource/h8300_timer16.c
+++ b/drivers/clocksource/h8300_timer16.c
@@ -72,7 +72,7 @@ static inline struct timer16_priv *cs_to_priv(struct clocksource *cs)
 	return container_of(cs, struct timer16_priv, cs);
 }
 
-static cycle_t timer16_clocksource_read(struct clocksource *cs)
+static u64 timer16_clocksource_read(struct clocksource *cs)
 {
 	struct timer16_priv *p = cs_to_priv(cs);
 	unsigned long raw, value;
diff --git a/drivers/clocksource/h8300_tpu.c b/drivers/clocksource/h8300_tpu.c
index 7bdf1991c847..72e1cf2b3096 100644
--- a/drivers/clocksource/h8300_tpu.c
+++ b/drivers/clocksource/h8300_tpu.c
@@ -64,7 +64,7 @@ static inline struct tpu_priv *cs_to_priv(struct clocksource *cs)
 	return container_of(cs, struct tpu_priv, cs);
 }
 
-static cycle_t tpu_clocksource_read(struct clocksource *cs)
+static u64 tpu_clocksource_read(struct clocksource *cs)
 {
 	struct tpu_priv *p = cs_to_priv(cs);
 	unsigned long flags;
diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c
index 0efd36e483ab..64f6490740d7 100644
--- a/drivers/clocksource/i8253.c
+++ b/drivers/clocksource/i8253.c
@@ -25,7 +25,7 @@ EXPORT_SYMBOL(i8253_lock);
  * to just read by itself. So use jiffies to emulate a free
  * running counter:
  */
-static cycle_t i8253_read(struct clocksource *cs)
+static u64 i8253_read(struct clocksource *cs)
 {
 	static int old_count;
 	static u32 old_jifs;
@@ -83,7 +83,7 @@ static cycle_t i8253_read(struct clocksource *cs)
 
 	count = (PIT_LATCH - 1) - count;
 
-	return (cycle_t)(jifs * PIT_LATCH) + count;
+	return (u64)(jifs * PIT_LATCH) + count;
 }
 
 static struct clocksource i8253_cs = {
diff --git a/drivers/clocksource/jcore-pit.c b/drivers/clocksource/jcore-pit.c
index 54e1665aa03c..e90a6cfcb061 100644
--- a/drivers/clocksource/jcore-pit.c
+++ b/drivers/clocksource/jcore-pit.c
@@ -57,7 +57,7 @@ static notrace u64 jcore_sched_clock_read(void)
 	return seclo * NSEC_PER_SEC + nsec;
 }
 
-static cycle_t jcore_clocksource_read(struct clocksource *cs)
+static u64 jcore_clocksource_read(struct clocksource *cs)
 {
 	return jcore_sched_clock_read();
 }
diff --git a/drivers/clocksource/metag_generic.c b/drivers/clocksource/metag_generic.c
index a80ab3e446b7..8d06a0f7ff26 100644
--- a/drivers/clocksource/metag_generic.c
+++ b/drivers/clocksource/metag_generic.c
@@ -56,7 +56,7 @@ static int metag_timer_set_next_event(unsigned long delta,
 	return 0;
 }
 
-static cycle_t metag_clocksource_read(struct clocksource *cs)
+static u64 metag_clocksource_read(struct clocksource *cs)
 {
 	return __core_reg_get(TXTIMER);
 }
diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c
index 7a960cd01104..7b86d07c99b4 100644
--- a/drivers/clocksource/mips-gic-timer.c
+++ b/drivers/clocksource/mips-gic-timer.c
@@ -125,7 +125,7 @@ static int gic_clockevent_init(void)
 	return 0;
 }
 
-static cycle_t gic_hpt_read(struct clocksource *cs)
+static u64 gic_hpt_read(struct clocksource *cs)
 {
 	return gic_read_count();
 }
diff --git a/drivers/clocksource/mmio.c b/drivers/clocksource/mmio.c
index c4f7d7a9b689..4c4df981d8cc 100644
--- a/drivers/clocksource/mmio.c
+++ b/drivers/clocksource/mmio.c
@@ -20,24 +20,24 @@ static inline struct clocksource_mmio *to_mmio_clksrc(struct clocksource *c)
 	return container_of(c, struct clocksource_mmio, clksrc);
 }
 
-cycle_t clocksource_mmio_readl_up(struct clocksource *c)
+u64 clocksource_mmio_readl_up(struct clocksource *c)
 {
-	return (cycle_t)readl_relaxed(to_mmio_clksrc(c)->reg);
+	return (u64)readl_relaxed(to_mmio_clksrc(c)->reg);
 }
 
-cycle_t clocksource_mmio_readl_down(struct clocksource *c)
+u64 clocksource_mmio_readl_down(struct clocksource *c)
 {
-	return ~(cycle_t)readl_relaxed(to_mmio_clksrc(c)->reg) & c->mask;
+	return ~(u64)readl_relaxed(to_mmio_clksrc(c)->reg) & c->mask;
 }
 
-cycle_t clocksource_mmio_readw_up(struct clocksource *c)
+u64 clocksource_mmio_readw_up(struct clocksource *c)
 {
-	return (cycle_t)readw_relaxed(to_mmio_clksrc(c)->reg);
+	return (u64)readw_relaxed(to_mmio_clksrc(c)->reg);
 }
 
-cycle_t clocksource_mmio_readw_down(struct clocksource *c)
+u64 clocksource_mmio_readw_down(struct clocksource *c)
 {
-	return ~(cycle_t)readw_relaxed(to_mmio_clksrc(c)->reg) & c->mask;
+	return ~(u64)readw_relaxed(to_mmio_clksrc(c)->reg) & c->mask;
 }
 
 /**
@@ -51,7 +51,7 @@ cycle_t clocksource_mmio_readw_down(struct clocksource *c)
  */
 int __init clocksource_mmio_init(void __iomem *base, const char *name,
 	unsigned long hz, int rating, unsigned bits,
-	cycle_t (*read)(struct clocksource *))
+	u64 (*read)(struct clocksource *))
 {
 	struct clocksource_mmio *cs;
 
diff --git a/drivers/clocksource/mxs_timer.c b/drivers/clocksource/mxs_timer.c
index 0ba0a913b41d..99b77aff0839 100644
--- a/drivers/clocksource/mxs_timer.c
+++ b/drivers/clocksource/mxs_timer.c
@@ -97,7 +97,7 @@ static void timrot_irq_acknowledge(void)
 		     HW_TIMROT_TIMCTRLn(0) + STMP_OFFSET_REG_CLR);
 }
 
-static cycle_t timrotv1_get_cycles(struct clocksource *cs)
+static u64 timrotv1_get_cycles(struct clocksource *cs)
 {
 	return ~((__raw_readl(mxs_timrot_base + HW_TIMROT_TIMCOUNTn(1))
 			& 0xffff0000) >> 16);
diff --git a/drivers/clocksource/qcom-timer.c b/drivers/clocksource/qcom-timer.c
index 3283cfa2aa52..d5d048d890d4 100644
--- a/drivers/clocksource/qcom-timer.c
+++ b/drivers/clocksource/qcom-timer.c
@@ -89,7 +89,7 @@ static struct clock_event_device __percpu *msm_evt;
 
 static void __iomem *source_base;
 
-static notrace cycle_t msm_read_timer_count(struct clocksource *cs)
+static notrace u64 msm_read_timer_count(struct clocksource *cs)
 {
 	return readl_relaxed(source_base + TIMER_COUNT_VAL);
 }
diff --git a/drivers/clocksource/samsung_pwm_timer.c b/drivers/clocksource/samsung_pwm_timer.c
index 54565bd0093b..0093ece661fe 100644
--- a/drivers/clocksource/samsung_pwm_timer.c
+++ b/drivers/clocksource/samsung_pwm_timer.c
@@ -307,7 +307,7 @@ static void samsung_clocksource_resume(struct clocksource *cs)
 	samsung_time_start(pwm.source_id, true);
 }
 
-static cycle_t notrace samsung_clocksource_read(struct clocksource *c)
+static u64 notrace samsung_clocksource_read(struct clocksource *c)
 {
 	return ~readl_relaxed(pwm.source_reg);
 }
diff --git a/drivers/clocksource/scx200_hrt.c b/drivers/clocksource/scx200_hrt.c
index 64f9e8294434..a46660bf6588 100644
--- a/drivers/clocksource/scx200_hrt.c
+++ b/drivers/clocksource/scx200_hrt.c
@@ -43,10 +43,10 @@ MODULE_PARM_DESC(ppm, "+-adjust to actual XO freq (ppm)");
 /* The base timer frequency, * 27 if selected */
 #define HRT_FREQ   1000000
 
-static cycle_t read_hrt(struct clocksource *cs)
+static u64 read_hrt(struct clocksource *cs)
 {
 	/* Read the timer value */
-	return (cycle_t) inl(scx200_cb_base + SCx200_TIMER_OFFSET);
+	return (u64) inl(scx200_cb_base + SCx200_TIMER_OFFSET);
 }
 
 static struct clocksource cs_hrt = {
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 103c49362c68..28757edf6aca 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -612,7 +612,7 @@ static struct sh_cmt_channel *cs_to_sh_cmt(struct clocksource *cs)
 	return container_of(cs, struct sh_cmt_channel, cs);
 }
 
-static cycle_t sh_cmt_clocksource_read(struct clocksource *cs)
+static u64 sh_cmt_clocksource_read(struct clocksource *cs)
 {
 	struct sh_cmt_channel *ch = cs_to_sh_cmt(cs);
 	unsigned long flags, raw;
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 469e776ec17a..1fbf2aadcfd4 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -255,7 +255,7 @@ static struct sh_tmu_channel *cs_to_sh_tmu(struct clocksource *cs)
 	return container_of(cs, struct sh_tmu_channel, cs);
 }
 
-static cycle_t sh_tmu_clocksource_read(struct clocksource *cs)
+static u64 sh_tmu_clocksource_read(struct clocksource *cs)
 {
 	struct sh_tmu_channel *ch = cs_to_sh_tmu(cs);
 
diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c
index 4da2af9694a2..d4ca9962a759 100644
--- a/drivers/clocksource/tcb_clksrc.c
+++ b/drivers/clocksource/tcb_clksrc.c
@@ -41,7 +41,7 @@
 
 static void __iomem *tcaddr;
 
-static cycle_t tc_get_cycles(struct clocksource *cs)
+static u64 tc_get_cycles(struct clocksource *cs)
 {
 	unsigned long	flags;
 	u32		lower, upper;
@@ -56,7 +56,7 @@ static cycle_t tc_get_cycles(struct clocksource *cs)
 	return (upper << 16) | lower;
 }
 
-static cycle_t tc_get_cycles32(struct clocksource *cs)
+static u64 tc_get_cycles32(struct clocksource *cs)
 {
 	return __raw_readl(tcaddr + ATMEL_TC_REG(0, CV));
 }
diff --git a/drivers/clocksource/time-pistachio.c b/drivers/clocksource/time-pistachio.c
index a8e6c7df853d..3710e4d9dcba 100644
--- a/drivers/clocksource/time-pistachio.c
+++ b/drivers/clocksource/time-pistachio.c
@@ -67,7 +67,7 @@ static inline void gpt_writel(void __iomem *base, u32 value, u32 offset,
 	writel(value, base + 0x20 * gpt_id + offset);
 }
 
-static cycle_t notrace
+static u64 notrace
 pistachio_clocksource_read_cycles(struct clocksource *cs)
 {
 	struct pistachio_clocksource *pcs = to_pistachio_clocksource(cs);
@@ -84,7 +84,7 @@ pistachio_clocksource_read_cycles(struct clocksource *cs)
 	counter = gpt_readl(pcs->base, TIMER_CURRENT_VALUE, 0);
 	raw_spin_unlock_irqrestore(&pcs->lock, flags);
 
-	return (cycle_t)~counter;
+	return (u64)~counter;
 }
 
 static u64 notrace pistachio_read_sched_clock(void)
diff --git a/drivers/clocksource/timer-atlas7.c b/drivers/clocksource/timer-atlas7.c
index 4334e0330ada..db0f21e7d7d2 100644
--- a/drivers/clocksource/timer-atlas7.c
+++ b/drivers/clocksource/timer-atlas7.c
@@ -85,7 +85,7 @@ static irqreturn_t sirfsoc_timer_interrupt(int irq, void *dev_id)
 }
 
 /* read 64-bit timer counter */
-static cycle_t sirfsoc_timer_read(struct clocksource *cs)
+static u64 sirfsoc_timer_read(struct clocksource *cs)
 {
 	u64 cycles;
 
diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c
index 6555821bbdae..c0b5df3167a0 100644
--- a/drivers/clocksource/timer-atmel-pit.c
+++ b/drivers/clocksource/timer-atmel-pit.c
@@ -73,7 +73,7 @@ static inline void pit_write(void __iomem *base, unsigned int reg_offset, unsign
  * Clocksource:  just a monotonic counter of MCK/16 cycles.
  * We don't care whether or not PIT irqs are enabled.
  */
-static cycle_t read_pit_clk(struct clocksource *cs)
+static u64 read_pit_clk(struct clocksource *cs)
 {
 	struct pit_data *data = clksrc_to_pit_data(cs);
 	unsigned long flags;
diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c
index e90ab5b63a90..be4ac7604136 100644
--- a/drivers/clocksource/timer-atmel-st.c
+++ b/drivers/clocksource/timer-atmel-st.c
@@ -92,7 +92,7 @@ static irqreturn_t at91rm9200_timer_interrupt(int irq, void *dev_id)
 	return IRQ_NONE;
 }
 
-static cycle_t read_clk32k(struct clocksource *cs)
+static u64 read_clk32k(struct clocksource *cs)
 {
 	return read_CRTR();
 }
diff --git a/drivers/clocksource/timer-nps.c b/drivers/clocksource/timer-nps.c
index 8da5e93b6810..da1f7986e477 100644
--- a/drivers/clocksource/timer-nps.c
+++ b/drivers/clocksource/timer-nps.c
@@ -77,11 +77,11 @@ static int __init nps_get_timer_clk(struct device_node *node,
 	return 0;
 }
 
-static cycle_t nps_clksrc_read(struct clocksource *clksrc)
+static u64 nps_clksrc_read(struct clocksource *clksrc)
 {
 	int cluster = raw_smp_processor_id() >> NPS_CLUSTER_OFFSET;
 
-	return (cycle_t)ioread32be(nps_msu_reg_low_addr[cluster]);
+	return (u64)ioread32be(nps_msu_reg_low_addr[cluster]);
 }
 
 static int __init nps_setup_clocksource(struct device_node *node)
diff --git a/drivers/clocksource/timer-prima2.c b/drivers/clocksource/timer-prima2.c
index c32148ec7a38..bfa981ac1eaf 100644
--- a/drivers/clocksource/timer-prima2.c
+++ b/drivers/clocksource/timer-prima2.c
@@ -72,7 +72,7 @@ static irqreturn_t sirfsoc_timer_interrupt(int irq, void *dev_id)
 }
 
 /* read 64-bit timer counter */
-static cycle_t notrace sirfsoc_timer_read(struct clocksource *cs)
+static u64 notrace sirfsoc_timer_read(struct clocksource *cs)
 {
 	u64 cycles;
 
diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c
index 4f87f3e76d83..a3e662b15964 100644
--- a/drivers/clocksource/timer-sun5i.c
+++ b/drivers/clocksource/timer-sun5i.c
@@ -152,7 +152,7 @@ static irqreturn_t sun5i_timer_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static cycle_t sun5i_clksrc_read(struct clocksource *clksrc)
+static u64 sun5i_clksrc_read(struct clocksource *clksrc)
 {
 	struct sun5i_timer_clksrc *cs = to_sun5i_timer_clksrc(clksrc);
 
diff --git a/drivers/clocksource/timer-ti-32k.c b/drivers/clocksource/timer-ti-32k.c
index cf5b14e442e4..624067712ef0 100644
--- a/drivers/clocksource/timer-ti-32k.c
+++ b/drivers/clocksource/timer-ti-32k.c
@@ -65,11 +65,11 @@ static inline struct ti_32k *to_ti_32k(struct clocksource *cs)
 	return container_of(cs, struct ti_32k, cs);
 }
 
-static cycle_t notrace ti_32k_read_cycles(struct clocksource *cs)
+static u64 notrace ti_32k_read_cycles(struct clocksource *cs)
 {
 	struct ti_32k *ti = to_ti_32k(cs);
 
-	return (cycle_t)readl_relaxed(ti->counter);
+	return (u64)readl_relaxed(ti->counter);
 }
 
 static struct ti_32k ti_32k_timer = {
diff --git a/drivers/clocksource/vt8500_timer.c b/drivers/clocksource/vt8500_timer.c
index b15069483fbd..d02b51075ad1 100644
--- a/drivers/clocksource/vt8500_timer.c
+++ b/drivers/clocksource/vt8500_timer.c
@@ -53,7 +53,7 @@
 
 static void __iomem *regbase;
 
-static cycle_t vt8500_timer_read(struct clocksource *cs)
+static u64 vt8500_timer_read(struct clocksource *cs)
 {
 	int loops = msecs_to_loops(10);
 	writel(3, regbase + TIMER_CTRL_VAL);
@@ -75,7 +75,7 @@ static int vt8500_timer_set_next_event(unsigned long cycles,
 				    struct clock_event_device *evt)
 {
 	int loops = msecs_to_loops(10);
-	cycle_t alarm = clocksource.read(&clocksource) + cycles;
+	u64 alarm = clocksource.read(&clocksource) + cycles;
 	while ((readl(regbase + TIMER_AS_VAL) & TIMER_MATCH_W_ACTIVE)
 						&& --loops)
 		cpu_relax();
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index 446802ae8f1b..b44b32f21e61 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -135,9 +135,9 @@ u64 hv_do_hypercall(u64 control, void *input, void *output)
 EXPORT_SYMBOL_GPL(hv_do_hypercall);
 
 #ifdef CONFIG_X86_64
-static cycle_t read_hv_clock_tsc(struct clocksource *arg)
+static u64 read_hv_clock_tsc(struct clocksource *arg)
 {
-	cycle_t current_tick;
+	u64 current_tick;
 	struct ms_hyperv_tsc_page *tsc_pg = hv_context.tsc_page;
 
 	if (tsc_pg->tsc_sequence != 0) {
@@ -146,7 +146,7 @@ static cycle_t read_hv_clock_tsc(struct clocksource *arg)
 		 */
 
 		while (1) {
-			cycle_t tmp;
+			u64 tmp;
 			u32 sequence = tsc_pg->tsc_sequence;
 			u64 cur_tsc;
 			u64 scale = tsc_pg->tsc_scale;
@@ -350,7 +350,7 @@ int hv_post_message(union hv_connection_id connection_id,
 static int hv_ce_set_next_event(unsigned long delta,
 				struct clock_event_device *evt)
 {
-	cycle_t current_tick;
+	u64 current_tick;
 
 	WARN_ON(!clockevent_state_oneshot(evt));
 
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index c0178a122940..c01c09e9916d 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -152,12 +152,12 @@ static inline void gic_map_to_vpe(unsigned int intr, unsigned int vpe)
 }
 
 #ifdef CONFIG_CLKSRC_MIPS_GIC
-cycle_t gic_read_count(void)
+u64 gic_read_count(void)
 {
 	unsigned int hi, hi2, lo;
 
 	if (mips_cm_is64)
-		return (cycle_t)gic_read(GIC_REG(SHARED, GIC_SH_COUNTER));
+		return (u64)gic_read(GIC_REG(SHARED, GIC_SH_COUNTER));
 
 	do {
 		hi = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
@@ -165,7 +165,7 @@ cycle_t gic_read_count(void)
 		hi2 = gic_read32(GIC_REG(SHARED, GIC_SH_COUNTER_63_32));
 	} while (hi2 != hi);
 
-	return (((cycle_t) hi) << 32) + lo;
+	return (((u64) hi) << 32) + lo;
 }
 
 unsigned int gic_get_count_width(void)
@@ -179,7 +179,7 @@ unsigned int gic_get_count_width(void)
 	return bits;
 }
 
-void gic_write_compare(cycle_t cnt)
+void gic_write_compare(u64 cnt)
 {
 	if (mips_cm_is64) {
 		gic_write(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE), cnt);
@@ -191,7 +191,7 @@ void gic_write_compare(cycle_t cnt)
 	}
 }
 
-void gic_write_cpu_compare(cycle_t cnt, int cpu)
+void gic_write_cpu_compare(u64 cnt, int cpu)
 {
 	unsigned long flags;
 
@@ -211,17 +211,17 @@ void gic_write_cpu_compare(cycle_t cnt, int cpu)
 	local_irq_restore(flags);
 }
 
-cycle_t gic_read_compare(void)
+u64 gic_read_compare(void)
 {
 	unsigned int hi, lo;
 
 	if (mips_cm_is64)
-		return (cycle_t)gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE));
+		return (u64)gic_read(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE));
 
 	hi = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_HI));
 	lo = gic_read32(GIC_REG(VPE_LOCAL, GIC_VPE_COMPARE_LO));
 
-	return (((cycle_t) hi) << 32) + lo;
+	return (((u64) hi) << 32) + lo;
 }
 
 void gic_start_count(void)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c
index b03e4f58d02e..a533a6cc2d53 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c
@@ -122,7 +122,7 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
-static cycle_t xgbe_cc_read(const struct cyclecounter *cc)
+static u64 xgbe_cc_read(const struct cyclecounter *cc)
 {
 	struct xgbe_prv_data *pdata = container_of(cc,
 						   struct xgbe_prv_data,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 688617ac8c29..d8d06fdfc42b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -15223,7 +15223,7 @@ void bnx2x_set_rx_ts(struct bnx2x *bp, struct sk_buff *skb)
 }
 
 /* Read the PHC */
-static cycle_t bnx2x_cyclecounter_read(const struct cyclecounter *cc)
+static u64 bnx2x_cyclecounter_read(const struct cyclecounter *cc)
 {
 	struct bnx2x *bp = container_of(cc, struct bnx2x, cyclecounter);
 	int port = BP_PORT(bp);
diff --git a/drivers/net/ethernet/freescale/fec_ptp.c b/drivers/net/ethernet/freescale/fec_ptp.c
index f9e74461bdc0..6ebad3fac81d 100644
--- a/drivers/net/ethernet/freescale/fec_ptp.c
+++ b/drivers/net/ethernet/freescale/fec_ptp.c
@@ -230,7 +230,7 @@ static int fec_ptp_enable_pps(struct fec_enet_private *fep, uint enable)
  * cyclecounter structure used to construct a ns counter from the
  * arbitrary fixed point registers
  */
-static cycle_t fec_ptp_read(const struct cyclecounter *cc)
+static u64 fec_ptp_read(const struct cyclecounter *cc)
 {
 	struct fec_enet_private *fep =
 		container_of(cc, struct fec_enet_private, cc);
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index ffcf35af4881..eccf1da9356b 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4305,24 +4305,24 @@ void e1000e_reinit_locked(struct e1000_adapter *adapter)
 /**
  * e1000e_sanitize_systim - sanitize raw cycle counter reads
  * @hw: pointer to the HW structure
- * @systim: cycle_t value read, sanitized and returned
+ * @systim: time value read, sanitized and returned
  *
  * Errata for 82574/82583 possible bad bits read from SYSTIMH/L:
  * check to see that the time is incrementing at a reasonable
  * rate and is a multiple of incvalue.
  **/
-static cycle_t e1000e_sanitize_systim(struct e1000_hw *hw, cycle_t systim)
+static u64 e1000e_sanitize_systim(struct e1000_hw *hw, u64 systim)
 {
 	u64 time_delta, rem, temp;
-	cycle_t systim_next;
+	u64 systim_next;
 	u32 incvalue;
 	int i;
 
 	incvalue = er32(TIMINCA) & E1000_TIMINCA_INCVALUE_MASK;
 	for (i = 0; i < E1000_MAX_82574_SYSTIM_REREADS; i++) {
 		/* latch SYSTIMH on read of SYSTIML */
-		systim_next = (cycle_t)er32(SYSTIML);
-		systim_next |= (cycle_t)er32(SYSTIMH) << 32;
+		systim_next = (u64)er32(SYSTIML);
+		systim_next |= (u64)er32(SYSTIMH) << 32;
 
 		time_delta = systim_next - systim;
 		temp = time_delta;
@@ -4342,13 +4342,13 @@ static cycle_t e1000e_sanitize_systim(struct e1000_hw *hw, cycle_t systim)
  * e1000e_cyclecounter_read - read raw cycle counter (used by time counter)
  * @cc: cyclecounter structure
  **/
-static cycle_t e1000e_cyclecounter_read(const struct cyclecounter *cc)
+static u64 e1000e_cyclecounter_read(const struct cyclecounter *cc)
 {
 	struct e1000_adapter *adapter = container_of(cc, struct e1000_adapter,
 						     cc);
 	struct e1000_hw *hw = &adapter->hw;
 	u32 systimel, systimeh;
-	cycle_t systim;
+	u64 systim;
 	/* SYSTIMH latching upon SYSTIML read does not work well.
 	 * This means that if SYSTIML overflows after we read it but before
 	 * we read SYSTIMH, the value of SYSTIMH has been incremented and we
@@ -4368,8 +4368,8 @@ static cycle_t e1000e_cyclecounter_read(const struct cyclecounter *cc)
 			systimel = systimel_2;
 		}
 	}
-	systim = (cycle_t)systimel;
-	systim |= (cycle_t)systimeh << 32;
+	systim = (u64)systimel;
+	systim |= (u64)systimeh << 32;
 
 	if (adapter->flags2 & FLAG2_CHECK_SYSTIM_OVERFLOW)
 		systim = e1000e_sanitize_systim(hw, systim);
diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index ad03763e009a..34cc3be0df8e 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -127,8 +127,8 @@ static int e1000e_phc_get_syncdevicetime(ktime_t *device,
 	unsigned long flags;
 	int i;
 	u32 tsync_ctrl;
-	cycle_t dev_cycles;
-	cycle_t sys_cycles;
+	u64 dev_cycles;
+	u64 sys_cycles;
 
 	tsync_ctrl = er32(TSYNCTXCTL);
 	tsync_ctrl |= E1000_TSYNCTXCTL_START_SYNC |
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index c30eea8399a7..c4477552ce9e 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -77,7 +77,7 @@
 static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter);
 
 /* SYSTIM read access for the 82576 */
-static cycle_t igb_ptp_read_82576(const struct cyclecounter *cc)
+static u64 igb_ptp_read_82576(const struct cyclecounter *cc)
 {
 	struct igb_adapter *igb = container_of(cc, struct igb_adapter, cc);
 	struct e1000_hw *hw = &igb->hw;
@@ -94,7 +94,7 @@ static cycle_t igb_ptp_read_82576(const struct cyclecounter *cc)
 }
 
 /* SYSTIM read access for the 82580 */
-static cycle_t igb_ptp_read_82580(const struct cyclecounter *cc)
+static u64 igb_ptp_read_82580(const struct cyclecounter *cc)
 {
 	struct igb_adapter *igb = container_of(cc, struct igb_adapter, cc);
 	struct e1000_hw *hw = &igb->hw;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index a92277683a64..1efb404431e9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -245,7 +245,7 @@ static void ixgbe_ptp_setup_sdp_x540(struct ixgbe_adapter *adapter)
  * result of SYSTIME is 32bits of "billions of cycles" and 32 bits of
  * "cycles", rather than seconds and nanoseconds.
  */
-static cycle_t ixgbe_ptp_read_X550(const struct cyclecounter *hw_cc)
+static u64 ixgbe_ptp_read_X550(const struct cyclecounter *hw_cc)
 {
 	struct ixgbe_adapter *adapter =
 			container_of(hw_cc, struct ixgbe_adapter, hw_cc);
@@ -282,7 +282,7 @@ static cycle_t ixgbe_ptp_read_X550(const struct cyclecounter *hw_cc)
  * cyclecounter structure used to construct a ns counter from the
  * arbitrary fixed point registers
  */
-static cycle_t ixgbe_ptp_read_82599(const struct cyclecounter *cc)
+static u64 ixgbe_ptp_read_82599(const struct cyclecounter *cc)
 {
 	struct ixgbe_adapter *adapter =
 		container_of(cc, struct ixgbe_adapter, hw_cc);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
index a5fc46bbcbe2..015198c14fa8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -38,7 +38,7 @@
 
 /* mlx4_en_read_clock - read raw cycle counter (to be used by time counter)
  */
-static cycle_t mlx4_en_read_clock(const struct cyclecounter *tc)
+static u64 mlx4_en_read_clock(const struct cyclecounter *tc)
 {
 	struct mlx4_en_dev *mdev =
 		container_of(tc, struct mlx4_en_dev, cycles);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index b2ca8a635b2e..5e7840a7a33b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1823,10 +1823,10 @@ static void unmap_bf_area(struct mlx4_dev *dev)
 		io_mapping_free(mlx4_priv(dev)->bf_mapping);
 }
 
-cycle_t mlx4_read_clock(struct mlx4_dev *dev)
+u64 mlx4_read_clock(struct mlx4_dev *dev)
 {
 	u32 clockhi, clocklo, clockhi1;
-	cycle_t cycles;
+	u64 cycles;
 	int i;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
index 2cd8e56a573b..746a92c13644 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -49,7 +49,7 @@ void mlx5e_fill_hwstamp(struct mlx5e_tstamp *tstamp, u64 timestamp,
 	hwts->hwtstamp = ns_to_ktime(nsec);
 }
 
-static cycle_t mlx5e_read_internal_timer(const struct cyclecounter *cc)
+static u64 mlx5e_read_internal_timer(const struct cyclecounter *cc)
 {
 	struct mlx5e_tstamp *tstamp = container_of(cc, struct mlx5e_tstamp,
 						   cycles);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 7b4c339a8a9a..54e5a786f191 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -557,7 +557,7 @@ int mlx5_core_disable_hca(struct mlx5_core_dev *dev, u16 func_id)
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
-cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev)
+u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 {
 	u32 timer_h, timer_h1, timer_l;
 
@@ -567,7 +567,7 @@ cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev)
 	if (timer_h != timer_h1) /* wrap around */
 		timer_l = ioread32be(&dev->iseg->internal_timer_l);
 
-	return (cycle_t)timer_l | (cycle_t)timer_h1 << 32;
+	return (u64)timer_l | (u64)timer_h1 << 32;
 }
 
 static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index e0a8fbdd1446..d4a99c9757cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -106,7 +106,7 @@ int mlx5_modify_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
 int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy,
 					u32 element_id);
 int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev);
-cycle_t mlx5_read_internal_timer(struct mlx5_core_dev *dev);
+u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev);
 u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx);
 struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn);
 void mlx5_cq_tasklet_cb(unsigned long data);
diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c
index 0c0d48e5bea4..32279d21c836 100644
--- a/drivers/net/ethernet/ti/cpts.c
+++ b/drivers/net/ethernet/ti/cpts.c
@@ -121,7 +121,7 @@ static int cpts_fifo_read(struct cpts *cpts, int match)
 	return type == match ? 0 : -1;
 }
 
-static cycle_t cpts_systim_read(const struct cyclecounter *cc)
+static u64 cpts_systim_read(const struct cyclecounter *cc)
 {
 	u64 val = 0;
 	struct cpts_event *event;
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index dda39d8fa189..b717ed9d2b75 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -25,13 +25,13 @@
 
 struct arch_timer_kvm {
 	/* Virtual offset */
-	cycle_t			cntvoff;
+	u64			cntvoff;
 };
 
 struct arch_timer_cpu {
 	/* Registers: control register, timer value */
 	u32				cntv_ctl;	/* Saved/restored */
-	cycle_t				cntv_cval;	/* Saved/restored */
+	u64				cntv_cval;	/* Saved/restored */
 
 	/*
 	 * Anything that is not used directly from assembly code goes
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 65602d395a52..e315d04a2fd9 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -75,8 +75,8 @@ struct module;
  * structure.
  */
 struct clocksource {
-	cycle_t (*read)(struct clocksource *cs);
-	cycle_t mask;
+	u64 (*read)(struct clocksource *cs);
+	u64 mask;
 	u32 mult;
 	u32 shift;
 	u64 max_idle_ns;
@@ -98,8 +98,8 @@ struct clocksource {
 #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
 	/* Watchdog related data, used by the framework */
 	struct list_head wd_list;
-	cycle_t cs_last;
-	cycle_t wd_last;
+	u64 cs_last;
+	u64 wd_last;
 #endif
 	struct module *owner;
 };
@@ -117,7 +117,7 @@ struct clocksource {
 #define CLOCK_SOURCE_RESELECT			0x100
 
 /* simplify initialization of mask field */
-#define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+#define CLOCKSOURCE_MASK(bits) (u64)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
 
 static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
 {
@@ -176,7 +176,7 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
  *
  * XXX - This could use some mult_lxl_ll() asm optimization
  */
-static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift)
+static inline s64 clocksource_cyc2ns(u64 cycles, u32 mult, u32 shift)
 {
 	return ((u64) cycles * mult) >> shift;
 }
@@ -236,13 +236,13 @@ static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz
 
 extern int timekeeping_notify(struct clocksource *clock);
 
-extern cycle_t clocksource_mmio_readl_up(struct clocksource *);
-extern cycle_t clocksource_mmio_readl_down(struct clocksource *);
-extern cycle_t clocksource_mmio_readw_up(struct clocksource *);
-extern cycle_t clocksource_mmio_readw_down(struct clocksource *);
+extern u64 clocksource_mmio_readl_up(struct clocksource *);
+extern u64 clocksource_mmio_readl_down(struct clocksource *);
+extern u64 clocksource_mmio_readw_up(struct clocksource *);
+extern u64 clocksource_mmio_readw_down(struct clocksource *);
 
 extern int clocksource_mmio_init(void __iomem *, const char *,
-	unsigned long, int, unsigned, cycle_t (*)(struct clocksource *));
+	unsigned long, int, unsigned, u64 (*)(struct clocksource *));
 
 extern int clocksource_i8253_init(void);
 
diff --git a/include/linux/dw_apb_timer.h b/include/linux/dw_apb_timer.h
index 1f79b20918b1..4334106f44c3 100644
--- a/include/linux/dw_apb_timer.h
+++ b/include/linux/dw_apb_timer.h
@@ -50,6 +50,6 @@ dw_apb_clocksource_init(unsigned rating, const char *name, void __iomem *base,
 			unsigned long freq);
 void dw_apb_clocksource_register(struct dw_apb_clocksource *dw_cs);
 void dw_apb_clocksource_start(struct dw_apb_clocksource *dw_cs);
-cycle_t dw_apb_clocksource_read(struct dw_apb_clocksource *dw_cs);
+u64 dw_apb_clocksource_read(struct dw_apb_clocksource *dw_cs);
 
 #endif /* __DW_APB_TIMER_H__ */
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 81f930b0bca9..7b49c71c968b 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -259,11 +259,11 @@ extern void gic_init(unsigned long gic_base_addr,
 	unsigned long gic_addrspace_size, unsigned int cpu_vec,
 	unsigned int irqbase);
 extern void gic_clocksource_init(unsigned int);
-extern cycle_t gic_read_count(void);
+extern u64 gic_read_count(void);
 extern unsigned int gic_get_count_width(void);
-extern cycle_t gic_read_compare(void);
-extern void gic_write_compare(cycle_t cnt);
-extern void gic_write_cpu_compare(cycle_t cnt, int cpu);
+extern u64 gic_read_compare(void);
+extern void gic_write_compare(u64 cnt);
+extern void gic_write_cpu_compare(u64 cnt, int cpu);
 extern void gic_start_count(void);
 extern void gic_stop_count(void);
 extern int gic_get_c0_compare_int(void);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c9f379689dd0..93bdb3485192 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1460,7 +1460,7 @@ int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
 int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
 				      u32 max_range_qpn);
 
-cycle_t mlx4_read_clock(struct mlx4_dev *dev);
+u64 mlx4_read_clock(struct mlx4_dev *dev);
 
 struct mlx4_active_ports {
 	DECLARE_BITMAP(ports, MLX4_MAX_PORTS);
diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index 4382035a75bb..2496ad4cfc99 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -20,7 +20,7 @@
 #include <linux/types.h>
 
 /* simplify initialization of mask field */
-#define CYCLECOUNTER_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+#define CYCLECOUNTER_MASK(bits) (u64)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
 
 /**
  * struct cyclecounter - hardware abstraction for a free running counter
@@ -37,8 +37,8 @@
  * @shift:		cycle to nanosecond divisor (power of two)
  */
 struct cyclecounter {
-	cycle_t (*read)(const struct cyclecounter *cc);
-	cycle_t mask;
+	u64 (*read)(const struct cyclecounter *cc);
+	u64 mask;
 	u32 mult;
 	u32 shift;
 };
@@ -63,7 +63,7 @@ struct cyclecounter {
  */
 struct timecounter {
 	const struct cyclecounter *cc;
-	cycle_t cycle_last;
+	u64 cycle_last;
 	u64 nsec;
 	u64 mask;
 	u64 frac;
@@ -77,7 +77,7 @@ struct timecounter {
  * @frac:	pointer to storage for the fractional nanoseconds.
  */
 static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
-				      cycle_t cycles, u64 mask, u64 *frac)
+				      u64 cycles, u64 mask, u64 *frac)
 {
 	u64 ns = (u64) cycles;
 
@@ -134,6 +134,6 @@ extern u64 timecounter_read(struct timecounter *tc);
  * in the past.
  */
 extern u64 timecounter_cyc2time(struct timecounter *tc,
-				cycle_t cycle_tstamp);
+				u64 cycle_tstamp);
 
 #endif
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index e88005459035..110f4532188c 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -29,9 +29,9 @@
  */
 struct tk_read_base {
 	struct clocksource	*clock;
-	cycle_t			(*read)(struct clocksource *cs);
-	cycle_t			mask;
-	cycle_t			cycle_last;
+	u64			(*read)(struct clocksource *cs);
+	u64			mask;
+	u64			cycle_last;
 	u32			mult;
 	u32			shift;
 	u64			xtime_nsec;
@@ -97,7 +97,7 @@ struct timekeeper {
 	struct timespec64	raw_time;
 
 	/* The following members are for timekeeping internal use */
-	cycle_t			cycle_interval;
+	u64			cycle_interval;
 	u64			xtime_interval;
 	s64			xtime_remainder;
 	u32			raw_interval;
@@ -136,7 +136,7 @@ extern void update_vsyscall_tz(void);
 
 extern void update_vsyscall_old(struct timespec *ts, struct timespec *wtm,
 				struct clocksource *c, u32 mult,
-				cycle_t cycle_last);
+				u64 cycle_last);
 extern void update_vsyscall_tz(void);
 
 #else
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 361f8bf1429d..d2e804e15c3e 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -293,7 +293,7 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
  * @cs_was_changed_seq:	The sequence number of clocksource change events
  */
 struct system_time_snapshot {
-	cycle_t		cycles;
+	u64		cycles;
 	ktime_t		real;
 	ktime_t		raw;
 	unsigned int	clock_was_set_seq;
@@ -321,7 +321,7 @@ struct system_device_crosststamp {
  *	timekeeping code to verify comparibility of two cycle values
  */
 struct system_counterval_t {
-	cycle_t			cycles;
+	u64			cycles;
 	struct clocksource	*cs;
 };
 
diff --git a/include/linux/types.h b/include/linux/types.h
index d501ad3ba247..1e7bd24848fc 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -228,8 +228,5 @@ struct callback_head {
 typedef void (*rcu_callback_t)(struct rcu_head *head);
 typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
 
-/* clocksource cycle base type */
-typedef u64 cycle_t;
-
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 150242ccfcd2..665985b0a89a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -170,7 +170,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 static void clocksource_watchdog(unsigned long data)
 {
 	struct clocksource *cs;
-	cycle_t csnow, wdnow, cslast, wdlast, delta;
+	u64 csnow, wdnow, cslast, wdlast, delta;
 	int64_t wd_nsec, cs_nsec;
 	int next_cpu, reset_pending;
 
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 555e21f7b966..a4a0e478e44d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -59,9 +59,9 @@
 #define JIFFIES_SHIFT	8
 #endif
 
-static cycle_t jiffies_read(struct clocksource *cs)
+static u64 jiffies_read(struct clocksource *cs)
 {
-	return (cycle_t) jiffies;
+	return (u64) jiffies;
 }
 
 static struct clocksource clocksource_jiffies = {
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 4687b3104bae..8afd78932bdf 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(timecounter_init);
  */
 static u64 timecounter_read_delta(struct timecounter *tc)
 {
-	cycle_t cycle_now, cycle_delta;
+	u64 cycle_now, cycle_delta;
 	u64 ns_offset;
 
 	/* read cycle counter: */
@@ -80,7 +80,7 @@ EXPORT_SYMBOL_GPL(timecounter_read);
  * time previous to the time stored in the cycle counter.
  */
 static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
-			       cycle_t cycles, u64 mask, u64 frac)
+			       u64 cycles, u64 mask, u64 frac)
 {
 	u64 ns = (u64) cycles;
 
@@ -90,7 +90,7 @@ static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
 }
 
 u64 timecounter_cyc2time(struct timecounter *tc,
-			 cycle_t cycle_tstamp)
+			 u64 cycle_tstamp)
 {
 	u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
 	u64 nsec = tc->nsec, frac = tc->frac;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index da233cdf89b0..f4152a69277f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -119,10 +119,10 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
 
-static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
 {
 
-	cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+	u64 max_cycles = tk->tkr_mono.clock->max_cycles;
 	const char *name = tk->tkr_mono.clock->name;
 
 	if (offset > max_cycles) {
@@ -158,10 +158,10 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 	}
 }
 
-static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	cycle_t now, last, mask, max, delta;
+	u64 now, last, mask, max, delta;
 	unsigned int seq;
 
 	/*
@@ -199,12 +199,12 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 	return delta;
 }
 #else
-static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
+static inline void timekeeping_check_update(struct timekeeper *tk, u64 offset)
 {
 }
-static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
+static inline u64 timekeeping_get_delta(struct tk_read_base *tkr)
 {
-	cycle_t cycle_now, delta;
+	u64 cycle_now, delta;
 
 	/* read clocksource */
 	cycle_now = tkr->read(tkr->clock);
@@ -229,7 +229,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
  */
 static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 {
-	cycle_t interval;
+	u64 interval;
 	u64 tmp, ntpinterval;
 	struct clocksource *old_clock;
 
@@ -254,7 +254,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	if (tmp == 0)
 		tmp = 1;
 
-	interval = (cycle_t) tmp;
+	interval = (u64) tmp;
 	tk->cycle_interval = interval;
 
 	/* Go back from cycles -> shifted ns */
@@ -298,8 +298,7 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
 static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
-static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
-					  cycle_t delta)
+static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, u64 delta)
 {
 	u64 nsec;
 
@@ -312,16 +311,15 @@ static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
 
 static inline u64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
-	cycle_t delta;
+	u64 delta;
 
 	delta = timekeeping_get_delta(tkr);
 	return timekeeping_delta_to_ns(tkr, delta);
 }
 
-static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
-					   cycle_t cycles)
+static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, u64 cycles)
 {
-	cycle_t delta;
+	u64 delta;
 
 	/* calculate the delta since the last update_wall_time */
 	delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
@@ -454,9 +452,9 @@ u64 notrace ktime_get_boot_fast_ns(void)
 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
 
 /* Suspend-time cycles value for halted fast timekeeper. */
-static cycle_t cycles_at_suspend;
+static u64 cycles_at_suspend;
 
-static cycle_t dummy_clock_read(struct clocksource *cs)
+static u64 dummy_clock_read(struct clocksource *cs)
 {
 	return cycles_at_suspend;
 }
@@ -650,7 +648,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
 	struct clocksource *clock = tk->tkr_mono.clock;
-	cycle_t cycle_now, delta;
+	u64 cycle_now, delta;
 	u64 nsec;
 
 	cycle_now = tk->tkr_mono.read(clock);
@@ -923,7 +921,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 	ktime_t base_real;
 	u64 nsec_raw;
 	u64 nsec_real;
-	cycle_t now;
+	u64 now;
 
 	WARN_ON_ONCE(timekeeping_suspended);
 
@@ -982,8 +980,8 @@ static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
  * interval is partial_history_cycles.
  */
 static int adjust_historical_crosststamp(struct system_time_snapshot *history,
-					 cycle_t partial_history_cycles,
-					 cycle_t total_history_cycles,
+					 u64 partial_history_cycles,
+					 u64 total_history_cycles,
 					 bool discontinuity,
 					 struct system_device_crosststamp *ts)
 {
@@ -1047,7 +1045,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
 /*
  * cycle_between - true if test occurs chronologically between before and after
  */
-static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
+static bool cycle_between(u64 before, u64 test, u64 after)
 {
 	if (test > before && test < after)
 		return true;
@@ -1077,7 +1075,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
 {
 	struct system_counterval_t system_counterval;
 	struct timekeeper *tk = &tk_core.timekeeper;
-	cycle_t cycles, now, interval_start;
+	u64 cycles, now, interval_start;
 	unsigned int clock_was_set_seq = 0;
 	ktime_t base_real, base_raw;
 	u64 nsec_real, nsec_raw;
@@ -1138,7 +1136,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
 	 * current interval
 	 */
 	if (do_interp) {
-		cycle_t partial_history_cycles, total_history_cycles;
+		u64 partial_history_cycles, total_history_cycles;
 		bool discontinuity;
 
 		/*
@@ -1644,7 +1642,7 @@ void timekeeping_resume(void)
 	struct clocksource *clock = tk->tkr_mono.clock;
 	unsigned long flags;
 	struct timespec64 ts_new, ts_delta;
-	cycle_t cycle_now;
+	u64 cycle_now;
 
 	sleeptime_injected = false;
 	read_persistent_clock64(&ts_new);
@@ -2010,11 +2008,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
  *
  * Returns the unconsumed cycles.
  */
-static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
-						u32 shift,
-						unsigned int *clock_set)
+static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
+				    u32 shift, unsigned int *clock_set)
 {
-	cycle_t interval = tk->cycle_interval << shift;
+	u64 interval = tk->cycle_interval << shift;
 	u64 raw_nsecs;
 
 	/* If the offset is smaller than a shifted interval, do nothing */
@@ -2055,7 +2052,7 @@ void update_wall_time(void)
 {
 	struct timekeeper *real_tk = &tk_core.timekeeper;
 	struct timekeeper *tk = &shadow_timekeeper;
-	cycle_t offset;
+	u64 offset;
 	int shift = 0, maxshift;
 	unsigned int clock_set = 0;
 	unsigned long flags;
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 5be76270ec4a..9a18f121f399 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -13,9 +13,9 @@ extern void tk_debug_account_sleep_time(struct timespec64 *t);
 #endif
 
 #ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
-static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
 {
-	cycle_t ret = (now - last) & mask;
+	u64 ret = (now - last) & mask;
 
 	/*
 	 * Prevent time going backwards by checking the MSB of mask in
@@ -24,7 +24,7 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
 	return ret & ~(mask >> 1) ? 0 : ret;
 }
 #else
-static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+static inline u64 clocksource_delta(u64 now, u64 last, u64 mask)
 {
 	return (now - last) & mask;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1f0f547c54da..eb230f06ba41 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2847,7 +2847,7 @@ static void ftrace_shutdown_sysctl(void)
 	}
 }
 
-static cycle_t		ftrace_update_time;
+static u64		ftrace_update_time;
 unsigned long		ftrace_update_tot_cnt;
 
 static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2894,7 +2894,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *p;
-	cycle_t start, stop;
+	u64 start, stop;
 	unsigned long update_cnt = 0;
 	unsigned long rec_flags = 0;
 	int i;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 66f829c47bec..d7449783987a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -236,7 +236,7 @@ static int __init set_tracepoint_printk(char *str)
 }
 __setup("tp_printk", set_tracepoint_printk);
 
-unsigned long long ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(u64 nsec)
 {
 	nsec += 500;
 	do_div(nsec, 1000);
@@ -573,7 +573,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
 	return read;
 }
 
-static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+static u64 buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
 	u64 ts;
 
@@ -587,7 +587,7 @@ static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 	return ts;
 }
 
-cycle_t ftrace_now(int cpu)
+u64 ftrace_now(int cpu)
 {
 	return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c2234494f40c..1ea51ab53edf 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -159,7 +159,7 @@ struct trace_array_cpu {
 	unsigned long		policy;
 	unsigned long		rt_priority;
 	unsigned long		skipped_entries;
-	cycle_t			preempt_timestamp;
+	u64			preempt_timestamp;
 	pid_t			pid;
 	kuid_t			uid;
 	char			comm[TASK_COMM_LEN];
@@ -177,7 +177,7 @@ struct trace_buffer {
 	struct trace_array		*tr;
 	struct ring_buffer		*buffer;
 	struct trace_array_cpu __percpu	*data;
-	cycle_t				time_start;
+	u64				time_start;
 	int				cpu;
 };
 
@@ -689,7 +689,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
 }
 #endif /* CONFIG_STACKTRACE */
 
-extern cycle_t ftrace_now(int cpu);
+extern u64 ftrace_now(int cpu);
 
 extern void trace_find_cmdline(int pid, char comm[]);
 extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
@@ -736,7 +736,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
-extern unsigned long long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(u64 nsec);
 extern int
 trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 86654d7e1afe..7758bc0617cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -298,7 +298,7 @@ static void irqsoff_print_header(struct seq_file *s)
 /*
  * Should this new latency be reported/recorded?
  */
-static bool report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, u64 delta)
 {
 	if (tracing_thresh) {
 		if (delta < tracing_thresh)
@@ -316,7 +316,7 @@ check_critical_timing(struct trace_array *tr,
 		      unsigned long parent_ip,
 		      int cpu)
 {
-	cycle_t T0, T1, delta;
+	u64 T0, T1, delta;
 	unsigned long flags;
 	int pc;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5d0bb025bb21..ddec53b67646 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -358,7 +358,7 @@ static void wakeup_print_header(struct seq_file *s)
 /*
  * Should this new latency be reported/recorded?
  */
-static bool report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, u64 delta)
 {
 	if (tracing_thresh) {
 		if (delta < tracing_thresh)
@@ -440,7 +440,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
 			  struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
-	cycle_t T0, T1, delta;
+	u64 T0, T1, delta;
 	unsigned long flags;
 	long disabled;
 	int cpu;
diff --git a/sound/hda/hdac_stream.c b/sound/hda/hdac_stream.c
index 38990a77d7b7..c6994ebb4567 100644
--- a/sound/hda/hdac_stream.c
+++ b/sound/hda/hdac_stream.c
@@ -465,7 +465,7 @@ int snd_hdac_stream_set_params(struct hdac_stream *azx_dev,
 }
 EXPORT_SYMBOL_GPL(snd_hdac_stream_set_params);
 
-static cycle_t azx_cc_read(const struct cyclecounter *cc)
+static u64 azx_cc_read(const struct cyclecounter *cc)
 {
 	struct hdac_stream *azx_dev = container_of(cc, struct hdac_stream, cc);
 
@@ -473,7 +473,7 @@ static cycle_t azx_cc_read(const struct cyclecounter *cc)
 }
 
 static void azx_timecounter_init(struct hdac_stream *azx_dev,
-				 bool force, cycle_t last)
+				 bool force, u64 last)
 {
 	struct timecounter *tc = &azx_dev->tc;
 	struct cyclecounter *cc = &azx_dev->cc;
@@ -523,7 +523,7 @@ void snd_hdac_stream_timecounter_init(struct hdac_stream *azx_dev,
 	struct snd_pcm_runtime *runtime = azx_dev->substream->runtime;
 	struct hdac_stream *s;
 	bool inited = false;
-	cycle_t cycle_last = 0;
+	u64 cycle_last = 0;
 	int i = 0;
 
 	list_for_each_entry(s, &bus->stream_list, list) {
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index ae95fc0e3214..97b657adb3bd 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -39,7 +39,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->arch.timer_cpu.active_cleared_last = false;
 }
 
-static cycle_t kvm_phys_timer_read(void)
+static u64 kvm_phys_timer_read(void)
 {
 	return timecounter->cc->read(timecounter->cc);
 }
@@ -102,7 +102,7 @@ static void kvm_timer_inject_irq_work(struct work_struct *work)
 
 static u64 kvm_timer_compute_delta(struct kvm_vcpu *vcpu)
 {
-	cycle_t cval, now;
+	u64 cval, now;
 
 	cval = vcpu->arch.timer_cpu.cntv_cval;
 	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
@@ -155,7 +155,7 @@ static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu)
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	cycle_t cval, now;
+	u64 cval, now;
 
 	if (!kvm_timer_irq_can_fire(vcpu))
 		return false;
-- 
cgit v1.2.3


From 2456e855354415bfaeb7badaa14e11b3e02c8466 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 25 Dec 2016 11:38:40 +0100
Subject: ktime: Get rid of the union

ktime is a union because the initial implementation stored the time in
scalar nanoseconds on 64 bit machine and in a endianess optimized timespec
variant for 32bit machines. The Y2038 cleanup removed the timespec variant
and switched everything to scalar nanoseconds. The union remained, but
become completely pointless.

Get rid of the union and just keep ktime_t as simple typedef of type s64.

The conversion was done with coccinelle and some manual mopping up.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 drivers/base/power/wakeup.c            |  2 +-
 drivers/media/rc/ir-rx51.c             |  2 +-
 drivers/rtc/interface.c                |  8 ++--
 drivers/usb/chipidea/otg_fsm.c         | 14 +++----
 drivers/usb/host/ehci-timer.c          |  2 +-
 drivers/usb/host/fotg210-hcd.c         |  2 +-
 fs/aio.c                               |  4 +-
 fs/nfs/flexfilelayout/flexfilelayout.c |  3 +-
 fs/ocfs2/cluster/heartbeat.c           |  2 +-
 fs/timerfd.c                           | 26 ++++++-------
 include/linux/futex.h                  |  4 +-
 include/linux/hrtimer.h                | 12 +++---
 include/linux/ktime.h                  | 68 ++++++++++++----------------------
 include/linux/tick.h                   |  4 +-
 include/linux/wait.h                   |  2 +-
 include/net/red.h                      |  4 +-
 include/net/sock.h                     |  4 +-
 include/trace/events/alarmtimer.h      |  6 +--
 include/trace/events/timer.h           | 16 ++++----
 kernel/futex.c                         |  4 +-
 kernel/signal.c                        |  6 +--
 kernel/time/alarmtimer.c               | 20 +++++-----
 kernel/time/clockevents.c              |  6 +--
 kernel/time/hrtimer.c                  | 52 +++++++++++++-------------
 kernel/time/itimer.c                   | 10 ++---
 kernel/time/ntp.c                      |  2 +-
 kernel/time/posix-timers.c             | 20 +++++-----
 kernel/time/tick-broadcast-hrtimer.c   |  2 +-
 kernel/time/tick-broadcast.c           | 24 ++++++------
 kernel/time/tick-oneshot.c             |  2 +-
 kernel/time/tick-sched.c               | 22 +++++------
 kernel/time/timekeeping.c              |  6 +--
 lib/timerqueue.c                       |  4 +-
 net/can/bcm.c                          | 28 +++++++-------
 net/can/gw.c                           |  2 +-
 net/core/dev.c                         |  4 +-
 net/core/skbuff.c                      |  2 +-
 net/ipv4/tcp_output.c                  |  4 +-
 net/ipv6/exthdrs.c                     |  2 +-
 net/ipx/af_ipx.c                       |  2 +-
 net/netfilter/nf_conntrack_core.c      |  2 +-
 net/netfilter/nfnetlink_log.c          |  2 +-
 net/netfilter/nfnetlink_queue.c        |  4 +-
 net/netfilter/xt_time.c                |  2 +-
 net/sched/sch_netem.c                  |  2 +-
 net/socket.c                           |  2 +-
 net/sunrpc/svcsock.c                   |  2 +-
 sound/core/hrtimer.c                   |  2 +-
 48 files changed, 200 insertions(+), 227 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index bf9ba26981a5..94332902a1cf 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -998,7 +998,7 @@ static int print_wakeup_source_stats(struct seq_file *m,
 
 		active_time = ktime_sub(now, ws->last_time);
 		total_time = ktime_add(total_time, active_time);
-		if (active_time.tv64 > max_time.tv64)
+		if (active_time > max_time)
 			max_time = active_time;
 
 		if (ws->autosleep_enabled)
diff --git a/drivers/media/rc/ir-rx51.c b/drivers/media/rc/ir-rx51.c
index 82fb6f2ca011..e6efa8c267a0 100644
--- a/drivers/media/rc/ir-rx51.c
+++ b/drivers/media/rc/ir-rx51.c
@@ -109,7 +109,7 @@ static enum hrtimer_restart lirc_rx51_timer_cb(struct hrtimer *timer)
 
 		now = timer->base->get_time();
 
-	} while (hrtimer_get_expires_tv64(timer) < now.tv64);
+	} while (hrtimer_get_expires_tv64(timer) < now);
 
 	return HRTIMER_RESTART;
 end:
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 84a52db9b05f..5cf196dfc193 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -394,8 +394,8 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 	rtc->aie_timer.period = ktime_set(0, 0);
 
 	/* Alarm has to be enabled & in the future for us to enqueue it */
-	if (alarm->enabled && (rtc_tm_to_ktime(now).tv64 <
-			 rtc->aie_timer.node.expires.tv64)) {
+	if (alarm->enabled && (rtc_tm_to_ktime(now) <
+			 rtc->aie_timer.node.expires)) {
 
 		rtc->aie_timer.enabled = 1;
 		timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node);
@@ -766,7 +766,7 @@ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer)
 
 	/* Skip over expired timers */
 	while (next) {
-		if (next->expires.tv64 >= now.tv64)
+		if (next->expires >= now)
 			break;
 		next = timerqueue_iterate_next(next);
 	}
@@ -858,7 +858,7 @@ again:
 	__rtc_read_time(rtc, &tm);
 	now = rtc_tm_to_ktime(tm);
 	while ((next = timerqueue_getnext(&rtc->timerqueue))) {
-		if (next->expires.tv64 > now.tv64)
+		if (next->expires > now)
 			break;
 
 		/* expire timer */
diff --git a/drivers/usb/chipidea/otg_fsm.c b/drivers/usb/chipidea/otg_fsm.c
index de8e22ec3902..93e24ce61a3a 100644
--- a/drivers/usb/chipidea/otg_fsm.c
+++ b/drivers/usb/chipidea/otg_fsm.c
@@ -234,8 +234,8 @@ static void ci_otg_add_timer(struct ci_hdrc *ci, enum otg_fsm_timer t)
 				ktime_set(timer_sec, timer_nsec));
 	ci->enabled_otg_timer_bits |= (1 << t);
 	if ((ci->next_otg_timer == NUM_OTG_FSM_TIMERS) ||
-			(ci->hr_timeouts[ci->next_otg_timer].tv64 >
-						ci->hr_timeouts[t].tv64)) {
+			(ci->hr_timeouts[ci->next_otg_timer] >
+						ci->hr_timeouts[t])) {
 			ci->next_otg_timer = t;
 			hrtimer_start_range_ns(&ci->otg_fsm_hrtimer,
 					ci->hr_timeouts[t], NSEC_PER_MSEC,
@@ -269,8 +269,8 @@ static void ci_otg_del_timer(struct ci_hdrc *ci, enum otg_fsm_timer t)
 			for_each_set_bit(cur_timer, &enabled_timer_bits,
 							NUM_OTG_FSM_TIMERS) {
 				if ((next_timer == NUM_OTG_FSM_TIMERS) ||
-					(ci->hr_timeouts[next_timer].tv64 <
-					ci->hr_timeouts[cur_timer].tv64))
+					(ci->hr_timeouts[next_timer] <
+					 ci->hr_timeouts[cur_timer]))
 					next_timer = cur_timer;
 			}
 		}
@@ -397,14 +397,14 @@ static enum hrtimer_restart ci_otg_hrtimer_func(struct hrtimer *t)
 
 	now = ktime_get();
 	for_each_set_bit(cur_timer, &enabled_timer_bits, NUM_OTG_FSM_TIMERS) {
-		if (now.tv64 >= ci->hr_timeouts[cur_timer].tv64) {
+		if (now >= ci->hr_timeouts[cur_timer]) {
 			ci->enabled_otg_timer_bits &= ~(1 << cur_timer);
 			if (otg_timer_handlers[cur_timer])
 				ret = otg_timer_handlers[cur_timer](ci);
 		} else {
 			if ((next_timer == NUM_OTG_FSM_TIMERS) ||
-				(ci->hr_timeouts[cur_timer].tv64 <
-					ci->hr_timeouts[next_timer].tv64))
+				(ci->hr_timeouts[cur_timer] <
+					ci->hr_timeouts[next_timer]))
 				next_timer = cur_timer;
 		}
 	}
diff --git a/drivers/usb/host/ehci-timer.c b/drivers/usb/host/ehci-timer.c
index 69f50e6533a6..262e10cacc8c 100644
--- a/drivers/usb/host/ehci-timer.c
+++ b/drivers/usb/host/ehci-timer.c
@@ -425,7 +425,7 @@ static enum hrtimer_restart ehci_hrtimer_func(struct hrtimer *t)
 	 */
 	now = ktime_get();
 	for_each_set_bit(e, &events, EHCI_HRTIMER_NUM_EVENTS) {
-		if (now.tv64 >= ehci->hr_timeouts[e].tv64)
+		if (now >= ehci->hr_timeouts[e])
 			event_handlers[e](ehci);
 		else
 			ehci_enable_event(ehci, e, false);
diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c
index 66efa9a67687..4dda56ef06cd 100644
--- a/drivers/usb/host/fotg210-hcd.c
+++ b/drivers/usb/host/fotg210-hcd.c
@@ -1381,7 +1381,7 @@ static enum hrtimer_restart fotg210_hrtimer_func(struct hrtimer *t)
 	 */
 	now = ktime_get();
 	for_each_set_bit(e, &events, FOTG210_HRTIMER_NUM_EVENTS) {
-		if (now.tv64 >= fotg210->hr_timeouts[e].tv64)
+		if (now >= fotg210->hr_timeouts[e])
 			event_handlers[e](fotg210);
 		else
 			fotg210_enable_event(fotg210, e, false);
diff --git a/fs/aio.c b/fs/aio.c
index 955c5241a8f2..4ab67e8cb776 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1285,7 +1285,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 			struct io_event __user *event,
 			struct timespec __user *timeout)
 {
-	ktime_t until = { .tv64 = KTIME_MAX };
+	ktime_t until = KTIME_MAX;
 	long ret = 0;
 
 	if (timeout) {
@@ -1311,7 +1311,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 	 * the ringbuffer empty. So in practice we should be ok, but it's
 	 * something to be aware of when touching this code.
 	 */
-	if (until.tv64 == 0)
+	if (until == 0)
 		aio_read_events(ctx, min_nr, nr, event, &ret);
 	else
 		wait_event_interruptible_hrtimeout(ctx->wait,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 45962fe5098c..c98f6db9aa6b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -619,12 +619,11 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 			    struct nfs4_ff_layoutstat *layoutstat,
 			    ktime_t now)
 {
-	static const ktime_t notime = {0};
 	s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
 	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
 
 	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
-	if (ktime_equal(mirror->start_time, notime))
+	if (ktime_equal(mirror->start_time, 0))
 		mirror->start_time = now;
 	if (mirror->report_interval != 0)
 		report_interval = (s64)mirror->report_interval * 1000LL;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 96a155ab5059..f6e871760f8d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1250,7 +1250,7 @@ static int o2hb_thread(void *data)
 
 		mlog(ML_HEARTBEAT,
 		     "start = %lld, end = %lld, msec = %u, ret = %d\n",
-		     before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
+		     before_hb, after_hb, elapsed_msec, ret);
 
 		if (!kthread_should_stop() &&
 		    elapsed_msec < reg->hr_timeout_ms) {
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 9ae4abb4110b..fb4407a7cf9e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -55,7 +55,7 @@ static inline bool isalarm(struct timerfd_ctx *ctx)
 /*
  * This gets called when the timer event triggers. We set the "expired"
  * flag, but we do not re-arm the timer (in case it's necessary,
- * tintv.tv64 != 0) until the timer is accessed.
+ * tintv != 0) until the timer is accessed.
  */
 static void timerfd_triggered(struct timerfd_ctx *ctx)
 {
@@ -93,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
  */
 void timerfd_clock_was_set(void)
 {
-	ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	ktime_t moffs = ktime_mono_to_real(0);
 	struct timerfd_ctx *ctx;
 	unsigned long flags;
 
@@ -102,8 +102,8 @@ void timerfd_clock_was_set(void)
 		if (!ctx->might_cancel)
 			continue;
 		spin_lock_irqsave(&ctx->wqh.lock, flags);
-		if (ctx->moffs.tv64 != moffs.tv64) {
-			ctx->moffs.tv64 = KTIME_MAX;
+		if (ctx->moffs != moffs) {
+			ctx->moffs = KTIME_MAX;
 			ctx->ticks++;
 			wake_up_locked(&ctx->wqh);
 		}
@@ -124,9 +124,9 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
 
 static bool timerfd_canceled(struct timerfd_ctx *ctx)
 {
-	if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+	if (!ctx->might_cancel || ctx->moffs != KTIME_MAX)
 		return false;
-	ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	ctx->moffs = ktime_mono_to_real(0);
 	return true;
 }
 
@@ -155,7 +155,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 	else
 		remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
 
-	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
+	return remaining < 0 ? ktime_set(0, 0): remaining;
 }
 
 static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
@@ -184,7 +184,7 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
 		ctx->t.tmr.function = timerfd_tmrproc;
 	}
 
-	if (texp.tv64 != 0) {
+	if (texp != 0) {
 		if (isalarm(ctx)) {
 			if (flags & TFD_TIMER_ABSTIME)
 				alarm_start(&ctx->t.alarm, texp);
@@ -261,9 +261,9 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
 	if (ctx->ticks) {
 		ticks = ctx->ticks;
 
-		if (ctx->expired && ctx->tintv.tv64) {
+		if (ctx->expired && ctx->tintv) {
 			/*
-			 * If tintv.tv64 != 0, this is a periodic timer that
+			 * If tintv != 0, this is a periodic timer that
 			 * needs to be re-armed. We avoid doing it in the timer
 			 * callback to avoid DoS attacks specifying a very
 			 * short timer period.
@@ -410,7 +410,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	else
 		hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
 
-	ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	ctx->moffs = ktime_mono_to_real(0);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
 			       O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -469,7 +469,7 @@ static int do_timerfd_settime(int ufd, int flags,
 	 * We do not update "ticks" and "expired" since the timer will be
 	 * re-programmed again in the following timerfd_setup() call.
 	 */
-	if (ctx->expired && ctx->tintv.tv64) {
+	if (ctx->expired && ctx->tintv) {
 		if (isalarm(ctx))
 			alarm_forward_now(&ctx->t.alarm, ctx->tintv);
 		else
@@ -499,7 +499,7 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
 	ctx = f.file->private_data;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	if (ctx->expired && ctx->tintv.tv64) {
+	if (ctx->expired && ctx->tintv) {
 		ctx->expired = 0;
 
 		if (isalarm(ctx)) {
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 6435f46d6e13..7c5b694864cd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -1,14 +1,14 @@
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
+#include <linux/ktime.h>
 #include <uapi/linux/futex.h>
 
 struct inode;
 struct mm_struct;
 struct task_struct;
-union ktime;
 
-long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 
 extern int
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5e00f80b1535..cdab81ba29f8 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -228,8 +228,8 @@ static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t t
 
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
-	timer->node.expires.tv64 = tv64;
-	timer->_softexpires.tv64 = tv64;
+	timer->node.expires = tv64;
+	timer->_softexpires = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
@@ -256,11 +256,11 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
 
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
-	return timer->node.expires.tv64;
+	return timer->node.expires;
 }
 static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 {
-	return timer->_softexpires.tv64;
+	return timer->_softexpires;
 }
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
@@ -297,7 +297,7 @@ extern void hrtimer_peek_ahead_timers(void);
  * this resolution values.
  */
 # define HIGH_RES_NSEC		1
-# define KTIME_HIGH_RES		(ktime_t) { .tv64 = HIGH_RES_NSEC }
+# define KTIME_HIGH_RES		(HIGH_RES_NSEC)
 # define MONOTONIC_RES_NSEC	HIGH_RES_NSEC
 # define KTIME_MONOTONIC_RES	KTIME_HIGH_RES
 
@@ -333,7 +333,7 @@ __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
 	 * hrtimer_start_range_ns() to prevent short timeouts.
 	 */
 	if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel)
-		rem.tv64 -= hrtimer_resolution;
+		rem -= hrtimer_resolution;
 	return rem;
 }
 
diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 0fb7ffb1775f..8e573deda55e 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -24,21 +24,8 @@
 #include <linux/time.h>
 #include <linux/jiffies.h>
 
-/*
- * ktime_t:
- *
- * A single 64-bit variable is used to store the hrtimers
- * internal representation of time values in scalar nanoseconds. The
- * design plays out best on 64-bit CPUs, where most conversions are
- * NOPs and most arithmetic ktime_t operations are plain arithmetic
- * operations.
- *
- */
-union ktime {
-	s64	tv64;
-};
-
-typedef union ktime ktime_t;		/* Kill this */
+/* Nanosecond scalar representation for kernel time values */
+typedef s64	ktime_t;
 
 /**
  * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
@@ -50,39 +37,34 @@ typedef union ktime ktime_t;		/* Kill this */
 static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs)
 {
 	if (unlikely(secs >= KTIME_SEC_MAX))
-		return (ktime_t){ .tv64 = KTIME_MAX };
+		return KTIME_MAX;
 
-	return (ktime_t) { .tv64 = secs * NSEC_PER_SEC + (s64)nsecs };
+	return secs * NSEC_PER_SEC + (s64)nsecs;
 }
 
 /* Subtract two ktime_t variables. rem = lhs -rhs: */
-#define ktime_sub(lhs, rhs) \
-		({ (ktime_t){ .tv64 = (lhs).tv64 - (rhs).tv64 }; })
+#define ktime_sub(lhs, rhs)	((lhs) - (rhs))
 
 /* Add two ktime_t variables. res = lhs + rhs: */
-#define ktime_add(lhs, rhs) \
-		({ (ktime_t){ .tv64 = (lhs).tv64 + (rhs).tv64 }; })
+#define ktime_add(lhs, rhs)	((lhs) + (rhs))
 
 /*
  * Same as ktime_add(), but avoids undefined behaviour on overflow; however,
  * this means that you must check the result for overflow yourself.
  */
-#define ktime_add_unsafe(lhs, rhs) \
-		({ (ktime_t){ .tv64 = (u64) (lhs).tv64 + (rhs).tv64 }; })
+#define ktime_add_unsafe(lhs, rhs)	((u64) (lhs) + (rhs))
 
 /*
  * Add a ktime_t variable and a scalar nanosecond value.
  * res = kt + nsval:
  */
-#define ktime_add_ns(kt, nsval) \
-		({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; })
+#define ktime_add_ns(kt, nsval)		((kt) + (nsval))
 
 /*
  * Subtract a scalar nanosecod from a ktime_t variable
  * res = kt - nsval:
  */
-#define ktime_sub_ns(kt, nsval) \
-		({ (ktime_t){ .tv64 = (kt).tv64 - (nsval) }; })
+#define ktime_sub_ns(kt, nsval)		((kt) - (nsval))
 
 /* convert a timespec to ktime_t format: */
 static inline ktime_t timespec_to_ktime(struct timespec ts)
@@ -103,16 +85,16 @@ static inline ktime_t timeval_to_ktime(struct timeval tv)
 }
 
 /* Map the ktime_t to timespec conversion to ns_to_timespec function */
-#define ktime_to_timespec(kt)		ns_to_timespec((kt).tv64)
+#define ktime_to_timespec(kt)		ns_to_timespec((kt))
 
 /* Map the ktime_t to timespec conversion to ns_to_timespec function */
-#define ktime_to_timespec64(kt)		ns_to_timespec64((kt).tv64)
+#define ktime_to_timespec64(kt)		ns_to_timespec64((kt))
 
 /* Map the ktime_t to timeval conversion to ns_to_timeval function */
-#define ktime_to_timeval(kt)		ns_to_timeval((kt).tv64)
+#define ktime_to_timeval(kt)		ns_to_timeval((kt))
 
 /* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
-#define ktime_to_ns(kt)			((kt).tv64)
+#define ktime_to_ns(kt)			(kt)
 
 
 /**
@@ -126,7 +108,7 @@ static inline ktime_t timeval_to_ktime(struct timeval tv)
  */
 static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2)
 {
-	return cmp1.tv64 == cmp2.tv64;
+	return cmp1 == cmp2;
 }
 
 /**
@@ -141,9 +123,9 @@ static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2)
  */
 static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
 {
-	if (cmp1.tv64 < cmp2.tv64)
+	if (cmp1 < cmp2)
 		return -1;
-	if (cmp1.tv64 > cmp2.tv64)
+	if (cmp1 > cmp2)
 		return 1;
 	return 0;
 }
@@ -182,7 +164,7 @@ static inline s64 ktime_divns(const ktime_t kt, s64 div)
 	 */
 	BUG_ON(div < 0);
 	if (__builtin_constant_p(div) && !(div >> 32)) {
-		s64 ns = kt.tv64;
+		s64 ns = kt;
 		u64 tmp = ns < 0 ? -ns : ns;
 
 		do_div(tmp, div);
@@ -199,7 +181,7 @@ static inline s64 ktime_divns(const ktime_t kt, s64 div)
 	 * so catch them on 64bit as well.
 	 */
 	WARN_ON(div < 0);
-	return kt.tv64 / div;
+	return kt / div;
 }
 #endif
 
@@ -256,7 +238,7 @@ extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs);
 static inline __must_check bool ktime_to_timespec_cond(const ktime_t kt,
 						       struct timespec *ts)
 {
-	if (kt.tv64) {
+	if (kt) {
 		*ts = ktime_to_timespec(kt);
 		return true;
 	} else {
@@ -275,7 +257,7 @@ static inline __must_check bool ktime_to_timespec_cond(const ktime_t kt,
 static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt,
 						       struct timespec64 *ts)
 {
-	if (kt.tv64) {
+	if (kt) {
 		*ts = ktime_to_timespec64(kt);
 		return true;
 	} else {
@@ -290,20 +272,16 @@ static inline __must_check bool ktime_to_timespec64_cond(const ktime_t kt,
  * this resolution values.
  */
 #define LOW_RES_NSEC		TICK_NSEC
-#define KTIME_LOW_RES		(ktime_t){ .tv64 = LOW_RES_NSEC }
+#define KTIME_LOW_RES		(LOW_RES_NSEC)
 
 static inline ktime_t ns_to_ktime(u64 ns)
 {
-	static const ktime_t ktime_zero = { .tv64 = 0 };
-
-	return ktime_add_ns(ktime_zero, ns);
+	return ns;
 }
 
 static inline ktime_t ms_to_ktime(u64 ms)
 {
-	static const ktime_t ktime_zero = { .tv64 = 0 };
-
-	return ktime_add_ms(ktime_zero, ms);
+	return ms * NSEC_PER_MSEC;
 }
 
 # include <linux/timekeeping.h>
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 62be0786d6d0..a04fea19676f 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -127,9 +127,7 @@ static inline void tick_nohz_idle_exit(void) { }
 
 static inline ktime_t tick_nohz_get_sleep_length(void)
 {
-	ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
-
-	return len;
+	return NSEC_PER_SEC / HZ;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2408e8d5c05c..1421132e9086 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -510,7 +510,7 @@ do {									\
 	hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC,		\
 			      HRTIMER_MODE_REL);			\
 	hrtimer_init_sleeper(&__t, current);				\
-	if ((timeout).tv64 != KTIME_MAX)				\
+	if ((timeout) != KTIME_MAX)				\
 		hrtimer_start_range_ns(&__t.timer, timeout,		\
 				       current->timer_slack_ns,		\
 				       HRTIMER_MODE_REL);		\
diff --git a/include/net/red.h b/include/net/red.h
index 76e0b5f922c6..208e718e16b9 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -207,7 +207,7 @@ static inline void red_set_parms(struct red_parms *p,
 
 static inline int red_is_idling(const struct red_vars *v)
 {
-	return v->qidlestart.tv64 != 0;
+	return v->qidlestart != 0;
 }
 
 static inline void red_start_of_idle_period(struct red_vars *v)
@@ -217,7 +217,7 @@ static inline void red_start_of_idle_period(struct red_vars *v)
 
 static inline void red_end_of_idle_period(struct red_vars *v)
 {
-	v->qidlestart.tv64 = 0;
+	v->qidlestart = 0;
 }
 
 static inline void red_restart(struct red_vars *v)
diff --git a/include/net/sock.h b/include/net/sock.h
index 282d065e286b..f0e867f58722 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2193,8 +2193,8 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 	 */
 	if (sock_flag(sk, SOCK_RCVTSTAMP) ||
 	    (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
-	    (kt.tv64 && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
-	    (hwtstamps->hwtstamp.tv64 &&
+	    (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
+	    (hwtstamps->hwtstamp &&
 	     (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
 		__sock_recv_timestamp(msg, sk, skb);
 	else
diff --git a/include/trace/events/alarmtimer.h b/include/trace/events/alarmtimer.h
index a1c108c16c9c..ae4f358dd8e9 100644
--- a/include/trace/events/alarmtimer.h
+++ b/include/trace/events/alarmtimer.h
@@ -31,7 +31,7 @@ TRACE_EVENT(alarmtimer_suspend,
 	),
 
 	TP_fast_assign(
-		__entry->expires = expires.tv64;
+		__entry->expires = expires;
 		__entry->alarm_type = flag;
 	),
 
@@ -57,8 +57,8 @@ DECLARE_EVENT_CLASS(alarm_class,
 	TP_fast_assign(
 		__entry->alarm = alarm;
 		__entry->alarm_type = alarm->type;
-		__entry->expires = alarm->node.expires.tv64;
-		__entry->now = now.tv64;
+		__entry->expires = alarm->node.expires;
+		__entry->now = now;
 	),
 
 	TP_printk("alarmtimer:%p type:%s expires:%llu now:%llu",
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 28c5da6fdfac..1448637616d6 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -177,16 +177,14 @@ TRACE_EVENT(hrtimer_start,
 	TP_fast_assign(
 		__entry->hrtimer	= hrtimer;
 		__entry->function	= hrtimer->function;
-		__entry->expires	= hrtimer_get_expires(hrtimer).tv64;
-		__entry->softexpires	= hrtimer_get_softexpires(hrtimer).tv64;
+		__entry->expires	= hrtimer_get_expires(hrtimer);
+		__entry->softexpires	= hrtimer_get_softexpires(hrtimer);
 	),
 
 	TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu",
 		  __entry->hrtimer, __entry->function,
-		  (unsigned long long)ktime_to_ns((ktime_t) {
-				  .tv64 = __entry->expires }),
-		  (unsigned long long)ktime_to_ns((ktime_t) {
-				  .tv64 = __entry->softexpires }))
+		  (unsigned long long) __entry->expires,
+		  (unsigned long long) __entry->softexpires)
 );
 
 /**
@@ -211,13 +209,13 @@ TRACE_EVENT(hrtimer_expire_entry,
 
 	TP_fast_assign(
 		__entry->hrtimer	= hrtimer;
-		__entry->now		= now->tv64;
+		__entry->now		= *now;
 		__entry->function	= hrtimer->function;
 	),
 
 	TP_printk("hrtimer=%p function=%pf now=%llu", __entry->hrtimer, __entry->function,
-		  (unsigned long long)ktime_to_ns((ktime_t) { .tv64 = __entry->now }))
- );
+		  (unsigned long long) __entry->now)
+);
 
 DECLARE_EVENT_CLASS(hrtimer_class,
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 9246d9f593d1..0842c8ca534b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2459,7 +2459,7 @@ retry:
 	restart->fn = futex_wait_restart;
 	restart->futex.uaddr = uaddr;
 	restart->futex.val = val;
-	restart->futex.time = abs_time->tv64;
+	restart->futex.time = *abs_time;
 	restart->futex.bitset = bitset;
 	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
 
@@ -2480,7 +2480,7 @@ static long futex_wait_restart(struct restart_block *restart)
 	ktime_t t, *tp = NULL;
 
 	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-		t.tv64 = restart->futex.time;
+		t = restart->futex.time;
 		tp = &t;
 	}
 	restart->fn = do_no_restart_syscall;
diff --git a/kernel/signal.c b/kernel/signal.c
index f5d4e275345e..ff046b73ff2d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -587,7 +587,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 			struct hrtimer *tmr = &tsk->signal->real_timer;
 
 			if (!hrtimer_is_queued(tmr) &&
-			    tsk->signal->it_real_incr.tv64 != 0) {
+			    tsk->signal->it_real_incr != 0) {
 				hrtimer_forward(tmr, tmr->base->get_time(),
 						tsk->signal->it_real_incr);
 				hrtimer_restart(tmr);
@@ -2766,7 +2766,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
 int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
 		    const struct timespec *ts)
 {
-	ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };
+	ktime_t *to = NULL, timeout = KTIME_MAX;
 	struct task_struct *tsk = current;
 	sigset_t mask = *which;
 	int sig, ret = 0;
@@ -2786,7 +2786,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
 
 	spin_lock_irq(&tsk->sighand->siglock);
 	sig = dequeue_signal(tsk, &mask, info);
-	if (!sig && timeout.tv64) {
+	if (!sig && timeout) {
 		/*
 		 * None ready, temporarily unblock those we're interested
 		 * while we are sleeping in so that we'll be awakened when
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 3921cf7fea8e..ab6ac077bdb7 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -254,13 +254,13 @@ static int alarmtimer_suspend(struct device *dev)
 		if (!next)
 			continue;
 		delta = ktime_sub(next->expires, base->gettime());
-		if (!min.tv64 || (delta.tv64 < min.tv64)) {
+		if (!min || (delta < min)) {
 			expires = next->expires;
 			min = delta;
 			type = i;
 		}
 	}
-	if (min.tv64 == 0)
+	if (min == 0)
 		return 0;
 
 	if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
@@ -328,7 +328,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
 	delta = ktime_sub(absexp, base->gettime());
 
 	spin_lock_irqsave(&freezer_delta_lock, flags);
-	if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) {
+	if (!freezer_delta || (delta < freezer_delta)) {
 		freezer_delta = delta;
 		freezer_expires = absexp;
 		freezer_alarmtype = type;
@@ -453,10 +453,10 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
 
 	delta = ktime_sub(now, alarm->node.expires);
 
-	if (delta.tv64 < 0)
+	if (delta < 0)
 		return 0;
 
-	if (unlikely(delta.tv64 >= interval.tv64)) {
+	if (unlikely(delta >= interval)) {
 		s64 incr = ktime_to_ns(interval);
 
 		overrun = ktime_divns(delta, incr);
@@ -464,7 +464,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
 		alarm->node.expires = ktime_add_ns(alarm->node.expires,
 							incr*overrun);
 
-		if (alarm->node.expires.tv64 > now.tv64)
+		if (alarm->node.expires > now)
 			return overrun;
 		/*
 		 * This (and the ktime_add() below) is the
@@ -522,7 +522,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
 	}
 
 	/* Re-add periodic timers */
-	if (ptr->it.alarm.interval.tv64) {
+	if (ptr->it.alarm.interval) {
 		ptr->it_overrun += alarm_forward(alarm, now,
 						ptr->it.alarm.interval);
 		result = ALARMTIMER_RESTART;
@@ -730,7 +730,7 @@ static int update_rmtp(ktime_t exp, enum  alarmtimer_type type,
 
 	rem = ktime_sub(exp, alarm_bases[type].gettime());
 
-	if (rem.tv64 <= 0)
+	if (rem <= 0)
 		return 0;
 	rmt = ktime_to_timespec(rem);
 
@@ -755,7 +755,7 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
 	struct alarm alarm;
 	int ret = 0;
 
-	exp.tv64 = restart->nanosleep.expires;
+	exp = restart->nanosleep.expires;
 	alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
 
 	if (alarmtimer_do_nsleep(&alarm, exp))
@@ -835,7 +835,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 	restart = &current->restart_block;
 	restart->fn = alarm_timer_nsleep_restart;
 	restart->nanosleep.clockid = type;
-	restart->nanosleep.expires = exp.tv64;
+	restart->nanosleep.expires = exp;
 	restart->nanosleep.rmtp = rmtp;
 	ret = -ERESTART_RESTARTBLOCK;
 
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2c5bc77c0bb0..97ac0951f164 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -179,7 +179,7 @@ void clockevents_switch_state(struct clock_event_device *dev,
 void clockevents_shutdown(struct clock_event_device *dev)
 {
 	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
-	dev->next_event.tv64 = KTIME_MAX;
+	dev->next_event = KTIME_MAX;
 }
 
 /**
@@ -213,7 +213,7 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
 	if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
 		printk_deferred(KERN_WARNING
 				"CE: Reprogramming failure. Giving up\n");
-		dev->next_event.tv64 = KTIME_MAX;
+		dev->next_event = KTIME_MAX;
 		return -ETIME;
 	}
 
@@ -310,7 +310,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 	int64_t delta;
 	int rc;
 
-	if (unlikely(expires.tv64 < 0)) {
+	if (unlikely(expires < 0)) {
 		WARN_ON_ONCE(1);
 		return -ETIME;
 	}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 161e340395d5..c7f780113884 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -171,7 +171,7 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 		return 0;
 
 	expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
-	return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
+	return expires <= new_base->cpu_base->expires_next;
 #else
 	return 0;
 #endif
@@ -313,7 +313,7 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
 	 * We use KTIME_SEC_MAX here, the maximum timeout which we can
 	 * return to user space in a timespec:
 	 */
-	if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+	if (res < 0 || res < lhs || res < rhs)
 		res = ktime_set(KTIME_SEC_MAX, 0);
 
 	return res;
@@ -465,8 +465,8 @@ static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 {
 	struct hrtimer_clock_base *base = cpu_base->clock_base;
-	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
 	unsigned int active = cpu_base->active_bases;
+	ktime_t expires, expires_next = KTIME_MAX;
 
 	hrtimer_update_next_timer(cpu_base, NULL);
 	for (; active; base++, active >>= 1) {
@@ -479,7 +479,7 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		if (expires.tv64 < expires_next.tv64) {
+		if (expires < expires_next) {
 			expires_next = expires;
 			hrtimer_update_next_timer(cpu_base, timer);
 		}
@@ -489,8 +489,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 	 * the clock bases so the result might be negative. Fix it up
 	 * to prevent a false positive in clockevents_program_event().
 	 */
-	if (expires_next.tv64 < 0)
-		expires_next.tv64 = 0;
+	if (expires_next < 0)
+		expires_next = 0;
 	return expires_next;
 }
 #endif
@@ -561,10 +561,10 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 
 	expires_next = __hrtimer_get_next_event(cpu_base);
 
-	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
+	if (skip_equal && expires_next == cpu_base->expires_next)
 		return;
 
-	cpu_base->expires_next.tv64 = expires_next.tv64;
+	cpu_base->expires_next = expires_next;
 
 	/*
 	 * If a hang was detected in the last timer interrupt then we
@@ -622,10 +622,10 @@ static void hrtimer_reprogram(struct hrtimer *timer,
 	 * CLOCK_REALTIME timer might be requested with an absolute
 	 * expiry time which is less than base->offset. Set it to 0.
 	 */
-	if (expires.tv64 < 0)
-		expires.tv64 = 0;
+	if (expires < 0)
+		expires = 0;
 
-	if (expires.tv64 >= cpu_base->expires_next.tv64)
+	if (expires >= cpu_base->expires_next)
 		return;
 
 	/* Update the pointer to the next expiring timer */
@@ -653,7 +653,7 @@ static void hrtimer_reprogram(struct hrtimer *timer,
  */
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 {
-	base->expires_next.tv64 = KTIME_MAX;
+	base->expires_next = KTIME_MAX;
 	base->hres_active = 0;
 }
 
@@ -827,21 +827,21 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 
 	delta = ktime_sub(now, hrtimer_get_expires(timer));
 
-	if (delta.tv64 < 0)
+	if (delta < 0)
 		return 0;
 
 	if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
 		return 0;
 
-	if (interval.tv64 < hrtimer_resolution)
-		interval.tv64 = hrtimer_resolution;
+	if (interval < hrtimer_resolution)
+		interval = hrtimer_resolution;
 
-	if (unlikely(delta.tv64 >= interval.tv64)) {
+	if (unlikely(delta >= interval)) {
 		s64 incr = ktime_to_ns(interval);
 
 		orun = ktime_divns(delta, incr);
 		hrtimer_add_expires_ns(timer, incr * orun);
-		if (hrtimer_get_expires_tv64(timer) > now.tv64)
+		if (hrtimer_get_expires_tv64(timer) > now)
 			return orun;
 		/*
 		 * This (and the ktime_add() below) is the
@@ -1104,7 +1104,7 @@ u64 hrtimer_get_next_event(void)
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
 	if (!__hrtimer_hres_active(cpu_base))
-		expires = __hrtimer_get_next_event(cpu_base).tv64;
+		expires = __hrtimer_get_next_event(cpu_base);
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
@@ -1296,7 +1296,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 			 * are right-of a not yet expired timer, because that
 			 * timer will have to trigger a wakeup anyway.
 			 */
-			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
+			if (basenow < hrtimer_get_softexpires_tv64(timer))
 				break;
 
 			__run_hrtimer(cpu_base, base, timer, &basenow);
@@ -1318,7 +1318,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 	BUG_ON(!cpu_base->hres_active);
 	cpu_base->nr_events++;
-	dev->next_event.tv64 = KTIME_MAX;
+	dev->next_event = KTIME_MAX;
 
 	raw_spin_lock(&cpu_base->lock);
 	entry_time = now = hrtimer_update_base(cpu_base);
@@ -1331,7 +1331,7 @@ retry:
 	 * timers which run their callback and need to be requeued on
 	 * this CPU.
 	 */
-	cpu_base->expires_next.tv64 = KTIME_MAX;
+	cpu_base->expires_next = KTIME_MAX;
 
 	__hrtimer_run_queues(cpu_base, now);
 
@@ -1379,13 +1379,13 @@ retry:
 	cpu_base->hang_detected = 1;
 	raw_spin_unlock(&cpu_base->lock);
 	delta = ktime_sub(now, entry_time);
-	if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
-		cpu_base->max_hang_time = (unsigned int) delta.tv64;
+	if ((unsigned int)delta > cpu_base->max_hang_time)
+		cpu_base->max_hang_time = (unsigned int) delta;
 	/*
 	 * Limit it to a sensible value as we enforce a longer
 	 * delay. Give the CPU at least 100ms to catch up.
 	 */
-	if (delta.tv64 > 100 * NSEC_PER_MSEC)
+	if (delta > 100 * NSEC_PER_MSEC)
 		expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
 	else
 		expires_next = ktime_add(now, delta);
@@ -1495,7 +1495,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
 	ktime_t rem;
 
 	rem = hrtimer_expires_remaining(timer);
-	if (rem.tv64 <= 0)
+	if (rem <= 0)
 		return 0;
 	rmt = ktime_to_timespec(rem);
 
@@ -1693,7 +1693,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
 	 * Optimize when a zero timeout value is given. It does not
 	 * matter whether this is an absolute or a relative time.
 	 */
-	if (expires && !expires->tv64) {
+	if (expires && *expires == 0) {
 		__set_current_state(TASK_RUNNING);
 		return 0;
 	}
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index a45afb7277c2..8c89143f9ebf 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -34,10 +34,10 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
 	 * then we return 0 - which is correct.
 	 */
 	if (hrtimer_active(timer)) {
-		if (rem.tv64 <= 0)
-			rem.tv64 = NSEC_PER_USEC;
+		if (rem <= 0)
+			rem = NSEC_PER_USEC;
 	} else
-		rem.tv64 = 0;
+		rem = 0;
 
 	return ktime_to_timeval(rem);
 }
@@ -216,12 +216,12 @@ again:
 			goto again;
 		}
 		expires = timeval_to_ktime(value->it_value);
-		if (expires.tv64 != 0) {
+		if (expires != 0) {
 			tsk->signal->it_real_incr =
 				timeval_to_ktime(value->it_interval);
 			hrtimer_start(timer, expires, HRTIMER_MODE_REL);
 		} else
-			tsk->signal->it_real_incr.tv64 = 0;
+			tsk->signal->it_real_incr = 0;
 
 		trace_itimer_state(ITIMER_REAL, value, 0);
 		spin_unlock_irq(&tsk->sighand->siglock);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 6df8927c58a5..edf19cc53140 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -381,7 +381,7 @@ ktime_t ntp_get_next_leap(void)
 
 	if ((time_state == TIME_INS) && (time_status & STA_INS))
 		return ktime_set(ntp_next_leap_sec, 0);
-	ret.tv64 = KTIME_MAX;
+	ret = KTIME_MAX;
 	return ret;
 }
 
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42d7b9558741..9fe98b3777a2 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -359,7 +359,7 @@ static void schedule_next_timer(struct k_itimer *timr)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 
-	if (timr->it.real.interval.tv64 == 0)
+	if (timr->it.real.interval == 0)
 		return;
 
 	timr->it_overrun += (unsigned int) hrtimer_forward(timer,
@@ -449,7 +449,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 	timr = container_of(timer, struct k_itimer, it.real.timer);
 	spin_lock_irqsave(&timr->it_lock, flags);
 
-	if (timr->it.real.interval.tv64 != 0)
+	if (timr->it.real.interval != 0)
 		si_private = ++timr->it_requeue_pending;
 
 	if (posix_timer_event(timr, si_private)) {
@@ -458,7 +458,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 		 * we will not get a call back to restart it AND
 		 * it should be restarted.
 		 */
-		if (timr->it.real.interval.tv64 != 0) {
+		if (timr->it.real.interval != 0) {
 			ktime_t now = hrtimer_cb_get_time(timer);
 
 			/*
@@ -487,7 +487,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 			{
 				ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
 
-				if (timr->it.real.interval.tv64 < kj.tv64)
+				if (timr->it.real.interval < kj)
 					now = ktime_add(now, kj);
 			}
 #endif
@@ -743,7 +743,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	iv = timr->it.real.interval;
 
 	/* interval timer ? */
-	if (iv.tv64)
+	if (iv)
 		cur_setting->it_interval = ktime_to_timespec(iv);
 	else if (!hrtimer_active(timer) &&
 		 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
@@ -756,13 +756,13 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 	 * timer move the expiry time forward by intervals, so
 	 * expiry is > now.
 	 */
-	if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
-	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+	if (iv && (timr->it_requeue_pending & REQUEUE_PENDING ||
+		   (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
 		timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
 
 	remaining = __hrtimer_expires_remaining_adjusted(timer, now);
 	/* Return 0 only, when the timer is expired and not pending */
-	if (remaining.tv64 <= 0) {
+	if (remaining <= 0) {
 		/*
 		 * A single shot SIGEV_NONE timer must return 0, when
 		 * it is expired !
@@ -839,7 +839,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
-	timr->it.real.interval.tv64 = 0;
+	timr->it.real.interval = 0;
 	/*
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
@@ -924,7 +924,7 @@ retry:
 
 static int common_timer_del(struct k_itimer *timer)
 {
-	timer->it.real.interval.tv64 = 0;
+	timer->it.real.interval = 0;
 
 	if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0)
 		return TIMER_RETRY;
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 690b797f522e..a7bb8f33ae07 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -97,7 +97,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
 	ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
 
 	if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
-		if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+		if (ce_broadcast_hrtimer.next_event != KTIME_MAX)
 			return HRTIMER_RESTART;
 
 	return HRTIMER_NORESTART;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index d2a20e83ebae..3109204c87cc 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -604,14 +604,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
-	dev->next_event.tv64 = KTIME_MAX;
-	next_event.tv64 = KTIME_MAX;
+	dev->next_event = KTIME_MAX;
+	next_event = KTIME_MAX;
 	cpumask_clear(tmpmask);
 	now = ktime_get();
 	/* Find all expired events */
 	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
 		td = &per_cpu(tick_cpu_device, cpu);
-		if (td->evtdev->next_event.tv64 <= now.tv64) {
+		if (td->evtdev->next_event <= now) {
 			cpumask_set_cpu(cpu, tmpmask);
 			/*
 			 * Mark the remote cpu in the pending mask, so
@@ -619,8 +619,8 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 			 * timer in tick_broadcast_oneshot_control().
 			 */
 			cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
-		} else if (td->evtdev->next_event.tv64 < next_event.tv64) {
-			next_event.tv64 = td->evtdev->next_event.tv64;
+		} else if (td->evtdev->next_event < next_event) {
+			next_event = td->evtdev->next_event;
 			next_cpu = cpu;
 		}
 	}
@@ -657,7 +657,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 	 * - There are pending events on sleeping CPUs which were not
 	 * in the event mask
 	 */
-	if (next_event.tv64 != KTIME_MAX)
+	if (next_event != KTIME_MAX)
 		tick_broadcast_set_event(dev, next_cpu, next_event);
 
 	raw_spin_unlock(&tick_broadcast_lock);
@@ -672,7 +672,7 @@ static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
 {
 	if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
 		return 0;
-	if (bc->next_event.tv64 == KTIME_MAX)
+	if (bc->next_event == KTIME_MAX)
 		return 0;
 	return bc->bound_on == cpu ? -EBUSY : 0;
 }
@@ -688,7 +688,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
 	if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
 		if (broadcast_needs_cpu(bc, smp_processor_id()))
 			return;
-		if (dev->next_event.tv64 < bc->next_event.tv64)
+		if (dev->next_event < bc->next_event)
 			return;
 	}
 	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
@@ -754,7 +754,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			 */
 			if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) {
 				ret = -EBUSY;
-			} else if (dev->next_event.tv64 < bc->next_event.tv64) {
+			} else if (dev->next_event < bc->next_event) {
 				tick_broadcast_set_event(bc, cpu, dev->next_event);
 				/*
 				 * In case of hrtimer broadcasts the
@@ -789,7 +789,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			/*
 			 * Bail out if there is no next event.
 			 */
-			if (dev->next_event.tv64 == KTIME_MAX)
+			if (dev->next_event == KTIME_MAX)
 				goto out;
 			/*
 			 * If the pending bit is not set, then we are
@@ -824,7 +824,7 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			 * nohz fixups.
 			 */
 			now = ktime_get();
-			if (dev->next_event.tv64 <= now.tv64) {
+			if (dev->next_event <= now) {
 				cpumask_set_cpu(cpu, tick_broadcast_force_mask);
 				goto out;
 			}
@@ -897,7 +897,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 						       tick_next_period);
 			tick_broadcast_set_event(bc, cpu, tick_next_period);
 		} else
-			bc->next_event.tv64 = KTIME_MAX;
+			bc->next_event = KTIME_MAX;
 	} else {
 		/*
 		 * The first cpu which switches to oneshot mode sets
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index b51344652330..6b009c207671 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,7 +28,7 @@ int tick_program_event(ktime_t expires, int force)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	if (unlikely(expires.tv64 == KTIME_MAX)) {
+	if (unlikely(expires == KTIME_MAX)) {
 		/*
 		 * We don't need the clock event device any more, stop it.
 		 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 71496a20e670..2c115fdab397 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -58,21 +58,21 @@ static void tick_do_update_jiffies64(ktime_t now)
 	 * Do a quick check without holding jiffies_lock:
 	 */
 	delta = ktime_sub(now, last_jiffies_update);
-	if (delta.tv64 < tick_period.tv64)
+	if (delta < tick_period)
 		return;
 
 	/* Reevaluate with jiffies_lock held */
 	write_seqlock(&jiffies_lock);
 
 	delta = ktime_sub(now, last_jiffies_update);
-	if (delta.tv64 >= tick_period.tv64) {
+	if (delta >= tick_period) {
 
 		delta = ktime_sub(delta, tick_period);
 		last_jiffies_update = ktime_add(last_jiffies_update,
 						tick_period);
 
 		/* Slow path for long timeouts */
-		if (unlikely(delta.tv64 >= tick_period.tv64)) {
+		if (unlikely(delta >= tick_period)) {
 			s64 incr = ktime_to_ns(tick_period);
 
 			ticks = ktime_divns(delta, incr);
@@ -101,7 +101,7 @@ static ktime_t tick_init_jiffy_update(void)
 
 	write_seqlock(&jiffies_lock);
 	/* Did we start the jiffies update yet ? */
-	if (last_jiffies_update.tv64 == 0)
+	if (last_jiffies_update == 0)
 		last_jiffies_update = tick_next_period;
 	period = last_jiffies_update;
 	write_sequnlock(&jiffies_lock);
@@ -669,7 +669,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&jiffies_lock);
-		basemono = last_jiffies_update.tv64;
+		basemono = last_jiffies_update;
 		basejiff = jiffies;
 	} while (read_seqretry(&jiffies_lock, seq));
 	ts->last_jiffies = basejiff;
@@ -697,7 +697,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 */
 	delta = next_tick - basemono;
 	if (delta <= (u64)TICK_NSEC) {
-		tick.tv64 = 0;
+		tick = 0;
 
 		/*
 		 * Tell the timer code that the base is not idle, i.e. undo
@@ -764,10 +764,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		expires = KTIME_MAX;
 
 	expires = min_t(u64, expires, next_tick);
-	tick.tv64 = expires;
+	tick = expires;
 
 	/* Skip reprogram of event if its not changed */
-	if (ts->tick_stopped && (expires == dev->next_event.tv64))
+	if (ts->tick_stopped && (expires == dev->next_event))
 		goto out;
 
 	/*
@@ -864,7 +864,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 	}
 
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
-		ts->sleep_length = (ktime_t) { .tv64 = NSEC_PER_SEC/HZ };
+		ts->sleep_length = NSEC_PER_SEC / HZ;
 		return false;
 	}
 
@@ -914,7 +914,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 		ts->idle_calls++;
 
 		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
-		if (expires.tv64 > 0LL) {
+		if (expires > 0LL) {
 			ts->idle_sleeps++;
 			ts->idle_expires = expires;
 		}
@@ -1051,7 +1051,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 
-	dev->next_event.tv64 = KTIME_MAX;
+	dev->next_event = KTIME_MAX;
 
 	tick_sched_do_timer(now);
 	tick_sched_handle(ts, regs);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f4152a69277f..db087d7e106d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -104,7 +104,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
 	 */
 	set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
 					-tk->wall_to_monotonic.tv_nsec);
-	WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);
+	WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
 	tk->wall_to_monotonic = wtm;
 	set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
 	tk->offs_real = timespec64_to_ktime(tmp);
@@ -571,7 +571,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 static inline void tk_update_leap_state(struct timekeeper *tk)
 {
 	tk->next_leap_ktime = ntp_get_next_leap();
-	if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+	if (tk->next_leap_ktime != KTIME_MAX)
 		/* Convert to monotonic time */
 		tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
 }
@@ -2250,7 +2250,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 		}
 
 		/* Handle leapsecond insertion adjustments */
-		if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+		if (unlikely(base >= tk->next_leap_ktime))
 			*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 782ae8ca2c06..adc6ee0a5126 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -48,7 +48,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	while (*p) {
 		parent = *p;
 		ptr = rb_entry(parent, struct timerqueue_node, node);
-		if (node->expires.tv64 < ptr->expires.tv64)
+		if (node->expires < ptr->expires)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -56,7 +56,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	rb_link_node(&node->node, parent, p);
 	rb_insert_color(&node->node, &head->head);
 
-	if (!head->next || node->expires.tv64 < head->next->expires.tv64) {
+	if (!head->next || node->expires < head->next->expires) {
 		head->next = node;
 		return true;
 	}
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 436a7537e6a9..ab8ba1e16473 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -199,11 +199,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
 
 		seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');
 
-		if (op->kt_ival1.tv64)
+		if (op->kt_ival1)
 			seq_printf(m, "timeo=%lld ",
 				   (long long)ktime_to_us(op->kt_ival1));
 
-		if (op->kt_ival2.tv64)
+		if (op->kt_ival2)
 			seq_printf(m, "thr=%lld ",
 				   (long long)ktime_to_us(op->kt_ival2));
 
@@ -226,11 +226,11 @@ static int bcm_proc_show(struct seq_file *m, void *v)
 		else
 			seq_printf(m, "[%u] ", op->nframes);
 
-		if (op->kt_ival1.tv64)
+		if (op->kt_ival1)
 			seq_printf(m, "t1=%lld ",
 				   (long long)ktime_to_us(op->kt_ival1));
 
-		if (op->kt_ival2.tv64)
+		if (op->kt_ival2)
 			seq_printf(m, "t2=%lld ",
 				   (long long)ktime_to_us(op->kt_ival2));
 
@@ -365,11 +365,11 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
 
 static void bcm_tx_start_timer(struct bcm_op *op)
 {
-	if (op->kt_ival1.tv64 && op->count)
+	if (op->kt_ival1 && op->count)
 		hrtimer_start(&op->timer,
 			      ktime_add(ktime_get(), op->kt_ival1),
 			      HRTIMER_MODE_ABS);
-	else if (op->kt_ival2.tv64)
+	else if (op->kt_ival2)
 		hrtimer_start(&op->timer,
 			      ktime_add(ktime_get(), op->kt_ival2),
 			      HRTIMER_MODE_ABS);
@@ -380,7 +380,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
 	struct bcm_op *op = (struct bcm_op *)data;
 	struct bcm_msg_head msg_head;
 
-	if (op->kt_ival1.tv64 && (op->count > 0)) {
+	if (op->kt_ival1 && (op->count > 0)) {
 
 		op->count--;
 		if (!op->count && (op->flags & TX_COUNTEVT)) {
@@ -398,7 +398,7 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
 		}
 		bcm_can_tx(op);
 
-	} else if (op->kt_ival2.tv64)
+	} else if (op->kt_ival2)
 		bcm_can_tx(op);
 
 	bcm_tx_start_timer(op);
@@ -459,7 +459,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
 	lastdata->flags |= (RX_RECV|RX_THR);
 
 	/* throttling mode inactive ? */
-	if (!op->kt_ival2.tv64) {
+	if (!op->kt_ival2) {
 		/* send RX_CHANGED to the user immediately */
 		bcm_rx_changed(op, lastdata);
 		return;
@@ -470,7 +470,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
 		return;
 
 	/* first reception with enabled throttling mode */
-	if (!op->kt_lastmsg.tv64)
+	if (!op->kt_lastmsg)
 		goto rx_changed_settime;
 
 	/* got a second frame inside a potential throttle period? */
@@ -537,7 +537,7 @@ static void bcm_rx_starttimer(struct bcm_op *op)
 	if (op->flags & RX_NO_AUTOTIMER)
 		return;
 
-	if (op->kt_ival1.tv64)
+	if (op->kt_ival1)
 		hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
 }
 
@@ -1005,7 +1005,7 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 		op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
 
 		/* disable an active timer due to zero values? */
-		if (!op->kt_ival1.tv64 && !op->kt_ival2.tv64)
+		if (!op->kt_ival1 && !op->kt_ival2)
 			hrtimer_cancel(&op->timer);
 	}
 
@@ -1189,7 +1189,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 			op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);
 
 			/* disable an active timer due to zero value? */
-			if (!op->kt_ival1.tv64)
+			if (!op->kt_ival1)
 				hrtimer_cancel(&op->timer);
 
 			/*
@@ -1201,7 +1201,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 			bcm_rx_thr_flush(op, 1);
 		}
 
-		if ((op->flags & STARTTIMER) && op->kt_ival1.tv64)
+		if ((op->flags & STARTTIMER) && op->kt_ival1)
 			hrtimer_start(&op->timer, op->kt_ival1,
 				      HRTIMER_MODE_REL);
 	}
diff --git a/net/can/gw.c b/net/can/gw.c
index 455168718c2e..a54ab0c82104 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -429,7 +429,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 
 	/* clear the skb timestamp if not configured the other way */
 	if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP))
-		nskb->tstamp.tv64 = 0;
+		nskb->tstamp = 0;
 
 	/* send to netdevice */
 	if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO))
diff --git a/net/core/dev.c b/net/core/dev.c
index 037ffd27fcc2..8db5a0b4b520 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1731,14 +1731,14 @@ EXPORT_SYMBOL(net_disable_timestamp);
 
 static inline void net_timestamp_set(struct sk_buff *skb)
 {
-	skb->tstamp.tv64 = 0;
+	skb->tstamp = 0;
 	if (static_key_false(&netstamp_needed))
 		__net_timestamp(skb);
 }
 
 #define net_timestamp_check(COND, SKB)			\
 	if (static_key_false(&netstamp_needed)) {		\
-		if ((COND) && !(SKB)->tstamp.tv64)	\
+		if ((COND) && !(SKB)->tstamp)	\
 			__net_timestamp(SKB);		\
 	}						\
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e77f40616fea..5a03730fbc1a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4368,7 +4368,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
  */
 void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 {
-	skb->tstamp.tv64 = 0;
+	skb->tstamp = 0;
 	skb->pkt_type = PACKET_HOST;
 	skb->skb_iif = 0;
 	skb->ignore_df = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 31a255b555ad..1d5331a1b1dc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1038,7 +1038,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
 
 	/* Our usage of tstamp should remain private */
-	skb->tstamp.tv64 = 0;
+	skb->tstamp = 0;
 
 	/* Cleanup our debris for IP stacks */
 	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
@@ -3203,7 +3203,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 #endif
 
 	/* Do not fool tcpdump (if any), clean our debris */
-	skb->tstamp.tv64 = 0;
+	skb->tstamp = 0;
 	return skb;
 }
 EXPORT_SYMBOL(tcp_make_synack);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 926818c331e5..e4198502fd98 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -232,7 +232,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
 	ipv6h->saddr = hao->addr;
 	hao->addr = tmp_addr;
 
-	if (skb->tstamp.tv64 == 0)
+	if (skb->tstamp == 0)
 		__net_timestamp(skb);
 
 	return true;
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index e07f22b0c58a..8a9219ff2e77 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1809,7 +1809,7 @@ static int ipx_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 	rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied);
 	if (rc)
 		goto out_free;
-	if (skb->tstamp.tv64)
+	if (skb->tstamp)
 		sk->sk_stamp = skb->tstamp;
 
 	if (sipx) {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 6a0bbfa8e702..3a073cd9fcf4 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -783,7 +783,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	/* set conntrack timestamp, if enabled. */
 	tstamp = nf_conn_tstamp_find(ct);
 	if (tstamp) {
-		if (skb->tstamp.tv64 == 0)
+		if (skb->tstamp == 0)
 			__net_timestamp(skb);
 
 		tstamp->start = ktime_to_ns(skb->tstamp);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 200922bb2036..08247bf7d7b8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -538,7 +538,7 @@ __build_packet_message(struct nfnl_log_net *log,
 			goto nla_put_failure;
 	}
 
-	if (skb->tstamp.tv64) {
+	if (skb->tstamp) {
 		struct nfulnl_msg_packet_timestamp ts;
 		struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
 		ts.sec = cpu_to_be64(kts.tv_sec);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index be7627b80400..3ee0b8a000a4 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -384,7 +384,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		+ nla_total_size(sizeof(u_int32_t))	/* skbinfo */
 		+ nla_total_size(sizeof(u_int32_t));	/* cap_len */
 
-	if (entskb->tstamp.tv64)
+	if (entskb->tstamp)
 		size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
 
 	size += nfqnl_get_bridge_size(entry);
@@ -555,7 +555,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	if (nfqnl_put_bridge(entry, skb) < 0)
 		goto nla_put_failure;
 
-	if (entskb->tstamp.tv64) {
+	if (entskb->tstamp) {
 		struct nfqnl_msg_packet_timestamp ts;
 		struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
 
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 0ae55a36f492..1b01eec1fbda 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -168,7 +168,7 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	 * may happen that the same packet matches both rules if
 	 * it arrived at the right moment before 13:00.
 	 */
-	if (skb->tstamp.tv64 == 0)
+	if (skb->tstamp == 0)
 		__net_timestamp((struct sk_buff *)skb);
 
 	stamp = ktime_to_ns(skb->tstamp);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b7e4097bfdab..bcfadfdea8e0 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -627,7 +627,7 @@ deliver:
 			 * from the network (tstamp will be updated).
 			 */
 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
-				skb->tstamp.tv64 = 0;
+				skb->tstamp = 0;
 #endif
 
 			if (q->qdisc) {
diff --git a/net/socket.c b/net/socket.c
index 5ff26c44db33..8487bf136e5c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -668,7 +668,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 
 	/* Race occurred between timestamp enabling and packet
 	   receiving.  Fill in the current time for now. */
-	if (need_software_tstamp && skb->tstamp.tv64 == 0)
+	if (need_software_tstamp && skb->tstamp == 0)
 		__net_timestamp(skb);
 
 	if (need_software_tstamp) {
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index a3e85ee28b5a..de066acdb34e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -574,7 +574,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp)
 	}
 	len = svc_addr_len(svc_addr(rqstp));
 	rqstp->rq_addrlen = len;
-	if (skb->tstamp.tv64 == 0) {
+	if (skb->tstamp == 0) {
 		skb->tstamp = ktime_get_real();
 		/* Don't enable netstamp, sunrpc doesn't
 		   need that much accuracy */
diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c
index e2f27022b363..1ac0c423903e 100644
--- a/sound/core/hrtimer.c
+++ b/sound/core/hrtimer.c
@@ -58,7 +58,7 @@ static enum hrtimer_restart snd_hrtimer_callback(struct hrtimer *hrt)
 
 	/* calculate the drift */
 	delta = ktime_sub(hrt->base->get_time(), hrtimer_get_expires(hrt));
-	if (delta.tv64 > 0)
+	if (delta > 0)
 		ticks += ktime_divns(delta, ticks * resolution);
 
 	snd_timer_interrupt(stime->timer, ticks);
-- 
cgit v1.2.3


From 8b0e195314fabd58a331c4f7b6db75a1565535d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 25 Dec 2016 12:30:41 +0100
Subject: ktime: Cleanup ktime_set() usage

ktime_set(S,N) was required for the timespec storage type and is still
useful for situations where a Seconds and Nanoseconds part of a time value
needs to be converted. For anything where the Seconds argument is 0, this
is pointless and can be replaced with a simple assignment.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/powerpc/kvm/book3s_hv.c                   | 3 +--
 arch/powerpc/oprofile/cell/spu_profiler.c      | 4 ++--
 arch/s390/kvm/interrupt.c                      | 2 +-
 arch/x86/kvm/lapic.c                           | 4 ++--
 block/blk-mq.c                                 | 2 +-
 drivers/base/power/main.c                      | 2 +-
 drivers/base/power/wakeup.c                    | 2 +-
 drivers/block/null_blk.c                       | 2 +-
 drivers/dma/dmatest.c                          | 4 ++--
 drivers/gpu/drm/amd/amdgpu/dce_virtual.c       | 6 +++---
 drivers/gpu/drm/i915/intel_uncore.c            | 2 +-
 drivers/gpu/drm/nouveau/nouveau_fence.c        | 2 +-
 drivers/gpu/drm/tilcdc/tilcdc_crtc.c           | 2 +-
 drivers/iio/trigger/iio-trig-hrtimer.c         | 5 ++---
 drivers/input/joystick/walkera0701.c           | 2 +-
 drivers/mailbox/mailbox.c                      | 3 +--
 drivers/media/dvb-core/dmxdev.c                | 2 +-
 drivers/media/pci/cx88/cx88-input.c            | 6 ++----
 drivers/media/pci/pt3/pt3.c                    | 2 +-
 drivers/net/can/softing/softing_fw.c           | 4 ++--
 drivers/net/can/softing/softing_main.c         | 2 +-
 drivers/net/ethernet/ec_bhf.c                  | 5 ++---
 drivers/net/ethernet/marvell/mvpp2.c           | 2 +-
 drivers/net/ethernet/tile/tilegx.c             | 4 ++--
 drivers/net/ieee802154/at86rf230.c             | 9 ++++-----
 drivers/net/usb/cdc_ncm.c                      | 2 +-
 drivers/net/wireless/ralink/rt2x00/rt2800usb.c | 4 ++--
 drivers/pci/quirks.c                           | 2 +-
 drivers/platform/x86/msi-wmi.c                 | 2 +-
 drivers/power/reset/ltc2952-poweroff.c         | 2 +-
 drivers/rtc/interface.c                        | 8 ++++----
 drivers/s390/crypto/ap_bus.c                   | 4 ++--
 drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c       | 2 +-
 drivers/scsi/scsi_debug.c                      | 2 +-
 drivers/scsi/ufs/ufshcd.c                      | 4 ++--
 drivers/usb/gadget/function/f_ncm.c            | 3 +--
 drivers/usb/host/ehci-timer.c                  | 3 +--
 drivers/usb/host/fotg210-hcd.c                 | 3 +--
 drivers/usb/musb/musb_cppi41.c                 | 9 ++++-----
 fs/dlm/lock.c                                  | 2 +-
 fs/gfs2/glock.c                                | 2 +-
 fs/timerfd.c                                   | 2 +-
 include/linux/skbuff.h                         | 2 +-
 kernel/sched/core.c                            | 2 +-
 kernel/time/alarmtimer.c                       | 4 ++--
 kernel/time/hrtimer.c                          | 2 +-
 kernel/time/posix-timers.c                     | 2 +-
 kernel/time/tick-common.c                      | 4 ++--
 net/can/bcm.c                                  | 4 ++--
 net/mac802154/util.c                           | 4 ++--
 net/sched/sch_cbq.c                            | 2 +-
 net/sctp/transport.c                           | 2 +-
 net/xfrm/xfrm_state.c                          | 2 +-
 sound/drivers/pcsp/pcsp_lib.c                  | 2 +-
 sound/firewire/lib.c                           | 6 +++---
 sound/sh/sh_dac_audio.c                        | 2 +-
 56 files changed, 84 insertions(+), 95 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 66b2a35be424..ec34e39471a7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1872,8 +1872,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 	}
 	dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
 		   / tb_ticks_per_sec;
-	hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
-		      HRTIMER_MODE_REL);
+	hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
 	vcpu->arch.timer_running = 1;
 }
 
diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c
index b19265de9178..5182f2936af2 100644
--- a/arch/powerpc/oprofile/cell/spu_profiler.c
+++ b/arch/powerpc/oprofile/cell/spu_profiler.c
@@ -180,7 +180,7 @@ static enum hrtimer_restart profile_spus(struct hrtimer *timer)
 	smp_wmb();	/* insure spu event buffer updates are written */
 			/* don't want events intermingled... */
 
-	kt = ktime_set(0, profiling_interval);
+	kt = profiling_interval;
 	if (!spu_prof_running)
 		goto stop;
 	hrtimer_forward(timer, timer->base->get_time(), kt);
@@ -204,7 +204,7 @@ int start_spu_profiling_cycles(unsigned int cycles_reset)
 	ktime_t kt;
 
 	pr_debug("timer resolution: %lu\n", TICK_NSEC);
-	kt = ktime_set(0, profiling_interval);
+	kt = profiling_interval;
 	hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer_set_expires(&timer, kt);
 	timer.function = profile_spus;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 6843dd5a1cba..0f8f14199734 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1019,7 +1019,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		return 0;
 
 	__set_cpu_idle(vcpu);
-	hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
+	hrtimer_start(&vcpu->arch.ckc_timer, sltime, HRTIMER_MODE_REL);
 	VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
 no_timer:
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 34a66b2d47e6..5fe290c1b7d8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1106,7 +1106,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 	now = ktime_get();
 	remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
 	if (ktime_to_ns(remaining) < 0)
-		remaining = ktime_set(0, 0);
+		remaining = 0;
 
 	ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
 	tmcct = div64_u64(ns,
@@ -2057,7 +2057,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 			apic->lapic_timer.tscdeadline = 0;
 		if (apic_lvtt_oneshot(apic)) {
 			apic->lapic_timer.tscdeadline = 0;
-			apic->lapic_timer.target_expiration = ktime_set(0, 0);
+			apic->lapic_timer.target_expiration = 0;
 		}
 		atomic_set(&apic->lapic_timer.pending, 0);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4bf850e8d6b5..a8e67a155d04 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2569,7 +2569,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	 * This will be replaced with the stats tracking code, using
 	 * 'avg_completion_time / 2' as the pre-sleep target.
 	 */
-	kt = ktime_set(0, nsecs);
+	kt = nsecs;
 
 	mode = HRTIMER_MODE_REL;
 	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 48c6294e9c34..249e0304597f 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -194,7 +194,7 @@ void device_pm_move_last(struct device *dev)
 
 static ktime_t initcall_debug_start(struct device *dev)
 {
-	ktime_t calltime = ktime_set(0, 0);
+	ktime_t calltime = 0;
 
 	if (pm_print_times_enabled) {
 		pr_info("calling  %s+ @ %i, parent: %s\n",
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 94332902a1cf..f546f8f107b0 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1005,7 +1005,7 @@ static int print_wakeup_source_stats(struct seq_file *m,
 			prevent_sleep_time = ktime_add(prevent_sleep_time,
 				ktime_sub(now, ws->start_prevent_time));
 	} else {
-		active_time = ktime_set(0, 0);
+		active_time = 0;
 	}
 
 	seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 4943ee22716e..c0e14e54909b 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -257,7 +257,7 @@ static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
 
 static void null_cmd_end_timer(struct nullb_cmd *cmd)
 {
-	ktime_t kt = ktime_set(0, completion_nsec);
+	ktime_t kt = completion_nsec;
 
 	hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
 }
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 451f899f74e4..c9297605058c 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -429,8 +429,8 @@ static int dmatest_func(void *data)
 	int			dst_cnt;
 	int			i;
 	ktime_t			ktime, start, diff;
-	ktime_t			filltime = ktime_set(0, 0);
-	ktime_t			comparetime = ktime_set(0, 0);
+	ktime_t			filltime = 0;
+	ktime_t			comparetime = 0;
 	s64			runtime = 0;
 	unsigned long long	total_len = 0;
 	u8			align = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
index e4a5a5ac0ff3..762f8e82ceb7 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_virtual.c
@@ -752,7 +752,7 @@ static enum hrtimer_restart dce_virtual_vblank_timer_handle(struct hrtimer *vbla
 
 	drm_handle_vblank(ddev, amdgpu_crtc->crtc_id);
 	dce_virtual_pageflip(adev, amdgpu_crtc->crtc_id);
-	hrtimer_start(vblank_timer, ktime_set(0, DCE_VIRTUAL_VBLANK_PERIOD),
+	hrtimer_start(vblank_timer, DCE_VIRTUAL_VBLANK_PERIOD,
 		      HRTIMER_MODE_REL);
 
 	return HRTIMER_NORESTART;
@@ -772,11 +772,11 @@ static void dce_virtual_set_crtc_vblank_interrupt_state(struct amdgpu_device *ad
 		hrtimer_init(&adev->mode_info.crtcs[crtc]->vblank_timer,
 			     CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 		hrtimer_set_expires(&adev->mode_info.crtcs[crtc]->vblank_timer,
-				    ktime_set(0, DCE_VIRTUAL_VBLANK_PERIOD));
+				    DCE_VIRTUAL_VBLANK_PERIOD);
 		adev->mode_info.crtcs[crtc]->vblank_timer.function =
 			dce_virtual_vblank_timer_handle;
 		hrtimer_start(&adev->mode_info.crtcs[crtc]->vblank_timer,
-			      ktime_set(0, DCE_VIRTUAL_VBLANK_PERIOD), HRTIMER_MODE_REL);
+			      DCE_VIRTUAL_VBLANK_PERIOD, HRTIMER_MODE_REL);
 	} else if (!state && adev->mode_info.crtcs[crtc]->vsync_timer_enabled) {
 		DRM_DEBUG("Disable software vsync timer\n");
 		hrtimer_cancel(&adev->mode_info.crtcs[crtc]->vblank_timer);
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index d7be0d94ba4d..0bffd3f0c15d 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -62,7 +62,7 @@ fw_domain_arm_timer(struct intel_uncore_forcewake_domain *d)
 {
 	d->wake_count++;
 	hrtimer_start_range_ns(&d->timer,
-			       ktime_set(0, NSEC_PER_MSEC),
+			       NSEC_PER_MSEC,
 			       NSEC_PER_MSEC,
 			       HRTIMER_MODE_REL);
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index f2f348f0160c..a6126c93f215 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -330,7 +330,7 @@ nouveau_fence_wait_legacy(struct dma_fence *f, bool intr, long wait)
 		__set_current_state(intr ? TASK_INTERRUPTIBLE :
 					   TASK_UNINTERRUPTIBLE);
 
-		kt = ktime_set(0, sleep_time);
+		kt = sleep_time;
 		schedule_hrtimeout(&kt, HRTIMER_MODE_REL);
 		sleep_time *= 2;
 		if (sleep_time > NSEC_PER_MSEC)
diff --git a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c
index 9942b0577d6e..725dffad5640 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_crtc.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_crtc.c
@@ -539,7 +539,7 @@ static void tilcdc_crtc_off(struct drm_crtc *crtc, bool shutdown)
 	}
 
 	drm_flip_work_commit(&tilcdc_crtc->unref_work, priv->wq);
-	tilcdc_crtc->last_vblank = ktime_set(0, 0);
+	tilcdc_crtc->last_vblank = 0;
 
 	tilcdc_crtc->enabled = false;
 	mutex_unlock(&tilcdc_crtc->enable_lock);
diff --git a/drivers/iio/trigger/iio-trig-hrtimer.c b/drivers/iio/trigger/iio-trig-hrtimer.c
index 5e6d451febeb..a1cad6cc2e0f 100644
--- a/drivers/iio/trigger/iio-trig-hrtimer.c
+++ b/drivers/iio/trigger/iio-trig-hrtimer.c
@@ -63,7 +63,7 @@ ssize_t iio_hrtimer_store_sampling_frequency(struct device *dev,
 		return -EINVAL;
 
 	info->sampling_frequency = val;
-	info->period = ktime_set(0, NSEC_PER_SEC / val);
+	info->period = NSEC_PER_SEC / val;
 
 	return len;
 }
@@ -141,8 +141,7 @@ static struct iio_sw_trigger *iio_trig_hrtimer_probe(const char *name)
 	trig_info->timer.function = iio_hrtimer_trig_handler;
 
 	trig_info->sampling_frequency = HRTIMER_DEFAULT_SAMPLING_FREQUENCY;
-	trig_info->period = ktime_set(0, NSEC_PER_SEC /
-				      trig_info->sampling_frequency);
+	trig_info->period = NSEC_PER_SEC / trig_info->sampling_frequency;
 
 	ret = iio_trigger_register(trig_info->swt.trigger);
 	if (ret)
diff --git a/drivers/input/joystick/walkera0701.c b/drivers/input/joystick/walkera0701.c
index 70a893a17467..36a5b93156ed 100644
--- a/drivers/input/joystick/walkera0701.c
+++ b/drivers/input/joystick/walkera0701.c
@@ -165,7 +165,7 @@ static void walkera0701_irq_handler(void *handler_data)
 				RESERVE + BIN1_PULSE - BIN0_PULSE)	/* frame sync .. */
 		w->counter = 0;
 
-	hrtimer_start(&w->timer, ktime_set(0, BIN_SAMPLE), HRTIMER_MODE_REL);
+	hrtimer_start(&w->timer, BIN_SAMPLE, HRTIMER_MODE_REL);
 }
 
 static enum hrtimer_restart timer_handler(struct hrtimer
diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index 4a36632c236f..4671f8a12872 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -87,8 +87,7 @@ exit:
 
 	if (!err && (chan->txdone_method & TXDONE_BY_POLL))
 		/* kick start the timer immediately to avoid delays */
-		hrtimer_start(&chan->mbox->poll_hrt, ktime_set(0, 0),
-			      HRTIMER_MODE_REL);
+		hrtimer_start(&chan->mbox->poll_hrt, 0, HRTIMER_MODE_REL);
 }
 
 static void tx_tick(struct mbox_chan *chan, int r)
diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c
index 0c44479b556e..0c16bb213101 100644
--- a/drivers/media/dvb-core/dmxdev.c
+++ b/drivers/media/dvb-core/dmxdev.c
@@ -562,7 +562,7 @@ static int dvb_dmxdev_start_feed(struct dmxdev *dmxdev,
 				 struct dmxdev_filter *filter,
 				 struct dmxdev_feed *feed)
 {
-	ktime_t timeout = ktime_set(0, 0);
+	ktime_t timeout = 0;
 	struct dmx_pes_filter_params *para = &filter->params.pes;
 	dmx_output_t otype;
 	int ret;
diff --git a/drivers/media/pci/cx88/cx88-input.c b/drivers/media/pci/cx88/cx88-input.c
index dcfea3502e42..c7b3cb406499 100644
--- a/drivers/media/pci/cx88/cx88-input.c
+++ b/drivers/media/pci/cx88/cx88-input.c
@@ -178,8 +178,7 @@ static enum hrtimer_restart cx88_ir_work(struct hrtimer *timer)
 	struct cx88_IR *ir = container_of(timer, struct cx88_IR, timer);
 
 	cx88_ir_handle_key(ir);
-	missed = hrtimer_forward_now(&ir->timer,
-				     ktime_set(0, ir->polling * 1000000));
+	missed = hrtimer_forward_now(&ir->timer, ir->polling * 1000000);
 	if (missed > 1)
 		ir_dprintk("Missed ticks %ld\n", missed - 1);
 
@@ -199,8 +198,7 @@ static int __cx88_ir_start(void *priv)
 	if (ir->polling) {
 		hrtimer_init(&ir->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 		ir->timer.function = cx88_ir_work;
-		hrtimer_start(&ir->timer,
-			      ktime_set(0, ir->polling * 1000000),
+		hrtimer_start(&ir->timer, ir->polling * 1000000,
 			      HRTIMER_MODE_REL);
 	}
 	if (ir->sampling) {
diff --git a/drivers/media/pci/pt3/pt3.c b/drivers/media/pci/pt3/pt3.c
index 7fb649e523f4..77f4d15f322b 100644
--- a/drivers/media/pci/pt3/pt3.c
+++ b/drivers/media/pci/pt3/pt3.c
@@ -463,7 +463,7 @@ static int pt3_fetch_thread(void *data)
 
 		pt3_proc_dma(adap);
 
-		delay = ktime_set(0, PT3_FETCH_DELAY * NSEC_PER_MSEC);
+		delay = PT3_FETCH_DELAY * NSEC_PER_MSEC;
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		freezable_schedule_hrtimeout_range(&delay,
 					PT3_FETCH_DELAY_DELTA * NSEC_PER_MSEC,
diff --git a/drivers/net/can/softing/softing_fw.c b/drivers/net/can/softing/softing_fw.c
index 52fe50725d74..4063215c9b54 100644
--- a/drivers/net/can/softing/softing_fw.c
+++ b/drivers/net/can/softing/softing_fw.c
@@ -390,7 +390,7 @@ static void softing_initialize_timestamp(struct softing *card)
 	ovf = 0x100000000ULL * 16;
 	do_div(ovf, card->pdat->freq ?: 16);
 
-	card->ts_overflow = ktime_add_us(ktime_set(0, 0), ovf);
+	card->ts_overflow = ktime_add_us(0, ovf);
 }
 
 ktime_t softing_raw2ktime(struct softing *card, u32 raw)
@@ -647,7 +647,7 @@ int softing_startstop(struct net_device *dev, int up)
 		open_candev(netdev);
 		if (dev != netdev) {
 			/* notify other busses on the restart */
-			softing_netdev_rx(netdev, &msg, ktime_set(0, 0));
+			softing_netdev_rx(netdev, &msg, 0);
 			++priv->can.can_stats.restarts;
 		}
 		netif_wake_queue(netdev);
diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c
index 7621f91a8a20..5f64deec9f6c 100644
--- a/drivers/net/can/softing/softing_main.c
+++ b/drivers/net/can/softing/softing_main.c
@@ -192,7 +192,7 @@ static int softing_handle_1(struct softing *card)
 				/* a dead bus has no overflows */
 				continue;
 			++netdev->stats.rx_over_errors;
-			softing_netdev_rx(netdev, &msg, ktime_set(0, 0));
+			softing_netdev_rx(netdev, &msg, 0);
 		}
 		/* prepare for other use */
 		memset(&msg, 0, sizeof(msg));
diff --git a/drivers/net/ethernet/ec_bhf.c b/drivers/net/ethernet/ec_bhf.c
index 57650953ff83..7bf78a0d322c 100644
--- a/drivers/net/ethernet/ec_bhf.c
+++ b/drivers/net/ethernet/ec_bhf.c
@@ -253,7 +253,7 @@ static enum hrtimer_restart ec_bhf_timer_fun(struct hrtimer *timer)
 	if (!netif_running(priv->net_dev))
 		return HRTIMER_NORESTART;
 
-	hrtimer_forward_now(timer, ktime_set(0, polling_frequency));
+	hrtimer_forward_now(timer, polling_frequency);
 	return HRTIMER_RESTART;
 }
 
@@ -427,8 +427,7 @@ static int ec_bhf_open(struct net_device *net_dev)
 
 	hrtimer_init(&priv->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	priv->hrtimer.function = ec_bhf_timer_fun;
-	hrtimer_start(&priv->hrtimer, ktime_set(0, polling_frequency),
-		      HRTIMER_MODE_REL);
+	hrtimer_start(&priv->hrtimer, polling_frequency, HRTIMER_MODE_REL);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c
index cda04b3126bc..4fe430ceb194 100644
--- a/drivers/net/ethernet/marvell/mvpp2.c
+++ b/drivers/net/ethernet/marvell/mvpp2.c
@@ -4913,7 +4913,7 @@ static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu)
 
 	if (!port_pcpu->timer_scheduled) {
 		port_pcpu->timer_scheduled = true;
-		interval = ktime_set(0, MVPP2_TXDONE_HRTIMER_PERIOD_NS);
+		interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS;
 		hrtimer_start(&port_pcpu->tx_done_timer, interval,
 			      HRTIMER_MODE_REL_PINNED);
 	}
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 0aaf975bb347..2255f9a6f3bc 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -751,7 +751,7 @@ static void tile_net_schedule_tx_wake_timer(struct net_device *dev,
 		&info->mpipe[instance].tx_wake[priv->echannel];
 
 	hrtimer_start(&tx_wake->timer,
-		      ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
+		      TX_TIMER_DELAY_USEC * 1000UL,
 		      HRTIMER_MODE_REL_PINNED);
 }
 
@@ -770,7 +770,7 @@ static void tile_net_schedule_egress_timer(void)
 
 	if (!info->egress_timer_scheduled) {
 		hrtimer_start(&info->egress_timer,
-			      ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
+			      EGRESS_TIMER_DELAY_USEC * 1000UL,
 			      HRTIMER_MODE_REL_PINNED);
 		info->egress_timer_scheduled = true;
 	}
diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index 057025722e3d..46d53a6c8cf8 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -510,7 +510,7 @@ at86rf230_async_state_delay(void *context)
 	case STATE_TRX_OFF:
 		switch (ctx->to_state) {
 		case STATE_RX_AACK_ON:
-			tim = ktime_set(0, c->t_off_to_aack * NSEC_PER_USEC);
+			tim = c->t_off_to_aack * NSEC_PER_USEC;
 			/* state change from TRX_OFF to RX_AACK_ON to do a
 			 * calibration, we need to reset the timeout for the
 			 * next one.
@@ -519,7 +519,7 @@ at86rf230_async_state_delay(void *context)
 			goto change;
 		case STATE_TX_ARET_ON:
 		case STATE_TX_ON:
-			tim = ktime_set(0, c->t_off_to_tx_on * NSEC_PER_USEC);
+			tim = c->t_off_to_tx_on * NSEC_PER_USEC;
 			/* state change from TRX_OFF to TX_ON or ARET_ON to do
 			 * a calibration, we need to reset the timeout for the
 			 * next one.
@@ -539,8 +539,7 @@ at86rf230_async_state_delay(void *context)
 			 * to TX_ON or TRX_OFF.
 			 */
 			if (!force) {
-				tim = ktime_set(0, (c->t_frame + c->t_p_ack) *
-						   NSEC_PER_USEC);
+				tim = (c->t_frame + c->t_p_ack) * NSEC_PER_USEC;
 				goto change;
 			}
 			break;
@@ -552,7 +551,7 @@ at86rf230_async_state_delay(void *context)
 	case STATE_P_ON:
 		switch (ctx->to_state) {
 		case STATE_TRX_OFF:
-			tim = ktime_set(0, c->t_reset_to_off * NSEC_PER_USEC);
+			tim = c->t_reset_to_off * NSEC_PER_USEC;
 			goto change;
 		default:
 			break;
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 2d1a6f2e16ab..f317984f7536 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1282,7 +1282,7 @@ static void cdc_ncm_tx_timeout_start(struct cdc_ncm_ctx *ctx)
 	/* start timer, if not already started */
 	if (!(hrtimer_active(&ctx->tx_timer) || atomic_read(&ctx->stop)))
 		hrtimer_start(&ctx->tx_timer,
-				ktime_set(0, ctx->timer_interval),
+				ctx->timer_interval,
 				HRTIMER_MODE_REL);
 }
 
diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
index 9f61293f1a56..f38c44061b5b 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
@@ -177,7 +177,7 @@ static bool rt2800usb_tx_sta_fifo_read_completed(struct rt2x00_dev *rt2x00dev,
 	if (rt2800usb_txstatus_pending(rt2x00dev)) {
 		/* Read register after 1 ms */
 		hrtimer_start(&rt2x00dev->txstatus_timer,
-			      ktime_set(0, TXSTATUS_READ_INTERVAL),
+			      TXSTATUS_READ_INTERVAL,
 			      HRTIMER_MODE_REL);
 		return false;
 	}
@@ -204,7 +204,7 @@ static void rt2800usb_async_read_tx_status(struct rt2x00_dev *rt2x00dev)
 
 	/* Read TX_STA_FIFO register after 2 ms */
 	hrtimer_start(&rt2x00dev->txstatus_timer,
-		      ktime_set(0, 2*TXSTATUS_READ_INTERVAL),
+		      2 * TXSTATUS_READ_INTERVAL,
 		      HRTIMER_MODE_REL);
 }
 
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 9236e40ac055..1800befa8b8b 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3044,7 +3044,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0e0d, quirk_intel_ntb);
 static ktime_t fixup_debug_start(struct pci_dev *dev,
 				 void (*fn)(struct pci_dev *dev))
 {
-	ktime_t calltime = ktime_set(0, 0);
+	ktime_t calltime = 0;
 
 	dev_dbg(&dev->dev, "calling %pF\n", fn);
 	if (initcall_debug) {
diff --git a/drivers/platform/x86/msi-wmi.c b/drivers/platform/x86/msi-wmi.c
index 978e6d640572..9a32f8627ecc 100644
--- a/drivers/platform/x86/msi-wmi.c
+++ b/drivers/platform/x86/msi-wmi.c
@@ -283,7 +283,7 @@ static int __init msi_wmi_input_setup(void)
 	if (err)
 		goto err_free_keymap;
 
-	last_pressed = ktime_set(0, 0);
+	last_pressed = 0;
 
 	return 0;
 
diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c
index 15fed9d8f871..bfcd6fba6363 100644
--- a/drivers/power/reset/ltc2952-poweroff.c
+++ b/drivers/power/reset/ltc2952-poweroff.c
@@ -169,7 +169,7 @@ static void ltc2952_poweroff_kill(void)
 
 static void ltc2952_poweroff_default(struct ltc2952_poweroff *data)
 {
-	data->wde_interval = ktime_set(0, 300L*1E6L);
+	data->wde_interval = 300L * 1E6L;
 	data->trigger_delay = ktime_set(2, 500L*1E6L);
 
 	hrtimer_init(&data->timer_trigger, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 5cf196dfc193..fc0fa7577636 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -363,7 +363,7 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 		rtc_timer_remove(rtc, &rtc->aie_timer);
 
 	rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
-	rtc->aie_timer.period = ktime_set(0, 0);
+	rtc->aie_timer.period = 0;
 	if (alarm->enabled)
 		err = rtc_timer_enqueue(rtc, &rtc->aie_timer);
 
@@ -391,7 +391,7 @@ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm)
 		return err;
 
 	rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time);
-	rtc->aie_timer.period = ktime_set(0, 0);
+	rtc->aie_timer.period = 0;
 
 	/* Alarm has to be enabled & in the future for us to enqueue it */
 	if (alarm->enabled && (rtc_tm_to_ktime(now) <
@@ -554,7 +554,7 @@ enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer)
 	int count;
 	rtc = container_of(timer, struct rtc_device, pie_timer);
 
-	period = ktime_set(0, NSEC_PER_SEC/rtc->irq_freq);
+	period = NSEC_PER_SEC / rtc->irq_freq;
 	count = hrtimer_forward_now(timer, period);
 
 	rtc_handle_legacy_irq(rtc, count, RTC_PF);
@@ -665,7 +665,7 @@ static int rtc_update_hrtimer(struct rtc_device *rtc, int enabled)
 		return -1;
 
 	if (enabled) {
-		ktime_t period = ktime_set(0, NSEC_PER_SEC / rtc->irq_freq);
+		ktime_t period = NSEC_PER_SEC / rtc->irq_freq;
 
 		hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL);
 	}
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 6d75984a3d85..5fa699192864 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -333,7 +333,7 @@ void ap_wait(enum ap_wait wait)
 	case AP_WAIT_TIMEOUT:
 		spin_lock_bh(&ap_poll_timer_lock);
 		if (!hrtimer_is_queued(&ap_poll_timer)) {
-			hr_time = ktime_set(0, poll_timeout);
+			hr_time = poll_timeout;
 			hrtimer_forward_now(&ap_poll_timer, hr_time);
 			hrtimer_restart(&ap_poll_timer);
 		}
@@ -860,7 +860,7 @@ static ssize_t poll_timeout_store(struct bus_type *bus, const char *buf,
 	    time > 120000000000ULL)
 		return -EINVAL;
 	poll_timeout = time;
-	hr_time = ktime_set(0, poll_timeout);
+	hr_time = poll_timeout;
 
 	spin_lock_bh(&ap_poll_timer_lock);
 	hrtimer_cancel(&ap_poll_timer);
diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
index 2583e8b50b21..3d3768aaab4f 100644
--- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
+++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c
@@ -1694,7 +1694,7 @@ static void srp_snd_msg_failed(struct scsi_info *vscsi, long rc)
 		if (!vscsi->rsp_q_timer.started) {
 			if (vscsi->rsp_q_timer.timer_pops <
 			    MAX_TIMER_POPS) {
-				kt = ktime_set(0, WAIT_NANO_SECONDS);
+				kt = WAIT_NANO_SECONDS;
 			} else {
 				/*
 				 * slide the timeslice if the maximum
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index cf04a364fd8b..03051e12a072 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -4085,7 +4085,7 @@ static int schedule_resp(struct scsi_cmnd *cmnd, struct sdebug_dev_info *devip,
 			jiffies_to_timespec(delta_jiff, &ts);
 			kt = ktime_set(ts.tv_sec, ts.tv_nsec);
 		} else
-			kt = ktime_set(0, sdebug_ndelay);
+			kt = sdebug_ndelay;
 		if (NULL == sd_dp) {
 			sd_dp = kzalloc(sizeof(*sd_dp), GFP_ATOMIC);
 			if (NULL == sd_dp)
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index a2c2817fc566..20e5e5fb048c 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -930,7 +930,7 @@ static void ufshcd_clk_scaling_update_busy(struct ufs_hba *hba)
 	if (!hba->outstanding_reqs && scaling->is_busy_started) {
 		scaling->tot_busy_t += ktime_to_us(ktime_sub(ktime_get(),
 					scaling->busy_start_t));
-		scaling->busy_start_t = ktime_set(0, 0);
+		scaling->busy_start_t = 0;
 		scaling->is_busy_started = false;
 	}
 }
@@ -6661,7 +6661,7 @@ start_window:
 		scaling->busy_start_t = ktime_get();
 		scaling->is_busy_started = true;
 	} else {
-		scaling->busy_start_t = ktime_set(0, 0);
+		scaling->busy_start_t = 0;
 		scaling->is_busy_started = false;
 	}
 	spin_unlock_irqrestore(hba->host->host_lock, flags);
diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
index e8008fa35e1e..224717e63a53 100644
--- a/drivers/usb/gadget/function/f_ncm.c
+++ b/drivers/usb/gadget/function/f_ncm.c
@@ -1113,8 +1113,7 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port,
 		}
 
 		/* Delay the timer. */
-		hrtimer_start(&ncm->task_timer,
-			      ktime_set(0, TX_TIMEOUT_NSECS),
+		hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS,
 			      HRTIMER_MODE_REL);
 
 		/* Add the datagram position entries */
diff --git a/drivers/usb/host/ehci-timer.c b/drivers/usb/host/ehci-timer.c
index 262e10cacc8c..3893b5bafd87 100644
--- a/drivers/usb/host/ehci-timer.c
+++ b/drivers/usb/host/ehci-timer.c
@@ -88,8 +88,7 @@ static void ehci_enable_event(struct ehci_hcd *ehci, unsigned event,
 	ktime_t		*timeout = &ehci->hr_timeouts[event];
 
 	if (resched)
-		*timeout = ktime_add(ktime_get(),
-				ktime_set(0, event_delays_ns[event]));
+		*timeout = ktime_add(ktime_get(), event_delays_ns[event]);
 	ehci->enabled_hrtimer_events |= (1 << event);
 
 	/* Track only the lowest-numbered pending event */
diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c
index 4dda56ef06cd..9d0b0518290a 100644
--- a/drivers/usb/host/fotg210-hcd.c
+++ b/drivers/usb/host/fotg210-hcd.c
@@ -1080,8 +1080,7 @@ static void fotg210_enable_event(struct fotg210_hcd *fotg210, unsigned event,
 	ktime_t *timeout = &fotg210->hr_timeouts[event];
 
 	if (resched)
-		*timeout = ktime_add(ktime_get(),
-				ktime_set(0, event_delays_ns[event]));
+		*timeout = ktime_add(ktime_get(), event_delays_ns[event]);
 	fotg210->enabled_hrtimer_events |= (1 << event);
 
 	/* Track only the lowest-numbered pending event */
diff --git a/drivers/usb/musb/musb_cppi41.c b/drivers/usb/musb/musb_cppi41.c
index d4d7c56b48c7..16363852c034 100644
--- a/drivers/usb/musb/musb_cppi41.c
+++ b/drivers/usb/musb/musb_cppi41.c
@@ -197,8 +197,7 @@ static enum hrtimer_restart cppi41_recheck_tx_req(struct hrtimer *timer)
 	if (!list_empty(&controller->early_tx_list) &&
 	    !hrtimer_is_queued(&controller->early_tx)) {
 		ret = HRTIMER_RESTART;
-		hrtimer_forward_now(&controller->early_tx,
-				ktime_set(0, 20 * NSEC_PER_USEC));
+		hrtimer_forward_now(&controller->early_tx, 20 * NSEC_PER_USEC);
 	}
 
 	spin_unlock_irqrestore(&musb->lock, flags);
@@ -280,9 +279,9 @@ static void cppi41_dma_callback(void *private_data)
 		unsigned long usecs = cppi41_channel->total_len / 10;
 
 		hrtimer_start_range_ns(&controller->early_tx,
-				ktime_set(0, usecs * NSEC_PER_USEC),
-				20 * NSEC_PER_USEC,
-				HRTIMER_MODE_REL);
+				       usecs * NSEC_PER_USEC,
+				       20 * NSEC_PER_USEC,
+				       HRTIMER_MODE_REL);
 	}
 
 out:
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 35502d4046f5..f631a70407f6 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1395,7 +1395,7 @@ static int nodeid_warned(int nodeid, int num_nodes, int *warned)
 void dlm_scan_waiters(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
-	ktime_t zero = ktime_set(0, 0);
+	ktime_t zero = 0;
 	s64 us;
 	s64 debug_maxus = 0;
 	u32 debug_scanned = 0;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f7b3ba61add5..94f50cac91c6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -695,7 +695,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_target = LM_ST_UNLOCKED;
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_ops = glops;
-	gl->gl_dstamp = ktime_set(0, 0);
+	gl->gl_dstamp = 0;
 	preempt_disable();
 	/* We use the global stats to estimate the initial per-glock stats */
 	gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type];
diff --git a/fs/timerfd.c b/fs/timerfd.c
index fb4407a7cf9e..c173cc196175 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -155,7 +155,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 	else
 		remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
 
-	return remaining < 0 ? ktime_set(0, 0): remaining;
+	return remaining < 0 ? 0: remaining;
 }
 
 static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ac7fa34db8a7..b53c0cfd417e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3227,7 +3227,7 @@ static inline ktime_t net_timedelta(ktime_t t)
 
 static inline ktime_t net_invalid_timestamp(void)
 {
-	return ktime_set(0, 0);
+	return 0;
 }
 
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 966556ebdbb3..c56fb57f2991 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1456,7 +1456,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * yield - it could be a while.
 		 */
 		if (unlikely(queued)) {
-			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+			ktime_t to = NSEC_PER_SEC / HZ;
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ab6ac077bdb7..e6dc9a538efa 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -234,7 +234,7 @@ static int alarmtimer_suspend(struct device *dev)
 	min = freezer_delta;
 	expires = freezer_expires;
 	type = freezer_alarmtype;
-	freezer_delta = ktime_set(0, 0);
+	freezer_delta = 0;
 	spin_unlock_irqrestore(&freezer_delta_lock, flags);
 
 	rtc = alarmtimer_get_rtcdev();
@@ -277,7 +277,7 @@ static int alarmtimer_suspend(struct device *dev)
 	now = ktime_add(now, min);
 
 	/* Set alarm, if in the past reject suspend briefly to handle */
-	ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+	ret = rtc_timer_start(rtc, &rtctimer, now, 0);
 	if (ret < 0)
 		__pm_wakeup_event(ws, MSEC_PER_SEC);
 	return ret;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c7f780113884..c6ecedd3b839 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -955,7 +955,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
 	 */
 	timer->is_rel = mode & HRTIMER_MODE_REL;
 	if (timer->is_rel)
-		tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+		tim = ktime_add_safe(tim, hrtimer_resolution);
 #endif
 	return tim;
 }
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9fe98b3777a2..1e6623d76750 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -485,7 +485,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
 			 */
 #ifdef CONFIG_HIGH_RES_TIMERS
 			{
-				ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ);
+				ktime_t kj = NSEC_PER_SEC / HZ;
 
 				if (timr->it.real.interval < kj)
 					now = ktime_add(now, kj);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4fcd99e12aa0..49edc1c4f3e6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -178,8 +178,8 @@ static void tick_setup_device(struct tick_device *td,
 			      struct clock_event_device *newdev, int cpu,
 			      const struct cpumask *cpumask)
 {
-	ktime_t next_event;
 	void (*handler)(struct clock_event_device *) = NULL;
+	ktime_t next_event = 0;
 
 	/*
 	 * First device setup ?
@@ -195,7 +195,7 @@ static void tick_setup_device(struct tick_device *td,
 			else
 				tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 			tick_next_period = ktime_get();
-			tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
+			tick_period = NSEC_PER_SEC / HZ;
 		}
 
 		/*
diff --git a/net/can/bcm.c b/net/can/bcm.c
index ab8ba1e16473..21ac75390e3d 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -643,7 +643,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
 		return HRTIMER_RESTART;
 	} else {
 		/* rearm throttle handling */
-		op->kt_lastmsg = ktime_set(0, 0);
+		op->kt_lastmsg = 0;
 		return HRTIMER_NORESTART;
 	}
 }
@@ -1196,7 +1196,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 			 * In any case cancel the throttle timer, flush
 			 * potentially blocked msgs and reset throttle handling
 			 */
-			op->kt_lastmsg = ktime_set(0, 0);
+			op->kt_lastmsg = 0;
 			hrtimer_cancel(&op->thrtimer);
 			bcm_rx_thr_flush(op, 1);
 		}
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index f9fd0957ab67..7c03fb0ea34c 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -80,11 +80,11 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
 
 		if (skb->len > max_sifs_size)
 			hrtimer_start(&local->ifs_timer,
-				      ktime_set(0, hw->phy->lifs_period * NSEC_PER_USEC),
+				      hw->phy->lifs_period * NSEC_PER_USEC,
 				      HRTIMER_MODE_REL);
 		else
 			hrtimer_start(&local->ifs_timer,
-				      ktime_set(0, hw->phy->sifs_period * NSEC_PER_USEC),
+				      hw->phy->sifs_period * NSEC_PER_USEC,
 				      HRTIMER_MODE_REL);
 	} else {
 		ieee802154_wake_queue(hw);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 9ffe1c220b02..f1207582cbf3 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -509,7 +509,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 	if (delay) {
 		ktime_t time;
 
-		time = ktime_set(0, 0);
+		time = 0;
 		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 	}
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index ce54dce13ddb..a1652ab63918 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
 	 */
 	peer->rto = msecs_to_jiffies(net->sctp.rto_initial);
 
-	peer->last_time_heard = ktime_set(0, 0);
+	peer->last_time_heard = 0;
 	peer->last_time_ecne_reduced = jiffies;
 
 	peer->param_flags = SPP_HB_DISABLE |
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d4ab9a7f3d94..64e3c82eedf6 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1404,7 +1404,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
 	if (x->curlft.bytes >= x->lft.hard_byte_limit ||
 	    x->curlft.packets >= x->lft.hard_packet_limit) {
 		x->km.state = XFRM_STATE_EXPIRED;
-		tasklet_hrtimer_start(&x->mtimer, ktime_set(0, 0), HRTIMER_MODE_REL);
+		tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL);
 		return -EINVAL;
 	}
 
diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c
index 3689f5f6be64..aca2d7d5f059 100644
--- a/sound/drivers/pcsp/pcsp_lib.c
+++ b/sound/drivers/pcsp/pcsp_lib.c
@@ -166,7 +166,7 @@ static int pcsp_start_playing(struct snd_pcsp *chip)
 	atomic_set(&chip->timer_active, 1);
 	chip->thalf = 0;
 
-	hrtimer_start(&pcsp_chip.timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+	hrtimer_start(&pcsp_chip.timer, 0, HRTIMER_MODE_REL);
 	return 0;
 }
 
diff --git a/sound/firewire/lib.c b/sound/firewire/lib.c
index ca4dfcf43175..7683238283b6 100644
--- a/sound/firewire/lib.c
+++ b/sound/firewire/lib.c
@@ -114,7 +114,7 @@ static void async_midi_port_callback(struct fw_card *card, int rcode,
 		snd_rawmidi_transmit_ack(substream, port->consume_bytes);
 	else if (!rcode_is_permanent_error(rcode))
 		/* To start next transaction immediately for recovery. */
-		port->next_ktime = ktime_set(0, 0);
+		port->next_ktime = 0;
 	else
 		/* Don't continue processing. */
 		port->error = true;
@@ -156,7 +156,7 @@ static void midi_port_work(struct work_struct *work)
 	if (port->consume_bytes <= 0) {
 		/* Do it in next chance, immediately. */
 		if (port->consume_bytes == 0) {
-			port->next_ktime = ktime_set(0, 0);
+			port->next_ktime = 0;
 			schedule_work(&port->work);
 		} else {
 			/* Fatal error. */
@@ -219,7 +219,7 @@ int snd_fw_async_midi_port_init(struct snd_fw_async_midi_port *port,
 	port->addr = addr;
 	port->fill = fill;
 	port->idling = true;
-	port->next_ktime = ktime_set(0, 0);
+	port->next_ktime = 0;
 	port->error = false;
 
 	INIT_WORK(&port->work, midi_port_work);
diff --git a/sound/sh/sh_dac_audio.c b/sound/sh/sh_dac_audio.c
index abf9c0cab1e2..461b310c7872 100644
--- a/sound/sh/sh_dac_audio.c
+++ b/sound/sh/sh_dac_audio.c
@@ -87,7 +87,7 @@ static void dac_audio_reset(struct snd_sh_dac *chip)
 
 static void dac_audio_set_rate(struct snd_sh_dac *chip)
 {
-	chip->wakeups_per_second = ktime_set(0, 1000000000 / chip->rate);
+	chip->wakeups_per_second = 1000000000 / chip->rate;
 }
 
 
-- 
cgit v1.2.3


From b9d9d6911bd5c370ad4b3aa57d758c093d17aed5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 26 Dec 2016 22:58:19 +0100
Subject: smp/hotplug: Undo tglxs brainfart

The attempt to prevent overwriting an active state resulted in a
disaster which effectively disables all dynamically allocated hotplug
states.

Cleanup the mess.

Fixes: dc280d936239 ("cpu/hotplug: Prevent overwriting of callbacks")
Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Reported-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpu.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 042fd7e8e030..f75c4d031eeb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1471,6 +1471,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 			bool multi_instance)
 {
 	int cpu, ret = 0;
+	bool dynstate;
 
 	if (cpuhp_cb_check(state) || !name)
 		return -EINVAL;
@@ -1480,6 +1481,12 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 	ret = cpuhp_store_callbacks(state, name, startup, teardown,
 				    multi_instance);
 
+	dynstate = state == CPUHP_AP_ONLINE_DYN;
+	if (ret > 0 && dynstate) {
+		state = ret;
+		ret = 0;
+	}
+
 	if (ret || !invoke || !startup)
 		goto out;
 
@@ -1508,7 +1515,7 @@ out:
 	 * If the requested state is CPUHP_AP_ONLINE_DYN, return the
 	 * dynamically allocated state in case of success.
 	 */
-	if (!ret && state == CPUHP_AP_ONLINE_DYN)
+	if (!ret && dynstate)
 		return state;
 	return ret;
 }
-- 
cgit v1.2.3


From be29d20f3f5db1f0b4e49a4f6eeedf840e2bf9b1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 14 Dec 2016 14:40:05 +0100
Subject: audit: Fix sleep in atomic

Audit tree code was happily adding new notification marks while holding
spinlocks. Since fsnotify_add_mark() acquires group->mark_mutex this can
lead to sleeping while holding a spinlock, deadlocks due to lock
inversion, and probably other fun. Fix the problem by acquiring
group->mark_mutex earlier.

CC: Paul Moore <paul@paul-moore.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b4b58400531f..862969014cf6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -231,9 +231,11 @@ static void untag_chunk(struct node *p)
 	if (size)
 		new = alloc_chunk(size);
 
+	mutex_lock(&entry->group->mark_mutex);
 	spin_lock(&entry->lock);
 	if (chunk->dead || !entry->inode) {
 		spin_unlock(&entry->lock);
+		mutex_unlock(&entry->group->mark_mutex);
 		if (new)
 			free_chunk(new);
 		goto out;
@@ -251,6 +253,7 @@ static void untag_chunk(struct node *p)
 		list_del_rcu(&chunk->hash);
 		spin_unlock(&hash_lock);
 		spin_unlock(&entry->lock);
+		mutex_unlock(&entry->group->mark_mutex);
 		fsnotify_destroy_mark(entry, audit_tree_group);
 		goto out;
 	}
@@ -258,8 +261,8 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 
-	if (fsnotify_add_mark(&new->mark,
-			      entry->group, entry->inode, NULL, 1)) {
+	if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode,
+				     NULL, 1)) {
 		fsnotify_put_mark(&new->mark);
 		goto Fallback;
 	}
@@ -293,6 +296,7 @@ static void untag_chunk(struct node *p)
 		owner->root = new;
 	spin_unlock(&hash_lock);
 	spin_unlock(&entry->lock);
+	mutex_unlock(&entry->group->mark_mutex);
 	fsnotify_destroy_mark(entry, audit_tree_group);
 	fsnotify_put_mark(&new->mark);	/* drop initial reference */
 	goto out;
@@ -309,6 +313,7 @@ Fallback:
 	put_tree(owner);
 	spin_unlock(&hash_lock);
 	spin_unlock(&entry->lock);
+	mutex_unlock(&entry->group->mark_mutex);
 out:
 	fsnotify_put_mark(entry);
 	spin_lock(&hash_lock);
@@ -386,18 +391,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	chunk_entry = &chunk->mark;
 
+	mutex_lock(&old_entry->group->mark_mutex);
 	spin_lock(&old_entry->lock);
 	if (!old_entry->inode) {
 		/* old_entry is being shot, lets just lie */
 		spin_unlock(&old_entry->lock);
+		mutex_unlock(&old_entry->group->mark_mutex);
 		fsnotify_put_mark(old_entry);
 		free_chunk(chunk);
 		return -ENOENT;
 	}
 
-	if (fsnotify_add_mark(chunk_entry,
-			      old_entry->group, old_entry->inode, NULL, 1)) {
+	if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
+				     old_entry->inode, NULL, 1)) {
 		spin_unlock(&old_entry->lock);
+		mutex_unlock(&old_entry->group->mark_mutex);
 		fsnotify_put_mark(chunk_entry);
 		fsnotify_put_mark(old_entry);
 		return -ENOSPC;
@@ -413,6 +421,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		chunk->dead = 1;
 		spin_unlock(&chunk_entry->lock);
 		spin_unlock(&old_entry->lock);
+		mutex_unlock(&old_entry->group->mark_mutex);
 
 		fsnotify_destroy_mark(chunk_entry, audit_tree_group);
 
@@ -445,6 +454,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	spin_unlock(&hash_lock);
 	spin_unlock(&chunk_entry->lock);
 	spin_unlock(&old_entry->lock);
+	mutex_unlock(&old_entry->group->mark_mutex);
 	fsnotify_destroy_mark(old_entry, audit_tree_group);
 	fsnotify_put_mark(chunk_entry);	/* drop initial reference */
 	fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
-- 
cgit v1.2.3


From add7c65ca426b7a37184dd3d2172394e23d585d6 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@openvz.org>
Date: Wed, 4 Jan 2017 19:28:14 -0800
Subject: pid: fix lockdep deadlock warning due to ucount_lock

=========================================================
[ INFO: possible irq lock inversion dependency detected ]
4.10.0-rc2-00024-g4aecec9-dirty #118 Tainted: G        W
---------------------------------------------------------
swapper/1/0 just changed the state of lock:
 (&(&sighand->siglock)->rlock){-.....}, at: [<ffffffffbd0a1bc6>] __lock_task_sighand+0xb6/0x2c0
but this lock took another, HARDIRQ-unsafe lock in the past:
 (ucounts_lock){+.+...}
and interrupts could create inverse lock ordering between them.
other info that might help us debug this:
Chain exists of:                 &(&sighand->siglock)->rlock --> &(&tty->ctrl_lock)->rlock --> ucounts_lock
 Possible interrupt unsafe locking scenario:
       CPU0                    CPU1
       ----                    ----
  lock(ucounts_lock);
                               local_irq_disable();
                               lock(&(&sighand->siglock)->rlock);
                               lock(&(&tty->ctrl_lock)->rlock);
  <Interrupt>
    lock(&(&sighand->siglock)->rlock);

 *** DEADLOCK ***

This patch removes a dependency between rlock and ucount_lock.

Fixes: f333c700c610 ("pidns: Add a limit on the number of pid namespaces")
Cc: stable@vger.kernel.org
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/pid_namespace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index df9e8e9e0be7..eef2ce968636 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -151,8 +151,12 @@ out:
 
 static void delayed_free_pidns(struct rcu_head *p)
 {
-	kmem_cache_free(pid_ns_cachep,
-			container_of(p, struct pid_namespace, rcu));
+	struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);
+
+	dec_pid_namespaces(ns->ucounts);
+	put_user_ns(ns->user_ns);
+
+	kmem_cache_free(pid_ns_cachep, ns);
 }
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
@@ -162,8 +166,6 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 	ns_free_inum(&ns->ns);
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
-	dec_pid_namespaces(ns->ucounts);
-	put_user_ns(ns->user_ns);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
-- 
cgit v1.2.3


From 7984c27c2c5cd3298de8afdba3e1bd46f884e934 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 10 Jan 2017 16:57:30 -0800
Subject: bpf: do not use KMALLOC_SHIFT_MAX

Commit 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and
integer overflow") has added checks for the maximum allocateable size.
It (ab)used KMALLOC_SHIFT_MAX for that purpose.

While this is not incorrect it is not very clean because we already have
KMALLOC_MAX_SIZE for this very reason so let's change both checks to use
KMALLOC_MAX_SIZE instead.

The original motivation for using KMALLOC_SHIFT_MAX was to work around
an incorrect KMALLOC_MAX_SIZE which could lead to allocation warnings
but it is no longer needed since "slab: make sure that KMALLOC_MAX_SIZE
will fit into MAX_ORDER".

Link: http://lkml.kernel.org/r/20161220130659.16461-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/bpf/arraymap.c | 2 +-
 kernel/bpf/hashtab.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index a2ac051c342f..229a5d5df977 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -56,7 +56,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	    attr->value_size == 0 || attr->map_flags)
 		return ERR_PTR(-EINVAL);
 
-	if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
+	if (attr->value_size > KMALLOC_MAX_SIZE)
 		/* if value_size is bigger, the user space won't be able to
 		 * access the elements.
 		 */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34debc1a9641..3f2bb58952d8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -274,7 +274,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		goto free_htab;
 
-	if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
+	if (htab->map.value_size >= KMALLOC_MAX_SIZE -
 	    MAX_BPF_STACK - sizeof(struct htab_elem))
 		/* if value_size is bigger, the user space won't be able to
 		 * access the elements via bpf syscall. This check also makes
-- 
cgit v1.2.3


From f931ab479dd24cf7a2c6e2df19778406892591fb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 10 Jan 2017 16:57:36 -0800
Subject: mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}

Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.

For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:

    if (pgd_val(*pgd)) {
    	pud = (pud_t *)pgd_page_vaddr(*pgd);
    	paddr_last = phys_pud_init(pud, __pa(vaddr),
    				   __pa(vaddr_end),
    				   page_size_mask);
    	continue;
    }

    pud = alloc_low_page();
    paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
    			   page_size_mask);

The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization.  This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:

    BUG: unable to handle kernel paging request at ffff888ebfff0000
    IP: memcpy_erms+0x6/0x10
    PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
    Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
    CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
    task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
    RIP: memcpy_erms+0x6/0x10
    [..]
    Call Trace:
      ? pmem_do_bvec+0x205/0x370 [nd_pmem]
      ? blk_queue_enter+0x3a/0x280
      pmem_rw_page+0x38/0x80 [nd_pmem]
      bdev_read_page+0x84/0xb0

Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().

Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/memremap.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index b501e390bb34..9ecedc28b928 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,7 +246,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(resource_size(res), SECTION_SIZE);
+	mem_hotplug_begin();
 	arch_remove_memory(align_start, align_size);
+	mem_hotplug_done();
 	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 	pgmap_radix_release(res);
 	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
@@ -358,7 +360,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (error)
 		goto err_pfn_remap;
 
+	mem_hotplug_begin();
 	error = arch_add_memory(nid, align_start, align_size, true);
+	mem_hotplug_done();
 	if (error)
 		goto err_add_memory;
 
-- 
cgit v1.2.3


From 2d39b3cd34e6d323720d4c61bd714f5ae202c022 Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie.iles@oracle.com>
Date: Tue, 10 Jan 2017 16:57:54 -0800
Subject: signal: protect SIGNAL_UNKILLABLE from unintentional clearing.

Since commit 00cd5c37afd5 ("ptrace: permit ptracing of /sbin/init") we
can now trace init processes.  init is initially protected with
SIGNAL_UNKILLABLE which will prevent fatal signals such as SIGSTOP, but
there are a number of paths during tracing where SIGNAL_UNKILLABLE can
be implicitly cleared.

This can result in init becoming stoppable/killable after tracing.  For
example, running:

  while true; do kill -STOP 1; done &
  strace -p 1

and then stopping strace and the kill loop will result in init being
left in state TASK_STOPPED.  Sending SIGCONT to init will resume it, but
init will now respond to future SIGSTOP signals rather than ignoring
them.

Make sure that when setting SIGNAL_STOP_CONTINUED/SIGNAL_STOP_STOPPED
that we don't clear SIGNAL_UNKILLABLE.

Link: http://lkml.kernel.org/r/20170104122017.25047-1-jamie.iles@oracle.com
Signed-off-by: Jamie Iles <jamie.iles@oracle.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 10 ++++++++++
 kernel/signal.c       |  4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..ad3ec9ec61f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -854,6 +854,16 @@ struct signal_struct {
 
 #define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
 
+#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
+			  SIGNAL_STOP_CONTINUED)
+
+static inline void signal_set_stop_flags(struct signal_struct *sig,
+					 unsigned int flags)
+{
+	WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
+	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
+}
+
 /* If true, all threads except ->group_exit_task have pending SIGKILL */
 static inline int signal_group_exit(const struct signal_struct *sig)
 {
diff --git a/kernel/signal.c b/kernel/signal.c
index ff046b73ff2d..3603d93a1968 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task)
 	 * fresh group stop.  Read comment in do_signal_stop() for details.
 	 */
 	if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
-		sig->flags = SIGNAL_STOP_STOPPED;
+		signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
 		return true;
 	}
 	return false;
@@ -843,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
 			 * will take ->siglock, notice SIGNAL_CLD_MASK, and
 			 * notify its parent. See get_signal_to_deliver().
 			 */
-			signal->flags = why | SIGNAL_STOP_CONTINUED;
+			signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
 			signal->group_stop_count = 0;
 			signal->group_exit_code = 0;
 		}
-- 
cgit v1.2.3


From 24b91e360ef521a2808771633d76ebc68bd5604b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 4 Jan 2017 15:12:04 +0100
Subject: nohz: Fix collision between tick and other hrtimers

When the tick is stopped and an interrupt occurs afterward, we check on
that interrupt exit if the next tick needs to be rescheduled. If it
doesn't need any update, we don't want to do anything.

In order to check if the tick needs an update, we compare it against the
clockevent device deadline. Now that's a problem because the clockevent
device is at a lower level than the tick itself if it is implemented
on top of hrtimer.

Every hrtimer share this clockevent device. So comparing the next tick
deadline against the clockevent device deadline is wrong because the
device may be programmed for another hrtimer whose deadline collides
with the tick. As a result we may end up not reprogramming the tick
accidentally.

In a worst case scenario under full dynticks mode, the tick stops firing
as it is supposed to every 1hz, leaving /proc/stat stalled:

      Task in a full dynticks CPU
      ----------------------------

      * hrtimer A is queued 2 seconds ahead
      * the tick is stopped, scheduled 1 second ahead
      * tick fires 1 second later
      * on tick exit, nohz schedules the tick 1 second ahead but sees
        the clockevent device is already programmed to that deadline,
        fooled by hrtimer A, the tick isn't rescheduled.
      * hrtimer A is cancelled before its deadline
      * tick never fires again until an interrupt happens...

In order to fix this, store the next tick deadline to the tick_sched
local structure and reuse that value later to check whether we need to
reprogram the clock after an interrupt.

On the other hand, ts->sleep_length still wants to know about the next
clock event and not just the tick, so we want to improve the related
comment to avoid confusion.

Reported-by: James Hartsock <hartsjc@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Rik van Riel <riel@redhat.com>
Link: http://lkml.kernel.org/r/1483539124-5693-1-git-send-email-fweisbec@gmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 9 +++++++--
 kernel/time/tick-sched.h | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2c115fdab397..74e0388cc88d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -767,7 +767,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	tick = expires;
 
 	/* Skip reprogram of event if its not changed */
-	if (ts->tick_stopped && (expires == dev->next_event))
+	if (ts->tick_stopped && (expires == ts->next_tick))
 		goto out;
 
 	/*
@@ -787,6 +787,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		trace_tick_stop(1, TICK_DEP_MASK_NONE);
 	}
 
+	ts->next_tick = tick;
+
 	/*
 	 * If the expiration time == KTIME_MAX, then we simply stop
 	 * the tick timer.
@@ -802,7 +804,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	else
 		tick_program_event(tick, 1);
 out:
-	/* Update the estimated sleep length */
+	/*
+	 * Update the estimated sleep length until the next timer
+	 * (not only the tick).
+	 */
 	ts->sleep_length = ktime_sub(dev->next_event, now);
 	return tick;
 }
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index bf38226e5c17..075444e3d48e 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -27,6 +27,7 @@ enum tick_nohz_mode {
  *			timer is modified for nohz sleeps. This is necessary
  *			to resume the tick timer operation in the timeline
  *			when the CPU returns from nohz sleep.
+ * @next_tick:		Next tick to be fired when in dynticks mode.
  * @tick_stopped:	Indicator that the idle tick has been stopped
  * @idle_jiffies:	jiffies at the entry to idle for idle time accounting
  * @idle_calls:		Total number of idle calls
@@ -44,6 +45,7 @@ struct tick_sched {
 	unsigned long			check_clocks;
 	enum tick_nohz_mode		nohz_mode;
 	ktime_t				last_tick;
+	ktime_t				next_tick;
 	int				inidle;
 	int				tick_stopped;
 	unsigned long			idle_jiffies;
-- 
cgit v1.2.3


From b6416e61012429e0277bd15a229222fd17afc1c1 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 16 Dec 2016 14:30:35 -0800
Subject: jump_labels: API for flushing deferred jump label updates

Modules that use static_key_deferred need a way to synchronize with
any delayed work that is still pending when the module is unloaded.
Introduce static_key_deferred_flush() which flushes any pending
jump label updates.

Signed-off-by: David Matlack <dmatlack@google.com>
Cc: stable@vger.kernel.org
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/jump_label_ratelimit.h | 5 +++++
 kernel/jump_label.c                  | 7 +++++++
 2 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index 089f70f83e97..23da3af459fe 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -14,6 +14,7 @@ struct static_key_deferred {
 
 #ifdef HAVE_JUMP_LABEL
 extern void static_key_slow_dec_deferred(struct static_key_deferred *key);
+extern void static_key_deferred_flush(struct static_key_deferred *key);
 extern void
 jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl);
 
@@ -26,6 +27,10 @@ static inline void static_key_slow_dec_deferred(struct static_key_deferred *key)
 	STATIC_KEY_CHECK_USE();
 	static_key_slow_dec(&key->key);
 }
+static inline void static_key_deferred_flush(struct static_key_deferred *key)
+{
+	STATIC_KEY_CHECK_USE();
+}
 static inline void
 jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 93ad6c1fb9b6..a9b8cf500591 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -182,6 +182,13 @@ void static_key_slow_dec_deferred(struct static_key_deferred *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
 
+void static_key_deferred_flush(struct static_key_deferred *key)
+{
+	STATIC_KEY_CHECK_USE();
+	flush_delayed_work(&key->work);
+}
+EXPORT_SYMBOL_GPL(static_key_deferred_flush);
+
 void jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
-- 
cgit v1.2.3


From 19c816e8e455f58da9997e4c6626f06203d8fbf0 Mon Sep 17 00:00:00 2001
From: Jike Song <jike.song@intel.com>
Date: Thu, 12 Jan 2017 16:52:02 +0800
Subject: capability: export has_capability

has_capability() is sometimes needed by modules to test capability
for specified task other than current, so export it.

Cc: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Jike Song <jike.song@intel.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 kernel/capability.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index a98e814f216f..f97fe77ceb88 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -318,6 +318,7 @@ bool has_capability(struct task_struct *t, int cap)
 {
 	return has_ns_capability(t, &init_user_ns, cap);
 }
+EXPORT_SYMBOL(has_capability);
 
 /**
  * has_ns_capability_noaudit - Does a task have a capability (unaudited)
-- 
cgit v1.2.3


From 63cae12bce9861cec309798d34701cf3da20bc71 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 9 Dec 2016 14:59:00 +0100
Subject: perf/core: Fix sys_perf_event_open() vs. hotplug

There is problem with installing an event in a task that is 'stuck' on
an offline CPU.

Blocked tasks are not dis-assosciated from offlined CPUs, after all, a
blocked task doesn't run and doesn't require a CPU etc.. Only on
wakeup do we ammend the situation and place the task on a available
CPU.

If we hit such a task with perf_install_in_context() we'll loop until
either that task wakes up or the CPU comes back online, if the task
waking depends on the event being installed, we're stuck.

While looking into this issue, I also spotted another problem, if we
hit a task with perf_install_in_context() that is in the middle of
being migrated, that is we observe the old CPU before sending the IPI,
but run the IPI (on the old CPU) while the task is already running on
the new CPU, things also go sideways.

Rework things to rely on task_curr() -- outside of rq->lock -- which
is rather tricky. Imagine the following scenario where we're trying to
install the first event into our task 't':

CPU0            CPU1            CPU2

                (current == t)

t->perf_event_ctxp[] = ctx;
smp_mb();
cpu = task_cpu(t);

                switch(t, n);
                                migrate(t, 2);
                                switch(p, t);

                                ctx = t->perf_event_ctxp[]; // must not be NULL

smp_function_call(cpu, ..);

                generic_exec_single()
                  func();
                    spin_lock(ctx->lock);
                    if (task_curr(t)) // false

                    add_event_to_ctx();
                    spin_unlock(ctx->lock);

                                perf_event_context_sched_in();
                                  spin_lock(ctx->lock);
                                  // sees event

So its CPU0's store of t->perf_event_ctxp[] that must not go 'missing'.
Because if CPU2's load of that variable were to observe NULL, it would
not try to schedule the ctx and we'd have a task running without its
counter, which would be 'bad'.

As long as we observe !NULL, we'll acquire ctx->lock. If we acquire it
first and not see the event yet, then CPU0 must observe task_curr()
and retry. If the install happens first, then we must see the event on
sched-in and all is well.

I think we can translate the first part (until the 'must not be NULL')
of the scenario to a litmus test like:

  C C-peterz

  {
  }

  P0(int *x, int *y)
  {
          int r1;

          WRITE_ONCE(*x, 1);
          smp_mb();
          r1 = READ_ONCE(*y);
  }

  P1(int *y, int *z)
  {
          WRITE_ONCE(*y, 1);
          smp_store_release(z, 1);
  }

  P2(int *x, int *z)
  {
          int r1;
          int r2;

          r1 = smp_load_acquire(z);
	  smp_mb();
          r2 = READ_ONCE(*x);
  }

  exists
  (0:r1=0 /\ 2:r1=1 /\ 2:r2=0)

Where:
  x is perf_event_ctxp[],
  y is our tasks's CPU, and
  z is our task being placed on the rq of CPU2.

The P0 smp_mb() is the one added by this patch, ordering the store to
perf_event_ctxp[] from find_get_context() and the load of task_cpu()
in task_function_call().

The smp_store_release/smp_load_acquire model the RCpc locking of the
rq->lock and the smp_mb() of P2 is the context switch switching from
whatever CPU2 was running to our task 't'.

This litmus test evaluates into:

  Test C-peterz Allowed
  States 7
  0:r1=0; 2:r1=0; 2:r2=0;
  0:r1=0; 2:r1=0; 2:r2=1;
  0:r1=0; 2:r1=1; 2:r2=1;
  0:r1=1; 2:r1=0; 2:r2=0;
  0:r1=1; 2:r1=0; 2:r2=1;
  0:r1=1; 2:r1=1; 2:r2=0;
  0:r1=1; 2:r1=1; 2:r2=1;
  No
  Witnesses
  Positive: 0 Negative: 7
  Condition exists (0:r1=0 /\ 2:r1=1 /\ 2:r2=0)
  Observation C-peterz Never 0 7
  Hash=e427f41d9146b2a5445101d3e2fcaa34

And the strong and weak model agree.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: jeremy.linton@arm.com
Link: http://lkml.kernel.org/r/20161209135900.GU3174@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 70 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ab15509fab8c..72ce7d63e561 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2249,7 +2249,7 @@ static int  __perf_install_in_context(void *info)
 	struct perf_event_context *ctx = event->ctx;
 	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
-	bool activate = true;
+	bool reprogram = true;
 	int ret = 0;
 
 	raw_spin_lock(&cpuctx->ctx.lock);
@@ -2257,27 +2257,26 @@ static int  __perf_install_in_context(void *info)
 		raw_spin_lock(&ctx->lock);
 		task_ctx = ctx;
 
-		/* If we're on the wrong CPU, try again */
-		if (task_cpu(ctx->task) != smp_processor_id()) {
-			ret = -ESRCH;
-			goto unlock;
-		}
+		reprogram = (ctx->task == current);
 
 		/*
-		 * If we're on the right CPU, see if the task we target is
-		 * current, if not we don't have to activate the ctx, a future
-		 * context switch will do that for us.
+		 * If the task is running, it must be running on this CPU,
+		 * otherwise we cannot reprogram things.
+		 *
+		 * If its not running, we don't care, ctx->lock will
+		 * serialize against it becoming runnable.
 		 */
-		if (ctx->task != current)
-			activate = false;
-		else
-			WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+		if (task_curr(ctx->task) && !reprogram) {
+			ret = -ESRCH;
+			goto unlock;
+		}
 
+		WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
 	} else if (task_ctx) {
 		raw_spin_lock(&task_ctx->lock);
 	}
 
-	if (activate) {
+	if (reprogram) {
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 		add_event_to_ctx(event, ctx);
 		ctx_resched(cpuctx, task_ctx);
@@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx,
 	/*
 	 * Installing events is tricky because we cannot rely on ctx->is_active
 	 * to be set in case this is the nr_events 0 -> 1 transition.
+	 *
+	 * Instead we use task_curr(), which tells us if the task is running.
+	 * However, since we use task_curr() outside of rq::lock, we can race
+	 * against the actual state. This means the result can be wrong.
+	 *
+	 * If we get a false positive, we retry, this is harmless.
+	 *
+	 * If we get a false negative, things are complicated. If we are after
+	 * perf_event_context_sched_in() ctx::lock will serialize us, and the
+	 * value must be correct. If we're before, it doesn't matter since
+	 * perf_event_context_sched_in() will program the counter.
+	 *
+	 * However, this hinges on the remote context switch having observed
+	 * our task->perf_event_ctxp[] store, such that it will in fact take
+	 * ctx::lock in perf_event_context_sched_in().
+	 *
+	 * We do this by task_function_call(), if the IPI fails to hit the task
+	 * we know any future context switch of task must see the
+	 * perf_event_ctpx[] store.
 	 */
-again:
+
 	/*
-	 * Cannot use task_function_call() because we need to run on the task's
-	 * CPU regardless of whether its current or not.
+	 * This smp_mb() orders the task->perf_event_ctxp[] store with the
+	 * task_cpu() load, such that if the IPI then does not find the task
+	 * running, a future context switch of that task must observe the
+	 * store.
 	 */
-	if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+	smp_mb();
+again:
+	if (!task_function_call(task, __perf_install_in_context, event))
 		return;
 
 	raw_spin_lock_irq(&ctx->lock);
@@ -2348,12 +2370,16 @@ again:
 		raw_spin_unlock_irq(&ctx->lock);
 		return;
 	}
-	raw_spin_unlock_irq(&ctx->lock);
 	/*
-	 * Since !ctx->is_active doesn't mean anything, we must IPI
-	 * unconditionally.
+	 * If the task is not running, ctx->lock will avoid it becoming so,
+	 * thus we can safely install the event.
 	 */
-	goto again;
+	if (task_curr(task)) {
+		raw_spin_unlock_irq(&ctx->lock);
+		goto again;
+	}
+	add_event_to_ctx(event, ctx);
+	raw_spin_unlock_irq(&ctx->lock);
 }
 
 /*
-- 
cgit v1.2.3


From 321027c1fe77f892f4ea07846aeae08cefbbb290 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Jan 2017 21:09:50 +0100
Subject: perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race

Di Shen reported a race between two concurrent sys_perf_event_open()
calls where both try and move the same pre-existing software group
into a hardware context.

The problem is exactly that described in commit:

  f63a8daa5812 ("perf: Fix event->ctx locking")

... where, while we wait for a ctx->mutex acquisition, the event->ctx
relation can have changed under us.

That very same commit failed to recognise sys_perf_event_context() as an
external access vector to the events and thereby didn't apply the
established locking rules correctly.

So while one sys_perf_event_open() call is stuck waiting on
mutex_lock_double(), the other (which owns said locks) moves the group
about. So by the time the former sys_perf_event_open() acquires the
locks, the context we've acquired is stale (and possibly dead).

Apply the established locking rules as per perf_event_ctx_lock_nested()
to the mutex_lock_double() for the 'move_group' case. This obviously means
we need to validate state after we acquire the locks.

Reported-by: Di Shen (Keen Lab)
Tested-by: John Dias <joaodias@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Min Chong <mchong@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: f63a8daa5812 ("perf: Fix event->ctx locking")
Link: http://lkml.kernel.org/r/20170106131444.GZ3174@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 72ce7d63e561..cbc5937265da 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9529,6 +9529,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
 	return 0;
 }
 
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+			     struct perf_event_context *ctx)
+{
+	struct perf_event_context *gctx;
+
+again:
+	rcu_read_lock();
+	gctx = READ_ONCE(group_leader->ctx);
+	if (!atomic_inc_not_zero(&gctx->refcount)) {
+		rcu_read_unlock();
+		goto again;
+	}
+	rcu_read_unlock();
+
+	mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+	if (group_leader->ctx != gctx) {
+		mutex_unlock(&ctx->mutex);
+		mutex_unlock(&gctx->mutex);
+		put_ctx(gctx);
+		goto again;
+	}
+
+	return gctx;
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -9772,12 +9803,31 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	if (move_group) {
-		gctx = group_leader->ctx;
-		mutex_lock_double(&gctx->mutex, &ctx->mutex);
+		gctx = __perf_event_ctx_lock_double(group_leader, ctx);
+
 		if (gctx->task == TASK_TOMBSTONE) {
 			err = -ESRCH;
 			goto err_locked;
 		}
+
+		/*
+		 * Check if we raced against another sys_perf_event_open() call
+		 * moving the software group underneath us.
+		 */
+		if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+			/*
+			 * If someone moved the group out from under us, check
+			 * if this new event wound up on the same ctx, if so
+			 * its the regular !move_group case, otherwise fail.
+			 */
+			if (gctx != ctx) {
+				err = -EINVAL;
+				goto err_locked;
+			} else {
+				perf_event_ctx_unlock(group_leader, gctx);
+				move_group = 0;
+			}
+		}
 	} else {
 		mutex_lock(&ctx->mutex);
 	}
@@ -9879,7 +9929,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	perf_unpin_context(ctx);
 
 	if (move_group)
-		mutex_unlock(&gctx->mutex);
+		perf_event_ctx_unlock(group_leader, gctx);
 	mutex_unlock(&ctx->mutex);
 
 	if (task) {
@@ -9905,7 +9955,7 @@ SYSCALL_DEFINE5(perf_event_open,
 
 err_locked:
 	if (move_group)
-		mutex_unlock(&gctx->mutex);
+		perf_event_ctx_unlock(group_leader, gctx);
 	mutex_unlock(&ctx->mutex);
 /* err_file: */
 	fput(event_file);
-- 
cgit v1.2.3


From 475113d937adfd150eb82b5e2c5507125a68e7af Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 28 Dec 2016 14:31:03 +0100
Subject: perf/x86/intel: Account interrupts for PEBS errors

It's possible to set up PEBS events to get only errors and not
any data, like on SNB-X (model 45) and IVB-EP (model 62)
via 2 perf commands running simultaneously:

    taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10

This leads to a soft lock up, because the error path of the
intel_pmu_drain_pebs_nhm() does not account event->hw.interrupt
for error PEBS interrupts, so in case you're getting ONLY
errors you don't have a way to stop the event when it's over
the max_samples_per_tick limit:

  NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816]
  ...
  RIP: 0010:[<ffffffff81159232>]  [<ffffffff81159232>] smp_call_function_single+0xe2/0x140
  ...
  Call Trace:
   ? trace_hardirqs_on_caller+0xf5/0x1b0
   ? perf_cgroup_attach+0x70/0x70
   perf_install_in_context+0x199/0x1b0
   ? ctx_resched+0x90/0x90
   SYSC_perf_event_open+0x641/0xf90
   SyS_perf_event_open+0x9/0x10
   do_syscall_64+0x6c/0x1f0
   entry_SYSCALL64_slow_path+0x25/0x25

Add perf_event_account_interrupt() which does the interrupt
and frequency checks and call it from intel_pmu_drain_pebs_nhm()'s
error path.

We keep the pending_kill and pending_wakeup logic only in the
__perf_event_overflow() path, because they make sense only if
there's any data to deliver.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1482931866-6018-2-git-send-email-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/ds.c |  6 +++++-
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 47 ++++++++++++++++++++++++++++++----------------
 3 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index be202390bbd3..9dfeeeca0ea8 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			continue;
 
 		/* log dropped samples number */
-		if (error[bit])
+		if (error[bit]) {
 			perf_log_lost_samples(event, error[bit]);
 
+			if (perf_event_account_interrupt(event))
+				x86_pmu_stop(event, 0);
+		}
+
 		if (counts[bit]) {
 			__intel_pmu_pebs_event(event, iregs, base,
 					       top, bit, counts[bit]);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..78ed8105e64d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_disable_local(struct perf_event *event);
 extern void perf_event_disable_inatomic(struct perf_event *event);
 extern void perf_event_task_tick(void);
+extern int perf_event_account_interrupt(struct perf_event *event);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cbc5937265da..110b38a58493 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7060,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event)
 	perf_output_end(&handle);
 }
 
-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
-				   int throttle, struct perf_sample_data *data,
-				   struct pt_regs *regs)
+static int
+__perf_event_account_interrupt(struct perf_event *event, int throttle)
 {
-	int events = atomic_read(&event->event_limit);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 seq;
 	int ret = 0;
-
-	/*
-	 * Non-sampling counters might still use the PMI to fold short
-	 * hardware counters, ignore those.
-	 */
-	if (unlikely(!is_sampling_event(event)))
-		return 0;
+	u64 seq;
 
 	seq = __this_cpu_read(perf_throttled_seq);
 	if (seq != hwc->interrupts_seq) {
@@ -7106,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event,
 			perf_adjust_period(event, delta, hwc->last_period, true);
 	}
 
+	return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+	return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+				   int throttle, struct perf_sample_data *data,
+				   struct pt_regs *regs)
+{
+	int events = atomic_read(&event->event_limit);
+	int ret = 0;
+
+	/*
+	 * Non-sampling counters might still use the PMI to fold short
+	 * hardware counters, ignore those.
+	 */
+	if (unlikely(!is_sampling_event(event)))
+		return 0;
+
+	ret = __perf_event_account_interrupt(event, throttle);
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * events
-- 
cgit v1.2.3


From f466ae66fa6a599f9a53b5f9bafea4b8cfffa7fb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 9 Jan 2017 23:23:15 -0800
Subject: rcu: Remove cond_resched() from Tiny synchronize_sched()

It is now legal to invoke synchronize_sched() at early boot, which causes
Tiny RCU's synchronize_sched() to emit spurious splats.  This commit
therefore removes the cond_resched() from Tiny RCU's synchronize_sched().

Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue")
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.9.0-
---
 kernel/rcu/tiny.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1898559e6b60..b23a4d076f3d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -185,9 +185,6 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
  * benefits of doing might_sleep() to reduce latency.)
  *
  * Cool, huh?  (Due to Josh Triplett.)
- *
- * But we want to make this a static inline later.  The cond_resched()
- * currently makes this problematic.
  */
 void synchronize_sched(void)
 {
@@ -195,7 +192,6 @@ void synchronize_sched(void)
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_sched() in RCU read-side critical section");
-	cond_resched();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
-- 
cgit v1.2.3


From 52d7e48b86fc108e45a656d8e53e4237993c481d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Jan 2017 02:28:26 -0800
Subject: rcu: Narrow early boot window of illegal synchronous grace periods

The current preemptible RCU implementation goes through three phases
during bootup.  In the first phase, there is only one CPU that is running
with preemption disabled, so that a no-op is a synchronous grace period.
In the second mid-boot phase, the scheduler is running, but RCU has
not yet gotten its kthreads spawned (and, for expedited grace periods,
workqueues are not yet running.  During this time, any attempt to do
a synchronous grace period will hang the system (or complain bitterly,
depending).  In the third and final phase, RCU is fully operational and
everything works normally.

This has been OK for some time, but there has recently been some
synchronous grace periods showing up during the second mid-boot phase.
This code worked "by accident" for awhile, but started failing as soon
as expedited RCU grace periods switched over to workqueues in commit
8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue").
Note that the code was buggy even before this commit, as it was subject
to failure on real-time systems that forced all expedited grace periods
to run as normal grace periods (for example, using the rcu_normal ksysfs
parameter).  The callchain from the failure case is as follows:

early_amd_iommu_init()
|-> acpi_put_table(ivrs_base);
|-> acpi_tb_put_table(table_desc);
|-> acpi_tb_invalidate_table(table_desc);
|-> acpi_tb_release_table(...)
|-> acpi_os_unmap_memory
|-> acpi_os_unmap_iomem
|-> acpi_os_map_cleanup
|-> synchronize_rcu_expedited

The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y,
which caused the code to try using workqueues before they were
initialized, which did not go well.

This commit therefore reworks RCU to permit synchronous grace periods
to proceed during this mid-boot phase.  This commit is therefore a
fix to a regression introduced in v4.9, and is therefore being put
forward post-merge-window in v4.10.

This commit sets a flag from the existing rcu_scheduler_starting()
function which causes all synchronous grace periods to take the expedited
path.  The expedited path now checks this flag, using the requesting task
to drive the expedited grace period forward during the mid-boot phase.
Finally, this flag is updated by a core_initcall() function named
rcu_exp_runtime_mode(), which causes the runtime codepaths to be used.

Note that this arrangement assumes that tasks are not sent POSIX signals
(or anything similar) from the time that the first task is spawned
through core_initcall() time.

Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue")
Reported-by: "Zheng, Lv" <lv.zheng@intel.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Stan Kain <stan.kain@gmail.com>
Tested-by: Ivan <waffolz@hotmail.com>
Tested-by: Emanuel Castelo <emanuel.castelo@gmail.com>
Tested-by: Bruno Pesavento <bpesavento@infinito.it>
Tested-by: Borislav Petkov <bp@suse.de>
Tested-by: Frederic Bezies <fredbezies@gmail.com>
Cc: <stable@vger.kernel.org> # 4.9.0-
---
 include/linux/rcupdate.h |  4 ++++
 kernel/rcu/rcu.h         |  1 +
 kernel/rcu/tiny_plugin.h |  9 +++++++--
 kernel/rcu/tree.c        | 33 ++++++++++++++++++------------
 kernel/rcu/tree_exp.h    | 52 ++++++++++++++++++++++++++++++++++++++----------
 kernel/rcu/tree_plugin.h |  2 +-
 kernel/rcu/update.c      | 38 +++++++++++++++++++++++++++--------
 7 files changed, 104 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 321f9ed552a9..01f71e1d2e94 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -444,6 +444,10 @@ bool __rcu_is_watching(void);
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
 
+#define RCU_SCHEDULER_INACTIVE	0
+#define RCU_SCHEDULER_INIT	1
+#define RCU_SCHEDULER_RUNNING	2
+
 /*
  * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
  * initialization and destruction of rcu_head on the stack. rcu_head structures
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..0d6ff3e471be 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
 #define TPS(x)  tracepoint_string(x)
 
 void rcu_early_boot_tests(void);
+void rcu_test_sync_prims(void);
 
 /*
  * This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 196f0302e2f4..c64b827ecbca 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 
 /*
  * During boot, we forgive RCU lockdep issues.  After this function is
- * invoked, we start taking RCU lockdep issues seriously.
+ * invoked, we start taking RCU lockdep issues seriously.  Note that unlike
+ * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
+ * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
+ * The reason for this is that Tiny RCU does not need kthreads, so does
+ * not have to care about the fact that the scheduler is half-initialized
+ * at a certain phase of the boot process.
  */
 void __init rcu_scheduler_starting(void)
 {
 	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
+	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
 }
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 96c52e43f7ca..cb4e2056ccf3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 int sysctl_panic_on_rcu_stall __read_mostly;
 
 /*
- * The rcu_scheduler_active variable transitions from zero to one just
- * before the first task is spawned.  So when this variable is zero, RCU
- * can assume that there is but one task, allowing RCU to (for example)
+ * The rcu_scheduler_active variable is initialized to the value
+ * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
+ * first task is spawned.  So when this variable is RCU_SCHEDULER_INACTIVE,
+ * RCU can assume that there is but one task, allowing RCU to (for example)
  * optimize synchronize_rcu() to a simple barrier().  When this variable
- * is one, RCU must actually do all the hard work required to detect real
- * grace periods.  This variable is also used to suppress boot-time false
- * positives from lockdep-RCU error checking.
+ * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
+ * to detect real grace periods.  This variable is also used to suppress
+ * boot-time false positives from lockdep-RCU error checking.  Finally, it
+ * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
+ * is fully initialized, including all of its kthreads having been spawned.
  */
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void)
 early_initcall(rcu_spawn_gp_kthread);
 
 /*
- * This function is invoked towards the end of the scheduler's initialization
- * process.  Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system).  After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections.  This function also enables RCU lockdep checking.
+ * This function is invoked towards the end of the scheduler's
+ * initialization process.  Before this is called, the idle task might
+ * contain synchronous grace-period primitives (during which time, this idle
+ * task is booting the system, and such primitives are no-ops).  After this
+ * function is called, any synchronous grace-period primitives are run as
+ * expedited, with the requesting task driving the grace period forward.
+ * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * runtime RCU functionality.
  */
 void rcu_scheduler_starting(void)
 {
 	WARN_ON(num_online_cpus() != 1);
 	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
+	rcu_test_sync_prims();
+	rcu_scheduler_active = RCU_SCHEDULER_INIT;
+	rcu_test_sync_prims();
 }
 
 /*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d3053e99fdb6..e59e1849b89a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -531,6 +531,20 @@ struct rcu_exp_work {
 	struct work_struct rew_work;
 };
 
+/*
+ * Common code to drive an expedited grace period forward, used by
+ * workqueues and mid-boot-time tasks.
+ */
+static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
+				  smp_call_func_t func, unsigned long s)
+{
+	/* Initialize the rcu_node tree in preparation for the wait. */
+	sync_rcu_exp_select_cpus(rsp, func);
+
+	/* Wait and clean up, including waking everyone. */
+	rcu_exp_wait_wake(rsp, s);
+}
+
 /*
  * Work-queue handler to drive an expedited grace period forward.
  */
@@ -538,12 +552,8 @@ static void wait_rcu_exp_gp(struct work_struct *wp)
 {
 	struct rcu_exp_work *rewp;
 
-	/* Initialize the rcu_node tree in preparation for the wait. */
 	rewp = container_of(wp, struct rcu_exp_work, rew_work);
-	sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func);
-
-	/* Wait and clean up, including waking everyone. */
-	rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
+	rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
 }
 
 /*
@@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	/* Marshall arguments and schedule the expedited grace period. */
-	rew.rew_func = func;
-	rew.rew_rsp = rsp;
-	rew.rew_s = s;
-	INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
-	schedule_work(&rew.rew_work);
+	/* Ensure that load happens before action based on it. */
+	if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+		/* Direct call during scheduler init and early_initcalls(). */
+		rcu_exp_sel_wait_wake(rsp, func, s);
+	} else {
+		/* Marshall arguments & schedule the expedited grace period. */
+		rew.rew_func = func;
+		rew.rew_rsp = rsp;
+		rew.rew_s = s;
+		INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
+		schedule_work(&rew.rew_work);
+	}
 
 	/* Wait for expedited grace period to complete. */
 	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void)
 {
 	struct rcu_state *rsp = rcu_state_p;
 
+	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+		return;
 	_synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void)
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+
+/*
+ * Switch to run-time mode once Tree RCU has fully initialized.
+ */
+static int __init rcu_exp_runtime_mode(void)
+{
+	rcu_test_sync_prims();
+	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+	rcu_test_sync_prims();
+	return 0;
+}
+core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 85c5a883c6e3..56583e764ebf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_rcu() in RCU read-side critical section");
-	if (!rcu_scheduler_active)
+	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return;
 	if (rcu_gp_is_expedited())
 		synchronize_rcu_expedited();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f19271dce0a9..4f6db7e6a117 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
  * Should expedited grace-period primitives always fall back to their
  * non-expedited counterparts?  Intended for use within RCU.  Note
  * that if the user specifies both rcu_expedited and rcu_normal, then
- * rcu_normal wins.
+ * rcu_normal wins.  (Except during the time period during boot from
+ * when the first task is spawned until the rcu_exp_runtime_mode()
+ * core_initcall() is invoked, at which point everything is expedited.)
  */
 bool rcu_gp_is_normal(void)
 {
-	return READ_ONCE(rcu_normal);
+	return READ_ONCE(rcu_normal) &&
+	       rcu_scheduler_active != RCU_SCHEDULER_INIT;
 }
 EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
 
@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
 /*
  * Should normal grace-period primitives be expedited?  Intended for
  * use within RCU.  Note that this function takes the rcu_expedited
- * sysfs/boot variable into account as well as the rcu_expedite_gp()
- * nesting.  So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
- * returns false is a -really- bad idea.
+ * sysfs/boot variable and rcu_scheduler_active into account as well
+ * as the rcu_expedite_gp() nesting.  So looping on rcu_unexpedite_gp()
+ * until rcu_gp_is_expedited() returns false is a -really- bad idea.
  */
 bool rcu_gp_is_expedited(void)
 {
-	return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+	return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
+	       rcu_scheduler_active == RCU_SCHEDULER_INIT;
 }
 EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
 
@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
 
 int notrace debug_lockdep_rcu_enabled(void)
 {
-	return rcu_scheduler_active && debug_locks &&
+	return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
 	       current->lockdep_recursion == 0;
 }
 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
 void synchronize_rcu_tasks(void)
 {
 	/* Complain if the scheduler has not started.  */
-	RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+	RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
 			 "synchronize_rcu_tasks called too soon");
 
 	/* Wait for the grace period. */
@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void)
 
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
+/*
+ * Test each non-SRCU synchronous grace-period wait API.  This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+	if (!IS_ENABLED(CONFIG_PROVE_RCU))
+		return;
+	synchronize_rcu();
+	synchronize_rcu_bh();
+	synchronize_sched();
+	synchronize_rcu_expedited();
+	synchronize_rcu_bh_expedited();
+	synchronize_sched_expedited();
+}
+
 #ifdef CONFIG_PROVE_RCU
 
 /*
@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
 		early_boot_test_call_rcu_bh();
 	if (rcu_self_test_sched)
 		early_boot_test_call_rcu_sched();
+	rcu_test_sync_prims();
 }
 
 static int rcu_verify_early_boot_tests(void)
-- 
cgit v1.2.3


From 4205e4786d0b9fc3b4fec7b1910cf645a0468307 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jan 2017 14:01:05 +0100
Subject: cpu/hotplug: Provide dynamic range for prepare stage

Mathieu reported that the LTTNG modules are broken as of 4.10-rc1 due to
the removal of the cpu hotplug notifiers.

Usually I don't care much about out of tree modules, but LTTNG is widely
used in distros. There are two ways to solve that:

1) Reserve a hotplug state for LTTNG

2) Add a dynamic range for the prepare states.

While #1 is the simplest solution, #2 is the proper one as we can convert
in tree users, which do not care about ordering, to the dynamic range as
well.

Add a dynamic range which allows LTTNG to request states in the prepare
stage.

Reported-and-tested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Sewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1701101353010.3401@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  2 ++
 kernel/cpu.c               | 22 ++++++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 20bfefbe7594..d936a0021839 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -74,6 +74,8 @@ enum cpuhp_state {
 	CPUHP_ZCOMP_PREPARE,
 	CPUHP_TIMERS_DEAD,
 	CPUHP_MIPS_SOC_PREPARE,
+	CPUHP_BP_PREPARE_DYN,
+	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f75c4d031eeb..c47506357519 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1302,10 +1302,24 @@ static int cpuhp_cb_check(enum cpuhp_state state)
  */
 static int cpuhp_reserve_state(enum cpuhp_state state)
 {
-	enum cpuhp_state i;
+	enum cpuhp_state i, end;
+	struct cpuhp_step *step;
 
-	for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
-		if (!cpuhp_ap_states[i].name)
+	switch (state) {
+	case CPUHP_AP_ONLINE_DYN:
+		step = cpuhp_ap_states + CPUHP_AP_ONLINE_DYN;
+		end = CPUHP_AP_ONLINE_DYN_END;
+		break;
+	case CPUHP_BP_PREPARE_DYN:
+		step = cpuhp_bp_states + CPUHP_BP_PREPARE_DYN;
+		end = CPUHP_BP_PREPARE_DYN_END;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	for (i = state; i <= end; i++, step++) {
+		if (!step->name)
 			return i;
 	}
 	WARN(1, "No more dynamic states available for CPU hotplug\n");
@@ -1323,7 +1337,7 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
 
 	mutex_lock(&cpuhp_state_mutex);
 
-	if (state == CPUHP_AP_ONLINE_DYN) {
+	if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
 		ret = cpuhp_reserve_state(state);
 		if (ret < 0)
 			goto out;
-- 
cgit v1.2.3


From f1f7714ea51c56b7163fb1a5acf39c6a204dd758 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Jan 2017 23:38:15 +0100
Subject: bpf: rework prog_digest into prog_tag

Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
fdinfo/netlink") was recently discussed, partially due to
admittedly suboptimal name of "prog_digest" in combination
with sha1 hash usage, thus inevitably and rightfully concerns
about its security in terms of collision resistance were
raised with regards to use-cases.

The intended use cases are for debugging resp. introspection
only for providing a stable "tag" over the instruction sequence
that both kernel and user space can calculate independently.
It's not usable at all for making a security relevant decision.
So collisions where two different instruction sequences generate
the same tag can happen, but ideally at a rather low rate. The
"tag" will be dumped in hex and is short enough to introspect
in tracepoints or kallsyms output along with other data such
as stack trace, etc. Thus, this patch performs a rename into
prog_tag and truncates the tag to a short output (64 bits) to
make it obvious it's not collision-free.

Should in future a hash or facility be needed with a security
relevant focus, then we can think about requirements, constraints,
etc that would fit to that situation. For now, rework the exposed
parts for the current use cases as long as nothing has been
released yet. Tested on x86_64 and s390x.

Fixes: 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                |  2 +-
 include/linux/filter.h             |  6 ++++--
 include/uapi/linux/pkt_cls.h       |  2 +-
 include/uapi/linux/tc_act/tc_bpf.h |  2 +-
 kernel/bpf/core.c                  | 14 ++++++++------
 kernel/bpf/syscall.c               |  8 ++++----
 kernel/bpf/verifier.c              |  2 +-
 net/sched/act_bpf.c                |  5 ++---
 net/sched/cls_bpf.c                |  4 ++--
 9 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f74ae68086dc..05cf951df3fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,7 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
-int bpf_prog_calc_digest(struct bpf_prog *fp);
+int bpf_prog_calc_tag(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a0934e6c9bab..e4eb2546339a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -57,6 +57,8 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
+#define BPF_TAG_SIZE	8
+
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -408,7 +410,7 @@ struct bpf_prog {
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
-	u32			digest[SHA_DIGEST_WORDS]; /* Program digest */
+	u8			tag[BPF_TAG_SIZE];
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	unsigned int		(*bpf_func)(const void *ctx,
@@ -519,7 +521,7 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
 	return prog->len * sizeof(struct bpf_insn);
 }
 
-static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog)
+static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog)
 {
 	return round_up(bpf_prog_insn_size(prog) +
 			sizeof(__be64) + 1, SHA_MESSAGE_BYTES);
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index cb4bcdc58543..a4dcd88ec271 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -397,7 +397,7 @@ enum {
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
 	TCA_BPF_FLAGS_GEN,
-	TCA_BPF_DIGEST,
+	TCA_BPF_TAG,
 	__TCA_BPF_MAX,
 };
 
diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index a6b88a6f7f71..975b50dc8d1d 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -27,7 +27,7 @@ enum {
 	TCA_ACT_BPF_FD,
 	TCA_ACT_BPF_NAME,
 	TCA_ACT_BPF_PAD,
-	TCA_ACT_BPF_DIGEST,
+	TCA_ACT_BPF_TAG,
 	__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1eb4f1303756..503d4211988a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -146,10 +146,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
-int bpf_prog_calc_digest(struct bpf_prog *fp)
+int bpf_prog_calc_tag(struct bpf_prog *fp)
 {
 	const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
-	u32 raw_size = bpf_prog_digest_scratch_size(fp);
+	u32 raw_size = bpf_prog_tag_scratch_size(fp);
+	u32 digest[SHA_DIGEST_WORDS];
 	u32 ws[SHA_WORKSPACE_WORDS];
 	u32 i, bsize, psize, blocks;
 	struct bpf_insn *dst;
@@ -162,7 +163,7 @@ int bpf_prog_calc_digest(struct bpf_prog *fp)
 	if (!raw)
 		return -ENOMEM;
 
-	sha_init(fp->digest);
+	sha_init(digest);
 	memset(ws, 0, sizeof(ws));
 
 	/* We need to take out the map fd for the digest calculation
@@ -204,13 +205,14 @@ int bpf_prog_calc_digest(struct bpf_prog *fp)
 	*bits = cpu_to_be64((psize - 1) << 3);
 
 	while (blocks--) {
-		sha_transform(fp->digest, todo, ws);
+		sha_transform(digest, todo, ws);
 		todo += SHA_MESSAGE_BYTES;
 	}
 
-	result = (__force __be32 *)fp->digest;
+	result = (__force __be32 *)digest;
 	for (i = 0; i < SHA_DIGEST_WORDS; i++)
-		result[i] = cpu_to_be32(fp->digest[i]);
+		result[i] = cpu_to_be32(digest[i]);
+	memcpy(fp->tag, result, sizeof(fp->tag));
 
 	vfree(raw);
 	return 0;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e89acea22ecf..1d6b29e4e2c3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -688,17 +688,17 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
 static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 {
 	const struct bpf_prog *prog = filp->private_data;
-	char prog_digest[sizeof(prog->digest) * 2 + 1] = { };
+	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
 
-	bin2hex(prog_digest, prog->digest, sizeof(prog->digest));
+	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "prog_type:\t%u\n"
 		   "prog_jited:\t%u\n"
-		   "prog_digest:\t%s\n"
+		   "prog_tag:\t%s\n"
 		   "memlock:\t%llu\n",
 		   prog->type,
 		   prog->jited,
-		   prog_digest,
+		   prog_tag,
 		   prog->pages * 1ULL << PAGE_SHIFT);
 }
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83ed2f8f6f22..cdc43b899f28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2936,7 +2936,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	int insn_cnt = env->prog->len;
 	int i, j, err;
 
-	err = bpf_prog_calc_digest(env->prog);
+	err = bpf_prog_calc_tag(env->prog);
 	if (err)
 		return err;
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 1c60317f0121..520baa41cba3 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -123,12 +123,11 @@ static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog,
 	    nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
-	nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST,
-			  sizeof(prog->filter->digest));
+	nla = nla_reserve(skb, TCA_ACT_BPF_TAG, sizeof(prog->filter->tag));
 	if (nla == NULL)
 		return -EMSGSIZE;
 
-	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+	memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
 
 	return 0;
 }
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index adc776048d1a..d9c97018317d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -555,11 +555,11 @@ static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog,
 	    nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name))
 		return -EMSGSIZE;
 
-	nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest));
+	nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag));
 	if (nla == NULL)
 		return -EMSGSIZE;
 
-	memcpy(nla_data(nla), prog->filter->digest, nla_len(nla));
+	memcpy(nla_data(nla), prog->filter->tag, nla_len(nla));
 
 	return 0;
 }
-- 
cgit v1.2.3


From 5eb7c0d04f04a667c049fe090a95494a8de2955c Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Sun, 1 Jan 2017 20:25:25 -0600
Subject: taint/module: Fix problems when out-of-kernel driver defines true or
 false

Commit 7fd8329ba502 ("taint/module: Clean up global and module taint
flags handling") used the key words true and false as character members
of a new struct. These names cause problems when out-of-kernel modules
such as VirtualBox include their own definitions of true and false.

Fixes: 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling")
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 include/linux/kernel.h | 4 ++--
 kernel/module.c        | 2 +-
 kernel/panic.c         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 56aec84237ad..cb09238f6d32 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -514,8 +514,8 @@ extern enum system_states {
 #define TAINT_FLAGS_COUNT		16
 
 struct taint_flag {
-	char true;	/* character printed when tainted */
-	char false;	/* character printed when not tainted */
+	char c_true;	/* character printed when tainted */
+	char c_false;	/* character printed when not tainted */
 	bool module;	/* also show as a per-module taint flag */
 };
 
diff --git a/kernel/module.c b/kernel/module.c
index 5088784c0cf9..38d4270925d4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1145,7 +1145,7 @@ static size_t module_flags_taint(struct module *mod, char *buf)
 
 	for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
 		if (taint_flags[i].module && test_bit(i, &mod->taints))
-			buf[l++] = taint_flags[i].true;
+			buf[l++] = taint_flags[i].c_true;
 	}
 
 	return l;
diff --git a/kernel/panic.c b/kernel/panic.c
index c51edaa04fce..901c4fb46002 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -355,7 +355,7 @@ const char *print_tainted(void)
 		for (i = 0; i < TAINT_FLAGS_COUNT; i++) {
 			const struct taint_flag *t = &taint_flags[i];
 			*s++ = test_bit(i, &tainted_mask) ?
-					t->true : t->false;
+					t->c_true : t->c_false;
 		}
 		*s = 0;
 	} else
-- 
cgit v1.2.3


From 0fec9557fd0c5349e3bd1a2141612a60bc20bb71 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Tue, 17 Jan 2017 15:35:01 +0100
Subject: cpu/hotplug: Remove unused but set variable in _cpu_down()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the recent removal of the hotplug notifiers the variable 'hasdied' in
_cpu_down() is set but no longer read, leading to the following GCC warning
when building with 'make W=1':

  kernel/cpu.c:767:7: warning: variable ‘hasdied’ set but not used [-Wunused-but-set-variable]

Fix it by removing the variable.

Fixes: 530e9b76ae8f ("cpu/hotplug: Remove obsolete cpu hotplug register/unregister functions")
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20170117143501.20893-1-tklauser@distanz.ch
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index c47506357519..0a5f630f5c54 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -764,7 +764,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int prev_state, ret = 0;
-	bool hasdied = false;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -809,7 +808,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 		cpuhp_kick_ap_work(cpu);
 	}
 
-	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
 out:
 	cpu_hotplug_done();
 	return ret;
-- 
cgit v1.2.3


From 47087eeb744c83482774e8f6dc20cf2b11fff53f Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Mon, 19 Dec 2016 23:03:11 +0800
Subject: PM / Hibernate: Use rb_entry() instead of container_of()

To make the code clearer, use rb_entry() instead of container_of() to
deal with rbtree.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/swap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 32e0c232efba..f80fd33639e0 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -201,7 +201,7 @@ void free_all_swap_pages(int swap)
 		struct swsusp_extent *ext;
 		unsigned long offset;
 
-		ext = container_of(node, struct swsusp_extent, node);
+		ext = rb_entry(node, struct swsusp_extent, node);
 		rb_erase(node, &swsusp_extents);
 		for (offset = ext->start; offset <= ext->end; offset++)
 			swap_free(swp_entry(swap, offset));
-- 
cgit v1.2.3