From fa07fb6a4ecad41d373eaf1574b9d54b4f0c1e75 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Sep 2016 08:54:06 -0400 Subject: workqueue: dump workqueue state on sanity check failures in destroy_workqueue() destroy_workqueue() performs a number of sanity checks to ensure that the workqueue is empty before proceeding with destruction. However, it's not always easy to tell what's going on just from the warning message. Let's dump workqueue state after sanity check failures to help debugging. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/CACT4Y+Zs6vkjHo9qHb4TrEiz3S4+quvvVQ9VWvj2Mx6pETGb9Q@mail.gmail.com Cc: Dmitry Vyukov --- kernel/workqueue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc3..4eaec8b86d65 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4021,6 +4021,7 @@ void destroy_workqueue(struct workqueue_struct *wq) for (i = 0; i < WORK_NR_COLORS; i++) { if (WARN_ON(pwq->nr_in_flight[i])) { mutex_unlock(&wq->mutex); + show_workqueue_state(); return; } } @@ -4029,6 +4030,7 @@ void destroy_workqueue(struct workqueue_struct *wq) WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { mutex_unlock(&wq->mutex); + show_workqueue_state(); return; } } -- cgit v1.2.3 From 3347fa0928210d96aaa2bd6cd5a8391d5e630873 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 Sep 2016 15:49:32 -0400 Subject: workqueue: make workqueue available early during boot Workqueue is currently initialized in an early init call; however, there are cases where early boot code has to be split and reordered to come after workqueue initialization or the same code path which makes use of workqueues is used both before workqueue initailization and after. The latter cases have to gate workqueue usages with keventd_up() tests, which is nasty and easy to get wrong. Workqueue usages have become widespread and it'd be a lot more convenient if it can be used very early from boot. This patch splits workqueue initialization into two steps. workqueue_init_early() which sets up the basic data structures so that workqueues can be created and work items queued, and workqueue_init() which actually brings up workqueues online and starts executing queued work items. The former step can be done very early during boot once memory allocation, cpumasks and idr are initialized. The latter right after kthreads become available. This allows work item queueing and canceling from very early boot which is what most of these use cases want. * As systemd_wq being initialized doesn't indicate that workqueue is fully online anymore, update keventd_up() to test wq_online instead. The follow-up patches will get rid of all its usages and the function itself. * Flushing doesn't make sense before workqueue is fully initialized. The flush functions trigger WARN and return immediately before fully online. * Work items are never in-flight before fully online. Canceling can always succeed by skipping the flush step. * Some code paths can no longer assume to be called with irq enabled as irq is disabled during early boot. Use irqsave/restore operations instead. v2: Watchdog init, which requires timer to be running, moved from workqueue_init_early() to workqueue_init(). Signed-off-by: Tejun Heo Suggested-by: Linus Torvalds Link: http://lkml.kernel.org/r/CA+55aFx0vPuMuxn00rBSM192n-Du5uxy+4AvKa0SBSOVJeuCGg@mail.gmail.com --- include/linux/workqueue.h | 7 ++++- init/main.c | 10 +++++++ kernel/workqueue.c | 76 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 76 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 26cc1df280d6..91d416f9c0a7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -358,6 +358,8 @@ extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; +extern bool wq_online; + extern struct workqueue_struct * __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6); @@ -594,7 +596,7 @@ static inline bool schedule_delayed_work(struct delayed_work *dwork, */ static inline bool keventd_up(void) { - return system_wq != NULL; + return wq_online; } #ifndef CONFIG_SMP @@ -631,4 +633,7 @@ int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif +int __init workqueue_init_early(void); +int __init workqueue_init(void); + #endif diff --git a/init/main.c b/init/main.c index a8a58e2794a5..5c4fd68a8671 100644 --- a/init/main.c +++ b/init/main.c @@ -551,6 +551,14 @@ asmlinkage __visible void __init start_kernel(void) "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); idr_init_cache(); + + /* + * Allow workqueue creation and work item queueing/cancelling + * early. Work item execution depends on kthreads and starts after + * workqueue_init(). + */ + workqueue_init_early(); + rcu_init(); /* trace_printk() and trace points may be used after this */ @@ -1005,6 +1013,8 @@ static noinline void __init kernel_init_freeable(void) smp_prepare_cpus(setup_max_cpus); + workqueue_init(); + do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4eaec8b86d65..15d0811c9e91 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,6 +290,8 @@ module_param_named(disable_numa, wq_disable_numa, bool, 0444); static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); +bool wq_online; /* can kworkers be created yet? */ + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -2583,6 +2585,9 @@ void flush_workqueue(struct workqueue_struct *wq) }; int next_color; + if (WARN_ON(!wq_online)) + return; + lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); @@ -2843,6 +2848,9 @@ bool flush_work(struct work_struct *work) { struct wq_barrier barr; + if (WARN_ON(!wq_online)) + return false; + lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); @@ -2913,7 +2921,13 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) mark_work_canceling(work); local_irq_restore(flags); - flush_work(work); + /* + * This allows canceling during early boot. We know that @work + * isn't executing. + */ + if (wq_online) + flush_work(work); + clear_work_data(work); /* @@ -3352,7 +3366,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - if (!create_worker(pool)) + if (wq_online && !create_worker(pool)) goto fail; /* install */ @@ -3417,6 +3431,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; bool freezable = wq->flags & WQ_FREEZABLE; + unsigned long flags; /* for @wq->saved_max_active */ lockdep_assert_held(&wq->mutex); @@ -3425,7 +3440,8 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) if (!freezable && pwq->max_active == wq->saved_max_active) return; - spin_lock_irq(&pwq->pool->lock); + /* this function can be called during early boot w/ irq disabled */ + spin_lock_irqsave(&pwq->pool->lock, flags); /* * During [un]freezing, the caller is responsible for ensuring that @@ -3448,7 +3464,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock_irq(&pwq->pool->lock); + spin_unlock_irqrestore(&pwq->pool->lock, flags); } /* initialize newly alloced @pwq which is associated with @wq and @pool */ @@ -5457,7 +5473,17 @@ static void __init wq_numa_init(void) wq_numa_enabled = true; } -static int __init init_workqueues(void) +/** + * workqueue_init_early - early init for workqueue subsystem + * + * This is the first half of two-staged workqueue subsystem initialization + * and invoked as soon as the bare basics - memory allocation, cpumasks and + * idr are up. It sets up all the data structures and system workqueues + * and allows early boot code to create workqueues and queue/cancel work + * items. Actual work item execution starts only after kthreads can be + * created and scheduled right before early initcalls. + */ +int __init workqueue_init_early(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; @@ -5490,16 +5516,6 @@ static int __init init_workqueues(void) } } - /* create the initial worker */ - for_each_online_cpu(cpu) { - struct worker_pool *pool; - - for_each_cpu_worker_pool(pool, cpu) { - pool->flags &= ~POOL_DISASSOCIATED; - BUG_ON(!create_worker(pool)); - } - } - /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; @@ -5536,8 +5552,36 @@ static int __init init_workqueues(void) !system_power_efficient_wq || !system_freezable_power_efficient_wq); + return 0; +} + +/** + * workqueue_init - bring workqueue subsystem fully online + * + * This is the latter half of two-staged workqueue subsystem initialization + * and invoked as soon as kthreads can be created and scheduled. + * Workqueues have been created and work items queued on them, but there + * are no kworkers executing the work items yet. Populate the worker pools + * with the initial workers and enable future kworker creations. + */ +int __init workqueue_init(void) +{ + struct worker_pool *pool; + int cpu, bkt; + + /* create the initial workers */ + for_each_online_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + pool->flags &= ~POOL_DISASSOCIATED; + BUG_ON(!create_worker(pool)); + } + } + + hash_for_each(unbound_pool_hash, bkt, pool, hash_node) + BUG_ON(!create_worker(pool)); + + wq_online = true; wq_watchdog_init(); return 0; } -early_initcall(init_workqueues); -- cgit v1.2.3 From a81f80f3eb759de72d74d18e478873fc0575abc5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 Sep 2016 15:49:33 -0400 Subject: power, workqueue: remove keventd_up() usage Now that workqueue can handle work item queueing/cancelling from very early during boot, there is no need to gate cancel_delayed_work_sync() while !keventd_up(). Remove it. Signed-off-by: Tejun Heo Acked-by: Rafael J. Wysocki Cc: Qiao Zhou --- kernel/power/qos.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 168ff442ebde..97b0df71303e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,16 +482,7 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - /* - * This function may be called very early during boot, for example, - * from of_clk_init(), where irq needs to stay disabled. - * cancel_delayed_work_sync() assumes that irq is enabled on - * invocation and re-enables it on return. Avoid calling it until - * workqueue is initialized. - */ - if (keventd_up()) - cancel_delayed_work_sync(&req->work); - + cancel_delayed_work_sync(&req->work); __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); -- cgit v1.2.3 From 863b710b664bdcb90c0c682ee24adb368f497a5b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 Sep 2016 15:49:34 -0400 Subject: workqueue: remove keventd_up() keventd_up() no longer has in-kernel users. Remove it and make wq_online static. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 10 ---------- kernel/workqueue.c | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 91d416f9c0a7..56417133c672 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -358,8 +358,6 @@ extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; -extern bool wq_online; - extern struct workqueue_struct * __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6); @@ -591,14 +589,6 @@ static inline bool schedule_delayed_work(struct delayed_work *dwork, return queue_delayed_work(system_wq, dwork, delay); } -/** - * keventd_up - is workqueue initialized yet? - */ -static inline bool keventd_up(void) -{ - return wq_online; -} - #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 15d0811c9e91..ad0cd439223b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,7 +290,7 @@ module_param_named(disable_numa, wq_disable_numa, bool, 0444); static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); -bool wq_online; /* can kworkers be created yet? */ +static bool wq_online; /* can kworkers be created yet? */ static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ -- cgit v1.2.3 From 57a09bf0a416700676e77102c28f9cfcb48267e0 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 18 Oct 2016 19:51:19 +0200 Subject: bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers A BPF program is required to check the return register of a map_elem_lookup() call before accessing memory. The verifier keeps track of this by converting the type of the result register from PTR_TO_MAP_VALUE_OR_NULL to PTR_TO_MAP_VALUE after a conditional jump ensures safety. This check is currently exclusively performed for the result register 0. In the event the compiler reorders instructions, BPF_MOV64_REG instructions may be moved before the conditional jump which causes them to keep their type PTR_TO_MAP_VALUE_OR_NULL to which the verifier objects when the register is accessed: 0: (b7) r1 = 10 1: (7b) *(u64 *)(r10 -8) = r1 2: (bf) r2 = r10 3: (07) r2 += -8 4: (18) r1 = 0x59c00000 6: (85) call 1 7: (bf) r4 = r0 8: (15) if r0 == 0x0 goto pc+1 R0=map_value(ks=8,vs=8) R4=map_value_or_null(ks=8,vs=8) R10=fp 9: (7a) *(u64 *)(r4 +0) = 0 R4 invalid mem access 'map_value_or_null' This commit extends the verifier to keep track of all identical PTR_TO_MAP_VALUE_OR_NULL registers after a map_elem_lookup() by assigning them an ID and then marking them all when the conditional jump is observed. Signed-off-by: Thomas Graf Reviewed-by: Josef Bacik Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 61 +++++++++++++++++------- tools/testing/selftests/bpf/test_verifier.c | 72 +++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7035b997aaa5..ac5b393ee6b2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -23,13 +23,13 @@ struct bpf_reg_state { * result in a bad access. */ u64 min_value, max_value; + u32 id; union { /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ s64 imm; /* valid when type == PTR_TO_PACKET* */ struct { - u32 id; u16 off; u16 range; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99a7e5b388f2..846d7ceaf202 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -212,9 +212,10 @@ static void print_verifier_state(struct bpf_verifier_state *state) else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL || t == PTR_TO_MAP_VALUE_ADJ) - verbose("(ks=%d,vs=%d)", + verbose("(ks=%d,vs=%d,id=%u)", reg->map_ptr->key_size, - reg->map_ptr->value_size); + reg->map_ptr->value_size, + reg->id); if (reg->min_value != BPF_REGISTER_MIN_RANGE) verbose(",min_value=%llu", (unsigned long long)reg->min_value); @@ -447,6 +448,7 @@ static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; + regs[regno].id = 0; regs[regno].imm = 0; } @@ -1252,6 +1254,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose("unknown return type %d of func %d\n", fn->ret_type, func_id); @@ -1644,8 +1647,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg); return -EACCES; } - regs[insn->dst_reg].type = UNKNOWN_VALUE; - regs[insn->dst_reg].map_ptr = NULL; + mark_reg_unknown_value(regs, insn->dst_reg); } } else { /* case: R = imm @@ -1907,6 +1909,38 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, check_reg_overflow(true_reg); } +static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, + enum bpf_reg_type type) +{ + struct bpf_reg_state *reg = ®s[regno]; + + if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + reg->type = type; + if (type == UNKNOWN_VALUE) + mark_reg_unknown_value(regs, regno); + } +} + +/* The logic is similar to find_good_pkt_pointers(), both could eventually + * be folded together at some point. + */ +static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, + enum bpf_reg_type type) +{ + struct bpf_reg_state *regs = state->regs; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_map_reg(regs, i, regs[regno].id, type); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, + regs[regno].id, type); + } +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -1994,18 +2028,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (opcode == BPF_JEQ) { - /* next fallthrough insn can access memory via - * this register - */ - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - /* branch targer cannot access it, since reg == 0 */ - mark_reg_unknown_value(other_branch->regs, - insn->dst_reg); - } else { - other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - mark_reg_unknown_value(regs, insn->dst_reg); - } + /* Mark all identical map registers in each branch as either + * safe or unknown depending R == 0 or R != 0 conditional. + */ + mark_map_regs(this_branch, insn->dst_reg, + opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE); + mark_map_regs(other_branch, insn->dst_reg, + opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index ff5df121b2f6..0ef8eaf6cea7 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2588,6 +2588,78 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, .result = REJECT, }, + { + "multiple registers share map_lookup_elem result", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, + { + "invalid memory access with multiple map_lookup_elem calls", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .result = REJECT, + .errstr = "R4 !read_ok", + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, + { + "valid indirect map_lookup_elem access with 2nd lookup in branch", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_REG(BPF_REG_8, BPF_REG_1), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_IMM(BPF_REG_2, 10), + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 3), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From 2186d9f940b6a04f263a3bacd48f2a7ba96df4cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 19 Oct 2016 12:01:27 -0400 Subject: workqueue: move wq_numa_init() to workqueue_init() While splitting up workqueue initialization into two parts, ac8f73400782 ("workqueue: make workqueue available early during boot") put wq_numa_init() into workqueue_init_early(). Unfortunately, on some archs including power and arm64, cpu to node mapping isn't yet established by the time the early init is called leading to incorrect NUMA initialization and subsequently the following oops due to zero cpumask on node-specific unbound pools. Unable to handle kernel paging request for data at address 0x00000038 Faulting instruction address: 0xc0000000000fc0cc Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.8.0-compiler_gcc-6.2.0-next-20161005 #94 task: c0000007f5400000 task.stack: c000001ffc084000 NIP: c0000000000fc0cc LR: c0000000000ed928 CTR: c0000000000fbfd0 REGS: c000001ffc087780 TRAP: 0300 Not tainted (4.8.0-compiler_gcc-6.2.0-next-20161005) MSR: 9000000002009033 CR: 48000424 XER: 00000000 CFAR: c0000000000089dc DAR: 0000000000000038 DSISR: 40000000 SOFTE: 0 GPR00: c0000000000ed928 c000001ffc087a00 c000000000e63200 c000000010d6d600 GPR04: c0000007f5409200 0000000000000021 000000000748e08c 000000000000001f GPR08: 0000000000000000 0000000000000021 000000000748f1f8 0000000000000000 GPR12: 0000000028000422 c00000000fb80000 c00000000000e0c8 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000021 0000000000000001 GPR20: ffffffffafb50401 0000000000000000 c000000010d6d600 000000000000ba7e GPR24: 000000000000ba7e c000000000d8bc58 afb504000afb5041 0000000000000001 GPR28: 0000000000000000 0000000000000004 c0000007f5409280 0000000000000000 NIP [c0000000000fc0cc] enqueue_task_fair+0xfc/0x18b0 LR [c0000000000ed928] activate_task+0x78/0xe0 Call Trace: [c000001ffc087a00] [c0000007f5409200] 0xc0000007f5409200 (unreliable) [c000001ffc087b10] [c0000000000ed928] activate_task+0x78/0xe0 [c000001ffc087b50] [c0000000000ede58] ttwu_do_activate+0x68/0xc0 [c000001ffc087b90] [c0000000000ef1b8] try_to_wake_up+0x208/0x4f0 [c000001ffc087c10] [c0000000000d3484] create_worker+0x144/0x250 [c000001ffc087cb0] [c000000000cd72d0] workqueue_init+0x124/0x150 [c000001ffc087d00] [c000000000cc0e74] kernel_init_freeable+0x158/0x360 [c000001ffc087dc0] [c00000000000e0e4] kernel_init+0x24/0x160 [c000001ffc087e30] [c00000000000bfa0] ret_from_kernel_thread+0x5c/0xbc Instruction dump: 62940401 3b800000 3aa00000 7f17c378 3a600001 3b600001 60000000 60000000 60420000 72490021 ebfe0150 2f890001 419e0de0 7fbee840 419e0e58 ---[ end trace 0000000000000000 ]--- Fix it by moving wq_numa_init() to workqueue_init(). As this means that the early intialization may not have full NUMA info for per-cpu pools and ignores NUMA affinity for unbound pools, fix them up from workqueue_init() after wq_numa_init(). Signed-off-by: Tejun Heo Reported-by: Michael Ellerman Link: http://lkml.kernel.org/r/87twck5wqo.fsf@concordia.ellerman.id.au Fixes: ac8f73400782 ("workqueue: make workqueue available early during boot") Signed-off-by: Tejun Heo --- kernel/workqueue.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ad0cd439223b..c56479b4884b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5495,8 +5495,6 @@ int __init workqueue_init_early(void) pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); - wq_numa_init(); - /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; @@ -5566,9 +5564,32 @@ int __init workqueue_init_early(void) */ int __init workqueue_init(void) { + struct workqueue_struct *wq; struct worker_pool *pool; int cpu, bkt; + /* + * It'd be simpler to initialize NUMA in workqueue_init_early() but + * CPU to node mapping may not be available that early on some + * archs such as power and arm64. As per-cpu pools created + * previously could be missing node hint and unbound pools NUMA + * affinity, fix them up. + */ + wq_numa_init(); + + mutex_lock(&wq_pool_mutex); + + for_each_possible_cpu(cpu) { + for_each_cpu_worker_pool(pool, cpu) { + pool->node = cpu_to_node(cpu); + } + } + + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, smp_processor_id(), true); + + mutex_unlock(&wq_pool_mutex); + /* create the initial workers */ for_each_online_cpu(cpu) { for_each_cpu_worker_pool(pool, cpu) { -- cgit v1.2.3 From 3c3fcb45d524feb5d14a14f332e3eec7f2aff8f3 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Wed, 19 Oct 2016 15:10:59 +0100 Subject: sched/fair: Kill the unused 'sched_shares_window_ns' tunable The last user of this tunable was removed in 2012 in commit: 82958366cfea ("sched: Replace update_shares weight distribution with per-entity computation") Delete it since its very existence confuses people. Signed-off-by: Matt Fleming Cc: Dietmar Eggemann Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161019141059.26408-1-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- include/linux/sched/sysctl.h | 1 - kernel/sched/fair.c | 7 ------- kernel/sysctl.c | 7 ------- 3 files changed, 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 22db1e63707e..441145351301 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -36,7 +36,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d941c97dfbc3..79d464a04417 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -93,13 +93,6 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -/* - * The exponential sliding window over which load is averaged for shares - * distribution. - * (default: 10msec) - */ -unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; - #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 706309f9ed84..739fb17371af 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_shares_window_ns", - .data = &sysctl_sched_shares_window, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", -- cgit v1.2.3 From 97dcaa0fcfd24daa9a36c212c1ad1d5a97759212 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 20 Oct 2016 18:15:16 -0700 Subject: kexec: Export kexec_in_progress to modules The bcm_sf2 driver uses kexec_in_progress to know whether it can power down an integrated PHY during shutdown, and can be built as a module. Other modules may be using this in the future, so export it. Fixes: 2399d6143f85 ("net: dsa: bcm_sf2: Prevent GPHY shutdown for kexec'd kernels") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- kernel/kexec_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 561675589511..786ab85a5452 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -59,6 +59,7 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +EXPORT_SYMBOL_GPL(kexec_in_progress); /* Location of the reserved area for the crash kernel */ -- cgit v1.2.3 From 0ce66d4f70d3bcfb67c89f473e76e7cf0a01288f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 21 Oct 2016 14:21:55 -0700 Subject: Revert "kexec: Export kexec_in_progress to modules" This reverts commit 97dcaa0fcfd24daa9a36c212c1ad1d5a97759212. Based on the review discussion with Eric, we will come up with a different fix for the bcm_sf2 driver which does not make it rely on the kexec_in_progress value. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- kernel/kexec_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 786ab85a5452..561675589511 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -59,7 +59,6 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; -EXPORT_SYMBOL_GPL(kexec_in_progress); /* Location of the reserved area for the crash kernel */ -- cgit v1.2.3 From 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 21 Oct 2016 12:46:33 +0200 Subject: bpf: add helper for retrieving current numa node id Use case is mainly for soreuseport to select sockets for the local numa node, but since generic, lets also add this for other networking and tracing program types. Suggested-by: Eric Dumazet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 2 ++ 6 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c201017b5730..edcd96ded8aa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -319,6 +319,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; +extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f09c70b97eca..374ef582ae18 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -426,6 +426,12 @@ enum bpf_func_id { */ BPF_FUNC_set_hash_invalid, + /** + * bpf_get_numa_node_id() + * Returns the id of the current NUMA node. + */ + BPF_FUNC_get_numa_node_id, + __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index aa6d98154106..82a04143368e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1043,6 +1043,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 39918402e6e9..045cbe673356 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -92,6 +93,17 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_numa_node_id) +{ + return numa_node_id(); +} + +const struct bpf_func_proto bpf_get_numa_node_id_proto = { + .func = bpf_get_numa_node_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_ktime_get_ns) { /* NMI safe access to clock monotonic */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5dcb99281259..fa77311dadb2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -422,6 +422,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) return bpf_get_trace_printk_proto(); case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; case BPF_FUNC_probe_write_user: diff --git a/net/core/filter.c b/net/core/filter.c index 00351cdf7d0c..cd9e2ba66b0e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2492,6 +2492,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: -- cgit v1.2.3 From 119a0798dc42ed4c4f96d39b8b676efcea73aec6 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 17 Oct 2016 12:16:08 +0200 Subject: padata: Remove unused but set variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the unused but set variable pinst in padata_parallel_worker to fix the following warning when building with 'W=1': kernel/padata.c: In function ‘padata_parallel_worker’: kernel/padata.c:68:26: warning: variable ‘pinst’ set but not used [-Wunused-but-set-variable] Also remove the now unused variable pd which is only used to set pinst. Signed-off-by: Tobias Klauser Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 7848f0566403..05316c9f32da 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -64,15 +64,11 @@ static int padata_cpu_hash(struct parallel_data *pd) static void padata_parallel_worker(struct work_struct *parallel_work) { struct padata_parallel_queue *pqueue; - struct parallel_data *pd; - struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); pqueue = container_of(parallel_work, struct padata_parallel_queue, work); - pd = pqueue->pd; - pinst = pd->pinst; spin_lock(&pqueue->parallel.lock); list_replace_init(&pqueue->parallel.list, &local_list); -- cgit v1.2.3 From a225023828038a1aaea876a65313c863ec23fa44 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Oct 2016 15:45:27 +0200 Subject: sched/core: Explain sleep/wakeup in a better way There were a few questions wrt. how sleep-wakeup works. Try and explain it more. Requested-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 52 +++++++++++++++++++++++++++++++++++---------------- kernel/sched/core.c | 17 +++++++++-------- 2 files changed, 45 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..3762fe4e3a80 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*!!( #define set_task_state(tsk, state_value) \ do { \ (tsk)->task_state_change = _THIS_IP_; \ - smp_store_mb((tsk)->state, (state_value)); \ + smp_store_mb((tsk)->state, (state_value)); \ } while (0) -/* - * set_current_state() includes a barrier so that the write of current->state - * is correctly serialised wrt the caller's subsequent test of whether to - * actually sleep: - * - * set_current_state(TASK_UNINTERRUPTIBLE); - * if (do_i_need_to_sleep()) - * schedule(); - * - * If the caller does not need such serialisation then use __set_current_state() - */ #define __set_current_state(state_value) \ do { \ current->task_state_change = _THIS_IP_; \ @@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*!!( #define set_current_state(state_value) \ do { \ current->task_state_change = _THIS_IP_; \ - smp_store_mb(current->state, (state_value)); \ + smp_store_mb(current->state, (state_value)); \ } while (0) #else +/* + * @tsk had better be current, or you get to keep the pieces. + * + * The only reason is that computing current can be more expensive than + * using a pointer that's already available. + * + * Therefore, see set_current_state(). + */ #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ @@ -299,11 +296,34 @@ extern char ___assert_task_state[1 - 2*!!( * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * + * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); - * if (do_i_need_to_sleep()) - * schedule(); + * if (!need_sleep) + * break; + * + * schedule(); + * } + * __set_current_state(TASK_RUNNING); + * + * If the caller does not need such serialisation (because, for instance, the + * condition test and condition change and wakeup are under the same lock) then + * use __set_current_state(). + * + * The above is typically ordered against the wakeup, which does: + * + * need_sleep = false; + * wake_up_state(p, TASK_UNINTERRUPTIBLE); + * + * Where wake_up_state() (and all other wakeup primitives) imply enough + * barriers to order the store of the variable against wakeup. + * + * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, + * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a + * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). + * + * This is obviously fine, since they both store the exact same value. * - * If the caller does not need such serialisation then use __set_current_state() + * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..b8c86ba44ca9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * - * Put it on the run-queue if it's not already there. The "current" - * thread is always on the run-queue (except when the actual - * re-schedule is in progress), and as such you're allowed to do - * the simpler "current->state = TASK_RUNNING" to mark yourself - * runnable without the overhead of this. - * - * Return: %true if @p was woken up, %false if it was already running. - * or @state didn't match @p's state. + * If (@state & @p->state) @p->state = TASK_RUNNING. + * + * If the task was not queued/runnable, also place it back on a runqueue. + * + * Atomic against schedule() which would dequeue a task, also see + * set_current_state(). + * + * Return: %true if @p->state changes (an actual wakeup was done), + * %false otherwise. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) -- cgit v1.2.3 From 3ca0ff571b092ee4d807f1168caa428d95b0173b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Aug 2016 13:36:04 +0200 Subject: locking/mutex: Rework mutex::owner The current mutex implementation has an atomic lock word and a non-atomic owner field. This disparity leads to a number of issues with the current mutex code as it means that we can have a locked mutex without an explicit owner (because the owner field has not been set, or already cleared). This leads to a number of weird corner cases, esp. between the optimistic spinning and debug code. Where the optimistic spinning code needs the owner field updated inside the lock region, the debug code is more relaxed because the whole lock is serialized by the wait_lock. Also, the spinning code itself has a few corner cases where we need to deal with a held lock without an owner field. Furthermore, it becomes even more of a problem when trying to fix starvation cases in the current code. We end up stacking special case on special case. To solve this rework the basic mutex implementation to be a single atomic word that contains the owner and uses the low bits for extra state. This matches how PI futexes and rt_mutex already work. By having the owner an integral part of the lock state a lot of the problems dissapear and we get a better option to deal with starvation cases, direct owner handoff. Changing the basic mutex does however invalidate all the arch specific mutex code; this patch leaves that unused in-place, a later patch will remove that. Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/mutex-debug.h | 24 --- include/linux/mutex.h | 46 ++++-- kernel/locking/mutex-debug.c | 13 -- kernel/locking/mutex-debug.h | 10 -- kernel/locking/mutex.c | 371 ++++++++++++++++++------------------------- kernel/locking/mutex.h | 26 --- kernel/sched/core.c | 2 +- 7 files changed, 187 insertions(+), 305 deletions(-) delete mode 100644 include/linux/mutex-debug.h (limited to 'kernel') diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h deleted file mode 100644 index 4ac8b1977b73..000000000000 --- a/include/linux/mutex-debug.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef __LINUX_MUTEX_DEBUG_H -#define __LINUX_MUTEX_DEBUG_H - -#include -#include -#include - -/* - * Mutexes - debugging helpers: - */ - -#define __DEBUG_MUTEX_INITIALIZER(lockname) \ - , .magic = &lockname - -#define mutex_init(mutex) \ -do { \ - static struct lock_class_key __key; \ - \ - __mutex_init((mutex), #mutex, &__key); \ -} while (0) - -extern void mutex_destroy(struct mutex *lock); - -#endif diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2cb7531e7d7a..4d3bccabbea5 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -18,6 +18,7 @@ #include #include #include +#include /* * Simple, straightforward mutexes with strict semantics: @@ -48,16 +49,12 @@ * locks and tasks (and only those tasks) */ struct mutex { - /* 1: unlocked, 0: locked, negative: locked, possible waiters */ - atomic_t count; + atomic_long_t owner; spinlock_t wait_lock; - struct list_head wait_list; -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER) - struct task_struct *owner; -#endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif + struct list_head wait_list; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif @@ -66,6 +63,11 @@ struct mutex { #endif }; +static inline struct task_struct *__mutex_owner(struct mutex *lock) +{ + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~0x03); +} + /* * This is the control structure for tasks blocked on mutex, * which resides on the blocked task's kernel stack: @@ -79,9 +81,20 @@ struct mutex_waiter { }; #ifdef CONFIG_DEBUG_MUTEXES -# include + +#define __DEBUG_MUTEX_INITIALIZER(lockname) \ + , .magic = &lockname + +extern void mutex_destroy(struct mutex *lock); + #else + # define __DEBUG_MUTEX_INITIALIZER(lockname) + +static inline void mutex_destroy(struct mutex *lock) {} + +#endif + /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized @@ -90,14 +103,12 @@ struct mutex_waiter { * * It is not allowed to initialize an already locked mutex. */ -# define mutex_init(mutex) \ -do { \ - static struct lock_class_key __key; \ - \ - __mutex_init((mutex), #mutex, &__key); \ +#define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + __mutex_init((mutex), #mutex, &__key); \ } while (0) -static inline void mutex_destroy(struct mutex *lock) {} -#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ @@ -107,7 +118,7 @@ static inline void mutex_destroy(struct mutex *lock) {} #endif #define __MUTEX_INITIALIZER(lockname) \ - { .count = ATOMIC_INIT(1) \ + { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ @@ -127,7 +138,10 @@ extern void __mutex_init(struct mutex *lock, const char *name, */ static inline int mutex_is_locked(struct mutex *lock) { - return atomic_read(&lock->count) != 1; + /* + * XXX think about spin_is_locked + */ + return __mutex_owner(lock) != NULL; } /* diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9c951fade415..9aa713629387 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -73,21 +73,8 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - - if (!lock->owner) - DEBUG_LOCKS_WARN_ON(!lock->owner); - else - DEBUG_LOCKS_WARN_ON(lock->owner != current); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } - - /* - * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug - * mutexes so that we can do it here after we've verified state. - */ - mutex_clear_owner(lock); - atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 57a871ae3c81..a459faa48987 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -27,16 +27,6 @@ extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} - #define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a70b90db3909..de1ce0bae0d5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -27,41 +27,113 @@ #include #include -/* - * In the DEBUG case we are using the "NULL fastpath" for mutexes, - * which forces all calls into the slowpath: - */ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" -# include -/* - * Must be 0 for the debug case so we do not do the unlock outside of the - * wait_lock region. debug_mutex_unlock() will do the actual unlock in this - * case. - */ -# undef __mutex_slowpath_needs_to_unlock -# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" -# include #endif void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { - atomic_set(&lock->count, 1); + atomic_long_set(&lock->owner, 0); spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); - mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); } - EXPORT_SYMBOL(__mutex_init); +/* + * @owner: contains: 'struct task_struct *' to the current lock owner, + * NULL means not owned. Since task_struct pointers are aligned at + * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low + * bits to store extra state. + * + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + */ +#define MUTEX_FLAG_WAITERS 0x01 + +#define MUTEX_FLAGS 0x03 + +static inline struct task_struct *__owner_task(unsigned long owner) +{ + return (struct task_struct *)(owner & ~MUTEX_FLAGS); +} + +static inline unsigned long __owner_flags(unsigned long owner) +{ + return owner & MUTEX_FLAGS; +} + +/* + * Actual trylock that will work on any unlocked state. + */ +static inline bool __mutex_trylock(struct mutex *lock) +{ + unsigned long owner, curr = (unsigned long)current; + + owner = atomic_long_read(&lock->owner); + for (;;) { /* must loop, can race against a flag */ + unsigned long old; + + if (__owner_task(owner)) + return false; + + old = atomic_long_cmpxchg_acquire(&lock->owner, owner, + curr | __owner_flags(owner)); + if (old == owner) + return true; + + owner = old; + } +} + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * Lockdep annotations are contained to the slow paths for simplicity. + * There is nothing that would stop spreading the lockdep annotations outwards + * except more code. + */ + +/* + * Optimistic trylock that only works in the uncontended case. Make sure to + * follow with a __mutex_trylock() before failing. + */ +static __always_inline bool __mutex_trylock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + return true; + + return false; +} + +static __always_inline bool __mutex_unlock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) + return true; + + return false; +} +#endif + +static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_or(flag, &lock->owner); +} + +static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_andnot(flag, &lock->owner); +} + #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * We split the mutex lock/unlock logic into separate fastpath and @@ -69,7 +141,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); +static void __sched __mutex_lock_slowpath(struct mutex *lock); /** * mutex_lock - acquire the mutex @@ -95,14 +167,10 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); void __sched mutex_lock(struct mutex *lock) { might_sleep(); - /* - * The locking fastpath is the 1->0 transition from - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); - mutex_set_owner(lock); -} + if (!__mutex_trylock_fast(lock)) + __mutex_lock_slowpath(lock); +} EXPORT_SYMBOL(mutex_lock); #endif @@ -149,9 +217,6 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, @@ -176,7 +241,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, /* * Check if lock is contended, if not there is nobody to wake up */ - if (likely(atomic_read(&lock->base.count) == 0)) + if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* @@ -227,7 +292,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) bool ret = true; rcu_read_lock(); - while (lock->owner == owner) { + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking lock->owner still matches owner. If that fails, @@ -260,26 +325,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) return 0; rcu_read_lock(); - owner = READ_ONCE(lock->owner); + owner = __mutex_owner(lock); if (owner) retval = owner->on_cpu; rcu_read_unlock(); + /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. + * If lock->owner is not set, the mutex has been released. Return true + * such that we'll trylock in the spin path, which is a faster option + * than the blocking slow path. */ return retval; } -/* - * Atomically try to take the lock when it is available - */ -static inline bool mutex_try_to_acquire(struct mutex *lock) -{ - return !mutex_is_locked(lock) && - (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); -} - /* * Optimistic spinning. * @@ -288,13 +346,6 @@ static inline bool mutex_try_to_acquire(struct mutex *lock) * need to reschedule. The rationale is that if the lock owner is * running, it is likely to release the lock soon. * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * * The mutex spinners are queued up using MCS lock so that only one * spinner can compete for the mutex. However, if mutex spinning isn't * going to happen, there is no point in going through the lock/unlock @@ -342,35 +393,16 @@ static bool mutex_optimistic_spin(struct mutex *lock, * If there's an owner, wait for it to either * release the lock or go to sleep. */ - owner = READ_ONCE(lock->owner); + owner = __mutex_owner(lock); if (owner && !mutex_spin_on_owner(lock, owner)) break; /* Try to acquire the mutex if it is unlocked. */ - if (mutex_try_to_acquire(lock)) { - lock_acquired(&lock->dep_map, ip); - - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); - } - - mutex_set_owner(lock); + if (__mutex_trylock(lock)) { osq_unlock(&lock->osq); return true; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; - /* * The cpu_relax() call is a compiler barrier which forces * everything in this loop to be re-loaded. We don't need @@ -406,8 +438,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, } #endif -__visible __used noinline -void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); /** * mutex_unlock - release the mutex @@ -422,21 +453,12 @@ void __sched __mutex_unlock_slowpath(atomic_t *lock_count); */ void __sched mutex_unlock(struct mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(lock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC + if (__mutex_unlock_fast(lock)) + return; #endif - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); + __mutex_unlock_slowpath(lock, _RET_IP_); } - EXPORT_SYMBOL(mutex_unlock); /** @@ -465,15 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) lock->ctx = NULL; } -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(&lock->base); -#endif - __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); + mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); @@ -520,20 +534,24 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { /* got the lock, yay! */ + lock_acquired(&lock->dep_map, ip); + if (use_ww_ctx) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } preempt_enable(); return 0; } spin_lock_mutex(&lock->wait_lock, flags); - /* - * Once more, try to acquire the lock. Only try-lock the mutex if - * it is unlocked to reduce unnecessary xchg() operations. + * After waiting to acquire the wait_lock, try again. */ - if (!mutex_is_locked(lock) && - (atomic_xchg_acquire(&lock->count, 0) == 1)) + if (__mutex_trylock(lock)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -543,21 +561,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; + if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + lock_contended(&lock->dep_map, ip); for (;;) { - /* - * Lets try to take the lock again - this is needed even if - * we get here for the first time (shortly after failing to - * acquire the lock), to make sure that we get a wakeup once - * it's unlocked. Later on, if we sleep, this is the - * operation that gives us the lock. We xchg it to -1, so - * that when we release the lock, we properly wake up the - * other waiters. We only attempt the xchg if the count is - * non-negative in order to avoid unnecessary xchg operations: - */ - if (atomic_read(&lock->count) >= 0 && - (atomic_xchg_acquire(&lock->count, -1) == 1)) + if (__mutex_trylock(lock)) break; /* @@ -585,15 +595,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); - /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); + __mutex_clear_flag(lock, MUTEX_FLAG_WAITERS); + debug_mutex_free_waiter(&waiter); skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - mutex_set_owner(lock); if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); @@ -631,7 +640,6 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched @@ -650,7 +658,6 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); static inline int @@ -715,29 +722,22 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); /* * Release the lock, slowpath: */ -static inline void -__mutex_unlock_common_slowpath(struct mutex *lock, int nested) +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { - unsigned long flags; + unsigned long owner, flags; WAKE_Q(wake_q); + mutex_release(&lock->dep_map, 1, ip); + /* - * As a performance measurement, release the lock before doing other - * wakeup related duties to follow. This allows other tasks to acquire - * the lock sooner, while still handling cleanups in past unlock calls. - * This can be done as we do not enforce strict equivalence between the - * mutex counter and wait_list. - * - * - * Some architectures leave the lock unlocked in the fastpath failure - * case, others need to leave it locked. In the later case we have to - * unlock it here - as the lock counter is currently 0 or negative. + * Release the lock before (potentially) taking the spinlock + * such that other contenders can get on with things ASAP. */ - if (__mutex_slowpath_needs_to_unlock()) - atomic_set(&lock->count, 1); + owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner); + if (!__owner_flags(owner)) + return; spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { @@ -754,17 +754,6 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) wake_up_q(&wake_q); } -/* - * Release the lock, slowpath: - */ -__visible void -__mutex_unlock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - __mutex_unlock_common_slowpath(lock, 1); -} - #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * Here come the less common (and hence less performance-critical) APIs: @@ -789,38 +778,30 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock); */ int __sched mutex_lock_interruptible(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_interruptible_slowpath(lock); + + return __mutex_lock_interruptible_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_interruptible); int __sched mutex_lock_killable(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_killable_slowpath(lock); + + return __mutex_lock_killable_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_killable); -__visible void __sched -__mutex_lock_slowpath(atomic_t *lock_count) +static noinline void __sched +__mutex_lock_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_, NULL, 0); } @@ -856,37 +837,6 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif -/* - * Spinlock based trylock, we take the spinlock and check whether we - * can get the lock: - */ -static inline int __mutex_trylock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - int prev; - - /* No need to trylock if the mutex is locked. */ - if (mutex_is_locked(lock)) - return 0; - - spin_lock_mutex(&lock->wait_lock, flags); - - prev = atomic_xchg_acquire(&lock->count, -1); - if (likely(prev == 1)) { - mutex_set_owner(lock); - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - - /* Set it back to 0 if there are no waiters: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - return prev == 1; -} - /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -903,13 +853,12 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) */ int __sched mutex_trylock(struct mutex *lock) { - int ret; + bool locked = __mutex_trylock(lock); - ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); - if (ret) - mutex_set_owner(lock); + if (locked) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return ret; + return locked; } EXPORT_SYMBOL(mutex_trylock); @@ -917,36 +866,28 @@ EXPORT_SYMBOL(mutex_trylock); int __sched __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock); int __sched __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_interruptible_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock_interruptible); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 6cd6b8e9efd7..4410a4af42a3 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,32 +16,6 @@ #define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * The mutex owner can get read and written to locklessly. - * We should use WRITE_ONCE when writing the owner value to - * avoid store tearing, otherwise, a thread could potentially - * read a partially written and incomplete owner value. - */ -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} -#else -static inline void mutex_set_owner(struct mutex *lock) -{ -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ -} -#endif - #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..8912aafd09e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -75,11 +75,11 @@ #include #include #include +#include #include #include #include -#include #ifdef CONFIG_PARAVIRT #include #endif -- cgit v1.2.3 From a3ea3d9b865c2a8f7fe455c7fa26db4b6fd066e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Aug 2016 13:45:15 +0200 Subject: locking/mutex: Allow MUTEX_SPIN_ON_OWNER when DEBUG_MUTEXES Now that mutex::count and mutex::owner are the same field, we can allow SPIN_ON_OWNER while DEBUG_MUTEX. Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index ebdb0043203a..84d882f3e299 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -225,7 +225,7 @@ config ARCH_SUPPORTS_ATOMIC_RMW config MUTEX_SPIN_ON_OWNER def_bool y - depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config RWSEM_SPIN_ON_OWNER def_bool y -- cgit v1.2.3 From 9d659ae14b545c4296e812c70493bfdc999b5c1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Aug 2016 14:40:16 +0200 Subject: locking/mutex: Add lock handoff to avoid starvation Implement lock handoff to avoid lock starvation. Lock starvation is possible because mutex_lock() allows lock stealing, where a running (or optimistic spinning) task beats the woken waiter to the acquire. Lock stealing is an important performance optimization because waiting for a waiter to wake up and get runtime can take a significant time, during which everyboy would stall on the lock. The down-side is of course that it allows for starvation. This patch has the waiter requesting a handoff if it fails to acquire the lock upon waking. This re-introduces some of the wait time, because once we do a handoff we have to wait for the waiter to wake up again. A future patch will add a round of optimistic spinning to attempt to alleviate this penalty, but if that turns out to not be enough, we can add a counter and only request handoff after multiple failed wakeups. There are a few tricky implementation details: - accepting a handoff must only be done in the wait-loop. Since the handoff condition is owner == current, it can easily cause recursive locking trouble. - accepting the handoff must be careful to provide the ACQUIRE semantics. - having the HANDOFF bit set on unlock requires care, we must not clear the owner. - we must be careful to not leave HANDOFF set after we've acquired the lock. The tricky scenario is setting the HANDOFF bit on an unlocked mutex. Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 142 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index de1ce0bae0d5..b4ebd8b9bcd5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -54,8 +54,10 @@ EXPORT_SYMBOL(__mutex_init); * bits to store extra state. * * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + * Bit1 indicates unlock needs to hand the lock to the top-waiter */ #define MUTEX_FLAG_WAITERS 0x01 +#define MUTEX_FLAG_HANDOFF 0x02 #define MUTEX_FLAGS 0x03 @@ -71,20 +73,48 @@ static inline unsigned long __owner_flags(unsigned long owner) /* * Actual trylock that will work on any unlocked state. + * + * When setting the owner field, we must preserve the low flag bits. + * + * Be careful with @handoff, only set that in a wait-loop (where you set + * HANDOFF) to avoid recursive lock attempts. */ -static inline bool __mutex_trylock(struct mutex *lock) +static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ - unsigned long old; + unsigned long old, flags = __owner_flags(owner); + + if (__owner_task(owner)) { + if (handoff && unlikely(__owner_task(owner) == current)) { + /* + * Provide ACQUIRE semantics for the lock-handoff. + * + * We cannot easily use load-acquire here, since + * the actual load is a failed cmpxchg, which + * doesn't imply any barriers. + * + * Also, this is a fairly unlikely scenario, and + * this contains the cost. + */ + smp_mb(); /* ACQUIRE */ + return true; + } - if (__owner_task(owner)) return false; + } + + /* + * We set the HANDOFF bit, we must make sure it doesn't live + * past the point where we acquire it. This would be possible + * if we (accidentally) set the bit on an unlocked mutex. + */ + if (handoff) + flags &= ~MUTEX_FLAG_HANDOFF; - old = atomic_long_cmpxchg_acquire(&lock->owner, owner, - curr | __owner_flags(owner)); + old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); if (old == owner) return true; @@ -134,6 +164,39 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) atomic_long_andnot(flag, &lock->owner); } +static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) +{ + return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; +} + +/* + * Give up ownership to a specific task, when @task = NULL, this is equivalent + * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE + * semantics like a regular unlock, the __mutex_trylock() provides matching + * ACQUIRE semantics for the handoff. + */ +static void __mutex_handoff(struct mutex *lock, struct task_struct *task) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + for (;;) { + unsigned long old, new; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + new = (owner & MUTEX_FLAG_WAITERS); + new |= (unsigned long)task; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, new); + if (old == owner) + break; + + owner = old; + } +} + #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * We split the mutex lock/unlock logic into separate fastpath and @@ -398,7 +461,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, break; /* Try to acquire the mutex if it is unlocked. */ - if (__mutex_trylock(lock)) { + if (__mutex_trylock(lock, false)) { osq_unlock(&lock->osq); return true; } @@ -523,6 +586,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; + bool first = false; int ret; if (use_ww_ctx) { @@ -534,7 +598,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + if (__mutex_trylock(lock, false) || + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (use_ww_ctx) { @@ -551,7 +616,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* * After waiting to acquire the wait_lock, try again. */ - if (__mutex_trylock(lock)) + if (__mutex_trylock(lock, false)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -561,13 +626,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter) + if (__mutex_waiter_is_first(lock, &waiter)) __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); lock_contended(&lock->dep_map, ip); for (;;) { - if (__mutex_trylock(lock)) + if (__mutex_trylock(lock, first)) break; /* @@ -586,17 +651,20 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, state); - - /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); + + if (!first && __mutex_waiter_is_first(lock, &waiter)) { + first = true; + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); + } } __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); if (likely(list_empty(&lock->wait_list))) - __mutex_clear_flag(lock, MUTEX_FLAG_WAITERS); + __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_free_waiter(&waiter); @@ -724,33 +792,61 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); */ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { + struct task_struct *next = NULL; unsigned long owner, flags; WAKE_Q(wake_q); mutex_release(&lock->dep_map, 1, ip); /* - * Release the lock before (potentially) taking the spinlock - * such that other contenders can get on with things ASAP. + * Release the lock before (potentially) taking the spinlock such that + * other contenders can get on with things ASAP. + * + * Except when HANDOFF, in that case we must not clear the owner field, + * but instead set it to the top waiter. */ - owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner); - if (!__owner_flags(owner)) - return; + owner = atomic_long_read(&lock->owner); + for (;;) { + unsigned long old; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + if (owner & MUTEX_FLAG_HANDOFF) + break; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, + __owner_flags(owner)); + if (old == owner) { + if (owner & MUTEX_FLAG_WAITERS) + break; + + return; + } + + owner = old; + } spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, - struct mutex_waiter, list); + list_first_entry(&lock->wait_list, + struct mutex_waiter, list); + + next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - wake_q_add(&wake_q, waiter->task); + wake_q_add(&wake_q, next); } + if (owner & MUTEX_FLAG_HANDOFF) + __mutex_handoff(lock, next); + spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } @@ -853,7 +949,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock); + bool locked = __mutex_trylock(lock, false); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); -- cgit v1.2.3 From 5bbd7e644378334700889fb762d5893985a7311f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Sep 2016 13:42:12 +0200 Subject: locking/mutex: Restructure wait loop Doesn't really matter yet, but pull the HANDOFF and trylock out from under the wait_lock. The intention is to add an optimistic spin loop here, which requires we do not hold the wait_lock, so shuffle code around in preparation. Also clarify the purpose of taking the wait_lock in the wait loop, its tempting to want to avoid it altogether, but the cancellation cases need to to avoid losing wakeups. Suggested-by: Waiman Long Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b4ebd8b9bcd5..8bb2304bb78d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -631,13 +631,21 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, lock_contended(&lock->dep_map, ip); + set_task_state(task, state); for (;;) { + /* + * Once we hold wait_lock, we're serialized against + * mutex_unlock() handing the lock off to us, do a trylock + * before testing the error conditions to make sure we pick up + * the handoff. + */ if (__mutex_trylock(lock, first)) - break; + goto acquired; /* - * got a signal? (This code gets eliminated in the - * TASK_UNINTERRUPTIBLE case.) + * Check for signals and wound conditions while holding + * wait_lock. This ensures the lock cancellation is ordered + * against mutex_unlock() and wake-ups do not go missing. */ if (unlikely(signal_pending_state(state, task))) { ret = -EINTR; @@ -650,16 +658,27 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - __set_task_state(task, state); spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); - spin_lock_mutex(&lock->wait_lock, flags); if (!first && __mutex_waiter_is_first(lock, &waiter)) { first = true; __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); } + + set_task_state(task, state); + /* + * Here we order against unlock; we must either see it change + * state back to RUNNING and fall through the next schedule(), + * or we must see its unlock and acquire. + */ + if (__mutex_trylock(lock, first)) + break; + + spin_lock_mutex(&lock->wait_lock, flags); } + spin_lock_mutex(&lock->wait_lock, flags); +acquired: __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); @@ -682,6 +701,7 @@ skip_wait: return 0; err: + __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); -- cgit v1.2.3 From a40ca56577f628eb3f7af22b484e95edfdd047a2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 26 Aug 2016 19:35:08 -0400 Subject: locking/mutex: Simplify some ww_mutex code in __mutex_lock_common() This patch removes some of the redundant ww_mutex code in __mutex_lock_common(). Tested-by: Jason Low Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Ding Tianhong Cc: Imre Deak Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/1472254509-27508-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 8bb2304bb78d..6c0d3040e4dc 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -587,10 +587,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct mutex_waiter waiter; unsigned long flags; bool first = false; + struct ww_mutex *ww; int ret; if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + ww = container_of(lock, struct ww_mutex, base); if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; } @@ -602,12 +603,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - + if (use_ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); - } preempt_enable(); return 0; } @@ -691,10 +688,8 @@ skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (use_ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); - } spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable(); -- cgit v1.2.3 From b341afb325eb390f707a82cbefd65cda887302ab Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 26 Aug 2016 19:35:09 -0400 Subject: locking/mutex: Enable optimistic spinning of woken waiter This patch makes the waiter that sets the HANDOFF flag start spinning instead of sleeping until the handoff is complete or the owner sleeps. Otherwise, the handoff will cause the optimistic spinners to abort spinning as the handed-off owner may not be running. Tested-by: Jason Low Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Ding Tianhong Cc: Imre Deak Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/1472254509-27508-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 77 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 6c0d3040e4dc..17a88e929e6a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -416,24 +416,39 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) * * Returns true when the lock was taken, otherwise false, indicating * that we need to jump to the slowpath and sleep. + * + * The waiter flag is set to true if the spinner is a waiter in the wait + * queue. The waiter-spinner will spin on the lock directly and concurrently + * with the spinner at the head of the OSQ, if present, until the owner is + * changed to itself. */ static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { struct task_struct *task = current; - if (!mutex_can_spin_on_owner(lock)) - goto done; + if (!waiter) { + /* + * The purpose of the mutex_can_spin_on_owner() function is + * to eliminate the overhead of osq_lock() and osq_unlock() + * in case spinning isn't possible. As a waiter-spinner + * is not going to take OSQ lock anyway, there is no need + * to call mutex_can_spin_on_owner(). + */ + if (!mutex_can_spin_on_owner(lock)) + goto fail; - /* - * In order to avoid a stampede of mutex spinners trying to - * acquire the mutex all at once, the spinners need to take a - * MCS (queued) lock first before spinning on the owner field. - */ - if (!osq_lock(&lock->osq)) - goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ + if (!osq_lock(&lock->osq)) + goto fail; + } - while (true) { + for (;;) { struct task_struct *owner; if (use_ww_ctx && ww_ctx->acquired > 0) { @@ -449,7 +464,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * performed the optimistic spinning cannot be done. */ if (READ_ONCE(ww->ctx)) - break; + goto fail_unlock; } /* @@ -457,15 +472,20 @@ static bool mutex_optimistic_spin(struct mutex *lock, * release the lock or go to sleep. */ owner = __mutex_owner(lock); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; + if (owner) { + if (waiter && owner == task) { + smp_mb(); /* ACQUIRE */ + break; + } - /* Try to acquire the mutex if it is unlocked. */ - if (__mutex_trylock(lock, false)) { - osq_unlock(&lock->osq); - return true; + if (!mutex_spin_on_owner(lock, owner)) + goto fail_unlock; } + /* Try to acquire the mutex if it is unlocked. */ + if (__mutex_trylock(lock, waiter)) + break; + /* * The cpu_relax() call is a compiler barrier which forces * everything in this loop to be re-loaded. We don't need @@ -475,8 +495,17 @@ static bool mutex_optimistic_spin(struct mutex *lock, cpu_relax_lowlatency(); } - osq_unlock(&lock->osq); -done: + if (!waiter) + osq_unlock(&lock->osq); + + return true; + + +fail_unlock: + if (!waiter) + osq_unlock(&lock->osq); + +fail: /* * If we fell out of the spin path because of need_resched(), * reschedule now, before we try-lock the mutex. This avoids getting @@ -495,7 +524,8 @@ done: } #else static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { return false; } @@ -600,7 +630,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); if (__mutex_trylock(lock, false) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (use_ww_ctx) @@ -669,7 +699,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock(lock, first)) + if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) || + __mutex_trylock(lock, first)) break; spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From 0ee1dd9f5e7eae4e55f95935b72d4beecb03de9c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Oct 2016 09:51:13 -0500 Subject: x86/dumpstack: Remove raw stack dump For mostly historical reasons, the x86 oops dump shows the raw stack values: ... [registers] Stack: ffff880079af7350 ffff880079905400 0000000000000000 ffffc900008f3ae0 ffffffffa0196610 0000000000000001 00010000ffffffff 0000000087654321 0000000000000002 0000000000000000 0000000000000000 0000000000000000 Call Trace: ... This seems to be an artifact from long ago, and probably isn't needed anymore. It generally just adds noise to the dump, and it can be actively harmful because it leaks kernel addresses. Linus says: "The stack dump actually goes back to forever, and it used to be useful back in 1992 or so. But it used to be useful mainly because stacks were simpler and we didn't have very good call traces anyway. I definitely remember having used them - I just do not remember having used them in the last ten+ years. Of course, it's still true that if you can trigger an oops, you've likely already lost the security game, but since the stack dump is so useless, let's aim to just remove it and make games like the above harder." This also removes the related 'kstack=' cmdline option and the 'kstack_depth_to_print' sysctl. Suggested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e83bd50df52d8fe88e94d2566426ae40d813bf8f.1477405374.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 -- Documentation/sysctl/kernel.txt | 8 ----- Documentation/x86/x86_64/boot-options.txt | 4 --- arch/x86/include/asm/stacktrace.h | 5 --- arch/x86/kernel/dumpstack.c | 21 ++---------- arch/x86/kernel/dumpstack_32.c | 33 +------------------ arch/x86/kernel/dumpstack_64.c | 53 +------------------------------ kernel/sysctl.c | 7 ---- 8 files changed, 4 insertions(+), 130 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 37babf91f2cb..049a9172ed22 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1958,9 +1958,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. kmemcheck=2 (one-shot mode) Default: 2 (one-shot mode) - kstack=N [X86] Print N words from the kernel stack - in oops dumps. - kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. Default is 0 (don't ignore, but inject #GP) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index ffab8b5caa60..065f18478c1c 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -40,7 +40,6 @@ show up in /proc/sys/kernel: - hung_task_warnings - kexec_load_disabled - kptr_restrict -- kstack_depth_to_print [ X86 only ] - l2cr [ PPC only ] - modprobe ==> Documentation/debugging-modules.txt - modules_disabled @@ -395,13 +394,6 @@ When kptr_restrict is set to (2), kernel pointers printed using ============================================================== -kstack_depth_to_print: (X86 only) - -Controls the number of words to print when dumping the raw -kernel stack. - -============================================================== - l2cr: (PPC only) This flag controls the L2 cache of G3 processor boards. If diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 0965a71f9942..61b611e9eeaf 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -277,10 +277,6 @@ IOMMU (input/output memory management unit) space might stop working. Use this option if you have devices that are accessed from userspace directly on some PCI host bridge. -Debugging - - kstack=N Print N words from the kernel stack in oops dumps. - Miscellaneous nogbpages diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 37f2e0b377ad..1e375b05cfa8 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -43,8 +43,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) addr + len > begin && addr + len <= end); } -extern int kstack_depth_to_print; - #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else @@ -86,9 +84,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl); -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl); - extern unsigned int code_bytes; /* The form of the top of the frame on the stack */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f967652500fa..499aa6f0fde5 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,7 +22,6 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; unsigned int code_bytes = 64; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -171,12 +170,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, KERN_DEFAULT); + show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -295,22 +294,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init kstack_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - kstack_depth_to_print = val; - return 0; -} -early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { ssize_t ret; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 06eb322b5f9f..90cf460d50bd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -121,36 +121,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %08lx", log_lvl, *stack++); - } else - pr_cont(" %08lx", *stack++); - touch_nmi_watchdog(); - } - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - - void show_regs(struct pt_regs *regs) { int i; @@ -168,8 +138,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_EMERG); + show_trace_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 36cf1a498227..310abf4542dc 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -140,56 +140,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *irq_stack_end; - unsigned long *irq_stack; - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); - irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - unsigned long word; - - if (stack >= irq_stack && stack <= irq_stack_end) { - if (stack == irq_stack_end) { - stack = (unsigned long *) (irq_stack_end[-1]); - pr_cont(" "); - } - } else { - if (kstack_end(stack)) - break; - } - - if (probe_kernel_address(stack, word)) - break; - - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %016lx", log_lvl, word); - } else - pr_cont(" %016lx", word); - - stack++; - touch_nmi_watchdog(); - } - - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - void show_regs(struct pt_regs *regs) { int i; @@ -207,8 +157,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 706309f9ed84..17a5a8253294 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -989,13 +989,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, - { - .procname = "kstack_depth_to_print", - .data = &kstack_depth_to_print, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "io_delay_type", .data = &io_delay_type, -- cgit v1.2.3 From ca7dfdbb33675151ad0854aea11340f95c38aff3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 26 Oct 2016 16:37:53 +1100 Subject: kernel/smp: Define pr_fmt() for smp.c This makes all our pr_xxx()'s start with "smp: ", which helps pin down where they come from and generally looks nice. There is actually only one pr_xxx() use in smp.c at the moment, but we will add some more in the next commit. Suggested-by: Borislav Petkov Signed-off-by: Michael Ellerman Cc: akpm@osdl.org Cc: jgross@suse.com Cc: ak@linux.intel.com Cc: tim.c.chen@linux.intel.com Cc: len.brown@intel.com Cc: peterz@infradead.org Cc: richard@nod.at Cc: jolsa@redhat.com Cc: boris.ostrovsky@oracle.com Cc: mgorman@techsingularity.net Link: http://lkml.kernel.org/r/1477460275-8266-1-git-send-email-mpe@ellerman.id.au Signed-off-by: Thomas Gleixner --- kernel/smp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index bba3b201668d..2d1f15d43022 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,9 @@ * * (C) Jens Axboe 2008 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include -- cgit v1.2.3 From 92b23278298304f72bbc786a737f2646f4b9aa9d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 26 Oct 2016 16:37:54 +1100 Subject: kernel/smp: Make the SMP boot message common on all arches Currently after bringing up secondary CPUs all arches print "Brought up %d CPUs". On x86 they also print the number of nodes that were brought online. It would be nice to also print the number of nodes on other arches. Although we could override smp_announce() on the other ~10 NUMA aware arches, it seems simpler to just always print the number of nodes. On non-NUMA arches there is just always 1 node. Having done that, smp_announce() is no longer weak, and seems small enough to just pull directly into smp_init(). Also update the printing of "%d CPUs" to be smart when an SMP kernel is booted on a single CPU system, or when only one CPU is available, eg: smp: Brought up 2 nodes, 1 CPU Signed-off-by: Michael Ellerman Reviewed-by: Borislav Petkov Cc: akpm@osdl.org Cc: jgross@suse.com Cc: ak@linux.intel.com Cc: tim.c.chen@linux.intel.com Cc: len.brown@intel.com Cc: peterz@infradead.org Cc: richard@nod.at Cc: jolsa@redhat.com Cc: boris.ostrovsky@oracle.com Cc: mgorman@techsingularity.net Link: http://lkml.kernel.org/r/1477460275-8266-2-git-send-email-mpe@ellerman.id.au Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 8 -------- kernel/smp.c | 13 +++++++------ 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 42f5eb7b4f6c..b9f02383f372 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -821,14 +821,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -void smp_announce(void) -{ - int num_nodes = num_online_nodes(); - - printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", - num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { diff --git a/kernel/smp.c b/kernel/smp.c index 2d1f15d43022..4323c5db7d26 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -546,14 +546,10 @@ void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -void __weak smp_announce(void) -{ - printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); -} - /* Called by boot processor to activate the rest. */ void __init smp_init(void) { + int num_nodes, num_cpus; unsigned int cpu; idle_threads_init(); @@ -567,8 +563,13 @@ void __init smp_init(void) cpu_up(cpu); } + num_nodes = num_online_nodes(); + num_cpus = num_online_cpus(); + pr_info("Brought up %d node%s, %d CPU%s\n", + num_nodes, (num_nodes > 1 ? "s" : ""), + num_cpus, (num_cpus > 1 ? "s" : "")); + /* Any cleanup work */ - smp_announce(); smp_cpus_done(setup_max_cpus); } -- cgit v1.2.3 From 51111dce2509506d16efd321939895ff7ffe1dc2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 26 Oct 2016 16:37:55 +1100 Subject: kernel/smp: Tell the user we're bringing up secondary CPUs Currently we don't print anything before starting to bring up secondary CPUs. This can be confusing if it takes a long time to bring up the secondaries, or if the kernel crashes while doing so and produces no further output. On x86 they work around this by detecting when the first secondary CPU comes up and printing a message (see announce_cpu()). But doing it in smp_init() is simpler and works for all arches. Signed-off-by: Michael Ellerman Reviewed-by: Borislav Petkov Cc: akpm@osdl.org Cc: jgross@suse.com Cc: ak@linux.intel.com Cc: tim.c.chen@linux.intel.com Cc: len.brown@intel.com Cc: peterz@infradead.org Cc: richard@nod.at Cc: jolsa@redhat.com Cc: boris.ostrovsky@oracle.com Cc: mgorman@techsingularity.net Link: http://lkml.kernel.org/r/1477460275-8266-3-git-send-email-mpe@ellerman.id.au Signed-off-by: Thomas Gleixner --- kernel/smp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4323c5db7d26..77fcdb9f2775 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -555,6 +555,8 @@ void __init smp_init(void) idle_threads_init(); cpuhp_threads_init(); + pr_info("Bringing up secondary CPUs ...\n"); + /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { if (num_online_cpus() >= setup_max_cpus) -- cgit v1.2.3 From 6c5e9059692567740a4ee51530dffe51a4b9584d Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 21 Oct 2016 08:58:50 -0700 Subject: timers: Fix usleep_range() in the context of wake_up_process() Users of usleep_range() expect that it will _never_ return in less time than the minimum passed parameter. However, nothing in the code ensures this, when the sleeping task is woken by wake_up_process() or any other mechanism which can wake a task from uninterruptible state. Neither usleep_range() nor schedule_hrtimeout_range*() have any protection against wakeups. schedule_hrtimeout_range*() is designed this way despite the fact that the API documentation does not mention it. msleep() already has code to handle this case since it will loop as long as there was still time left. usleep_range() has no such loop, add it. Presumably this problem was not detected before because usleep_range() is only used in a few places and the function is mostly used in contexts which are not exposed to wakeups of any form. An effort was made to look for users relying on the old behavior by looking for usleep_range() in the same file as wake_up_process(). No problems were found by this search, though it is conceivable that someone could have put the sleep and wakeup in two different files. An effort was made to ask several upstream maintainers if they were aware of people relying on wake_up_process() to wake up usleep_range(). No maintainers were aware of that but they were aware of many people relying on usleep_range() never returning before the minimum. Reported-by: Tao Huang Signed-off-by: Douglas Anderson Cc: heiko@sntech.de Cc: broonie@kernel.org Cc: briannorris@chromium.org Cc: Andreas Mohr Cc: linux-rockchip@lists.infradead.org Cc: tony.xie@rock-chips.com Cc: John Stultz Cc: djkurtz@chromium.org Cc: linux@roeck-us.net Cc: tskd08@gmail.com Link: http://lkml.kernel.org/r/1477065531-30342-1-git-send-email-dianders@chromium.org Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d47980a1bc4..12681c9a7683 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1896,16 +1896,6 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static void __sched do_usleep_range(unsigned long min, unsigned long max) -{ - ktime_t kmin; - u64 delta; - - kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (u64)(max - min) * NSEC_PER_USEC; - schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); -} - /** * usleep_range - Sleep for an approximate time * @min: Minimum time in usecs to sleep @@ -1919,7 +1909,14 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max) */ void __sched usleep_range(unsigned long min, unsigned long max) { - __set_current_state(TASK_UNINTERRUPTIBLE); - do_usleep_range(min, max); + ktime_t exp = ktime_add_us(ktime_get(), min); + u64 delta = (u64)(max - min) * NSEC_PER_USEC; + + for (;;) { + __set_current_state(TASK_UNINTERRUPTIBLE); + /* Do not return before the requested sleep time has elapsed */ + if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) + break; + } } EXPORT_SYMBOL(usleep_range); -- cgit v1.2.3 From 4b7e9cf9c84b09adc428e0433cd376b91f9c52a7 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 21 Oct 2016 08:58:51 -0700 Subject: timers: Fix documentation for schedule_timeout() and similar The documentation for schedule_timeout(), schedule_hrtimeout(), and schedule_hrtimeout_range() all claim that the routines couldn't possibly return early if the task state was TASK_UNINTERRUPTIBLE. This is simply not true since wake_up_process() will cause those routines to exit early. We cannot make schedule_[hr]timeout() loop until the timeout expires if the task state is uninterruptible because we have users which rely on the existing and designed behaviour. Make the documentation match the (correct) implementation. schedule_hrtimeout() returns -EINTR even when a uninterruptible task was woken up. This might look strange, but making the return code depend on the state is too much of an effort as it would affect all the call sites. There is no value in doing so, but we spell it out clearly in the documentation. Suggested-by: Daniel Kurtz Signed-off-by: Douglas Anderson Cc: huangtao@rock-chips.com Cc: heiko@sntech.de Cc: broonie@kernel.org Cc: briannorris@chromium.org Cc: Andreas Mohr Cc: linux-rockchip@lists.infradead.org Cc: tony.xie@rock-chips.com Cc: John Stultz Cc: linux@roeck-us.net Cc: tskd08@gmail.com Link: http://lkml.kernel.org/r/1477065531-30342-2-git-send-email-dianders@chromium.org Signed-off-by: Thomas Gleixner --- kernel/time/hrtimer.c | 20 ++++++++++++++------ kernel/time/timer.c | 11 +++++++---- 2 files changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bb5ec425dfe0..08be5c99d26b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1742,15 +1742,19 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) @@ -1772,15 +1776,19 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * - * Returns 0 when the timer has expired otherwise -EINTR + * Returns 0 when the timer has expired. If the task was woken before the + * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or + * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 12681c9a7683..88aab86a4594 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1691,11 +1691,12 @@ static void process_timeout(unsigned long __data) * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 + * pass before the routine returns unless the current task is explicitly + * woken up, (e.g. by wake_up_process())". * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time + * delivered to the current task or the current task is explicitly woken + * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. @@ -1704,7 +1705,9 @@ static void process_timeout(unsigned long __data) * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * - * In all cases the return value is guaranteed to be non-negative. + * Returns 0 when the timer has expired otherwise the remaining time in + * jiffies will be returned. In all cases the return value is guaranteed + * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { -- cgit v1.2.3 From a07ea4d9941af5a0c6f0be2a71b51ac9c083c5e5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:02 +0200 Subject: genetlink: no longer support using static family IDs Static family IDs have never really been used, the only use case was the workaround I introduced for those users that assumed their family ID was also their multicast group ID. Additionally, because static family IDs would never be reserved by the generic netlink code, using a relatively low ID would only work for built-in families that can be registered immediately after generic netlink is started, which is basically only the control family (apart from the workaround code, which I also had to add code for so it would reserve those IDs) Thus, anything other than GENL_ID_GENERATE is flawed and luckily not used except in the cases I mentioned. Move those workarounds into a few lines of code, and then get rid of GENL_ID_GENERATE entirely, making it more robust. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- drivers/acpi/event.c | 1 - drivers/net/gtp.c | 1 - drivers/net/macsec.c | 1 - drivers/net/team/team.c | 1 - drivers/net/wireless/mac80211_hwsim.c | 1 - drivers/scsi/pmcraid.c | 6 ------ drivers/target/target_core_user.c | 1 - drivers/thermal/thermal_core.c | 1 - fs/dlm/netlink.c | 1 - fs/quota/netlink.c | 7 ------- include/linux/genl_magic_func.h | 1 - include/net/genetlink.h | 7 ++----- include/uapi/linux/genetlink.h | 1 - kernel/taskstats.c | 1 - net/batman-adv/netlink.c | 1 - net/core/devlink.c | 1 - net/core/drop_monitor.c | 1 - net/hsr/hsr_netlink.c | 1 - net/ieee802154/netlink.c | 1 - net/ieee802154/nl802154.c | 1 - net/ipv4/fou.c | 1 - net/ipv4/tcp_metrics.c | 1 - net/ipv6/ila/ila_xlat.c | 1 - net/irda/irnetlink.c | 1 - net/l2tp/l2tp_netlink.c | 1 - net/netfilter/ipvs/ip_vs_ctl.c | 1 - net/netlabel/netlabel_calipso.c | 1 - net/netlabel/netlabel_cipso_v4.c | 1 - net/netlabel/netlabel_mgmt.c | 1 - net/netlabel/netlabel_unlabeled.c | 1 - net/netlink/genetlink.c | 37 +++++++++++++++++++++-------------- net/nfc/netlink.c | 1 - net/openvswitch/datapath.c | 4 ---- net/tipc/netlink.c | 1 - net/tipc/netlink_compat.c | 1 - net/wimax/stack.c | 1 - net/wireless/nl80211.c | 1 - 37 files changed, 24 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index e24ea4e796e4..8dfca3d53131 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -83,7 +83,6 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = { }; static struct genl_family acpi_event_genl_family = { - .id = GENL_ID_GENERATE, .name = ACPI_GENL_FAMILY_NAME, .version = ACPI_GENL_VERSION, .maxattr = ACPI_GENL_ATTR_MAX, diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 97e0cbca0a08..f66737ba1299 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1095,7 +1095,6 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info) } static struct genl_family gtp_genl_family = { - .id = GENL_ID_GENERATE, .name = "gtp", .version = 0, .hdrsize = 0, diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 1a134cb2d52c..a5309b81a786 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1422,7 +1422,6 @@ static void clear_tx_sa(struct macsec_tx_sa *tx_sa) } static struct genl_family macsec_fam = { - .id = GENL_ID_GENERATE, .name = MACSEC_GENL_NAME, .hdrsize = 0, .version = MACSEC_GENL_VERSION, diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index a380649bf6b5..0b50205764ff 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2151,7 +2151,6 @@ static struct rtnl_link_ops team_link_ops __read_mostly = { ***********************************/ static struct genl_family team_nl_family = { - .id = GENL_ID_GENERATE, .name = TEAM_GENL_NAME, .version = TEAM_GENL_VERSION, .maxattr = TEAM_ATTR_MAX, diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index e95b79bccf9b..54b6cd62676e 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -589,7 +589,6 @@ struct hwsim_radiotap_ack_hdr { /* MAC80211_HWSIM netlinf family */ static struct genl_family hwsim_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "MAC80211_HWSIM", .version = 1, diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index 68a5c347fae9..cc50eb87b28a 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -1369,12 +1369,6 @@ static struct genl_multicast_group pmcraid_mcgrps[] = { }; static struct genl_family pmcraid_event_family = { - /* - * Due to prior multicast group abuse (the code having assumed that - * the family ID can be used as a multicast group ID) we need to - * statically allocate a family (and thus group) ID. - */ - .id = GENL_ID_PMCRAID, .name = "pmcraid", .version = 1, .maxattr = PMCRAID_AEN_ATTR_MAX, diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 62bf4fe5704a..313a0ef3cda7 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -148,7 +148,6 @@ static const struct genl_multicast_group tcmu_mcgrps[] = { /* Our generic netlink family */ static struct genl_family tcmu_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "TCM-USER", .version = 1, diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 226b0b4aced6..68d7503f6417 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -2164,7 +2164,6 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = { }; static struct genl_family thermal_event_genl_family = { - .id = GENL_ID_GENERATE, .name = THERMAL_GENL_FAMILY_NAME, .version = THERMAL_GENL_VERSION, .maxattr = THERMAL_GENL_ATTR_MAX, diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 1e6e227134d7..00d226956264 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -17,7 +17,6 @@ static uint32_t dlm_nl_seqnum; static uint32_t listener_nlportid; static struct genl_family family = { - .id = GENL_ID_GENERATE, .name = DLM_GENL_NAME, .version = DLM_GENL_VERSION, }; diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 8b252673d454..3965a5cdfaa2 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -13,13 +13,6 @@ static const struct genl_multicast_group quota_mcgrps[] = { /* Netlink family structure for quota */ static struct genl_family quota_genl_family = { - /* - * Needed due to multicast group ID abuse - old code assumed - * the family ID was also a valid multicast group ID (which - * isn't true) and userspace might thus rely on it. Assign a - * static ID for this group to make dealing with that easier. - */ - .id = GENL_ID_VFS_DQUOT, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 667c31101b8b..7c070c1fe457 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -260,7 +260,6 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = { */ #define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) static struct genl_family ZZZ_genl_family __read_mostly = { - .id = GENL_ID_GENERATE, .name = __stringify(GENL_MAGIC_FAMILY), .version = GENL_MAGIC_VERSION, #ifdef GENL_MAGIC_FAMILY_HDRSZ diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ef9defb3f5bc..43a5c3975a2f 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -20,7 +20,7 @@ struct genl_info; /** * struct genl_family - generic netlink family - * @id: protocol family idenfitier + * @id: protocol family identifier (private) * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version @@ -48,7 +48,7 @@ struct genl_info; * @n_ops: number of operations supported by this family (private) */ struct genl_family { - unsigned int id; + unsigned int id; /* private */ unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; @@ -149,9 +149,6 @@ static inline int genl_register_family(struct genl_family *family) * Registers the specified family and operations from the specified table. * Only one family may be registered with the same family name or identifier. * - * The family id may equal GENL_ID_GENERATE causing an unique id to - * be automatically generated and assigned. - * * Either a doit or dumpit callback must be specified for every registered * operation or the function will fail. Only one operation structure per * command identifier may be registered. diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 5512c90af7e3..d9b2db4a29c6 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -26,7 +26,6 @@ struct genlmsghdr { /* * List of reserved static generic netlink identifiers: */ -#define GENL_ID_GENERATE 0 #define GENL_ID_CTRL NLMSG_MIN_TYPE #define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1) #define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2) diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..d7a1a9461a10 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -42,7 +42,6 @@ static int family_registered; struct kmem_cache *taskstats_cache; static struct genl_family family = { - .id = GENL_ID_GENERATE, .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, .maxattr = TASKSTATS_CMD_ATTR_MAX, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 64cb6acbe0a6..a03b0ed7e8dd 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -49,7 +49,6 @@ #include "translation-table.h" struct genl_family batadv_netlink_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, diff --git a/net/core/devlink.c b/net/core/devlink.c index d2fd736de6a2..3008d9c33875 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -342,7 +342,6 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, } static struct genl_family devlink_nl_family = { - .id = GENL_ID_GENERATE, .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, .maxattr = DEVLINK_ATTR_MAX, diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 72cfb0c61125..a5320dfcd978 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -60,7 +60,6 @@ struct dm_hw_stat_delta { }; static struct genl_family net_drop_monitor_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "NET_DM", .version = 2, diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index d4d1617f43a8..2ad039492bee 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -132,7 +132,6 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = { }; static struct genl_family hsr_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "HSR", .version = 1, diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index c8133c07ceee..19144158b696 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -29,7 +29,6 @@ static unsigned int ieee802154_seq_num; static DEFINE_SPINLOCK(ieee802154_seq_lock); struct genl_family nl802154_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = IEEE802154_NL_NAME, .version = 1, diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 21aabadccd0e..182299858f1d 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -34,7 +34,6 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, /* the netlink family */ static struct genl_family nl802154_fam = { - .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ .name = NL802154_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index cf50f7e2b012..e3fc527c5d37 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -623,7 +623,6 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) } static struct genl_family fou_nl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index bf1f3b2b29d1..3da305127b32 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -743,7 +743,6 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, } static struct genl_family tcp_metrics_nl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index e604013dd814..0d57e27d1cdd 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -119,7 +119,6 @@ static const struct rhashtable_params rht_params = { }; static struct genl_family ila_nl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = ILA_GENL_NAME, .version = ILA_GENL_VERSION, diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c index e15c40e86660..f23b81aa91fe 100644 --- a/net/irda/irnetlink.c +++ b/net/irda/irnetlink.c @@ -25,7 +25,6 @@ static struct genl_family irda_nl_family = { - .id = GENL_ID_GENERATE, .name = IRDA_NL_NAME, .hdrsize = 0, .version = IRDA_NL_VERSION, diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index bf3117771822..4fbf1f41ac52 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -32,7 +32,6 @@ static struct genl_family l2tp_nl_family = { - .id = GENL_ID_GENERATE, .name = L2TP_GENL_NAME, .version = L2TP_GENL_VERSION, .hdrsize = 0, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c3c809b2e712..ceed66cdd03e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2841,7 +2841,6 @@ static struct nf_sockopt_ops ip_vs_sockopts = { /* IPVS genetlink family */ static struct genl_family ip_vs_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 2ec93c5e77bb..152e503b8c5d 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg { /* NetLabel Generic NETLINK CALIPSO family */ static struct genl_family netlbl_calipso_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_CALIPSO_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 7fd1104ba900..755b284e7ad4 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -60,7 +60,6 @@ struct netlbl_domhsh_walk_arg { /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_cipsov4_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_CIPSOV4_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index f85d0e07af2d..3b00f2368fcd 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -61,7 +61,6 @@ struct netlbl_domhsh_walk_arg { /* NetLabel Generic NETLINK CIPSOv4 family */ static struct genl_family netlbl_mgmt_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_MGMT_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 4528cff9138b..c2ea8d1f653a 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -124,7 +124,6 @@ static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 01291b7a27bb..f19ec969edee 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -349,8 +349,6 @@ static int genl_validate_ops(const struct genl_family *family) * * Registers the specified family after validating it first. Only one * family may be registered with the same family name or identifier. - * The family id may equal GENL_ID_GENERATE causing an unique id to - * be automatically generated and assigned. * * The family's ops array must already be assigned, you can use the * genl_register_family_with_ops() helper function. @@ -359,13 +357,7 @@ static int genl_validate_ops(const struct genl_family *family) */ int __genl_register_family(struct genl_family *family) { - int err = -EINVAL, i; - - if (family->id && family->id < GENL_MIN_ID) - goto errout; - - if (family->id > GENL_MAX_ID) - goto errout; + int err, i; err = genl_validate_ops(family); if (err) @@ -378,8 +370,27 @@ int __genl_register_family(struct genl_family *family) goto errout_locked; } - if (family->id == GENL_ID_GENERATE) { - u16 newid = genl_generate_id(); + if (family == &genl_ctrl) { + family->id = GENL_ID_CTRL; + } else { + u16 newid; + + /* this should be left zero in the struct */ + WARN_ON(family->id); + + /* + * Sadly, a few cases need to be special-cased + * due to them having previously abused the API + * and having used their family ID also as their + * multicast group ID, so we use reserved IDs + * for both to be sure we can do that mapping. + */ + if (strcmp(family->name, "pmcraid") == 0) + newid = GENL_ID_PMCRAID; + else if (strcmp(family->name, "VFS_DQUOT") == 0) + newid = GENL_ID_VFS_DQUOT; + else + newid = genl_generate_id(); if (!newid) { err = -ENOMEM; @@ -387,9 +398,6 @@ int __genl_register_family(struct genl_family *family) } family->id = newid; - } else if (genl_family_find_byid(family->id)) { - err = -EEXIST; - goto errout_locked; } if (family->maxattr && !family->parallel_ops) { @@ -419,7 +427,6 @@ int __genl_register_family(struct genl_family *family) errout_locked: genl_unlock_all(); -errout: return err; } EXPORT_SYMBOL(__genl_register_family); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 79786bf62b88..c230403e066c 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -39,7 +39,6 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = { }; static struct genl_family nfc_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 194435aa1165..f9fef7dfba15 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -671,7 +671,6 @@ static const struct genl_ops dp_packet_genl_ops[] = { }; static struct genl_family dp_packet_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, @@ -1436,7 +1435,6 @@ static const struct genl_ops dp_flow_genl_ops[] = { }; static struct genl_family dp_flow_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, @@ -1822,7 +1820,6 @@ static const struct genl_ops dp_datapath_genl_ops[] = { }; static struct genl_family dp_datapath_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, @@ -2244,7 +2241,6 @@ static const struct genl_ops dp_vport_genl_ops[] = { }; struct genl_family dp_vport_genl_family = { - .id = GENL_ID_GENERATE, .hdrsize = sizeof(struct ovs_header), .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 4b94f3cfe3af..383b8fedabc7 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -136,7 +136,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { * so we have a separate genl handling for the new API. */ struct genl_family tipc_genl_family = { - .id = GENL_ID_GENERATE, .name = TIPC_GENL_V2_NAME, .version = TIPC_GENL_V2_VERSION, .hdrsize = 0, diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 1fd464764765..f04428e4c8e5 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1216,7 +1216,6 @@ send: } static struct genl_family tipc_genl_compat_family = { - .id = GENL_ID_GENERATE, .name = TIPC_GENL_NAME, .version = TIPC_GENL_VERSION, .hdrsize = TIPC_GENL_HDRLEN, diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 3f816e2971ee..8ac83a41585f 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -573,7 +573,6 @@ size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); struct genl_family wimax_gnl_family = { - .id = GENL_ID_GENERATE, .name = "WiMAX", .version = WIMAX_GNL_VERSION, .hdrsize = 0, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7d8cb3330c86..714beafe05e0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -39,7 +39,6 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, /* the netlink family */ static struct genl_family nl80211_fam = { - .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ .name = NL80211_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ -- cgit v1.2.3 From 489111e5c25b93be80340c3113d71903d7c82136 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:03 +0200 Subject: genetlink: statically initialize families Instead of providing macros/inline functions to initialize the families, make all users initialize them statically and get rid of the macros. This reduces the kernel code size by about 1.6k on x86-64 (with allyesconfig). Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- drivers/acpi/event.c | 1 + drivers/net/gtp.c | 21 +++++++---- drivers/net/macsec.c | 21 +++++++---- drivers/net/team/team.c | 22 +++++++---- drivers/net/wireless/mac80211_hwsim.c | 26 +++++++------ drivers/scsi/pmcraid.c | 1 + drivers/target/target_core_user.c | 1 + drivers/thermal/thermal_core.c | 1 + fs/dlm/netlink.c | 15 +++++--- fs/quota/netlink.c | 1 + include/linux/drbd_genl.h | 2 +- include/linux/genl_magic_func.h | 28 ++++++++------ include/net/genetlink.h | 71 ++++++----------------------------- kernel/taskstats.c | 17 ++++++--- net/batman-adv/netlink.c | 25 +++++++----- net/core/devlink.c | 27 +++++++------ net/core/drop_monitor.c | 20 ++++++---- net/hsr/hsr_netlink.c | 22 +++++++---- net/ieee802154/netlink.c | 23 +++++++----- net/ieee802154/nl802154.c | 34 ++++++++--------- net/ipv4/fou.c | 22 ++++++----- net/ipv4/tcp_metrics.c | 22 ++++++----- net/ipv6/ila/ila_xlat.c | 24 +++++++----- net/irda/irnetlink.c | 19 ++++++---- net/l2tp/l2tp_netlink.c | 25 +++++++----- net/netfilter/ipvs/ip_vs_ctl.c | 22 ++++++----- net/netlabel/netlabel_calipso.c | 20 ++++++---- net/netlabel/netlabel_cipso_v4.c | 21 ++++++----- net/netlabel/netlabel_mgmt.c | 20 ++++++---- net/netlabel/netlabel_unlabeled.c | 20 ++++++---- net/netlink/genetlink.c | 35 +++++++++-------- net/nfc/netlink.c | 24 +++++++----- net/openvswitch/datapath.c | 4 ++ net/tipc/netlink.c | 22 ++++++----- net/tipc/netlink_compat.c | 20 +++++----- net/wimax/stack.c | 19 +++++----- net/wireless/nl80211.c | 33 ++++++++-------- 37 files changed, 414 insertions(+), 337 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index 8dfca3d53131..1ab12ad7d5ba 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -83,6 +83,7 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = { }; static struct genl_family acpi_event_genl_family = { + .module = THIS_MODULE, .name = ACPI_GENL_FAMILY_NAME, .version = ACPI_GENL_VERSION, .maxattr = ACPI_GENL_ATTR_MAX, diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index f66737ba1299..0604fd78f826 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1094,13 +1094,7 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info) return 0; } -static struct genl_family gtp_genl_family = { - .name = "gtp", - .version = 0, - .hdrsize = 0, - .maxattr = GTPA_MAX, - .netnsok = true, -}; +static struct genl_family gtp_genl_family; static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, u32 type, struct pdp_ctx *pctx) @@ -1296,6 +1290,17 @@ static const struct genl_ops gtp_genl_ops[] = { }, }; +static struct genl_family gtp_genl_family = { + .name = "gtp", + .version = 0, + .hdrsize = 0, + .maxattr = GTPA_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = gtp_genl_ops, + .n_ops = ARRAY_SIZE(gtp_genl_ops), +}; + static int __net_init gtp_net_init(struct net *net) { struct gtp_net *gn = net_generic(net, gtp_net_id); @@ -1335,7 +1340,7 @@ static int __init gtp_init(void) if (err < 0) goto error_out; - err = genl_register_family_with_ops(>p_genl_family, gtp_genl_ops); + err = genl_register_family(>p_genl_family); if (err < 0) goto unreg_rtnl_link; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index a5309b81a786..63ca7a3c77cf 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1421,13 +1421,7 @@ static void clear_tx_sa(struct macsec_tx_sa *tx_sa) macsec_txsa_put(tx_sa); } -static struct genl_family macsec_fam = { - .name = MACSEC_GENL_NAME, - .hdrsize = 0, - .version = MACSEC_GENL_VERSION, - .maxattr = MACSEC_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family macsec_fam; static struct net_device *get_dev_from_nl(struct net *net, struct nlattr **attrs) @@ -2654,6 +2648,17 @@ static const struct genl_ops macsec_genl_ops[] = { }, }; +static struct genl_family macsec_fam = { + .name = MACSEC_GENL_NAME, + .hdrsize = 0, + .version = MACSEC_GENL_VERSION, + .maxattr = MACSEC_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = macsec_genl_ops, + .n_ops = ARRAY_SIZE(macsec_genl_ops), +}; + static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -3461,7 +3466,7 @@ static int __init macsec_init(void) if (err) goto notifier; - err = genl_register_family_with_ops(&macsec_fam, macsec_genl_ops); + err = genl_register_family(&macsec_fam); if (err) goto rtnl; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 0b50205764ff..46bf7c1216c0 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2150,12 +2150,7 @@ static struct rtnl_link_ops team_link_ops __read_mostly = { * Generic netlink custom interface ***********************************/ -static struct genl_family team_nl_family = { - .name = TEAM_GENL_NAME, - .version = TEAM_GENL_VERSION, - .maxattr = TEAM_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family team_nl_family; static const struct nla_policy team_nl_policy[TEAM_ATTR_MAX + 1] = { [TEAM_ATTR_UNSPEC] = { .type = NLA_UNSPEC, }, @@ -2745,6 +2740,18 @@ static const struct genl_multicast_group team_nl_mcgrps[] = { { .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, }, }; +static struct genl_family team_nl_family = { + .name = TEAM_GENL_NAME, + .version = TEAM_GENL_VERSION, + .maxattr = TEAM_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = team_nl_ops, + .n_ops = ARRAY_SIZE(team_nl_ops), + .mcgrps = team_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(team_nl_mcgrps), +}; + static int team_nl_send_multicast(struct sk_buff *skb, struct team *team, u32 portid) { @@ -2768,8 +2775,7 @@ static int team_nl_send_event_port_get(struct team *team, static int team_nl_init(void) { - return genl_register_family_with_ops_groups(&team_nl_family, team_nl_ops, - team_nl_mcgrps); + return genl_register_family(&team_nl_family); } static void team_nl_fini(void) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 54b6cd62676e..5d4637e586e8 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -587,14 +587,8 @@ struct hwsim_radiotap_ack_hdr { __le16 rt_chbitmask; } __packed; -/* MAC80211_HWSIM netlinf family */ -static struct genl_family hwsim_genl_family = { - .hdrsize = 0, - .name = "MAC80211_HWSIM", - .version = 1, - .maxattr = HWSIM_ATTR_MAX, - .netnsok = true, -}; +/* MAC80211_HWSIM netlink family */ +static struct genl_family hwsim_genl_family; enum hwsim_multicast_groups { HWSIM_MCGRP_CONFIG, @@ -3234,6 +3228,18 @@ static const struct genl_ops hwsim_ops[] = { }, }; +static struct genl_family hwsim_genl_family = { + .name = "MAC80211_HWSIM", + .version = 1, + .maxattr = HWSIM_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = hwsim_ops, + .n_ops = ARRAY_SIZE(hwsim_ops), + .mcgrps = hwsim_mcgrps, + .n_mcgrps = ARRAY_SIZE(hwsim_mcgrps), +}; + static void destroy_radio(struct work_struct *work) { struct mac80211_hwsim_data *data = @@ -3287,9 +3293,7 @@ static int hwsim_init_netlink(void) printk(KERN_INFO "mac80211_hwsim: initializing netlink\n"); - rc = genl_register_family_with_ops_groups(&hwsim_genl_family, - hwsim_ops, - hwsim_mcgrps); + rc = genl_register_family(&hwsim_genl_family); if (rc) goto failure; diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index cc50eb87b28a..c0ab7bb8c3ce 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -1369,6 +1369,7 @@ static struct genl_multicast_group pmcraid_mcgrps[] = { }; static struct genl_family pmcraid_event_family = { + .module = THIS_MODULE, .name = "pmcraid", .version = 1, .maxattr = PMCRAID_AEN_ATTR_MAX, diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 313a0ef3cda7..3483372f5562 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -148,6 +148,7 @@ static const struct genl_multicast_group tcmu_mcgrps[] = { /* Our generic netlink family */ static struct genl_family tcmu_genl_family = { + .module = THIS_MODULE, .hdrsize = 0, .name = "TCM-USER", .version = 1, diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 68d7503f6417..93b6caab2d9f 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -2164,6 +2164,7 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = { }; static struct genl_family thermal_event_genl_family = { + .module = THIS_MODULE, .name = THERMAL_GENL_FAMILY_NAME, .version = THERMAL_GENL_VERSION, .maxattr = THERMAL_GENL_ATTR_MAX, diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 00d226956264..04042d69573c 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -16,10 +16,7 @@ static uint32_t dlm_nl_seqnum; static uint32_t listener_nlportid; -static struct genl_family family = { - .name = DLM_GENL_NAME, - .version = DLM_GENL_VERSION, -}; +static struct genl_family family; static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) { @@ -75,9 +72,17 @@ static struct genl_ops dlm_nl_ops[] = { }, }; +static struct genl_family family = { + .name = DLM_GENL_NAME, + .version = DLM_GENL_VERSION, + .ops = dlm_nl_ops, + .n_ops = ARRAY_SIZE(dlm_nl_ops), + .module = THIS_MODULE, +}; + int __init dlm_netlink_init(void) { - return genl_register_family_with_ops(&family, dlm_nl_ops); + return genl_register_family(&family); } void dlm_netlink_exit(void) diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 3965a5cdfaa2..9457c7b0dfa2 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -13,6 +13,7 @@ static const struct genl_multicast_group quota_mcgrps[] = { /* Netlink family structure for quota */ static struct genl_family quota_genl_family = { + .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index c934d3a96b5e..2896f93808ae 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -67,7 +67,7 @@ * genl_magic_func.h * generates an entry in the static genl_ops array, * and static register/unregister functions to - * genl_register_family_with_ops(). + * genl_register_family(). * * flags and handler: * GENL_op_init( .doit = x, .dumpit = y, .flags = something) diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 7c070c1fe457..40c2e39362c8 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -259,15 +259,7 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = { * {{{2 */ #define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family) -static struct genl_family ZZZ_genl_family __read_mostly = { - .name = __stringify(GENL_MAGIC_FAMILY), - .version = GENL_MAGIC_VERSION, -#ifdef GENL_MAGIC_FAMILY_HDRSZ - .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), -#endif - .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, -}; - +static struct genl_family ZZZ_genl_family; /* * Magic: define multicast groups * Magic: define multicast group registration helper @@ -301,11 +293,23 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \ #undef GENL_mc_group #define GENL_mc_group(group) +static struct genl_family ZZZ_genl_family __read_mostly = { + .name = __stringify(GENL_MAGIC_FAMILY), + .version = GENL_MAGIC_VERSION, +#ifdef GENL_MAGIC_FAMILY_HDRSZ + .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), +#endif + .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, + .ops = ZZZ_genl_ops, + .n_ops = ARRAY_SIZE(ZZZ_genl_ops), + .mcgrps = ZZZ_genl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps), + .module = THIS_MODULE, +}; + int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void) { - return genl_register_family_with_ops_groups(&ZZZ_genl_family, \ - ZZZ_genl_ops, \ - ZZZ_genl_mcgrps); + return genl_register_family(&ZZZ_genl_family); } void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 43a5c3975a2f..2298b50cee34 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -39,13 +39,14 @@ struct genl_info; * Note that unbind() will not be called symmetrically if the * generic netlink family is removed while there are still open * sockets. - * @attrbuf: buffer to store parsed attributes - * @family_list: family list - * @mcgrps: multicast groups used by this family (private) - * @n_mcgrps: number of multicast groups (private) + * @attrbuf: buffer to store parsed attributes (private) + * @family_list: family list (private) + * @mcgrps: multicast groups used by this family + * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family - * @ops: the operations supported by this family (private) - * @n_ops: number of operations supported by this family (private) + * (private) + * @ops: the operations supported by this family + * @n_ops: number of operations supported by this family */ struct genl_family { unsigned int id; /* private */ @@ -64,10 +65,10 @@ struct genl_family { int (*mcast_bind)(struct net *net, int group); void (*mcast_unbind)(struct net *net, int group); struct nlattr ** attrbuf; /* private */ - const struct genl_ops * ops; /* private */ - const struct genl_multicast_group *mcgrps; /* private */ - unsigned int n_ops; /* private */ - unsigned int n_mcgrps; /* private */ + const struct genl_ops * ops; + const struct genl_multicast_group *mcgrps; + unsigned int n_ops; + unsigned int n_mcgrps; unsigned int mcgrp_offset; /* private */ struct list_head family_list; /* private */ struct module *module; @@ -132,55 +133,7 @@ struct genl_ops { u8 flags; }; -int __genl_register_family(struct genl_family *family); - -static inline int genl_register_family(struct genl_family *family) -{ - family->module = THIS_MODULE; - return __genl_register_family(family); -} - -/** - * genl_register_family_with_ops - register a generic netlink family with ops - * @family: generic netlink family - * @ops: operations to be registered - * @n_ops: number of elements to register - * - * Registers the specified family and operations from the specified table. - * Only one family may be registered with the same family name or identifier. - * - * Either a doit or dumpit callback must be specified for every registered - * operation or the function will fail. Only one operation structure per - * command identifier may be registered. - * - * See include/net/genetlink.h for more documenation on the operations - * structure. - * - * Return 0 on success or a negative error code. - */ -static inline int -_genl_register_family_with_ops_grps(struct genl_family *family, - const struct genl_ops *ops, size_t n_ops, - const struct genl_multicast_group *mcgrps, - size_t n_mcgrps) -{ - family->module = THIS_MODULE; - family->ops = ops; - family->n_ops = n_ops; - family->mcgrps = mcgrps; - family->n_mcgrps = n_mcgrps; - return __genl_register_family(family); -} - -#define genl_register_family_with_ops(family, ops) \ - _genl_register_family_with_ops_grps((family), \ - (ops), ARRAY_SIZE(ops), \ - NULL, 0) -#define genl_register_family_with_ops_groups(family, ops, grps) \ - _genl_register_family_with_ops_grps((family), \ - (ops), ARRAY_SIZE(ops), \ - (grps), ARRAY_SIZE(grps)) - +int genl_register_family(struct genl_family *family); int genl_unregister_family(struct genl_family *family); void genl_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d7a1a9461a10..4075ece592f2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -41,11 +41,7 @@ static DEFINE_PER_CPU(__u32, taskstats_seqnum); static int family_registered; struct kmem_cache *taskstats_cache; -static struct genl_family family = { - .name = TASKSTATS_GENL_NAME, - .version = TASKSTATS_GENL_VERSION, - .maxattr = TASKSTATS_CMD_ATTR_MAX, -}; +static struct genl_family family; static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, @@ -650,6 +646,15 @@ static const struct genl_ops taskstats_ops[] = { }, }; +static struct genl_family family = { + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, + .module = THIS_MODULE, + .ops = taskstats_ops, + .n_ops = ARRAY_SIZE(taskstats_ops), +}; + /* Needed early in initialization */ void __init taskstats_init_early(void) { @@ -666,7 +671,7 @@ static int __init taskstats_init(void) { int rc; - rc = genl_register_family_with_ops(&family, taskstats_ops); + rc = genl_register_family(&family); if (rc) return rc; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a03b0ed7e8dd..e28cec34a016 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -48,13 +48,7 @@ #include "tp_meter.h" #include "translation-table.h" -struct genl_family batadv_netlink_family = { - .hdrsize = 0, - .name = BATADV_NL_NAME, - .version = 1, - .maxattr = BATADV_ATTR_MAX, - .netnsok = true, -}; +struct genl_family batadv_netlink_family; /* multicast groups */ enum batadv_netlink_multicast_groups { @@ -609,6 +603,19 @@ static struct genl_ops batadv_netlink_ops[] = { }; +struct genl_family batadv_netlink_family = { + .hdrsize = 0, + .name = BATADV_NL_NAME, + .version = 1, + .maxattr = BATADV_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = batadv_netlink_ops, + .n_ops = ARRAY_SIZE(batadv_netlink_ops), + .mcgrps = batadv_netlink_mcgrps, + .n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps), +}; + /** * batadv_netlink_register - register batadv genl netlink family */ @@ -616,9 +623,7 @@ void __init batadv_netlink_register(void) { int ret; - ret = genl_register_family_with_ops_groups(&batadv_netlink_family, - batadv_netlink_ops, - batadv_netlink_mcgrps); + ret = genl_register_family(&batadv_netlink_family); if (ret) pr_warn("unable to register netlink family"); } diff --git a/net/core/devlink.c b/net/core/devlink.c index 3008d9c33875..063da8091aef 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -341,14 +341,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); } -static struct genl_family devlink_nl_family = { - .name = DEVLINK_GENL_NAME, - .version = DEVLINK_GENL_VERSION, - .maxattr = DEVLINK_ATTR_MAX, - .netnsok = true, - .pre_doit = devlink_nl_pre_doit, - .post_doit = devlink_nl_post_doit, -}; +static struct genl_family devlink_nl_family; enum devlink_multicast_groups { DEVLINK_MCGRP_CONFIG, @@ -1619,6 +1612,20 @@ static const struct genl_ops devlink_nl_ops[] = { }, }; +static struct genl_family devlink_nl_family = { + .name = DEVLINK_GENL_NAME, + .version = DEVLINK_GENL_VERSION, + .maxattr = DEVLINK_ATTR_MAX, + .netnsok = true, + .pre_doit = devlink_nl_pre_doit, + .post_doit = devlink_nl_post_doit, + .module = THIS_MODULE, + .ops = devlink_nl_ops, + .n_ops = ARRAY_SIZE(devlink_nl_ops), + .mcgrps = devlink_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), +}; + /** * devlink_alloc - Allocate new devlink instance resources * @@ -1841,9 +1848,7 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister); static int __init devlink_module_init(void) { - return genl_register_family_with_ops_groups(&devlink_nl_family, - devlink_nl_ops, - devlink_nl_mcgrps); + return genl_register_family(&devlink_nl_family); } static void __exit devlink_module_exit(void) diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index a5320dfcd978..80c002794ff6 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -59,11 +59,7 @@ struct dm_hw_stat_delta { unsigned long last_drop_val; }; -static struct genl_family net_drop_monitor_family = { - .hdrsize = 0, - .name = "NET_DM", - .version = 2, -}; +static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); @@ -350,6 +346,17 @@ static const struct genl_ops dropmon_ops[] = { }, }; +static struct genl_family net_drop_monitor_family = { + .hdrsize = 0, + .name = "NET_DM", + .version = 2, + .module = THIS_MODULE, + .ops = dropmon_ops, + .n_ops = ARRAY_SIZE(dropmon_ops), + .mcgrps = dropmon_mcgrps, + .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), +}; + static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; @@ -366,8 +373,7 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family_with_ops_groups(&net_drop_monitor_family, - dropmon_ops, dropmon_mcgrps); + rc = genl_register_family(&net_drop_monitor_family); if (rc) { pr_err("Could not create drop monitor netlink family\n"); return rc; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 2ad039492bee..aab34c7f6f89 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -131,12 +131,7 @@ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = { [HSR_A_IF2_SEQ] = { .type = NLA_U16 }, }; -static struct genl_family hsr_genl_family = { - .hdrsize = 0, - .name = "HSR", - .version = 1, - .maxattr = HSR_A_MAX, -}; +static struct genl_family hsr_genl_family; static const struct genl_multicast_group hsr_mcgrps[] = { { .name = "hsr-network", }, @@ -466,6 +461,18 @@ static const struct genl_ops hsr_ops[] = { }, }; +static struct genl_family hsr_genl_family = { + .hdrsize = 0, + .name = "HSR", + .version = 1, + .maxattr = HSR_A_MAX, + .module = THIS_MODULE, + .ops = hsr_ops, + .n_ops = ARRAY_SIZE(hsr_ops), + .mcgrps = hsr_mcgrps, + .n_mcgrps = ARRAY_SIZE(hsr_mcgrps), +}; + int __init hsr_netlink_init(void) { int rc; @@ -474,8 +481,7 @@ int __init hsr_netlink_init(void) if (rc) goto fail_rtnl_link_register; - rc = genl_register_family_with_ops_groups(&hsr_genl_family, hsr_ops, - hsr_mcgrps); + rc = genl_register_family(&hsr_genl_family); if (rc) goto fail_genl_register_family; diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 19144158b696..08e62470bac2 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -28,13 +28,6 @@ static unsigned int ieee802154_seq_num; static DEFINE_SPINLOCK(ieee802154_seq_lock); -struct genl_family nl802154_family = { - .hdrsize = 0, - .name = IEEE802154_NL_NAME, - .version = 1, - .maxattr = IEEE802154_ATTR_MAX, -}; - /* Requests to userspace */ struct sk_buff *ieee802154_nl_create(int flags, u8 req) { @@ -138,11 +131,21 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = { [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, }, }; +struct genl_family nl802154_family = { + .hdrsize = 0, + .name = IEEE802154_NL_NAME, + .version = 1, + .maxattr = IEEE802154_ATTR_MAX, + .module = THIS_MODULE, + .ops = ieee8021154_ops, + .n_ops = ARRAY_SIZE(ieee8021154_ops), + .mcgrps = ieee802154_mcgrps, + .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps), +}; + int __init ieee802154_nl_init(void) { - return genl_register_family_with_ops_groups(&nl802154_family, - ieee8021154_ops, - ieee802154_mcgrps); + return genl_register_family(&nl802154_family); } void ieee802154_nl_exit(void) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 182299858f1d..f7e75578aedd 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -26,22 +26,8 @@ #include "rdev-ops.h" #include "core.h" -static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info); - -static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info); - /* the netlink family */ -static struct genl_family nl802154_fam = { - .name = NL802154_GENL_NAME, /* have users key off the name instead */ - .hdrsize = 0, /* no private header */ - .version = 1, /* no particular meaning now */ - .maxattr = NL802154_ATTR_MAX, - .netnsok = true, - .pre_doit = nl802154_pre_doit, - .post_doit = nl802154_post_doit, -}; +static struct genl_family nl802154_fam; /* multicast groups */ enum nl802154_multicast_groups { @@ -2476,11 +2462,25 @@ static const struct genl_ops nl802154_ops[] = { #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; +static struct genl_family nl802154_fam = { + .name = NL802154_GENL_NAME, /* have users key off the name instead */ + .hdrsize = 0, /* no private header */ + .version = 1, /* no particular meaning now */ + .maxattr = NL802154_ATTR_MAX, + .netnsok = true, + .pre_doit = nl802154_pre_doit, + .post_doit = nl802154_post_doit, + .module = THIS_MODULE, + .ops = nl802154_ops, + .n_ops = ARRAY_SIZE(nl802154_ops), + .mcgrps = nl802154_mcgrps, + .n_mcgrps = ARRAY_SIZE(nl802154_mcgrps), +}; + /* initialisation/exit functions */ int nl802154_init(void) { - return genl_register_family_with_ops_groups(&nl802154_fam, nl802154_ops, - nl802154_mcgrps); + return genl_register_family(&nl802154_fam); } void nl802154_exit(void) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index e3fc527c5d37..5b5226a2434f 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -622,13 +622,7 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) return err; } -static struct genl_family fou_nl_family = { - .hdrsize = 0, - .name = FOU_GENL_NAME, - .version = FOU_GENL_VERSION, - .maxattr = FOU_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family fou_nl_family; static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_U16, }, @@ -830,6 +824,17 @@ static const struct genl_ops fou_nl_ops[] = { }, }; +static struct genl_family fou_nl_family = { + .hdrsize = 0, + .name = FOU_GENL_NAME, + .version = FOU_GENL_VERSION, + .maxattr = FOU_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = fou_nl_ops, + .n_ops = ARRAY_SIZE(fou_nl_ops), +}; + size_t fou_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct udphdr); @@ -1085,8 +1090,7 @@ static int __init fou_init(void) if (ret) goto exit; - ret = genl_register_family_with_ops(&fou_nl_family, - fou_nl_ops); + ret = genl_register_family(&fou_nl_family); if (ret < 0) goto unregister; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 3da305127b32..bba3c72c4a39 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -742,13 +742,7 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, rcu_read_unlock(); } -static struct genl_family tcp_metrics_nl_family = { - .hdrsize = 0, - .name = TCP_METRICS_GENL_NAME, - .version = TCP_METRICS_GENL_VERSION, - .maxattr = TCP_METRICS_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, @@ -1115,6 +1109,17 @@ static const struct genl_ops tcp_metrics_nl_ops[] = { }, }; +static struct genl_family tcp_metrics_nl_family = { + .hdrsize = 0, + .name = TCP_METRICS_GENL_NAME, + .version = TCP_METRICS_GENL_VERSION, + .maxattr = TCP_METRICS_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = tcp_metrics_nl_ops, + .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops), +}; + static unsigned int tcpmhash_entries; static int __init set_tcpmhash_entries(char *str) { @@ -1178,8 +1183,7 @@ void __init tcp_metrics_init(void) if (ret < 0) panic("Could not allocate the tcp_metrics hash table\n"); - ret = genl_register_family_with_ops(&tcp_metrics_nl_family, - tcp_metrics_nl_ops); + ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) panic("Could not register tcp_metrics generic netlink\n"); } diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 0d57e27d1cdd..97f7b0cc4675 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -118,14 +118,7 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = ila_cmpfn, }; -static struct genl_family ila_nl_family = { - .hdrsize = 0, - .name = ILA_GENL_NAME, - .version = ILA_GENL_VERSION, - .maxattr = ILA_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, -}; +static struct genl_family ila_nl_family; static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, @@ -560,6 +553,18 @@ static const struct genl_ops ila_nl_ops[] = { }, }; +static struct genl_family ila_nl_family = { + .hdrsize = 0, + .name = ILA_GENL_NAME, + .version = ILA_GENL_VERSION, + .maxattr = ILA_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .ops = ila_nl_ops, + .n_ops = ARRAY_SIZE(ila_nl_ops), +}; + #define ILA_HASH_TABLE_SIZE 1024 static __net_init int ila_init_net(struct net *net) @@ -630,8 +635,7 @@ int ila_xlat_init(void) if (ret) goto exit; - ret = genl_register_family_with_ops(&ila_nl_family, - ila_nl_ops); + ret = genl_register_family(&ila_nl_family); if (ret < 0) goto unregister; diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c index f23b81aa91fe..07877347c2f7 100644 --- a/net/irda/irnetlink.c +++ b/net/irda/irnetlink.c @@ -24,12 +24,7 @@ -static struct genl_family irda_nl_family = { - .name = IRDA_NL_NAME, - .hdrsize = 0, - .version = IRDA_NL_VERSION, - .maxattr = IRDA_NL_CMD_MAX, -}; +static struct genl_family irda_nl_family; static struct net_device * ifname_to_netdev(struct net *net, struct genl_info *info) { @@ -146,9 +141,19 @@ static const struct genl_ops irda_nl_ops[] = { }; +static struct genl_family irda_nl_family = { + .name = IRDA_NL_NAME, + .hdrsize = 0, + .version = IRDA_NL_VERSION, + .maxattr = IRDA_NL_CMD_MAX, + .module = THIS_MODULE, + .ops = irda_nl_ops, + .n_ops = ARRAY_SIZE(irda_nl_ops), +}; + int irda_nl_register(void) { - return genl_register_family_with_ops(&irda_nl_family, irda_nl_ops); + return genl_register_family(&irda_nl_family); } void irda_nl_unregister(void) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 4fbf1f41ac52..e4e8c0769a6b 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -31,13 +31,7 @@ #include "l2tp_core.h" -static struct genl_family l2tp_nl_family = { - .name = L2TP_GENL_NAME, - .version = L2TP_GENL_VERSION, - .hdrsize = 0, - .maxattr = L2TP_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family l2tp_nl_family; static const struct genl_multicast_group l2tp_multicast_group[] = { { @@ -976,6 +970,19 @@ static const struct genl_ops l2tp_nl_ops[] = { }, }; +static struct genl_family l2tp_nl_family = { + .name = L2TP_GENL_NAME, + .version = L2TP_GENL_VERSION, + .hdrsize = 0, + .maxattr = L2TP_ATTR_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = l2tp_nl_ops, + .n_ops = ARRAY_SIZE(l2tp_nl_ops), + .mcgrps = l2tp_multicast_group, + .n_mcgrps = ARRAY_SIZE(l2tp_multicast_group), +}; + int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops) { int ret; @@ -1012,9 +1019,7 @@ EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops); static int l2tp_nl_init(void) { pr_info("L2TP netlink interface\n"); - return genl_register_family_with_ops_groups(&l2tp_nl_family, - l2tp_nl_ops, - l2tp_multicast_group); + return genl_register_family(&l2tp_nl_family); } static void l2tp_nl_cleanup(void) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ceed66cdd03e..ea3e8aed063f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2840,13 +2840,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = { */ /* IPVS genetlink family */ -static struct genl_family ip_vs_genl_family = { - .hdrsize = 0, - .name = IPVS_GENL_NAME, - .version = IPVS_GENL_VERSION, - .maxattr = IPVS_CMD_MAX, - .netnsok = true, /* Make ipvsadm to work on netns */ -}; +static struct genl_family ip_vs_genl_family; /* Policy used for first-level command attributes */ static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { @@ -3871,10 +3865,20 @@ static const struct genl_ops ip_vs_genl_ops[] = { }, }; +static struct genl_family ip_vs_genl_family = { + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, + .netnsok = true, /* Make ipvsadm to work on netns */ + .module = THIS_MODULE, + .ops = ip_vs_genl_ops, + .n_ops = ARRAY_SIZE(ip_vs_genl_ops), +}; + static int __init ip_vs_genl_register(void) { - return genl_register_family_with_ops(&ip_vs_genl_family, - ip_vs_genl_ops); + return genl_register_family(&ip_vs_genl_family); } static void ip_vs_genl_unregister(void) diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index 152e503b8c5d..ca7c9c411a5c 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -60,12 +60,7 @@ struct netlbl_domhsh_walk_arg { }; /* NetLabel Generic NETLINK CALIPSO family */ -static struct genl_family netlbl_calipso_gnl_family = { - .hdrsize = 0, - .name = NETLBL_NLTYPE_CALIPSO_NAME, - .version = NETLBL_PROTO_VERSION, - .maxattr = NLBL_CALIPSO_A_MAX, -}; +static struct genl_family netlbl_calipso_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy calipso_genl_policy[NLBL_CALIPSO_A_MAX + 1] = { @@ -354,6 +349,16 @@ static const struct genl_ops netlbl_calipso_ops[] = { }, }; +static struct genl_family netlbl_calipso_gnl_family = { + .hdrsize = 0, + .name = NETLBL_NLTYPE_CALIPSO_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_CALIPSO_A_MAX, + .module = THIS_MODULE, + .ops = netlbl_calipso_ops, + .n_ops = ARRAY_SIZE(netlbl_calipso_ops), +}; + /* NetLabel Generic NETLINK Protocol Functions */ @@ -367,8 +372,7 @@ static const struct genl_ops netlbl_calipso_ops[] = { */ int __init netlbl_calipso_genl_init(void) { - return genl_register_family_with_ops(&netlbl_calipso_gnl_family, - netlbl_calipso_ops); + return genl_register_family(&netlbl_calipso_gnl_family); } static const struct netlbl_calipso_ops *calipso_ops; diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 755b284e7ad4..a665eae91245 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -59,13 +59,7 @@ struct netlbl_domhsh_walk_arg { }; /* NetLabel Generic NETLINK CIPSOv4 family */ -static struct genl_family netlbl_cipsov4_gnl_family = { - .hdrsize = 0, - .name = NETLBL_NLTYPE_CIPSOV4_NAME, - .version = NETLBL_PROTO_VERSION, - .maxattr = NLBL_CIPSOV4_A_MAX, -}; - +static struct genl_family netlbl_cipsov4_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_cipsov4_genl_policy[NLBL_CIPSOV4_A_MAX + 1] = { [NLBL_CIPSOV4_A_DOI] = { .type = NLA_U32 }, @@ -766,6 +760,16 @@ static const struct genl_ops netlbl_cipsov4_ops[] = { }, }; +static struct genl_family netlbl_cipsov4_gnl_family = { + .hdrsize = 0, + .name = NETLBL_NLTYPE_CIPSOV4_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_CIPSOV4_A_MAX, + .module = THIS_MODULE, + .ops = netlbl_cipsov4_ops, + .n_ops = ARRAY_SIZE(netlbl_cipsov4_ops), +}; + /* * NetLabel Generic NETLINK Protocol Functions */ @@ -780,6 +784,5 @@ static const struct genl_ops netlbl_cipsov4_ops[] = { */ int __init netlbl_cipsov4_genl_init(void) { - return genl_register_family_with_ops(&netlbl_cipsov4_gnl_family, - netlbl_cipsov4_ops); + return genl_register_family(&netlbl_cipsov4_gnl_family); } diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 3b00f2368fcd..ecfe8eb149db 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -60,12 +60,7 @@ struct netlbl_domhsh_walk_arg { }; /* NetLabel Generic NETLINK CIPSOv4 family */ -static struct genl_family netlbl_mgmt_gnl_family = { - .hdrsize = 0, - .name = NETLBL_NLTYPE_MGMT_NAME, - .version = NETLBL_PROTO_VERSION, - .maxattr = NLBL_MGMT_A_MAX, -}; +static struct genl_family netlbl_mgmt_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { @@ -833,6 +828,16 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = { }, }; +static struct genl_family netlbl_mgmt_gnl_family = { + .hdrsize = 0, + .name = NETLBL_NLTYPE_MGMT_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_MGMT_A_MAX, + .module = THIS_MODULE, + .ops = netlbl_mgmt_genl_ops, + .n_ops = ARRAY_SIZE(netlbl_mgmt_genl_ops), +}; + /* * NetLabel Generic NETLINK Protocol Functions */ @@ -847,6 +852,5 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = { */ int __init netlbl_mgmt_genl_init(void) { - return genl_register_family_with_ops(&netlbl_mgmt_gnl_family, - netlbl_mgmt_genl_ops); + return genl_register_family(&netlbl_mgmt_gnl_family); } diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index c2ea8d1f653a..5dbbad41114f 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -123,12 +123,7 @@ static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def; static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ -static struct genl_family netlbl_unlabel_gnl_family = { - .hdrsize = 0, - .name = NETLBL_NLTYPE_UNLABELED_NAME, - .version = NETLBL_PROTO_VERSION, - .maxattr = NLBL_UNLABEL_A_MAX, -}; +static struct genl_family netlbl_unlabel_gnl_family; /* NetLabel Netlink attribute policy */ static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = { @@ -1377,6 +1372,16 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = { }, }; +static struct genl_family netlbl_unlabel_gnl_family = { + .hdrsize = 0, + .name = NETLBL_NLTYPE_UNLABELED_NAME, + .version = NETLBL_PROTO_VERSION, + .maxattr = NLBL_UNLABEL_A_MAX, + .module = THIS_MODULE, + .ops = netlbl_unlabel_genl_ops, + .n_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops), +}; + /* * NetLabel Generic NETLINK Protocol Functions */ @@ -1391,8 +1396,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = { */ int __init netlbl_unlabel_genl_init(void) { - return genl_register_family_with_ops(&netlbl_unlabel_gnl_family, - netlbl_unlabel_genl_ops); + return genl_register_family(&netlbl_unlabel_gnl_family); } /* diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index f19ec969edee..ca582ee4ae05 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -344,18 +344,18 @@ static int genl_validate_ops(const struct genl_family *family) } /** - * __genl_register_family - register a generic netlink family + * genl_register_family - register a generic netlink family * @family: generic netlink family * * Registers the specified family after validating it first. Only one * family may be registered with the same family name or identifier. * - * The family's ops array must already be assigned, you can use the - * genl_register_family_with_ops() helper function. + * The family's ops, multicast groups and module pointer must already + * be assigned. * * Return 0 on success or a negative error code. */ -int __genl_register_family(struct genl_family *family) +int genl_register_family(struct genl_family *family) { int err, i; @@ -429,7 +429,7 @@ errout_locked: genl_unlock_all(); return err; } -EXPORT_SYMBOL(__genl_register_family); +EXPORT_SYMBOL(genl_register_family); /** * genl_unregister_family - unregister generic netlink family @@ -452,7 +452,6 @@ int genl_unregister_family(struct genl_family *family) genl_unregister_mc_groups(family); list_del(&rc->family_list); - family->n_ops = 0; up_write(&cb_lock); wait_event(genl_sk_destructing_waitq, atomic_read(&genl_sk_destructing_cnt) == 0); @@ -681,13 +680,7 @@ static void genl_rcv(struct sk_buff *skb) * Controller **************************************************************************/ -static struct genl_family genl_ctrl = { - .id = GENL_ID_CTRL, - .name = "nlctrl", - .version = 0x2, - .maxattr = CTRL_ATTR_MAX, - .netnsok = true, -}; +static struct genl_family genl_ctrl; static int ctrl_fill_info(struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) @@ -997,6 +990,19 @@ static const struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; +static struct genl_family genl_ctrl = { + .module = THIS_MODULE, + .ops = genl_ctrl_ops, + .n_ops = ARRAY_SIZE(genl_ctrl_ops), + .mcgrps = genl_ctrl_groups, + .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups), + .id = GENL_ID_CTRL, + .name = "nlctrl", + .version = 0x2, + .maxattr = CTRL_ATTR_MAX, + .netnsok = true, +}; + static int genl_bind(struct net *net, int group) { int i, err = -ENOENT; @@ -1086,8 +1092,7 @@ static int __init genl_init(void) for (i = 0; i < GENL_FAM_TAB_SIZE; i++) INIT_LIST_HEAD(&family_ht[i]); - err = genl_register_family_with_ops_groups(&genl_ctrl, genl_ctrl_ops, - genl_ctrl_groups); + err = genl_register_family(&genl_ctrl); if (err < 0) goto problem; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index c230403e066c..450b1e5144cc 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -38,13 +38,7 @@ static const struct genl_multicast_group nfc_genl_mcgrps[] = { { .name = NFC_GENL_MCAST_EVENT_NAME, }, }; -static struct genl_family nfc_genl_family = { - .hdrsize = 0, - .name = NFC_GENL_NAME, - .version = NFC_GENL_VERSION, - .maxattr = NFC_ATTR_MAX, -}; - +static struct genl_family nfc_genl_family; static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, @@ -1752,6 +1746,18 @@ static const struct genl_ops nfc_genl_ops[] = { }, }; +static struct genl_family nfc_genl_family = { + .hdrsize = 0, + .name = NFC_GENL_NAME, + .version = NFC_GENL_VERSION, + .maxattr = NFC_ATTR_MAX, + .module = THIS_MODULE, + .ops = nfc_genl_ops, + .n_ops = ARRAY_SIZE(nfc_genl_ops), + .mcgrps = nfc_genl_mcgrps, + .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps), +}; + struct urelease_work { struct work_struct w; @@ -1837,9 +1843,7 @@ int __init nfc_genl_init(void) { int rc; - rc = genl_register_family_with_ops_groups(&nfc_genl_family, - nfc_genl_ops, - nfc_genl_mcgrps); + rc = genl_register_family(&nfc_genl_family); if (rc) return rc; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f9fef7dfba15..ad6a111a0014 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -679,6 +679,7 @@ static struct genl_family dp_packet_genl_family = { .parallel_ops = true, .ops = dp_packet_genl_ops, .n_ops = ARRAY_SIZE(dp_packet_genl_ops), + .module = THIS_MODULE, }; static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, @@ -1445,6 +1446,7 @@ static struct genl_family dp_flow_genl_family = { .n_ops = ARRAY_SIZE(dp_flow_genl_ops), .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, + .module = THIS_MODULE, }; static size_t ovs_dp_cmd_msg_size(void) @@ -1830,6 +1832,7 @@ static struct genl_family dp_datapath_genl_family = { .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), .mcgrps = &ovs_dp_datapath_multicast_group, .n_mcgrps = 1, + .module = THIS_MODULE, }; /* Called with ovs_mutex or RCU read lock. */ @@ -2251,6 +2254,7 @@ struct genl_family dp_vport_genl_family = { .n_ops = ARRAY_SIZE(dp_vport_genl_ops), .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, + .module = THIS_MODULE, }; static struct genl_family * const dp_genl_families[] = { diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 383b8fedabc7..74a405bf107b 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -135,14 +135,6 @@ const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { /* Users of the legacy API (tipc-config) can't handle that we add operations, * so we have a separate genl handling for the new API. */ -struct genl_family tipc_genl_family = { - .name = TIPC_GENL_V2_NAME, - .version = TIPC_GENL_V2_VERSION, - .hdrsize = 0, - .maxattr = TIPC_NLA_MAX, - .netnsok = true, -}; - static const struct genl_ops tipc_genl_v2_ops[] = { { .cmd = TIPC_NL_BEARER_DISABLE, @@ -257,6 +249,17 @@ static const struct genl_ops tipc_genl_v2_ops[] = { #endif }; +struct genl_family tipc_genl_family = { + .name = TIPC_GENL_V2_NAME, + .version = TIPC_GENL_V2_VERSION, + .hdrsize = 0, + .maxattr = TIPC_NLA_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = tipc_genl_v2_ops, + .n_ops = ARRAY_SIZE(tipc_genl_v2_ops), +}; + int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) { u32 maxattr = tipc_genl_family.maxattr; @@ -272,8 +275,7 @@ int tipc_netlink_start(void) { int res; - res = genl_register_family_with_ops(&tipc_genl_family, - tipc_genl_v2_ops); + res = genl_register_family(&tipc_genl_family); if (res) { pr_err("Failed to register netlink interface\n"); return res; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index f04428e4c8e5..07b19931e458 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1215,27 +1215,29 @@ send: return err; } +static struct genl_ops tipc_genl_compat_ops[] = { + { + .cmd = TIPC_GENL_CMD, + .doit = tipc_nl_compat_recv, + }, +}; + static struct genl_family tipc_genl_compat_family = { .name = TIPC_GENL_NAME, .version = TIPC_GENL_VERSION, .hdrsize = TIPC_GENL_HDRLEN, .maxattr = 0, .netnsok = true, -}; - -static struct genl_ops tipc_genl_compat_ops[] = { - { - .cmd = TIPC_GENL_CMD, - .doit = tipc_nl_compat_recv, - }, + .module = THIS_MODULE, + .ops = tipc_genl_compat_ops, + .n_ops = ARRAY_SIZE(tipc_genl_compat_ops), }; int tipc_netlink_compat_start(void) { int res; - res = genl_register_family_with_ops(&tipc_genl_compat_family, - tipc_genl_compat_ops); + res = genl_register_family(&tipc_genl_compat_family); if (res) { pr_err("Failed to register legacy compat interface\n"); return res; diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 8ac83a41585f..587e1627681f 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -572,15 +572,20 @@ struct d_level D_LEVEL[] = { size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); +static const struct genl_multicast_group wimax_gnl_mcgrps[] = { + { .name = "msg", }, +}; + struct genl_family wimax_gnl_family = { .name = "WiMAX", .version = WIMAX_GNL_VERSION, .hdrsize = 0, .maxattr = WIMAX_GNL_ATTR_MAX, -}; - -static const struct genl_multicast_group wimax_gnl_mcgrps[] = { - { .name = "msg", }, + .module = THIS_MODULE, + .ops = wimax_gnl_ops, + .n_ops = ARRAY_SIZE(wimax_gnl_ops), + .mcgrps = wimax_gnl_mcgrps, + .n_mcgrps = ARRAY_SIZE(wimax_gnl_mcgrps), }; @@ -595,11 +600,7 @@ int __init wimax_subsys_init(void) d_parse_params(D_LEVEL, D_LEVEL_SIZE, wimax_debug_params, "wimax.debug"); - snprintf(wimax_gnl_family.name, sizeof(wimax_gnl_family.name), - "WiMAX"); - result = genl_register_family_with_ops_groups(&wimax_gnl_family, - wimax_gnl_ops, - wimax_gnl_mcgrps); + result = genl_register_family(&wimax_gnl_family); if (unlikely(result < 0)) { pr_err("cannot register generic netlink family: %d\n", result); goto error_register_family; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 714beafe05e0..8e5ca3c47593 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -32,21 +32,8 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, struct cfg80211_crypto_settings *settings, int cipher_limit); -static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info); -static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, - struct genl_info *info); - /* the netlink family */ -static struct genl_family nl80211_fam = { - .name = NL80211_GENL_NAME, /* have users key off the name instead */ - .hdrsize = 0, /* no private header */ - .version = 1, /* no particular meaning now */ - .maxattr = NL80211_ATTR_MAX, - .netnsok = true, - .pre_doit = nl80211_pre_doit, - .post_doit = nl80211_post_doit, -}; +static struct genl_family nl80211_fam; /* multicast groups */ enum nl80211_multicast_groups { @@ -12599,6 +12586,21 @@ static const struct genl_ops nl80211_ops[] = { }, }; +static struct genl_family nl80211_fam = { + .name = NL80211_GENL_NAME, /* have users key off the name instead */ + .hdrsize = 0, /* no private header */ + .version = 1, /* no particular meaning now */ + .maxattr = NL80211_ATTR_MAX, + .netnsok = true, + .pre_doit = nl80211_pre_doit, + .post_doit = nl80211_post_doit, + .module = THIS_MODULE, + .ops = nl80211_ops, + .n_ops = ARRAY_SIZE(nl80211_ops), + .mcgrps = nl80211_mcgrps, + .n_mcgrps = ARRAY_SIZE(nl80211_mcgrps), +}; + /* notification functions */ void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, @@ -14565,8 +14567,7 @@ int nl80211_init(void) { int err; - err = genl_register_family_with_ops_groups(&nl80211_fam, nl80211_ops, - nl80211_mcgrps); + err = genl_register_family(&nl80211_fam); if (err) return err; -- cgit v1.2.3 From 56989f6d8568c21257dcec0f5e644d5570ba3281 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:05 +0200 Subject: genetlink: mark families as __ro_after_init Now genl_register_family() is the only thing (other than the users themselves, perhaps, but I didn't find any doing that) writing to the family struct. In all families that I found, genl_register_family() is only called from __init functions (some indirectly, in which case I've add __init annotations to clarifly things), so all can actually be marked __ro_after_init. This protects the data structure from accidental corruption. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- drivers/acpi/event.c | 4 ++-- drivers/net/gtp.c | 2 +- drivers/net/macsec.c | 2 +- drivers/net/team/team.c | 4 ++-- drivers/net/wireless/mac80211_hwsim.c | 4 ++-- drivers/scsi/pmcraid.c | 4 ++-- drivers/target/target_core_user.c | 2 +- drivers/thermal/thermal_core.c | 4 ++-- fs/dlm/netlink.c | 2 +- fs/quota/netlink.c | 2 +- include/linux/genl_magic_func.h | 2 +- kernel/taskstats.c | 2 +- net/batman-adv/netlink.c | 2 +- net/core/devlink.c | 2 +- net/core/drop_monitor.c | 2 +- net/hsr/hsr_netlink.c | 2 +- net/ieee802154/netlink.c | 2 +- net/ieee802154/nl802154.c | 4 ++-- net/ipv4/fou.c | 2 +- net/ipv4/tcp_metrics.c | 2 +- net/ipv6/ila/ila_xlat.c | 4 ++-- net/irda/irnetlink.c | 4 ++-- net/l2tp/l2tp_netlink.c | 4 ++-- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- net/netlabel/netlabel_calipso.c | 2 +- net/netlabel/netlabel_cipso_v4.c | 2 +- net/netlabel/netlabel_mgmt.c | 2 +- net/netlabel/netlabel_unlabeled.c | 2 +- net/netlink/genetlink.c | 2 +- net/nfc/netlink.c | 2 +- net/openvswitch/datapath.c | 10 +++++----- net/tipc/netlink.c | 4 ++-- net/tipc/netlink_compat.c | 4 ++-- net/wimax/stack.c | 2 +- net/wireless/nl80211.c | 4 ++-- 35 files changed, 51 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/event.c b/drivers/acpi/event.c index 1ab12ad7d5ba..7fceb3b4691b 100644 --- a/drivers/acpi/event.c +++ b/drivers/acpi/event.c @@ -82,7 +82,7 @@ static const struct genl_multicast_group acpi_event_mcgrps[] = { { .name = ACPI_GENL_MCAST_GROUP_NAME, }, }; -static struct genl_family acpi_event_genl_family = { +static struct genl_family acpi_event_genl_family __ro_after_init = { .module = THIS_MODULE, .name = ACPI_GENL_FAMILY_NAME, .version = ACPI_GENL_VERSION, @@ -144,7 +144,7 @@ int acpi_bus_generate_netlink_event(const char *device_class, EXPORT_SYMBOL(acpi_bus_generate_netlink_event); -static int acpi_event_genetlink_init(void) +static int __init acpi_event_genetlink_init(void) { return genl_register_family(&acpi_event_genl_family); } diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 0604fd78f826..719d19f35673 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1290,7 +1290,7 @@ static const struct genl_ops gtp_genl_ops[] = { }, }; -static struct genl_family gtp_genl_family = { +static struct genl_family gtp_genl_family __ro_after_init = { .name = "gtp", .version = 0, .hdrsize = 0, diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 63ca7a3c77cf..0a715ab9d9cc 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -2648,7 +2648,7 @@ static const struct genl_ops macsec_genl_ops[] = { }, }; -static struct genl_family macsec_fam = { +static struct genl_family macsec_fam __ro_after_init = { .name = MACSEC_GENL_NAME, .hdrsize = 0, .version = MACSEC_GENL_VERSION, diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 46bf7c1216c0..bdc58567d10e 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2740,7 +2740,7 @@ static const struct genl_multicast_group team_nl_mcgrps[] = { { .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, }, }; -static struct genl_family team_nl_family = { +static struct genl_family team_nl_family __ro_after_init = { .name = TEAM_GENL_NAME, .version = TEAM_GENL_VERSION, .maxattr = TEAM_ATTR_MAX, @@ -2773,7 +2773,7 @@ static int team_nl_send_event_port_get(struct team *team, port); } -static int team_nl_init(void) +static int __init team_nl_init(void) { return genl_register_family(&team_nl_family); } diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 5d4637e586e8..220e9dc8ccf8 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3228,7 +3228,7 @@ static const struct genl_ops hwsim_ops[] = { }, }; -static struct genl_family hwsim_genl_family = { +static struct genl_family hwsim_genl_family __ro_after_init = { .name = "MAC80211_HWSIM", .version = 1, .maxattr = HWSIM_ATTR_MAX, @@ -3287,7 +3287,7 @@ static struct notifier_block hwsim_netlink_notifier = { .notifier_call = mac80211_hwsim_netlink_notify, }; -static int hwsim_init_netlink(void) +static int __init hwsim_init_netlink(void) { int rc; diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c index c0ab7bb8c3ce..845affa112f7 100644 --- a/drivers/scsi/pmcraid.c +++ b/drivers/scsi/pmcraid.c @@ -1368,7 +1368,7 @@ static struct genl_multicast_group pmcraid_mcgrps[] = { { .name = "events", /* not really used - see ID discussion below */ }, }; -static struct genl_family pmcraid_event_family = { +static struct genl_family pmcraid_event_family __ro_after_init = { .module = THIS_MODULE, .name = "pmcraid", .version = 1, @@ -1384,7 +1384,7 @@ static struct genl_family pmcraid_event_family = { * 0 if the pmcraid_event_family is successfully registered * with netlink generic, non-zero otherwise */ -static int pmcraid_netlink_init(void) +static int __init pmcraid_netlink_init(void) { int result; diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 3483372f5562..0f173bf7dbac 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -147,7 +147,7 @@ static const struct genl_multicast_group tcmu_mcgrps[] = { }; /* Our generic netlink family */ -static struct genl_family tcmu_genl_family = { +static struct genl_family tcmu_genl_family __ro_after_init = { .module = THIS_MODULE, .hdrsize = 0, .name = "TCM-USER", diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 93b6caab2d9f..911fd964c742 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -2163,7 +2163,7 @@ static const struct genl_multicast_group thermal_event_mcgrps[] = { { .name = THERMAL_GENL_MCAST_GROUP_NAME, }, }; -static struct genl_family thermal_event_genl_family = { +static struct genl_family thermal_event_genl_family __ro_after_init = { .module = THIS_MODULE, .name = THERMAL_GENL_FAMILY_NAME, .version = THERMAL_GENL_VERSION, @@ -2235,7 +2235,7 @@ int thermal_generate_netlink_event(struct thermal_zone_device *tz, } EXPORT_SYMBOL_GPL(thermal_generate_netlink_event); -static int genetlink_init(void) +static int __init genetlink_init(void) { return genl_register_family(&thermal_event_genl_family); } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 04042d69573c..0643ae44f342 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -72,7 +72,7 @@ static struct genl_ops dlm_nl_ops[] = { }, }; -static struct genl_family family = { +static struct genl_family family __ro_after_init = { .name = DLM_GENL_NAME, .version = DLM_GENL_VERSION, .ops = dlm_nl_ops, diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 9457c7b0dfa2..e99b1a72d9a7 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -12,7 +12,7 @@ static const struct genl_multicast_group quota_mcgrps[] = { }; /* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { +static struct genl_family quota_genl_family __ro_after_init = { .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 40c2e39362c8..377257d8f7e3 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -293,7 +293,7 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \ #undef GENL_mc_group #define GENL_mc_group(group) -static struct genl_family ZZZ_genl_family __read_mostly = { +static struct genl_family ZZZ_genl_family __ro_after_init = { .name = __stringify(GENL_MAGIC_FAMILY), .version = GENL_MAGIC_VERSION, #ifdef GENL_MAGIC_FAMILY_HDRSZ diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4075ece592f2..9b7f838511ce 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -646,7 +646,7 @@ static const struct genl_ops taskstats_ops[] = { }, }; -static struct genl_family family = { +static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, .maxattr = TASKSTATS_CMD_ATTR_MAX, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index e28cec34a016..005012ba9b48 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -603,7 +603,7 @@ static struct genl_ops batadv_netlink_ops[] = { }; -struct genl_family batadv_netlink_family = { +struct genl_family batadv_netlink_family __ro_after_init = { .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, diff --git a/net/core/devlink.c b/net/core/devlink.c index 063da8091aef..c14f8b661db9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1612,7 +1612,7 @@ static const struct genl_ops devlink_nl_ops[] = { }, }; -static struct genl_family devlink_nl_family = { +static struct genl_family devlink_nl_family __ro_after_init = { .name = DEVLINK_GENL_NAME, .version = DEVLINK_GENL_VERSION, .maxattr = DEVLINK_ATTR_MAX, diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 80c002794ff6..8e0c0635ee97 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -346,7 +346,7 @@ static const struct genl_ops dropmon_ops[] = { }, }; -static struct genl_family net_drop_monitor_family = { +static struct genl_family net_drop_monitor_family __ro_after_init = { .hdrsize = 0, .name = "NET_DM", .version = 2, diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index aab34c7f6f89..1ab30e7d3f99 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -461,7 +461,7 @@ static const struct genl_ops hsr_ops[] = { }, }; -static struct genl_family hsr_genl_family = { +static struct genl_family hsr_genl_family __ro_after_init = { .hdrsize = 0, .name = "HSR", .version = 1, diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c index 08e62470bac2..6bde9e5a5503 100644 --- a/net/ieee802154/netlink.c +++ b/net/ieee802154/netlink.c @@ -131,7 +131,7 @@ static const struct genl_multicast_group ieee802154_mcgrps[] = { [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, }, }; -struct genl_family nl802154_family = { +struct genl_family nl802154_family __ro_after_init = { .hdrsize = 0, .name = IEEE802154_NL_NAME, .version = 1, diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index f7e75578aedd..fc60cd061f39 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -2462,7 +2462,7 @@ static const struct genl_ops nl802154_ops[] = { #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; -static struct genl_family nl802154_fam = { +static struct genl_family nl802154_fam __ro_after_init = { .name = NL802154_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ @@ -2478,7 +2478,7 @@ static struct genl_family nl802154_fam = { }; /* initialisation/exit functions */ -int nl802154_init(void) +int __init nl802154_init(void) { return genl_register_family(&nl802154_fam); } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 5b5226a2434f..6cb57bb8692d 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -824,7 +824,7 @@ static const struct genl_ops fou_nl_ops[] = { }, }; -static struct genl_family fou_nl_family = { +static struct genl_family fou_nl_family __ro_after_init = { .hdrsize = 0, .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index bba3c72c4a39..d46f4d5b1c62 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1109,7 +1109,7 @@ static const struct genl_ops tcp_metrics_nl_ops[] = { }, }; -static struct genl_family tcp_metrics_nl_family = { +static struct genl_family tcp_metrics_nl_family __ro_after_init = { .hdrsize = 0, .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 97f7b0cc4675..628ae6d85b59 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -553,7 +553,7 @@ static const struct genl_ops ila_nl_ops[] = { }, }; -static struct genl_family ila_nl_family = { +static struct genl_family ila_nl_family __ro_after_init = { .hdrsize = 0, .name = ILA_GENL_NAME, .version = ILA_GENL_VERSION, @@ -627,7 +627,7 @@ static int ila_xlat_addr(struct sk_buff *skb, bool set_csum_neutral) return 0; } -int ila_xlat_init(void) +int __init ila_xlat_init(void) { int ret; diff --git a/net/irda/irnetlink.c b/net/irda/irnetlink.c index 07877347c2f7..7fc340e574cf 100644 --- a/net/irda/irnetlink.c +++ b/net/irda/irnetlink.c @@ -141,7 +141,7 @@ static const struct genl_ops irda_nl_ops[] = { }; -static struct genl_family irda_nl_family = { +static struct genl_family irda_nl_family __ro_after_init = { .name = IRDA_NL_NAME, .hdrsize = 0, .version = IRDA_NL_VERSION, @@ -151,7 +151,7 @@ static struct genl_family irda_nl_family = { .n_ops = ARRAY_SIZE(irda_nl_ops), }; -int irda_nl_register(void) +int __init irda_nl_register(void) { return genl_register_family(&irda_nl_family); } diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index e4e8c0769a6b..59aa2d204e4a 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -970,7 +970,7 @@ static const struct genl_ops l2tp_nl_ops[] = { }, }; -static struct genl_family l2tp_nl_family = { +static struct genl_family l2tp_nl_family __ro_after_init = { .name = L2TP_GENL_NAME, .version = L2TP_GENL_VERSION, .hdrsize = 0, @@ -1016,7 +1016,7 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type) } EXPORT_SYMBOL_GPL(l2tp_nl_unregister_ops); -static int l2tp_nl_init(void) +static int __init l2tp_nl_init(void) { pr_info("L2TP netlink interface\n"); return genl_register_family(&l2tp_nl_family); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ea3e8aed063f..6b85ded4f91d 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3865,7 +3865,7 @@ static const struct genl_ops ip_vs_genl_ops[] = { }, }; -static struct genl_family ip_vs_genl_family = { +static struct genl_family ip_vs_genl_family __ro_after_init = { .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c index ca7c9c411a5c..d177dd066504 100644 --- a/net/netlabel/netlabel_calipso.c +++ b/net/netlabel/netlabel_calipso.c @@ -349,7 +349,7 @@ static const struct genl_ops netlbl_calipso_ops[] = { }, }; -static struct genl_family netlbl_calipso_gnl_family = { +static struct genl_family netlbl_calipso_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_CALIPSO_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index a665eae91245..4149d3e63589 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -760,7 +760,7 @@ static const struct genl_ops netlbl_cipsov4_ops[] = { }, }; -static struct genl_family netlbl_cipsov4_gnl_family = { +static struct genl_family netlbl_cipsov4_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_CIPSOV4_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index ecfe8eb149db..21e0095b1d14 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -828,7 +828,7 @@ static const struct genl_ops netlbl_mgmt_genl_ops[] = { }, }; -static struct genl_family netlbl_mgmt_gnl_family = { +static struct genl_family netlbl_mgmt_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_MGMT_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 5dbbad41114f..22dc1b9d6362 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1372,7 +1372,7 @@ static const struct genl_ops netlbl_unlabel_genl_ops[] = { }, }; -static struct genl_family netlbl_unlabel_gnl_family = { +static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = { .hdrsize = 0, .name = NETLBL_NLTYPE_UNLABELED_NAME, .version = NETLBL_PROTO_VERSION, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 85659921e7b2..df0cbcddda2c 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -936,7 +936,7 @@ static const struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; -static struct genl_family genl_ctrl = { +static struct genl_family genl_ctrl __ro_after_init = { .module = THIS_MODULE, .ops = genl_ctrl_ops, .n_ops = ARRAY_SIZE(genl_ctrl_ops), diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 450b1e5144cc..03f3d5c7beb8 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1746,7 +1746,7 @@ static const struct genl_ops nfc_genl_ops[] = { }, }; -static struct genl_family nfc_genl_family = { +static struct genl_family nfc_genl_family __ro_after_init = { .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ad6a111a0014..fa8760176b7d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -670,7 +670,7 @@ static const struct genl_ops dp_packet_genl_ops[] = { } }; -static struct genl_family dp_packet_genl_family = { +static struct genl_family dp_packet_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, @@ -1435,7 +1435,7 @@ static const struct genl_ops dp_flow_genl_ops[] = { }, }; -static struct genl_family dp_flow_genl_family = { +static struct genl_family dp_flow_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, @@ -1821,7 +1821,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = { }, }; -static struct genl_family dp_datapath_genl_family = { +static struct genl_family dp_datapath_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, @@ -2243,7 +2243,7 @@ static const struct genl_ops dp_vport_genl_ops[] = { }, }; -struct genl_family dp_vport_genl_family = { +struct genl_family dp_vport_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, @@ -2272,7 +2272,7 @@ static void dp_unregister_genl(int n_families) genl_unregister_family(dp_genl_families[i]); } -static int dp_register_genl(void) +static int __init dp_register_genl(void) { int err; int i; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 74a405bf107b..26ca8dd64ded 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -249,7 +249,7 @@ static const struct genl_ops tipc_genl_v2_ops[] = { #endif }; -struct genl_family tipc_genl_family = { +struct genl_family tipc_genl_family __ro_after_init = { .name = TIPC_GENL_V2_NAME, .version = TIPC_GENL_V2_VERSION, .hdrsize = 0, @@ -271,7 +271,7 @@ int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy); } -int tipc_netlink_start(void) +int __init tipc_netlink_start(void) { int res; diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 07b19931e458..e1ae8a8a2b8e 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1222,7 +1222,7 @@ static struct genl_ops tipc_genl_compat_ops[] = { }, }; -static struct genl_family tipc_genl_compat_family = { +static struct genl_family tipc_genl_compat_family __ro_after_init = { .name = TIPC_GENL_NAME, .version = TIPC_GENL_VERSION, .hdrsize = TIPC_GENL_HDRLEN, @@ -1233,7 +1233,7 @@ static struct genl_family tipc_genl_compat_family = { .n_ops = ARRAY_SIZE(tipc_genl_compat_ops), }; -int tipc_netlink_compat_start(void) +int __init tipc_netlink_compat_start(void) { int res; diff --git a/net/wimax/stack.c b/net/wimax/stack.c index 587e1627681f..5db731512014 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -576,7 +576,7 @@ static const struct genl_multicast_group wimax_gnl_mcgrps[] = { { .name = "msg", }, }; -struct genl_family wimax_gnl_family = { +struct genl_family wimax_gnl_family __ro_after_init = { .name = "WiMAX", .version = WIMAX_GNL_VERSION, .hdrsize = 0, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8e5ca3c47593..271707dacfea 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12586,7 +12586,7 @@ static const struct genl_ops nl80211_ops[] = { }, }; -static struct genl_family nl80211_fam = { +static struct genl_family nl80211_fam __ro_after_init = { .name = NL80211_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ @@ -14563,7 +14563,7 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev) /* initialisation/exit functions */ -int nl80211_init(void) +int __init nl80211_init(void) { int err; -- cgit v1.2.3 From ef295ecf090d3e86e5b742fc6ab34f1122a43773 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2016 08:48:16 -0600 Subject: block: better op and flags encoding Now that we don't need the common flags to overflow outside the range of a 32-bit type we can encode them the same way for both the bio and request fields. This in addition allows us to place the operation first (and make some room for more ops while we're at it) and to stop having to shift around the operation values. In addition this allows passing around only one value in the block layer instead of two (and eventuall also in the file systems, but we can do that later) and thus clean up a lot of code. Last but not least this allows decreasing the size of the cmd_flags field in struct request to 32-bits. Various functions passing this value could also be updated, but I'd like to avoid the churn for now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 4 +- block/blk-core.c | 60 ++++++++++-------------------- block/blk-flush.c | 2 +- block/blk-lib.c | 2 +- block/blk-map.c | 2 + block/blk-mq.c | 28 ++++++-------- block/cfq-iosched.c | 66 ++++++++++++++++----------------- block/elevator.c | 4 +- drivers/md/dm-crypt.c | 2 +- drivers/scsi/sd.c | 3 +- fs/btrfs/inode.c | 5 +-- fs/buffer.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/gfs2/lops.c | 2 +- include/linux/blk-cgroup.h | 11 +++--- include/linux/blk_types.h | 83 +++++++++++++++++++----------------------- include/linux/blkdev.h | 26 +------------ include/linux/blktrace_api.h | 2 +- include/linux/dm-io.h | 2 +- include/linux/elevator.h | 4 +- include/trace/events/bcache.h | 12 ++---- include/trace/events/block.h | 31 ++++++---------- kernel/trace/blktrace.c | 14 +++---- 23 files changed, 148 insertions(+), 221 deletions(-) (limited to 'kernel') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 6acea160298c..01ddeaf64b0f 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -553,8 +553,8 @@ struct request { struct request_list *rl; } -See the rq_flag_bits definitions for an explanation of the various flags -available. Some bits are used by the block layer or i/o scheduler. +See the req_ops and req_flag_bits definitions for an explanation of the various +flags available. Some bits are used by the block layer or i/o scheduler. The behaviour of the various sector counts are almost the same as before, except that since we have multi-segment bios, current_nr_sectors refers diff --git a/block/blk-core.c b/block/blk-core.c index fd416651a676..0bfaa54d3e9f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1056,8 +1056,7 @@ static struct io_context *rq_ioc(struct bio *bio) /** * __get_request - get a free request * @rl: request list to allocate from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1068,23 +1067,22 @@ static struct io_context *rq_ioc(struct bio *bio) * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *__get_request(struct request_list *rl, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *__get_request(struct request_list *rl, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { struct request_queue *q = rl->q; struct request *rq; struct elevator_type *et = q->elevator->type; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); int may_queue; req_flags_t rq_flags = RQF_ALLOCED; if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); - may_queue = elv_may_queue(q, op, op_flags); + may_queue = elv_may_queue(q, op); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; @@ -1154,7 +1152,7 @@ static struct request *__get_request(struct request_list *rl, int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - req_set_op_attrs(rq, op, op_flags); + rq->cmd_flags = op; rq->rq_flags = rq_flags; /* init elvpriv */ @@ -1232,8 +1230,7 @@ rq_starved: /** * get_request - get a free request * @q: request_queue to allocate request from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1244,18 +1241,17 @@ rq_starved: * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *get_request(struct request_queue *q, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *get_request(struct request_queue *q, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); struct request_list *rl; struct request *rq; rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, op_flags, bio, gfp_mask); + rq = __get_request(rl, op, bio, gfp_mask); if (!IS_ERR(rq)) return rq; @@ -1297,7 +1293,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); - rq = get_request(q, rw, 0, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); return rq; @@ -1446,7 +1442,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) */ if (rq_flags & RQF_ALLOCED) { struct request_list *rl = blk_rq_rl(req); - bool sync = rw_is_sync(req_op(req), req->cmd_flags); + bool sync = op_is_sync(req->cmd_flags); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); @@ -1652,8 +1648,6 @@ out: void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; - - req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK; if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; @@ -1665,9 +1659,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { - const bool sync = !!(bio->bi_opf & REQ_SYNC); struct blk_plug *plug; - int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT; + int el_ret, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; @@ -1722,24 +1715,11 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - /* - * This sync check and mask will be re-done in init_request_from_bio(), - * but we need to set it earlier to expose the sync flag to the - * rq allocator and io schedulers. - */ - if (sync) - rw_flags |= REQ_SYNC; - - /* - * Add in META/PRIO flags, if set, before we get to the IO scheduler - */ - rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO)); - /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO); + req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { bio->bi_error = PTR_ERR(req); bio_endio(bio); @@ -2946,8 +2926,6 @@ EXPORT_SYMBOL_GPL(__blk_end_request_err); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { - req_set_op(rq, bio_op(bio)); - if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); @@ -3031,8 +3009,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - req_set_op_attrs(dst, req_op(src), - (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE); + dst->cmd_flags = src->cmd_flags | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -3537,8 +3514,11 @@ EXPORT_SYMBOL(blk_set_runtime_active); int __init blk_dev_init(void) { - BUILD_BUG_ON(__REQ_NR_BITS > 8 * + BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct request, cmd_flags)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * + FIELD_SIZEOF(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", diff --git a/block/blk-flush.c b/block/blk-flush.c index 3990b9cfbda5..95f1d4d357df 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH); + flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH; flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; diff --git a/block/blk-lib.c b/block/blk-lib.c index 46fe9248410d..18abda862915 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int granularity; - enum req_op op; + unsigned int op; int alignment; sector_t bs_mask; diff --git a/block/blk-map.c b/block/blk-map.c index 2c5ae5fef473..0173a72a8aa9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -16,6 +16,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) { if (!rq->bio) { + rq->cmd_flags &= REQ_OP_MASK; + rq->cmd_flags |= (bio->bi_opf & REQ_OP_MASK); blk_rq_bio_prep(rq->q, rq, bio); } else { if (!ll_back_merge_fn(rq->q, rq, bio)) diff --git a/block/blk-mq.c b/block/blk-mq.c index b49c6658eb05..2da1a0ee3318 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -139,14 +139,13 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) EXPORT_SYMBOL(blk_mq_can_queue); static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, int op, - unsigned int op_flags) + struct request *rq, unsigned int op) { INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = q; rq->mq_ctx = ctx; - req_set_op_attrs(rq, op, op_flags); + rq->cmd_flags = op; if (blk_queue_io_stat(q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ @@ -183,11 +182,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->end_io_data = NULL; rq->next_rq = NULL; - ctx->rq_dispatched[rw_is_sync(op, op_flags)]++; + ctx->rq_dispatched[op_is_sync(op)]++; } static struct request * -__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) +__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) { struct request *rq; unsigned int tag; @@ -202,7 +201,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) } rq->tag = tag; - blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags); + blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } @@ -225,7 +224,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + rq = __blk_mq_alloc_request(&alloc_data, rw); blk_mq_put_ctx(ctx); if (!rq) { @@ -277,7 +276,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + rq = __blk_mq_alloc_request(&alloc_data, rw); if (!rq) { ret = -EWOULDBLOCK; goto out_queue_exit; @@ -1196,19 +1195,14 @@ static struct request *blk_mq_map_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct request *rq; - int op = bio_data_dir(bio); - int op_flags = 0; blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, ctx->cpu); - if (rw_is_sync(bio_op(bio), bio->bi_opf)) - op_flags |= REQ_SYNC; - - trace_block_getrq(q, bio, op); + trace_block_getrq(q, bio, bio->bi_opf); blk_mq_set_alloc_data(data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(data, op, op_flags); + rq = __blk_mq_alloc_request(data, bio->bi_opf); data->hctx->queued++; return rq; @@ -1256,7 +1250,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) */ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_mq_alloc_data data; struct request *rq; @@ -1350,7 +1344,7 @@ done: */ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_plug *plug; unsigned int request_count = 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5e24d880306c..c96186adaa66 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -667,10 +667,10 @@ static inline void cfqg_put(struct cfq_group *cfqg) } while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, - int op_flags) + struct cfq_group *curr_cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.queued, op, 1); cfqg_stats_end_empty_time(&cfqg->stats); cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); } @@ -684,30 +684,29 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, #endif } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1); + blkg_rwstat_add(&cfqg->stats.queued, op, -1); } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.merged, op, 1); } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { struct cfqg_stats *stats = &cfqg->stats; unsigned long long now = sched_clock(); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, op_flags, - now - io_start_time); + blkg_rwstat_add(&stats->service_time, op, now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, op_flags, + blkg_rwstat_add(&stats->wait_time, op, io_start_time - start_time); } @@ -786,16 +785,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { } #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, int op_flags) { } + struct cfq_group *curr_cfqg, unsigned int op) { } static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, uint64_t time, unsigned long unaccounted_time) { } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) { } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) { } +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { } +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) { } + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -2474,10 +2473,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); cfq_add_rq_rb(rq); cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, - req_op(rq), rq->cmd_flags); + rq->cmd_flags); } static struct request * @@ -2530,7 +2529,7 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); if (rq->cmd_flags & REQ_PRIO) { WARN_ON(!cfqq->prio_pending); cfqq->prio_pending--; @@ -2565,7 +2564,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf); + cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); } static void @@ -2588,7 +2587,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags); + cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); cfqq = RQ_CFQQ(next); /* @@ -4142,7 +4141,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq), + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, rq->cmd_flags); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -4240,8 +4239,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), - rq_io_start_time_ns(rq), req_op(rq), - rq->cmd_flags); + rq_io_start_time_ns(rq), rq->cmd_flags); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -4319,14 +4317,14 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags) +static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) { /* * If REQ_PRIO is set, boost class and prio level, if it's below * BE/NORM. If prio is not set, restore the potentially boosted * class/prio level. */ - if (!(op_flags & REQ_PRIO)) { + if (!(op & REQ_PRIO)) { cfqq->ioprio_class = cfqq->org_ioprio_class; cfqq->ioprio = cfqq->org_ioprio; } else { @@ -4347,7 +4345,7 @@ static inline int __cfq_may_queue(struct cfq_queue *cfqq) return ELV_MQUEUE_MAY; } -static int cfq_may_queue(struct request_queue *q, int op, int op_flags) +static int cfq_may_queue(struct request_queue *q, unsigned int op) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -4364,10 +4362,10 @@ static int cfq_may_queue(struct request_queue *q, int op, int op_flags) if (!cic) return ELV_MQUEUE_MAY; - cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags)); + cfqq = cic_to_cfqq(cic, op_is_sync(op)); if (cfqq) { cfq_init_prio_data(cfqq, cic); - cfqq_boost_on_prio(cfqq, op_flags); + cfqq_boost_on_prio(cfqq, op); return __cfq_may_queue(cfqq); } diff --git a/block/elevator.c b/block/elevator.c index ac80f89a0842..a18a5db274e4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -714,12 +714,12 @@ void elv_put_request(struct request_queue *q, struct request *rq) e->type->ops.elevator_put_req_fn(rq); } -int elv_may_queue(struct request_queue *q, int op, int op_flags) +int elv_may_queue(struct request_queue *q, unsigned int op) { struct elevator_queue *e = q->elevator; if (e->type->ops.elevator_may_queue_fn) - return e->type->ops.elevator_may_queue_fn(q, op, op_flags); + return e->type->ops.elevator_may_queue_fn(q, op); return ELV_MQUEUE_MAY; } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a2768835d394..68a9eb4f3f36 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1135,7 +1135,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; - bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio)); + clone->bi_opf = io->base_bio->bi_opf; } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index cef1f78031d4..65738b0aad36 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1031,8 +1031,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) } else if (rq_data_dir(rq) == READ) { SCpnt->cmnd[0] = READ_6; } else { - scmd_printk(KERN_ERR, SCpnt, "Unknown command %llu,%llx\n", - req_op(rq), (unsigned long long) rq->cmd_flags); + scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq)); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b790bda7998..9a377079af26 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8427,7 +8427,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8465,8 +8465,7 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), - bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; diff --git a/fs/buffer.c b/fs/buffer.c index b205a629001d..a29335867e30 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3118,7 +3118,7 @@ EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) * @op: whether to %READ or %WRITE - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8de18a168a..2cf4f7f09e32 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -688,7 +688,7 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int op; /* contains REQ_OP_ */ - int op_flags; /* rq_flag_bits */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 49d5a1b61b06..b1f9144b42c7 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio) * gfs2_log_flush_bio - Submit any pending log bio * @sdp: The superblock * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3bf5d33800ab..ddaf28d0988f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -581,15 +581,14 @@ static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat - * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int op, int op_flags, uint64_t val) + unsigned int op, uint64_t val) { struct percpu_counter *cnt; @@ -600,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); - if (op_flags & REQ_SYNC) + if (op & REQ_SYNC) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; @@ -705,9 +704,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf, + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } rcu_read_unlock(); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ec69a8fe3b29..dca972d67548 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -88,24 +88,6 @@ struct bio { struct bio_vec bi_inline_vecs[0]; }; -#define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) -#define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) -#define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) - -#define bio_set_op_attrs(bio, op, op_flags) do { \ - if (__builtin_constant_p(op)) \ - BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \ - else \ - WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \ - if (__builtin_constant_p(op_flags)) \ - BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - else \ - WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - (bio)->bi_opf = bio_flags(bio); \ - (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \ - (bio)->bi_opf |= (op_flags); \ -} while (0) - #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) /* @@ -147,26 +129,40 @@ struct bio { #endif /* CONFIG_BLOCK */ /* - * Request flags. For use in the cmd_flags field of struct request, and in - * bi_opf of struct bio. Note that some flags are only valid in either one. + * Operations and flags common to the bio and request structures. + * We use 8 bits for encoding the operation, and the remaining 24 for flags. */ -enum rq_flag_bits { - /* common flags */ - __REQ_FAILFAST_DEV, /* no driver retries of device errors */ +#define REQ_OP_BITS 8 +#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) +#define REQ_FLAG_BITS 24 + +enum req_opf { + REQ_OP_READ, + REQ_OP_WRITE, + REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ + REQ_OP_WRITE_SAME, /* write same block many times */ + REQ_OP_FLUSH, /* request for cache flush */ + REQ_OP_ZONE_REPORT, /* Get zone information */ + REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ + + REQ_OP_LAST, +}; + +enum req_flag_bits { + __REQ_FAILFAST_DEV = /* no driver retries of device errors */ + REQ_OP_BITS, __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ - __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ - __REQ_NOMERGE, /* don't touch this for merging */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ - __REQ_NR_BITS, /* stops here */ }; @@ -176,37 +172,32 @@ enum rq_flag_bits { #define REQ_SYNC (1ULL << __REQ_SYNC) #define REQ_META (1ULL << __REQ_META) #define REQ_PRIO (1ULL << __REQ_PRIO) +#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) #define REQ_NOIDLE (1ULL << __REQ_NOIDLE) #define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) +#define REQ_FUA (1ULL << __REQ_FUA) +#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -#define REQ_COMMON_MASK \ - (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_RAHEAD) -#define REQ_CLONE_MASK REQ_COMMON_MASK -/* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) -#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) -#define REQ_FUA (1ULL << __REQ_FUA) -#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) -#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define bio_op(bio) \ + ((bio)->bi_opf & REQ_OP_MASK) +#define req_op(req) \ + ((req)->cmd_flags & REQ_OP_MASK) -enum req_op { - REQ_OP_READ, - REQ_OP_WRITE, - REQ_OP_DISCARD, /* request to discard sectors */ - REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ - REQ_OP_WRITE_SAME, /* write same block many times */ - REQ_OP_FLUSH, /* request for cache flush */ - REQ_OP_ZONE_REPORT, /* Get zone information */ - REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ -}; +/* obsolete, don't use in new code */ +#define bio_set_op_attrs(bio, op, op_flags) \ + ((bio)->bi_opf |= (op | op_flags)) -#define REQ_OP_BITS 3 +static inline bool op_is_sync(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC); +} typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b4415feac679..8396da2bb698 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -142,7 +142,7 @@ struct request { int cpu; unsigned cmd_type; - u64 cmd_flags; + unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; unsigned long atomic_flags; @@ -244,20 +244,6 @@ struct request { struct request *next_rq; }; -#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS) -#define req_op(req) ((req)->cmd_flags >> REQ_OP_SHIFT) - -#define req_set_op(req, op) do { \ - WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (req)->cmd_flags &= ((1ULL << REQ_OP_SHIFT) - 1); \ - (req)->cmd_flags |= ((u64) (op) << REQ_OP_SHIFT); \ -} while (0) - -#define req_set_op_attrs(req, op, flags) do { \ - req_set_op(req, op); \ - (req)->cmd_flags |= flags; \ -} while (0) - static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; @@ -741,17 +727,9 @@ static inline unsigned int blk_queue_zone_size(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -/* - * We regard a request as sync, if either a read or a sync write - */ -static inline bool rw_is_sync(int op, unsigned int rw_flags) -{ - return op == REQ_OP_READ || (rw_flags & REQ_SYNC); -} - static inline bool rq_is_sync(struct request *rq) { - return rw_is_sync(req_op(rq), rq->cmd_flags); + return op_is_sync(rq->cmd_flags); } static inline bool blk_rl_full(struct request_list *rl, bool sync) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cceb72f9e29f..e417f080219a 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq) } extern void blk_dump_cmd(char *buf, struct request *rq); -extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes); +extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h index b91b023deffb..a52c6580cc9a 100644 --- a/include/linux/dm-io.h +++ b/include/linux/dm-io.h @@ -58,7 +58,7 @@ struct dm_io_notify { struct dm_io_client; struct dm_io_request { int bi_op; /* REQ_OP */ - int bi_op_flags; /* rq_flag_bits */ + int bi_op_flags; /* req_flag_bits */ struct dm_io_memory mem; /* Memory to use for io */ struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */ struct dm_io_client *client; /* Client memory handler */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index e7f358d2e5fc..f219c9aed360 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -30,7 +30,7 @@ typedef int (elevator_dispatch_fn) (struct request_queue *, int); typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_may_queue_fn) (struct request_queue *, int, int); +typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int); typedef void (elevator_init_icq_fn) (struct io_cq *); typedef void (elevator_exit_icq_fn) (struct io_cq *); @@ -139,7 +139,7 @@ extern struct request *elv_former_request(struct request_queue *, struct request extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); extern void elv_unregister_queue(struct request_queue *q); -extern int elv_may_queue(struct request_queue *, int, int); +extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, struct bio *bio, gfp_t gfp_mask); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index d336b890e31f..df3e9ae5ad8d 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -27,8 +27,7 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->sector = bio->bi_iter.bi_sector; __entry->orig_sector = bio->bi_iter.bi_sector - 16; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", @@ -102,8 +101,7 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u", @@ -138,8 +136,7 @@ TRACE_EVENT(bcache_read, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); __entry->cache_hit = hit; __entry->bypass = bypass; ), @@ -170,8 +167,7 @@ TRACE_EVENT(bcache_write, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); __entry->writeback = writeback; __entry->bypass = bypass; ), diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 8f3a163b8166..3e02e3a25413 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -84,8 +84,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error, 0 : blk_rq_sectors(rq); __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); ), @@ -163,7 +162,7 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector = nr_bytes >> 9; __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, nr_bytes); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); blk_dump_cmd(__get_str(cmd), rq); ), @@ -199,8 +198,7 @@ DECLARE_EVENT_CLASS(block_rq, __entry->bytes = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? blk_rq_bytes(rq) : 0; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -274,8 +272,7 @@ TRACE_EVENT(block_bio_bounce, bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -313,8 +310,7 @@ TRACE_EVENT(block_bio_complete, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = error; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u [%d]", @@ -341,8 +337,7 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -409,8 +404,7 @@ TRACE_EVENT(block_bio_queue, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -438,7 +432,7 @@ DECLARE_EVENT_CLASS(block_get_rq, __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio ? bio->bi_iter.bi_sector : 0; __entry->nr_sector = bio ? bio_sectors(bio) : 0; - blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0, + blk_fill_rwbs(__entry->rwbs, bio ? bio->bi_opf : 0, __entry->nr_sector); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -573,8 +567,7 @@ TRACE_EVENT(block_split, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -617,8 +610,7 @@ TRACE_EVENT(block_bio_remap, __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", @@ -664,8 +656,7 @@ TRACE_EVENT(block_rq_remap, __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index dbafc5df03f3..95cecbf67f5c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1777,14 +1777,14 @@ void blk_dump_cmd(char *buf, struct request *rq) } } -void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; - if (rw & REQ_PREFLUSH) + if (op & REQ_PREFLUSH) rwbs[i++] = 'F'; - switch (op) { + switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; @@ -1806,13 +1806,13 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) rwbs[i++] = 'N'; } - if (rw & REQ_FUA) + if (op & REQ_FUA) rwbs[i++] = 'F'; - if (rw & REQ_RAHEAD) + if (op & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_SYNC) + if (op & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & REQ_META) + if (op & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; -- cgit v1.2.3 From ebb676daa1a340ccef25eb769aefc09b79c01f8a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 27 Oct 2016 11:23:51 +0200 Subject: bpf: Print function name in addition to function id The verifier currently prints raw function ids when printing CALL instructions or when complaining: 5: (85) call 23 unknown func 23 print a meaningful function name instead: 5: (85) call bpf_redirect#23 unknown func bpf_redirect#23 Moves the function documentation to a single comment and renames all helpers names in the list to conform to the bpf_ prefix notation so they can be greped in the kernel source. Signed-off-by: Thomas Graf Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 574 ++++++++++++++++++++++++----------------------- kernel/bpf/verifier.c | 35 ++- 2 files changed, 316 insertions(+), 293 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 374ef582ae18..e2f38e0091b6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,297 +143,301 @@ union bpf_attr { }; } __attribute__((aligned(8))); +/* BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(&map, &key) + * Return: Map value or NULL + * + * int bpf_map_update_elem(&map, &key, &value, flags) + * Return: 0 on success or negative error + * + * int bpf_map_delete_elem(&map, &key) + * Return: 0 on success or negative error + * + * int bpf_probe_read(void *dst, int size, void *src) + * Return: 0 on success or negative error + * + * u64 bpf_ktime_get_ns(void) + * Return: current ktime + * + * int bpf_trace_printk(const char *fmt, int fmt_size, ...) + * Return: length of buffer written or negative error + * + * u32 bpf_prandom_u32(void) + * Return: random value + * + * u32 bpf_raw_smp_processor_id(void) + * Return: SMP processor ID + * + * int bpf_skb_store_bytes(skb, offset, from, len, flags) + * store bytes into packet + * @skb: pointer to skb + * @offset: offset within packet from skb->mac_header + * @from: pointer where to copy bytes from + * @len: number of bytes to store into packet + * @flags: bit 0 - if true, recompute skb->csum + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_l3_csum_replace(skb, offset, from, to, flags) + * recompute IP checksum + * @skb: pointer to skb + * @offset: offset within packet where IP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_l4_csum_replace(skb, offset, from, to, flags) + * recompute TCP/UDP checksum + * @skb: pointer to skb + * @offset: offset within packet where TCP/UDP checksum is located + * @from: old value of header field + * @to: new value of header field + * @flags: bits 0-3 - size of header field + * bit 4 - is pseudo header + * other bits - reserved + * Return: 0 on success or negative error + * + * int bpf_tail_call(ctx, prog_array_map, index) + * jump into another BPF program + * @ctx: context pointer passed to next program + * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY + * @index: index inside array that selects specific program to run + * Return: 0 on success or negative error + * + * int bpf_clone_redirect(skb, ifindex, flags) + * redirect to another netdev + * @skb: pointer to skb + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: 0 on success or negative error + * + * u64 bpf_get_current_pid_tgid(void) + * Return: current->tgid << 32 | current->pid + * + * u64 bpf_get_current_uid_gid(void) + * Return: current_gid << 32 | current_uid + * + * int bpf_get_current_comm(char *buf, int size_of_buf) + * stores current->comm into buf + * Return: 0 on success or negative error + * + * u32 bpf_get_cgroup_classid(skb) + * retrieve a proc's classid + * @skb: pointer to skb + * Return: classid if != 0 + * + * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) + * Return: 0 on success or negative error + * + * int bpf_skb_vlan_pop(skb) + * Return: 0 on success or negative error + * + * int bpf_skb_get_tunnel_key(skb, key, size, flags) + * int bpf_skb_set_tunnel_key(skb, key, size, flags) + * retrieve or populate tunnel metadata + * @skb: pointer to skb + * @key: pointer to 'struct bpf_tunnel_key' + * @size: size of 'struct bpf_tunnel_key' + * @flags: room for future extensions + * Return: 0 on success or negative error + * + * u64 bpf_perf_event_read(&map, index) + * Return: Number events read or error code + * + * int bpf_redirect(ifindex, flags) + * redirect to another netdev + * @ifindex: ifindex of the net device + * @flags: bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * Return: TC_ACT_REDIRECT + * + * u32 bpf_get_route_realm(skb) + * retrieve a dst's tclassid + * @skb: pointer to skb + * Return: realm if != 0 + * + * int bpf_perf_event_output(ctx, map, index, data, size) + * output perf raw sample + * @ctx: struct pt_regs* + * @map: pointer to perf_event_array map + * @index: index of event in the map + * @data: data on stack to be output as raw data + * @size: size of data + * Return: 0 on success or negative error + * + * int bpf_get_stackid(ctx, map, flags) + * walk user or kernel stack and return id + * @ctx: struct pt_regs* + * @map: pointer to stack_trace map + * @flags: bits 0-7 - numer of stack frames to skip + * bit 8 - collect user stack instead of kernel + * bit 9 - compare stacks by hash only + * bit 10 - if two different stacks hash into the same stackid + * discard old + * other bits - reserved + * Return: >= 0 stackid on success or negative error + * + * s64 bpf_csum_diff(from, from_size, to, to_size, seed) + * calculate csum diff + * @from: raw from buffer + * @from_size: length of from buffer + * @to: raw to buffer + * @to_size: length of to buffer + * @seed: optional seed + * Return: csum result or negative error code + * + * int bpf_skb_get_tunnel_opt(skb, opt, size) + * retrieve tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: option size + * + * int bpf_skb_set_tunnel_opt(skb, opt, size) + * populate tunnel options metadata + * @skb: pointer to skb + * @opt: pointer to raw tunnel option data + * @size: size of @opt + * Return: 0 on success or negative error + * + * int bpf_skb_change_proto(skb, proto, flags) + * Change protocol of the skb. Currently supported is v4 -> v6, + * v6 -> v4 transitions. The helper will also resize the skb. eBPF + * program is expected to fill the new headers via skb_store_bytes + * and lX_csum_replace. + * @skb: pointer to skb + * @proto: new skb->protocol type + * @flags: reserved + * Return: 0 on success or negative error + * + * int bpf_skb_change_type(skb, type) + * Change packet type of skb. + * @skb: pointer to skb + * @type: new skb->pkt_type type + * Return: 0 on success or negative error + * + * int bpf_skb_under_cgroup(skb, map, index) + * Check cgroup2 membership of skb + * @skb: pointer to skb + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 skb failed the cgroup2 descendant test + * == 1 skb succeeded the cgroup2 descendant test + * < 0 error + * + * u32 bpf_get_hash_recalc(skb) + * Retrieve and possibly recalculate skb->hash. + * @skb: pointer to skb + * Return: hash + * + * u64 bpf_get_current_task(void) + * Returns current task_struct + * Return: current + * + * int bpf_probe_write_user(void *dst, void *src, int len) + * safely attempt to write to a location + * @dst: destination address in userspace + * @src: source address on stack + * @len: number of bytes to copy + * Return: 0 on success or negative error + * + * int bpf_current_task_under_cgroup(map, index) + * Check cgroup2 membership of current task + * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type + * @index: index of the cgroup in the bpf_map + * Return: + * == 0 current failed the cgroup2 descendant test + * == 1 current succeeded the cgroup2 descendant test + * < 0 error + * + * int bpf_skb_change_tail(skb, len, flags) + * The helper will resize the skb to the given new size, to be used f.e. + * with control messages. + * @skb: pointer to skb + * @len: new skb length + * @flags: reserved + * Return: 0 on success or negative error + * + * int bpf_skb_pull_data(skb, len) + * The helper will pull in non-linear data in case the skb is non-linear + * and not all of len are part of the linear section. Only needed for + * read/write with direct packet access. + * @skb: pointer to skb + * @len: len to make read/writeable + * Return: 0 on success or negative error + * + * s64 bpf_csum_update(skb, csum) + * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. + * @skb: pointer to skb + * @csum: csum to add + * Return: csum on success or negative error + * + * void bpf_set_hash_invalid(skb) + * Invalidate current skb->hash. + * @skb: pointer to skb + * + * int bpf_get_numa_node_id() + * Return: Id of current NUMA node. + */ +#define __BPF_FUNC_MAPPER(FN) \ + FN(unspec), \ + FN(map_lookup_elem), \ + FN(map_update_elem), \ + FN(map_delete_elem), \ + FN(probe_read), \ + FN(ktime_get_ns), \ + FN(trace_printk), \ + FN(get_prandom_u32), \ + FN(get_smp_processor_id), \ + FN(skb_store_bytes), \ + FN(l3_csum_replace), \ + FN(l4_csum_replace), \ + FN(tail_call), \ + FN(clone_redirect), \ + FN(get_current_pid_tgid), \ + FN(get_current_uid_gid), \ + FN(get_current_comm), \ + FN(get_cgroup_classid), \ + FN(skb_vlan_push), \ + FN(skb_vlan_pop), \ + FN(skb_get_tunnel_key), \ + FN(skb_set_tunnel_key), \ + FN(perf_event_read), \ + FN(redirect), \ + FN(get_route_realm), \ + FN(perf_event_output), \ + FN(skb_load_bytes), \ + FN(get_stackid), \ + FN(csum_diff), \ + FN(skb_get_tunnel_opt), \ + FN(skb_set_tunnel_opt), \ + FN(skb_change_proto), \ + FN(skb_change_type), \ + FN(skb_under_cgroup), \ + FN(get_hash_recalc), \ + FN(get_current_task), \ + FN(probe_write_user), \ + FN(current_task_under_cgroup), \ + FN(skb_change_tail), \ + FN(skb_pull_data), \ + FN(csum_update), \ + FN(set_hash_invalid), \ + FN(get_numa_node_id), + /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call */ +#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x enum bpf_func_id { - BPF_FUNC_unspec, - BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ - BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ - BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ - BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ - BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ - BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ - BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ - BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ - - /** - * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet - * @skb: pointer to skb - * @offset: offset within packet from skb->mac_header - * @from: pointer where to copy bytes from - * @len: number of bytes to store into packet - * @flags: bit 0 - if true, recompute skb->csum - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_skb_store_bytes, - - /** - * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum - * @skb: pointer to skb - * @offset: offset within packet where IP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_l3_csum_replace, - - /** - * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum - * @skb: pointer to skb - * @offset: offset within packet where TCP/UDP checksum is located - * @from: old value of header field - * @to: new value of header field - * @flags: bits 0-3 - size of header field - * bit 4 - is pseudo header - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_l4_csum_replace, - - /** - * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program - * @ctx: context pointer passed to next program - * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY - * @index: index inside array that selects specific program to run - * Return: 0 on success - */ - BPF_FUNC_tail_call, - - /** - * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev - * @skb: pointer to skb - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: 0 on success - */ - BPF_FUNC_clone_redirect, - - /** - * u64 bpf_get_current_pid_tgid(void) - * Return: current->tgid << 32 | current->pid - */ - BPF_FUNC_get_current_pid_tgid, - - /** - * u64 bpf_get_current_uid_gid(void) - * Return: current_gid << 32 | current_uid - */ - BPF_FUNC_get_current_uid_gid, - - /** - * bpf_get_current_comm(char *buf, int size_of_buf) - * stores current->comm into buf - * Return: 0 on success - */ - BPF_FUNC_get_current_comm, - - /** - * bpf_get_cgroup_classid(skb) - retrieve a proc's classid - * @skb: pointer to skb - * Return: classid if != 0 - */ - BPF_FUNC_get_cgroup_classid, - BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */ - BPF_FUNC_skb_vlan_pop, /* bpf_skb_vlan_pop(skb) */ - - /** - * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags) - * retrieve or populate tunnel metadata - * @skb: pointer to skb - * @key: pointer to 'struct bpf_tunnel_key' - * @size: size of 'struct bpf_tunnel_key' - * @flags: room for future extensions - * Retrun: 0 on success - */ - BPF_FUNC_skb_get_tunnel_key, - BPF_FUNC_skb_set_tunnel_key, - BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */ - /** - * bpf_redirect(ifindex, flags) - redirect to another netdev - * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT - */ - BPF_FUNC_redirect, - - /** - * bpf_get_route_realm(skb) - retrieve a dst's tclassid - * @skb: pointer to skb - * Return: realm if != 0 - */ - BPF_FUNC_get_route_realm, - - /** - * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample - * @ctx: struct pt_regs* - * @map: pointer to perf_event_array map - * @index: index of event in the map - * @data: data on stack to be output as raw data - * @size: size of data - * Return: 0 on success - */ - BPF_FUNC_perf_event_output, - BPF_FUNC_skb_load_bytes, - - /** - * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id - * @ctx: struct pt_regs* - * @map: pointer to stack_trace map - * @flags: bits 0-7 - numer of stack frames to skip - * bit 8 - collect user stack instead of kernel - * bit 9 - compare stacks by hash only - * bit 10 - if two different stacks hash into the same stackid - * discard old - * other bits - reserved - * Return: >= 0 stackid on success or negative error - */ - BPF_FUNC_get_stackid, - - /** - * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff - * @from: raw from buffer - * @from_size: length of from buffer - * @to: raw to buffer - * @to_size: length of to buffer - * @seed: optional seed - * Return: csum result - */ - BPF_FUNC_csum_diff, - - /** - * bpf_skb_[gs]et_tunnel_opt(skb, opt, size) - * retrieve or populate tunnel options metadata - * @skb: pointer to skb - * @opt: pointer to raw tunnel option data - * @size: size of @opt - * Return: 0 on success for set, option size for get - */ - BPF_FUNC_skb_get_tunnel_opt, - BPF_FUNC_skb_set_tunnel_opt, - - /** - * bpf_skb_change_proto(skb, proto, flags) - * Change protocol of the skb. Currently supported is - * v4 -> v6, v6 -> v4 transitions. The helper will also - * resize the skb. eBPF program is expected to fill the - * new headers via skb_store_bytes and lX_csum_replace. - * @skb: pointer to skb - * @proto: new skb->protocol type - * @flags: reserved - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_change_proto, - - /** - * bpf_skb_change_type(skb, type) - * Change packet type of skb. - * @skb: pointer to skb - * @type: new skb->pkt_type type - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_change_type, - - /** - * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb - * @skb: pointer to skb - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 skb failed the cgroup2 descendant test - * == 1 skb succeeded the cgroup2 descendant test - * < 0 error - */ - BPF_FUNC_skb_under_cgroup, - - /** - * bpf_get_hash_recalc(skb) - * Retrieve and possibly recalculate skb->hash. - * @skb: pointer to skb - * Return: hash - */ - BPF_FUNC_get_hash_recalc, - - /** - * u64 bpf_get_current_task(void) - * Returns current task_struct - * Return: current - */ - BPF_FUNC_get_current_task, - - /** - * bpf_probe_write_user(void *dst, void *src, int len) - * safely attempt to write to a location - * @dst: destination address in userspace - * @src: source address on stack - * @len: number of bytes to copy - * Return: 0 on success or negative error - */ - BPF_FUNC_probe_write_user, - - /** - * bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership of current task - * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type - * @index: index of the cgroup in the bpf_map - * Return: - * == 0 current failed the cgroup2 descendant test - * == 1 current succeeded the cgroup2 descendant test - * < 0 error - */ - BPF_FUNC_current_task_under_cgroup, - - /** - * bpf_skb_change_tail(skb, len, flags) - * The helper will resize the skb to the given new size, - * to be used f.e. with control messages. - * @skb: pointer to skb - * @len: new skb length - * @flags: reserved - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_change_tail, - - /** - * bpf_skb_pull_data(skb, len) - * The helper will pull in non-linear data in case the - * skb is non-linear and not all of len are part of the - * linear section. Only needed for read/write with direct - * packet access. - * @skb: pointer to skb - * @len: len to make read/writeable - * Return: 0 on success or negative error - */ - BPF_FUNC_skb_pull_data, - - /** - * bpf_csum_update(skb, csum) - * Adds csum into skb->csum in case of CHECKSUM_COMPLETE. - * @skb: pointer to skb - * @csum: csum to add - * Return: csum on success or negative error - */ - BPF_FUNC_csum_update, - - /** - * bpf_set_hash_invalid(skb) - * Invalidate current skb>hash. - * @skb: pointer to skb - */ - BPF_FUNC_set_hash_invalid, - - /** - * bpf_get_numa_node_id() - * Returns the id of the current NUMA node. - */ - BPF_FUNC_get_numa_node_id, - + __BPF_FUNC_MAPPER(__BPF_ENUM_FN) __BPF_FUNC_MAX_ID, }; +#undef __BPF_ENUM_FN /* All flags used by eBPF helper functions, placed here. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 846d7ceaf202..900257578934 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19,6 +19,7 @@ #include #include #include +#include /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. @@ -190,6 +191,22 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +static const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + static void print_verifier_state(struct bpf_verifier_state *state) { struct bpf_reg_state *reg; @@ -354,7 +371,8 @@ static void print_bpf_insn(struct bpf_insn *insn) u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %d\n", insn->code, insn->imm); + verbose("(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose("(%02x) goto pc%+d\n", insn->code, insn->off); @@ -1114,8 +1132,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %d\n", - map->map_type, func_id); + verbose("cannot pass map_type %d into func %s#%d\n", + map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1172,7 +1190,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %d\n", func_id); + verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1180,7 +1198,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %d\n", func_id); + verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); return -EINVAL; } @@ -1200,7 +1218,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %d\n", func_id); + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(func_id), func_id); return err; } @@ -1256,8 +1275,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; } else { - verbose("unknown return type %d of func %d\n", - fn->ret_type, func_id); + verbose("unknown return type %d of func %s#%d\n", + fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } -- cgit v1.2.3 From 0f98621bef5d2b7ad41f6595899660af344f5016 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 29 Oct 2016 02:30:46 +0200 Subject: bpf, inode: add support for symlinks and fix mtime/ctime While commit bb35a6ef7da4 ("bpf, inode: allow for rename and link ops") added support for hard links that can be used for prog and map nodes, this work adds simple symlink support, which can be used f.e. for directories also when unpriviledged and works with cmdline tooling that understands S_IFLNK anyway. Since the switch in e27f4a942a0e ("bpf: Use mount_nodev not mount_ns to mount the bpf filesystem"), there can be various mount instances with mount_nodev() and thus hierarchy can be flattened to facilitate object sharing. Thus, we can keep bpf tooling also working by repointing paths. Most of the functionality can be used from vfs library operations. The symlink is stored in the inode itself, that is in i_link, which is sufficient in our case as opposed to storing it in the page cache. While at it, I noticed that bpf_mkdir() and bpf_mkobj() don't update the directories mtime and ctime, so add a common helper for it called bpf_dentry_finalize() that takes care of it for all cases now. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 1ed8473ec537..2565809fbb34 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -87,6 +87,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, switch (mode & S_IFMT) { case S_IFDIR: case S_IFREG: + case S_IFLNK: break; default: return ERR_PTR(-EINVAL); @@ -119,6 +120,16 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) return 0; } +static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + d_instantiate(dentry, inode); + dget(dentry); + + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; +} + static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -133,9 +144,7 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inc_nlink(inode); inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -151,9 +160,7 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, inode->i_op = iops; inode->i_private = dentry->d_fsdata; - d_instantiate(dentry, inode); - dget(dentry); - + bpf_dentry_finalize(dentry, inode, dir); return 0; } @@ -181,13 +188,37 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { if (strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); + return simple_lookup(dir, dentry, flags); } +static int bpf_symlink(struct inode *dir, struct dentry *dentry, + const char *target) +{ + char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); + struct inode *inode; + + if (!link) + return -ENOMEM; + + inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); + if (IS_ERR(inode)) { + kfree(link); + return PTR_ERR(inode); + } + + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = link; + + bpf_dentry_finalize(dentry, inode, dir); + return 0; +} + static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, + .symlink = bpf_symlink, .rmdir = simple_rmdir, .rename = simple_rename, .link = simple_link, @@ -324,6 +355,8 @@ static void bpf_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); } -- cgit v1.2.3 From 405c0759712f57b680f66aee9c55cd06ad1cbdef Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 31 Oct 2016 08:11:43 -0700 Subject: fork: Add task stack refcounting sanity check and prevent premature task stack freeing If something goes wrong with task stack refcounting and a stack refcount hits zero too early, warn and leak it rather than potentially freeing it early (and silently). Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f29119c783a9680a4b4656e751b6123917ace94b.1477926663.git.luto@kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..997ac1d584f7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,9 @@ static void account_kernel_stack(struct task_struct *tsk, int account) static void release_task_stack(struct task_struct *tsk) { + if (WARN_ON(tsk->state != TASK_DEAD)) + return; /* Better to leak the stack than to free prematurely */ + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); @@ -1862,6 +1865,7 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + p->state = TASK_DEAD; put_task_stack(p); free_task(p); fork_out: -- cgit v1.2.3 From 70fd76140a6cb63262bd47b68d57b42e889c10ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2016 07:40:10 -0600 Subject: block,fs: use REQ_* flags directly Remove the WRITE_* and READ_SYNC wrappers, and just use the flags directly. Where applicable this also drops usage of the bio_set_op_attrs wrapper. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 4 ++-- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/xen-blkback/blkback.c | 10 ++++---- drivers/md/bcache/btree.c | 4 ++-- drivers/md/bcache/debug.c | 4 ++-- drivers/md/bcache/request.c | 2 +- drivers/md/bcache/super.c | 4 ++-- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-log.c | 2 +- drivers/md/dm-raid1.c | 4 ++-- drivers/md/dm-snap-persistent.c | 4 ++-- drivers/md/dm.c | 2 +- drivers/md/md.c | 4 ++-- drivers/md/raid5-cache.c | 4 ++-- drivers/md/raid5.c | 2 +- drivers/nvme/target/io-cmd.c | 4 ++-- drivers/target/target_core_iblock.c | 8 +++---- fs/btrfs/disk-io.c | 6 ++--- fs/btrfs/extent_io.c | 16 ++++++------- fs/btrfs/inode.c | 6 ++--- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 2 +- fs/buffer.c | 8 +++---- fs/direct-io.c | 2 +- fs/ext4/mmp.c | 6 ++--- fs/ext4/page-io.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 16 ++++++------- fs/f2fs/gc.c | 6 ++--- fs/f2fs/inline.c | 2 +- fs/f2fs/node.c | 4 ++-- fs/f2fs/segment.c | 8 +++---- fs/f2fs/super.c | 2 +- fs/gfs2/log.c | 4 ++-- fs/gfs2/meta_io.c | 6 ++--- fs/gfs2/ops_fstype.c | 2 +- fs/hfsplus/super.c | 4 ++-- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/commit.c | 9 +++---- fs/jbd2/journal.c | 15 ++++++------ fs/jbd2/revoke.c | 2 +- fs/jfs/jfs_logmgr.c | 4 ++-- fs/mpage.c | 6 ++--- fs/nilfs2/super.c | 2 +- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/reiserfs/journal.c | 6 +++-- fs/xfs/xfs_aops.c | 11 +++++---- fs/xfs/xfs_buf.c | 2 +- include/linux/fs.h | 47 ------------------------------------- include/trace/events/f2fs.h | 10 ++++---- kernel/power/swap.c | 19 +++++++-------- 53 files changed, 133 insertions(+), 182 deletions(-) (limited to 'kernel') diff --git a/block/blk-flush.c b/block/blk-flush.c index 95f1d4d357df..d35beca18481 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH; + flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; @@ -486,7 +486,7 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, bio = bio_alloc(gfp_mask, 0); bio->bi_bdev = bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 942384f34e22..a89538cb3eaa 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1266,7 +1266,7 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont bio->bi_bdev = device->ldev->backing_bdev; bio->bi_private = octx; bio->bi_end_io = one_flush_endio; - bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH); + bio->bi_opf = REQ_OP_FLUSH | REQ_PREFLUSH; device->flush_jif = jiffies; set_bit(FLUSH_PENDING, &device->flags); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 4a80ee752597..726c32e35db9 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -1253,14 +1253,14 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, case BLKIF_OP_WRITE: ring->st_wr_req++; operation = REQ_OP_WRITE; - operation_flags = WRITE_ODIRECT; + operation_flags = REQ_SYNC | REQ_IDLE; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: ring->st_f_req++; operation = REQ_OP_WRITE; - operation_flags = WRITE_FLUSH; + operation_flags = REQ_PREFLUSH; break; default: operation = 0; /* make gcc happy */ @@ -1272,7 +1272,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, nseg = req->operation == BLKIF_OP_INDIRECT ? req->u.indirect.nr_segments : req->u.rw.nr_segments; - if (unlikely(nseg == 0 && operation_flags != WRITE_FLUSH) || + if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) || unlikely((req->operation != BLKIF_OP_INDIRECT) && (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || unlikely((req->operation == BLKIF_OP_INDIRECT) && @@ -1334,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, } /* Wait on all outstanding I/O's and once that has been completed - * issue the WRITE_FLUSH. + * issue the flush. */ if (drain) xen_blk_drain_io(pending_req->ring); @@ -1380,7 +1380,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, /* This will be hit if the operation was a flush or discard. */ if (!bio) { - BUG_ON(operation_flags != WRITE_FLUSH); + BUG_ON(operation_flags != REQ_PREFLUSH); bio = bio_alloc(GFP_KERNEL, 0); if (unlikely(bio == NULL)) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 81d3db40cd7b..6fdd8e252760 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -297,7 +297,7 @@ static void bch_btree_node_read(struct btree *b) bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; bio->bi_end_io = btree_node_read_endio; bio->bi_private = &cl; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); + bio->bi_opf = REQ_OP_READ | REQ_META; bch_bio_map(bio, b->keys.set[0].data); @@ -393,7 +393,7 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); - bio_set_op_attrs(b->bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); + b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; bch_bio_map(b->bio, i); /* diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 333a1e5f6ae6..1c9130ae0073 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -52,7 +52,7 @@ void bch_btree_verify(struct btree *b) bio->bi_bdev = PTR_CACHE(b->c, &b->key, 0)->bdev; bio->bi_iter.bi_sector = PTR_OFFSET(&b->key, 0); bio->bi_iter.bi_size = KEY_SIZE(&v->key) << 9; - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); + bio->bi_opf = REQ_OP_READ | REQ_META; bch_bio_map(bio, sorted); submit_bio_wait(bio); @@ -113,7 +113,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_clone(bio, GFP_NOIO); if (!check) return; - bio_set_op_attrs(check, REQ_OP_READ, READ_SYNC); + check->bi_opf = REQ_OP_READ; if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e8a2b693c928..0d99b5f4b3e6 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -923,7 +923,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) flush->bi_bdev = bio->bi_bdev; flush->bi_end_io = request_endio; flush->bi_private = cl; - bio_set_op_attrs(flush, REQ_OP_WRITE, WRITE_FLUSH); + flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; closure_bio_submit(flush, cl); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 849ad441cd76..988edf928466 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -381,7 +381,7 @@ static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) return "bad uuid pointer"; bkey_copy(&c->uuid_bucket, k); - uuid_io(c, REQ_OP_READ, READ_SYNC, k, cl); + uuid_io(c, REQ_OP_READ, 0, k, cl); if (j->version < BCACHE_JSET_VERSION_UUIDv1) { struct uuid_entry_v0 *u0 = (void *) c->uuids; @@ -600,7 +600,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) ca->prio_last_buckets[bucket_nr] = bucket; bucket_nr++; - prio_io(ca, bucket, REQ_OP_READ, READ_SYNC); + prio_io(ca, bucket, REQ_OP_READ, 0); if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 125aedc3875f..b3ba142e59a4 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1316,7 +1316,7 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) { struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = WRITE_FLUSH, + .bi_op_flags = REQ_PREFLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = c->dm_io, diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 07fc1ad42ec5..33e71ea6cc14 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -308,7 +308,7 @@ static int flush_header(struct log_c *lc) }; lc->io_req.bi_op = REQ_OP_WRITE; - lc->io_req.bi_op_flags = WRITE_FLUSH; + lc->io_req.bi_op_flags = REQ_PREFLUSH; return dm_io(&lc->io_req, 1, &null_location, NULL); } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index bdf1606f67bc..1a176d7c8b90 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti) struct mirror *m; struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = WRITE_FLUSH, + .bi_op_flags = REQ_PREFLUSH, .mem.type = DM_IO_KMEM, .mem.ptr.addr = NULL, .client = ms->io_client, @@ -657,7 +657,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio) struct mirror *m; struct dm_io_request io_req = { .bi_op = REQ_OP_WRITE, - .bi_op_flags = bio->bi_opf & WRITE_FLUSH_FUA, + .bi_op_flags = bio->bi_opf & (REQ_FUA | REQ_PREFLUSH), .mem.type = DM_IO_BIO, .mem.ptr.bio = bio, .notify.fn = write_callback, diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index b8cf956b577b..b93476c3ba3f 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -741,7 +741,7 @@ static void persistent_commit_exception(struct dm_exception_store *store, /* * Commit exceptions to disk. */ - if (ps->valid && area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA)) + if (ps->valid && area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA)) ps->valid = 0; /* @@ -818,7 +818,7 @@ static int persistent_commit_merge(struct dm_exception_store *store, for (i = 0; i < nr_merged; i++) clear_exception(ps, ps->current_committed - 1 - i); - r = area_io(ps, REQ_OP_WRITE, WRITE_FLUSH_FUA); + r = area_io(ps, REQ_OP_WRITE, REQ_PREFLUSH | REQ_FUA); if (r < 0) return r; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 147af9536d0c..b2abfa41af3e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1527,7 +1527,7 @@ static struct mapped_device *alloc_dev(int minor) bio_init(&md->flush_bio); md->flush_bio.bi_bdev = md->bdev; - bio_set_op_attrs(&md->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); + md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; dm_stats_init(&md->stats); diff --git a/drivers/md/md.c b/drivers/md/md.c index eac84d8ff724..b69ec7da4bae 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -394,7 +394,7 @@ static void submit_flushes(struct work_struct *ws) bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; - bio_set_op_attrs(bi, REQ_OP_WRITE, WRITE_FLUSH); + bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; atomic_inc(&mddev->flush_pending); submit_bio(bi); rcu_read_lock(); @@ -743,7 +743,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA; atomic_inc(&mddev->pending_writes); submit_bio(bio); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 1b1ab4a1d132..28d015c6fffe 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -685,7 +685,7 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log) bio_reset(&log->flush_bio); log->flush_bio.bi_bdev = log->rdev->bdev; log->flush_bio.bi_end_io = r5l_log_flush_endio; - bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); + log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; submit_bio(&log->flush_bio); } @@ -1053,7 +1053,7 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, mb->checksum = cpu_to_le32(crc); if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, - WRITE_FUA, false)) { + REQ_FUA, false)) { __free_page(page); return -EIO; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 92ac251e91e6..70acdd379e44 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -913,7 +913,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { op = REQ_OP_WRITE; if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) - op_flags = WRITE_FUA; + op_flags = REQ_FUA; if (test_bit(R5_Discard, &sh->dev[i].flags)) op = REQ_OP_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 4a96c2049b7b..c2784cfc5e29 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -58,7 +58,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) if (req->cmd->rw.opcode == nvme_cmd_write) { op = REQ_OP_WRITE; - op_flags = WRITE_ODIRECT; + op_flags = REQ_SYNC | REQ_IDLE; if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) op_flags |= REQ_FUA; } else { @@ -109,7 +109,7 @@ static void nvmet_execute_flush(struct nvmet_req *req) bio->bi_bdev = req->ns->bdev; bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; submit_bio(bio); } diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 372d744315f3..d316ed537d59 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -388,7 +388,7 @@ iblock_execute_sync_cache(struct se_cmd *cmd) bio = bio_alloc(GFP_KERNEL, 0); bio->bi_end_io = iblock_end_io_flush; bio->bi_bdev = ib_dev->ibd_bd; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; if (!immed) bio->bi_private = cmd; submit_bio(bio); @@ -686,15 +686,15 @@ iblock_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, struct iblock_dev *ib_dev = IBLOCK_DEV(dev); struct request_queue *q = bdev_get_queue(ib_dev->ibd_bd); /* - * Force writethrough using WRITE_FUA if a volatile write cache + * Force writethrough using REQ_FUA if a volatile write cache * is not enabled, or if initiator set the Force Unit Access bit. */ op = REQ_OP_WRITE; if (test_bit(QUEUE_FLAG_FUA, &q->queue_flags)) { if (cmd->se_cmd_flags & SCF_FUA) - op_flags = WRITE_FUA; + op_flags = REQ_FUA; else if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) - op_flags = WRITE_FUA; + op_flags = REQ_FUA; } } else { op = REQ_OP_READ; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c8454a8e35f2..fe10afd51e02 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3485,9 +3485,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); else - ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); if (ret) errors++; } @@ -3551,7 +3551,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 66a755150056..ff87bff7bdb6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -127,7 +127,7 @@ struct extent_page_data { */ unsigned int extent_locked:1; - /* tells the submit_bio code to use a WRITE_SYNC */ + /* tells the submit_bio code to use REQ_SYNC */ unsigned int sync_io:1; }; @@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(bio)) { @@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct inode *inode = page->mapping->host; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct bio *bio; - int read_mode; + int read_mode = 0; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, } if (failed_bio->bi_vcnt > 1) - read_mode = READ_SYNC | REQ_FAILFAST_DEV; - else - read_mode = READ_SYNC; + read_mode |= REQ_FAILFAST_DEV; phy_offset >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, @@ -3484,7 +3482,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; + write_flags = REQ_SYNC; trace___extent_writepage(page, inode, wbc); @@ -3729,7 +3727,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, unsigned long i, num_pages; unsigned long bio_flags = 0; unsigned long start, end; - int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META; + int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -4076,7 +4074,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) int ret; bio_set_op_attrs(epd->bio, REQ_OP_WRITE, - epd->sync_io ? WRITE_SYNC : 0); + epd->sync_io ? REQ_SYNC : 0); ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9a377079af26..c8eb82a416b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7917,7 +7917,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec; struct bio *bio; int isector; - int read_mode; + int read_mode = 0; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7936,9 +7936,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, if ((failed_bio->bi_vcnt > 1) || (failed_bio->bi_io_vec->bv_len > BTRFS_I(inode)->root->sectorsize)) - read_mode = READ_SYNC | REQ_FAILFAST_DEV; - else - read_mode = READ_SYNC; + read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; isector >>= inode->i_sb->s_blocksize_bits; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fffb9ab8526e..ff3078234d94 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -4440,7 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { leave_with_eio: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index deda46cf1292..0d7d635d8bfb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6023,7 +6023,7 @@ static void btrfs_end_bio(struct bio *bio) else btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_READ_ERRS); - if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH) + if (bio->bi_opf & REQ_PREFLUSH) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_FLUSH_ERRS); btrfs_dev_stat_print_on_error(dev); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 09ed29c67848..f137ffe6654c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -62,7 +62,7 @@ struct btrfs_device { int running_pending; /* regular prio bios */ struct btrfs_pending_bios pending_bios; - /* WRITE_SYNC bios */ + /* sync bios */ struct btrfs_pending_bios pending_sync_bios; struct block_device *bdev; diff --git a/fs/buffer.c b/fs/buffer.c index a29335867e30..bc7c2bb30a9b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -753,7 +753,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * still in flight on potentially older * contents. */ - write_dirty_buffer(bh, WRITE_SYNC); + write_dirty_buffer(bh, REQ_SYNC); /* * Kick off IO for the previous mapping. Note @@ -1684,7 +1684,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * * prevents this contention from occurring. * * If block_write_full_page() is called with wbc->sync_mode == - * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this + * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ int __block_write_full_page(struct inode *inode, struct page *page, @@ -1697,7 +1697,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -3210,7 +3210,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer); int sync_dirty_buffer(struct buffer_head *bh) { - return __sync_dirty_buffer(bh, WRITE_SYNC); + return __sync_dirty_buffer(bh, REQ_SYNC); } EXPORT_SYMBOL(sync_dirty_buffer); diff --git a/fs/direct-io.c b/fs/direct-io.c index fb9aa16a7727..a5138c564019 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1209,7 +1209,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->inode = inode; if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; - dio->op_flags = WRITE_ODIRECT; + dio->op_flags = REQ_SYNC | REQ_IDLE; } else { dio->op = REQ_OP_READ; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d89754ef1aab..eb9835638680 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) } /* - * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * Write the MMP block using REQ_SYNC to try to get the block on-disk * faster. */ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) @@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { ret = -EIO; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0094923e5ebf..e0b3b54cdef3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0; + REQ_SYNC : 0; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6db81fbcbaa6..f31eb286af90 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4553,7 +4553,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) unlock_buffer(sbh); if (sync) { error = __sync_dirty_buffer(sbh, - test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); + test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); if (error) return error; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7e9b504bd8b2..d935c06a84f0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = READ_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, @@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ae194fd2fdb..b80bf10603d7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -198,11 +198,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - if (test_opt(sbi, NOBARRIER)) - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META | - REQ_PRIO; + io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_FUA; } __submit_merged_bio(io); out: @@ -483,7 +481,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, 0, false); if (IS_ERR(page)) return page; @@ -509,7 +507,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, 0, for_write); if (IS_ERR(page)) return page; @@ -1251,7 +1249,7 @@ static int f2fs_write_data_page(struct page *page, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, + .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0, .page = page, .encrypted_page = NULL, }; @@ -1663,7 +1661,7 @@ repeat: err = PTR_ERR(bio); goto fail; } - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + bio->bi_opf = REQ_OP_READ; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); err = -EFAULT; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 93985c64d8a8..9eb11b2244ea 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -550,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_READ, - .op_flags = READ_SYNC, + .op_flags = 0, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -625,7 +625,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) f2fs_wait_on_page_writeback(dn.node_page, NODE, true); fio.op = REQ_OP_WRITE; - fio.op_flags = WRITE_SYNC; + fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -663,7 +663,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC, + .op_flags = REQ_SYNC, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 5f1a67f756af..2e7f54c191b4 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .sbi = F2FS_I_SB(dn->inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 01177ecdeab8..932f3f8bb57b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1134,7 +1134,7 @@ repeat: if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, 0); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); @@ -1570,7 +1570,7 @@ static int f2fs_write_node_page(struct page *page, .sbi = sbi, .type = NODE, .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, + .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? REQ_SYNC : 0, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fc886f008449..f1b4a1775ebe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -420,7 +420,7 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); llist_for_each_entry_safe(cmd, next, @@ -454,7 +454,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) atomic_inc(&fcc->submit_flush); bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; ret = submit_bio_wait(bio); atomic_dec(&fcc->submit_flush); bio_put(bio); @@ -1515,7 +1515,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .sbi = sbi, .type = META, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6132b4ce4e4c..2cac6bb86080 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1238,7 +1238,7 @@ static int __f2fs_commit_super(struct buffer_head *bh, unlock_buffer(bh); /* it's rare case, we can do fua all the time */ - return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e58ccef09c91..27c00a16def0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int op_flags = WRITE_FLUSH_FUA | REQ_META; + int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); @@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); - op_flags = WRITE_SYNC | REQ_META | REQ_PRIO; + op_flags = REQ_SYNC | REQ_META | REQ_PRIO; } sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 373639a59782..e562b1191c9c 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -38,7 +38,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb struct buffer_head *bh, *head; int nr_underway = 0; int write_flags = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -285,7 +285,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } } - gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num); + gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; @@ -453,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ff72ac6439c8..a34308df927f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_META); submit_bio(bio); wait_on_page_locked(page); bio_put(bio); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 11854dd84572..67aedf4c2e7c 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, NULL, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (!error) error = error2; if (!write_backup) @@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (!error) error2 = error; out: diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 684996c8a3a4..4055f51617ef 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count) blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC); blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 31f8ca046639..8c514367ba5a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh); + ret = submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh); else - ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); *cbh = bh; return ret; @@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - WRITE_SYNC); + REQ_SYNC); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); @@ -717,7 +718,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); } cond_resched(); stats.run.rs_blocks_logged += bufs; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 927da4956a89..8ed971eeab44 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); if (ret) goto out; @@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal) /* Lock here to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); /* - * Update log tail information. We use WRITE_FUA since new + * Update log tail information. We use REQ_FUA since new * transaction will start reusing journal space and so we * must make sure information about current log tail is on * disk before that. @@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - WRITE_FUA); + REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_FUA); + jbd2_write_superblock(journal, REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal) ++journal->j_transaction_sequence; write_unlock(&journal->j_state_lock); - jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); + jbd2_mark_journal_empty(journal, + REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, WRITE_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, WRITE_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 91171dc352cb..cfc38b552118 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, WRITE_SYNC); + write_dirty_buffer(descriptor, REQ_SYNC); } #endif diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a21ea8b3e5fa..bb1da1feafeb 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + bio->bi_opf = REQ_OP_READ; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; @@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; /* check if journaling to disk has been disabled */ if (log->no_integrity) { diff --git a/fs/mpage.c b/fs/mpage.c index d2413af0823a..f35e2819d0c6 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? REQ_SYNC : 0); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -705,7 +705,7 @@ mpage_writepages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) { int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0); + REQ_SYNC : 0); mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } } @@ -726,7 +726,7 @@ int mpage_writepage(struct page *page, get_block_t get_block, int ret = __mpage_writepage(page, wbc, &mpd); if (mpd.bio) { int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0); + REQ_SYNC : 0); mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } return ret; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c95d369e90aa..12eeae62a2b1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag) set_buffer_dirty(nilfs->ns_sbh[0]); if (nilfs_test_opt(nilfs, BARRIER)) { err = __sync_dirty_buffer(nilfs->ns_sbh[0], - WRITE_SYNC | WRITE_FLUSH_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } else { err = sync_dirty_buffer(nilfs->ns_sbh[0]); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 636abcbd4650..52eef16edb01 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index bc2dde2423c2..aa40c242f1db 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s, mark_buffer_dirty(jl->j_commit_bh) ; depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) - __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); + __sync_dirty_buffer(jl->j_commit_bh, + REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock_nested(s, depth); @@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb, depth = reiserfs_write_unlock_nested(sb); if (reiserfs_barrier_flush(sb)) - __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); + __sync_dirty_buffer(journal->j_header_bh, + REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(journal->j_header_bh); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3e57a56cf829..594e02c485b2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -495,8 +495,10 @@ xfs_submit_ioend( ioend->io_bio->bi_private = ioend; ioend->io_bio->bi_end_io = xfs_end_bio; - bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, - (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + ioend->io_bio->bi_opf = REQ_OP_WRITE; + if (wbc->sync_mode == WB_SYNC_ALL) + ioend->io_bio->bi_opf |= REQ_SYNC; + /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -567,8 +569,9 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, - (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + ioend->io_bio->bi_opf = REQ_OP_WRITE; + if (wbc->sync_mode == WB_SYNC_ALL) + ioend->io_bio->bi_opf |= REQ_SYNC; submit_bio(ioend->io_bio); ioend->io_bio = new; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b5b9bffe3520..33c435f3316c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1304,7 +1304,7 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_WRITE) { op = REQ_OP_WRITE; if (bp->b_flags & XBF_SYNCIO) - op_flags = WRITE_SYNC; + op_flags = REQ_SYNC; if (bp->b_flags & XBF_FUA) op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) diff --git a/include/linux/fs.h b/include/linux/fs.h index 46a74209917f..7a1b78ab7c15 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -151,58 +151,11 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, */ #define CHECK_IOVEC_ONLY -1 -/* - * The below are the various read and write flags that we support. Some of - * them include behavioral modifiers that send information down to the - * block layer and IO scheduler. They should be used along with a req_op. - * Terminology: - * - * The block layer uses device plugging to defer IO a little bit, in - * the hope that we will see more IO very shortly. This increases - * coalescing of adjacent IO and thus reduces the number of IOs we - * have to send to the device. It also allows for better queuing, - * if the IO isn't mergeable. If the caller is going to be waiting - * for the IO, then he must ensure that the device is unplugged so - * that the IO is dispatched to the driver. - * - * All IO is handled async in Linux. This is fine for background - * writes, but for reads or writes that someone waits for completion - * on, we want to notify the block layer and IO scheduler so that they - * know about it. That allows them to make better scheduling - * decisions. So when the below references 'sync' and 'async', it - * is referencing this priority hint. - * - * With that in mind, the available types are: - * - * READ A normal read operation. Device will be plugged. - * READ_SYNC A synchronous read. Device is not plugged, caller can - * immediately wait on this read without caring about - * unplugging. - * WRITE A normal async write. Device will be plugged. - * WRITE_SYNC Synchronous write. Identical to WRITE, but passes down - * the hint that someone will be waiting on this IO - * shortly. The write equivalent of READ_SYNC. - * WRITE_ODIRECT Special case write for O_DIRECT only. - * WRITE_FLUSH Like WRITE_SYNC but with preceding cache flush. - * WRITE_FUA Like WRITE_SYNC but data is guaranteed to be on - * non-volatile media on completion. - * WRITE_FLUSH_FUA Combination of WRITE_FLUSH and FUA. The IO is preceded - * by a cache flush and data is guaranteed to be on - * non-volatile media on completion. - * - */ #define RW_MASK REQ_OP_WRITE #define READ REQ_OP_READ #define WRITE REQ_OP_WRITE -#define READ_SYNC 0 -#define WRITE_SYNC REQ_SYNC -#define WRITE_ODIRECT (REQ_SYNC | REQ_IDLE) -#define WRITE_FLUSH REQ_PREFLUSH -#define WRITE_FUA REQ_FUA -#define WRITE_FLUSH_FUA (REQ_PREFLUSH | REQ_FUA) - /* * Attribute flags. These should be or-ed together to figure out what * has been changed! diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index a9d34424450d..5da2c829a718 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -55,7 +55,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) -#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | WRITE_FLUSH_FUA)) +#define F2FS_BIO_FLAG_MASK(t) (t & (REQ_RAHEAD | REQ_PREFLUSH | REQ_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) #define show_bio_type(op_flags) show_bio_op_flags(op_flags), \ @@ -65,11 +65,9 @@ TRACE_DEFINE_ENUM(CP_DISCARD); __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ { 0, "WRITE" }, \ { REQ_RAHEAD, "READAHEAD" }, \ - { READ_SYNC, "READ_SYNC" }, \ - { WRITE_SYNC, "WRITE_SYNC" }, \ - { WRITE_FLUSH, "WRITE_FLUSH" }, \ - { WRITE_FUA, "WRITE_FUA" }, \ - { WRITE_FLUSH_FUA, "WRITE_FLUSH_FUA" }) + { REQ_SYNC, "REQ_SYNC" }, \ + { REQ_PREFLUSH, "REQ_PREFLUSH" }, \ + { REQ_FUA, "REQ_FUA" }) #define show_bio_extra(type) \ __print_symbolic(F2FS_BIO_EXTRA_MASK(type), \ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a3b1e617bcdc..32e0c232efba 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -307,7 +307,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { @@ -317,7 +317,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -397,7 +397,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) } else { src = buf; } - return hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, offset, src, hb); + return hib_submit_io(REQ_OP_WRITE, REQ_SYNC, offset, src, hb); } static void release_swap_writer(struct swap_map_handle *handle) @@ -1000,8 +1000,7 @@ static int get_swap_reader(struct swap_map_handle *handle, return -ENOMEM; } - error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, - tmp->map, NULL); + error = hib_submit_io(REQ_OP_READ, 0, offset, tmp->map, NULL); if (error) { release_swap_reader(handle); return error; @@ -1025,7 +1024,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = hib_submit_io(REQ_OP_READ, READ_SYNC, offset, buf, hb); + error = hib_submit_io(REQ_OP_READ, 0, offset, buf, hb); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { @@ -1534,7 +1533,7 @@ int swsusp_check(void) if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); - error = hib_submit_io(REQ_OP_READ, READ_SYNC, + error = hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (error) @@ -1543,7 +1542,7 @@ int swsusp_check(void) if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { @@ -1588,11 +1587,11 @@ int swsusp_unmark(void) { int error; - hib_submit_io(REQ_OP_READ, READ_SYNC, swsusp_resume_block, + hib_submit_io(REQ_OP_READ, 0, swsusp_resume_block, swsusp_header, NULL); if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) { memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10); - error = hib_submit_io(REQ_OP_WRITE, WRITE_SYNC, + error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { -- cgit v1.2.3 From 285fdfc5d9959a2104021b6bbdec39b8c26e99ef Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Tue, 20 Sep 2016 19:39:47 +0200 Subject: seccomp: Fix documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix struct seccomp_filter and seccomp_run_filters() signatures. Signed-off-by: Mickaël Salaün Cc: Andy Lutomirski Cc: James Morris Cc: Kees Cook Cc: Will Drewry Signed-off-by: Kees Cook --- kernel/seccomp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0db7c8a2afe2..494cba230ca0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -41,8 +41,7 @@ * outside of a lifetime-guarded section. In general, this * is only needed for handling filters shared across tasks. * @prev: points to a previously installed, or inherited, filter - * @len: the number of instructions in the program - * @insnsi: the BPF program instructions to evaluate + * @prog: the BPF program to evaluate * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -168,8 +167,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) } /** - * seccomp_run_filters - evaluates all seccomp filters against @syscall - * @syscall: number of the current system call + * seccomp_run_filters - evaluates all seccomp filters against @sd + * @sd: optional seccomp data to be passed to filters * * Returns valid seccomp BPF response codes. */ -- cgit v1.2.3 From ceb75787bc75d0a7b88519ab8a68067ac690f55a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 1 Nov 2016 11:49:56 +0100 Subject: PM / sleep: fix device reference leak in test_suspend Make sure to drop the reference taken by class_find_device() after opening the RTC device. Fixes: 77437fd4e61f (pm: boot time suspend selftest) Signed-off-by: Johan Hovold Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend_test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 084452e34a12..bdff5ed57f10 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -203,8 +203,10 @@ static int __init test_suspend(void) /* RTCs have initialized by now too ... can we use one? */ dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); - if (dev) + if (dev) { rtc = rtc_class_open(dev_name(dev)); + put_device(dev); + } if (!rtc) { printk(warn_no_rtc); return 0; -- cgit v1.2.3 From 382005027fedc50b28d40ae64ef1461cca38953e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 2 Nov 2016 19:50:29 +0900 Subject: sched/core: Fix oops in sched_show_task() When CONFIG_THREAD_INFO_IN_TASK=y, it is possible that an exited thread remains in the task list after its stack pointer was already set to NULL. Therefore, thread_saved_pc() and stack_not_used() in sched_show_task() will trigger NULL pointer dereference if an attempt to dump such thread's traces (e.g. SysRq-t, khungtaskd) is made. Since show_stack() in sched_show_task() calls try_get_task_stack() and sched_show_task() is called from interrupt context, calling try_get_task_stack() from sched_show_task() will be safe as well. Signed-off-by: Tetsuo Handa Acked-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@alien8.de Cc: brgerst@gmail.com Cc: jann@thejh.net Cc: keescook@chromium.org Cc: linux-api@vger.kernel.org Cc: tycho.andersen@canonical.com Link: http://lkml.kernel.org/r/201611021950.FEJ34368.HFFJOOMLtQOVSF@I-love.SAKURA.ne.jp Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 42d4027f9e26..9abf66b6c271 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5192,6 +5192,8 @@ void sched_show_task(struct task_struct *p) int ppid; unsigned long state = p->state; + if (!try_get_task_stack(p)) + return; if (state) state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, @@ -5221,6 +5223,7 @@ void sched_show_task(struct task_struct *p) print_worker_info(KERN_INFO, p); show_stack(p, NULL); + put_task_stack(p); } void show_state_filter(unsigned long state_filter) -- cgit v1.2.3 From 8243d5597793b5e85143c9a935e1b971c59740a9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 1 Nov 2016 17:47:18 -0600 Subject: sched/core: Remove pointless printout in sched_show_task() In sched_show_task() we print out a useless hex number, not even a symbol, and there's a big question mark whether this even makes sense anyway, I suspect we should just remove it all. Signed-off-by: Linus Torvalds Acked-by: Andy Lutomirski Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: bp@alien8.de Cc: brgerst@gmail.com Cc: jann@thejh.net Cc: keescook@chromium.org Cc: linux-api@vger.kernel.org Cc: tycho.andersen@canonical.com Link: http://lkml.kernel.org/r/CA+55aFzphURPFzAvU4z6Moy7ZmimcwPuUdYU8bj9z0J+S8X1rw@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9abf66b6c271..154fd689fe02 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5198,17 +5198,8 @@ void sched_show_task(struct task_struct *p) state = __ffs(state) + 1; printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); -#if BITS_PER_LONG == 32 - if (state == TASK_RUNNING) - printk(KERN_CONT " running "); - else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); -#else if (state == TASK_RUNNING) printk(KERN_CONT " running task "); - else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); -#endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif -- cgit v1.2.3 From 243d52126184b072a18fe2130ce0008f8aa3a340 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 3 Nov 2016 09:42:36 -0700 Subject: taskstats: fix the length of cgroupstats_cmd_get_policy cgroupstats_cmd_get_policy is [CGROUPSTATS_CMD_ATTR_MAX+1], taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1], but their family.maxattr is TASKSTATS_CMD_ATTR_MAX. CGROUPSTATS_CMD_ATTR_MAX is less than TASKSTATS_CMD_ATTR_MAX, so we could end up accessing out-of-bound. Change cgroupstats_cmd_get_policy to TASKSTATS_CMD_ATTR_MAX+1, this is safe because the rest are initialized to 0's. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- kernel/taskstats.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee20d18..cbb387a265db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { +/* + * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. + * Make sure they are always aligned. + */ +static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From 483bed2b0ddd12ec33fc9407e0c6e1088e77a97c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Nov 2016 00:01:19 +0100 Subject: bpf: fix htab map destruction when extra reserve is in use Commit a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem") added an extra per-cpu reserve to the hash table map to restore old behaviour from pre prealloc times. When non-prealloc is in use for a map, then problem is that once a hash table extra element has been linked into the hash-table, and the hash table is destroyed due to refcount dropping to zero, then htab_map_free() -> delete_all_elements() will walk the whole hash table and drop all elements via htab_elem_free(). The problem is that the element from the extra reserve is first fed to the wrong backend allocator and eventually freed twice. Fixes: a6ed3ea65d98 ("bpf: restore behavior of bpf_map_update_elem") Reported-by: Dmitry Vyukov Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca7bdfa..ad1bc67aff1b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab_elem_free(htab, l); + if (l->state != HTAB_EXTRA_ELEM_USED) + htab_elem_free(htab, l); } } } -- cgit v1.2.3 From 20b2b24f91f70e7d3f0918c077546cb21bd73a87 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Nov 2016 00:56:31 +0100 Subject: bpf: fix map not being uncharged during map creation failure In map_create(), we first find and create the map, then once that suceeded, we charge it to the user's RLIMIT_MEMLOCK, and then fetch a new anon fd through anon_inode_getfd(). The problem is, once the latter fails f.e. due to RLIMIT_NOFILE limit, then we only destruct the map via map->ops->map_free(), but without uncharging the previously locked memory first. That means that the user_struct allocation is leaked as well as the accounted RLIMIT_MEMLOCK memory not released. Make the label names in the fix consistent with bpf_prog_load(). Fixes: aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..237f3d6a7ddc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr) err = bpf_map_charge_memlock(map); if (err) - goto free_map; + goto free_map_nouncharge; err = bpf_map_new_fd(map); if (err < 0) @@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr) return err; free_map: + bpf_map_uncharge_memlock(map); +free_map_nouncharge: map->ops->map_free(map); return err; } -- cgit v1.2.3 From 7ee7e87dfb158e79019ea1d5ea1b0e6f2bc93ee4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Nov 2016 19:57:00 +0100 Subject: genirq: Use irq type from irqdata instead of irqdesc The type flags in the irq descriptor are there for historical reasons and only updated via irq_modify_status() or irq_set_type(). Both functions also update the type flags in irqdata. __setup_irq() is the only left over user of the type flags in the irq descriptor. If __setup_irq() is called with empty irq type flags, then the type flags are retrieved from irqdata. If an interrupt is shared, then the type flags are compared with the type flags stored in the irq descriptor. On x86 the ioapic does not have a irq_set_type() callback because the type is defined in the BIOS tables and cannot be changed. The type is stored in irqdata at setup time without updating the type data in the irq descriptor. As a result the comparison described above fails. There is no point in updating the irq descriptor flags because the only relevant storage is irqdata. Use the type flags from irqdata for both retrieval and comparison in __setup_irq() instead. Aside of that the print out in case of non matching type flags has the old and new type flags arguments flipped. Fix that as well. For correctness sake the flags stored in the irq descriptor should be removed, but this is beyond the scope of this bugfix and will be done in a later patch. Fixes: 4b357daed698 ("genirq: Look-up trigger type if not specified by caller") Reported-and-tested-by: Mika Westerberg Signed-off-by: Thomas Gleixner Cc: Marc Zyngier Cc: Jon Hunter Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1611072020360.3501@nanos Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9c4d30483264..6b669593e7eb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1341,12 +1341,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } else if (new->flags & IRQF_TRIGGER_MASK) { unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; - unsigned int omsk = irq_settings_get_trigger_mask(desc); + unsigned int omsk = irqd_get_trigger_type(&desc->irq_data); if (nmsk != omsk) /* hope the handler works with current trigger mode */ pr_warn("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + irq, omsk, nmsk); } *old_ptr = new; -- cgit v1.2.3 From 212bd846223c718b6577d4df16fd8d05a55ad914 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2016 17:15:02 -0800 Subject: genirq/affinity: Handle pre/post vectors in irq_calc_affinity_vectors() Only calculate the affinity for the main I/O vectors, and skip the pre or post vectors specified by struct irq_affinity. Also remove the irq_affinity cpumask argument that has never been used. If we ever need it in the future we can pass it through struct irq_affinity. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Jens Axboe Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Link: http://lkml.kernel.org/r/1478654107-7384-3-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- drivers/pci/msi.c | 8 ++++---- include/linux/interrupt.h | 4 ++-- kernel/irq/affinity.c | 24 ++++++++++-------------- 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index ad70507cfb56..dad2da7cf80e 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1061,6 +1061,7 @@ EXPORT_SYMBOL(pci_msi_enabled); static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, unsigned int flags) { + static const struct irq_affinity default_affd; bool affinity = flags & PCI_IRQ_AFFINITY; int nvec; int rc; @@ -1091,8 +1092,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, for (;;) { if (affinity) { - nvec = irq_calc_affinity_vectors(dev->irq_affinity, - nvec); + nvec = irq_calc_affinity_vectors(nvec, &default_affd); if (nvec < minvec) return -ENOSPC; } @@ -1132,6 +1132,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int minvec, int maxvec, unsigned int flags) { + static const struct irq_affinity default_affd; bool affinity = flags & PCI_IRQ_AFFINITY; int rc, nvec = maxvec; @@ -1140,8 +1141,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev, for (;;) { if (affinity) { - nvec = irq_calc_affinity_vectors(dev->irq_affinity, - nvec); + nvec = irq_calc_affinity_vectors(nvec, &default_affd); if (nvec < minvec) return -ENOSPC; } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 6b5268688a81..9081f23bc0ff 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -291,7 +291,7 @@ extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec); -int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec); +int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ @@ -331,7 +331,7 @@ irq_create_affinity_masks(const struct cpumask *affinity, int nvec) } static inline int -irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) { return maxvec; } diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 17f51d63da56..8d9259727cb4 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -131,24 +131,20 @@ out: } /** - * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask - * @affinity: The affinity mask to spread. If NULL cpu_online_mask - * is used - * @maxvec: The maximum number of vectors available + * irq_calc_affinity_vectors - Calculate the optimal number of vectors + * @maxvec: The maximum number of vectors available + * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec) +int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd) { - int cpus, ret; + int resv = affd->pre_vectors + affd->post_vectors; + int vecs = maxvec - resv; + int cpus; /* Stabilize the cpumasks */ get_online_cpus(); - /* If the supplied affinity mask is NULL, use cpu online mask */ - if (!affinity) - affinity = cpu_online_mask; - - cpus = cpumask_weight(affinity); - ret = (cpus < maxvec) ? cpus : maxvec; - + cpus = cpumask_weight(cpu_online_mask); put_online_cpus(); - return ret; + + return min(cpus, vecs) + resv; } -- cgit v1.2.3 From 67c93c218dc5d1b45d547771f1fdb44a381e1faf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2016 17:15:03 -0800 Subject: genirq/affinity: Handle pre/post vectors in irq_create_affinity_masks() Only calculate the affinity for the main I/O vectors, and skip the pre or post vectors specified by struct irq_affinity. Also remove the irq_affinity cpumask argument that has never been used. If we ever need it in the future we can pass it through struct irq_affinity. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Bjorn Helgaas Acked-by: Jens Axboe Cc: linux-block@vger.kernel.org Cc: linux-pci@vger.kernel.org Link: http://lkml.kernel.org/r/1478654107-7384-4-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- drivers/pci/msi.c | 6 ++++-- include/linux/interrupt.h | 4 ++-- kernel/irq/affinity.c | 46 +++++++++++++++++++++++++--------------------- 3 files changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index dad2da7cf80e..f4a108b59336 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -553,12 +553,13 @@ error_attrs: static struct msi_desc * msi_setup_entry(struct pci_dev *dev, int nvec, bool affinity) { + static const struct irq_affinity default_affd; struct cpumask *masks = NULL; struct msi_desc *entry; u16 control; if (affinity) { - masks = irq_create_affinity_masks(dev->irq_affinity, nvec); + masks = irq_create_affinity_masks(nvec, &default_affd); if (!masks) pr_err("Unable to allocate affinity masks, ignoring\n"); } @@ -692,12 +693,13 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, struct msix_entry *entries, int nvec, bool affinity) { + static const struct irq_affinity default_affd; struct cpumask *curmsk, *masks = NULL; struct msi_desc *entry; int ret, i; if (affinity) { - masks = irq_create_affinity_masks(dev->irq_affinity, nvec); + masks = irq_create_affinity_masks(nvec, &default_affd); if (!masks) pr_err("Unable to allocate affinity masks, ignoring\n"); } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 9081f23bc0ff..53144e78a369 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -290,7 +290,7 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); -struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, int nvec); +struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd); int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd); #else /* CONFIG_SMP */ @@ -325,7 +325,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) } static inline struct cpumask * -irq_create_affinity_masks(const struct cpumask *affinity, int nvec) +irq_create_affinity_masks(int nvec, const struct irq_affinity *affd) { return NULL; } diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 8d9259727cb4..17360bd9619b 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -51,16 +51,16 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) /** * irq_create_affinity_masks - Create affinity masks for multiqueue spreading - * @affinity: The affinity mask to spread. If NULL cpu_online_mask - * is used - * @nvecs: The number of vectors + * @nvecs: The total number of vectors + * @affd: Description of the affinity requirements * * Returns the masks pointer or NULL if allocation failed. */ -struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, - int nvec) +struct cpumask * +irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { - int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0; + int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; + int affv = nvecs - affd->pre_vectors - affd->post_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; cpumask_var_t nmsk; @@ -68,46 +68,46 @@ struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return NULL; - masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL); + masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) goto out; + /* Fill out vectors at the beginning that don't need affinity */ + for (curvec = 0; curvec < affd->pre_vectors; curvec++) + cpumask_copy(masks + curvec, cpu_possible_mask); + /* Stabilize the cpumasks */ get_online_cpus(); - /* If the supplied affinity mask is NULL, use cpu online mask */ - if (!affinity) - affinity = cpu_online_mask; - - nodes = get_nodes_in_cpumask(affinity, &nodemsk); + nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); /* * If the number of nodes in the mask is less than or equal the * number of vectors we just spread the vectors across the nodes. */ - if (nvec <= nodes) { + if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, cpumask_of_node(n)); - if (++curvec == nvec) + if (++curvec == affv) break; } - goto outonl; + goto done; } /* Spread the vectors per node */ - vecs_per_node = nvec / nodes; + vecs_per_node = affv / nodes; /* Account for rounding errors */ - extra_vecs = nvec - (nodes * vecs_per_node); + extra_vecs = affv - (nodes * vecs_per_node); for_each_node_mask(n, nodemsk) { int ncpus, v, vecs_to_assign = vecs_per_node; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, affinity, cpumask_of_node(n)); + cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); - for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) { + for (v = 0; curvec < affv && v < vecs_to_assign; curvec++, v++) { cpus_per_vec = ncpus / vecs_to_assign; /* Account for extra vectors to compensate rounding errors */ @@ -119,12 +119,16 @@ struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity, irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= nvec) + if (curvec >= affv) break; } -outonl: +done: put_online_cpus(); + + /* Fill out vectors at the end that don't need affinity */ + for (; curvec < nvecs; curvec++) + cpumask_copy(masks + curvec, cpu_possible_mask); out: free_cpumask_var(nmsk); return masks; -- cgit v1.2.3 From 90b14889d2f9b29d7e5b4b2d36251c13ce3dd13f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 3 Nov 2016 15:49:58 +0100 Subject: kernel/printk: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161103145021.28528-3-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + kernel/printk/printk.c | 29 +++++++++++++---------------- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 69b74fa0da60..4174083280d7 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -31,6 +31,7 @@ enum cpuhp_state { CPUHP_S390_PFAULT_DEAD, CPUHP_BLK_MQ_DEAD, CPUHP_FS_BUFF_DEAD, + CPUHP_PRINTK_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index de08fc90baaf..4487ffcd42d5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2185,27 +2185,18 @@ void resume_console(void) /** * console_cpu_notify - print deferred console messages after CPU hotplug - * @self: notifier struct - * @action: CPU hotplug event - * @hcpu: unused + * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be spooled but will not show up on the console. This function is * called when a new CPU comes online (or fails to come up), and ensures * that any such output gets printed. */ -static int console_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - case CPU_DOWN_FAILED: - case CPU_UP_CANCELED: - console_lock(); - console_unlock(); - } - return NOTIFY_OK; +static int console_cpu_notify(unsigned int cpu) +{ + console_lock(); + console_unlock(); + return 0; } /** @@ -2843,6 +2834,7 @@ EXPORT_SYMBOL(unregister_console); static int __init printk_late_init(void) { struct console *con; + int ret; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { @@ -2857,7 +2849,12 @@ static int __init printk_late_init(void) unregister_console(con); } } - hotcpu_notifier(console_cpu_notify, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", + console_cpu_notify, NULL); + WARN_ON(ret < 0); return 0; } late_initcall(printk_late_init); -- cgit v1.2.3 From de464375daf0d10f04fa5add2e889f42328d2ade Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 8 Nov 2016 16:40:28 +0100 Subject: bpf: Remove unused but set variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the unused but set variables min_set and max_set in adjust_reg_min_max_vals to fix the following warning when building with 'W=1': kernel/bpf/verifier.c:1483:7: warning: variable ‘min_set’ set but not used [-Wunused-but-set-variable] There is no warning about max_set being unused, but since it is only used in the assignment of min_set it can be removed as well. They were introduced in commit 484611357c19 ("bpf: allow access into map value arrays") but seem to have never been used. Cc: Josef Bacik Signed-off-by: Tobias Klauser Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 900257578934..89f787ca47ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1499,7 +1499,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, { struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; - bool min_set = false, max_set = false; u8 opcode = BPF_OP(insn->code); dst_reg = ®s[insn->dst_reg]; @@ -1522,7 +1521,6 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, } else if (insn->imm < BPF_REGISTER_MAX_RANGE && (s64)insn->imm > BPF_REGISTER_MIN_RANGE) { min_val = max_val = insn->imm; - min_set = max_set = true; } /* We don't know anything about what was done to this register, mark it -- cgit v1.2.3 From 83f06168ef15da5dc735c7ea14fae67609ed9538 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Tue, 8 Nov 2016 00:02:07 -0800 Subject: locking/lockdep: Remove unused parameter from the add_lock_to_list() function The 'class' parameter is not used, remove it. n Signed-off-by: Tahsin Erdogan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1478592127-4376-1-git-send-email-tahsin@google.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b3..e74b438c850b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -840,9 +840,9 @@ static struct lock_list *alloc_list_entry(void) /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, - int distance, struct stack_trace *trace) +static int add_lock_to_list(struct lock_class *this, struct list_head *head, + unsigned long ip, int distance, + struct stack_trace *trace) { struct lock_list *entry; /* @@ -1869,14 +1869,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, next->acquire_ip, distance, &trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, next->acquire_ip, distance, &trace); if (!ret) -- cgit v1.2.3 From 9846d50df3def5bff9d8a408a958722e79bcaa10 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Tue, 8 Nov 2016 11:15:23 +0100 Subject: sched/deadline: Fix typo in a comment In the comment: /* * The task might have changed its scheduling policy to something * different than SCHED_DEADLINE (through switched_fromd_dl()). */ s/fromd/from/ Signed-off-by: Daniel Bristot de Oliveira Cc: Juri Lelli Cc: Linus Torvalds Cc: Luca Abeni Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5408b3b3f9ee197a7b7f10fb834341100a4f2c88.1478599881.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 37e2449186c4..c61b461248a3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) /* * The task might have changed its scheduling policy to something - * different than SCHED_DEADLINE (through switched_fromd_dl()). + * different than SCHED_DEADLINE (through switched_from_dl()). */ if (!dl_task(p)) { __dl_clear_params(p); -- cgit v1.2.3 From c6c7d83b9c9e6a8b3e6d84c820ac61fbffc9e396 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 10 Nov 2016 10:46:26 -0800 Subject: Revert "console: don't prefer first registered if DT specifies stdout-path" This reverts commit 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path"). The reverted commit changes existing behavior on which many ARM boards rely. Many ARM small-board-computers, like e.g. the Raspberry Pi have both a video output and a serial console. Depending on whether the user is using the device as a more regular computer; or as a headless device we need to have the console on either one or the other. Many users rely on the kernel behavior of the console being present on both outputs, before the reverted commit the console setup with no console= kernel arguments on an ARM board which sets stdout-path in dt would look like this: [root@localhost ~]# cat /proc/consoles ttyS0 -W- (EC p a) 4:64 tty0 -WU (E p ) 4:1 Where as after the reverted commit, it looks like this: [root@localhost ~]# cat /proc/consoles ttyS0 -W- (EC p a) 4:64 This commit reverts commit 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path") restoring the original behavior. Fixes: 05fd007e4629 ("console: don't prefer first registered if DT specifies stdout-path") Link: http://lkml.kernel.org/r/20161104121135.4780-2-hdegoede@redhat.com Signed-off-by: Hans de Goede Cc: Paul Burton Cc: Rob Herring Cc: Frank Rowand Cc: Thorsten Leemhuis Cc: Greg Kroah-Hartman Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/base.c | 2 -- include/linux/console.h | 6 ------ kernel/printk/printk.c | 13 +------------ 3 files changed, 1 insertion(+), 20 deletions(-) (limited to 'kernel') diff --git a/drivers/of/base.c b/drivers/of/base.c index d687e6de24a0..a0bccb54a9bd 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -2077,8 +2077,6 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)) name = of_get_property(of_aliases, "stdout", NULL); if (name) of_stdout = of_find_node_opts_by_path(name, &of_stdout_options); - if (of_stdout) - console_set_by_of(); } if (!of_aliases) diff --git a/include/linux/console.h b/include/linux/console.h index 3672809234a7..d530c4627e54 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -173,12 +173,6 @@ static inline void console_sysfs_notify(void) #endif extern bool console_suspend_enabled; -#ifdef CONFIG_OF -extern void console_set_by_of(void); -#else -static inline void console_set_by_of(void) {} -#endif - /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index de08fc90baaf..5028f4fd504a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,17 +253,6 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); -#ifdef CONFIG_OF -static bool of_specified_console; - -void console_set_by_of(void) -{ - of_specified_console = true; -} -#else -# define of_specified_console false -#endif - /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -2657,7 +2646,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0 && !of_specified_console) { + if (preferred_console < 0) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || -- cgit v1.2.3 From c540594f864bb4645573c2c0a304919fabb3d7ea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Nov 2016 22:02:34 +0100 Subject: bpf, mlx4: fix prog refcount in mlx4_en_try_alloc_resources error path Commit 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings scheme") added a bug in that the prog's reference count is not dropped in the error path when mlx4_en_try_alloc_resources() is failing from mlx4_xdp_set(). We previously took bpf_prog_add(prog, priv->rx_ring_num - 1), that we need to release again. Earlier in the call path, dev_change_xdp_fd() itself holds a reference to the prog as well (hence the '- 1' in the bpf_prog_add()), so a simple atomic_sub() is safe to use here. When an error is propagated, then bpf_prog_put() is called eventually from dev_change_xdp_fd() Fixes: 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings scheme") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 5 ++++- include/linux/bpf.h | 5 +++++ kernel/bpf/syscall.c | 11 +++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 0f6225c042be..9bf7320107b0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2747,8 +2747,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) } err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof); - if (err) + if (err) { + if (prog) + bpf_prog_sub(prog, priv->rx_ring_num - 1); goto unlock_out; + } if (priv->port_up) { port_up = 1; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index edcd96ded8aa..01c1487277b2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -234,6 +234,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i); +void bpf_prog_sub(struct bpf_prog *prog, int i); struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); @@ -303,6 +304,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) return ERR_PTR(-EOPNOTSUPP); } +static inline void bpf_prog_sub(struct bpf_prog *prog, int i) +{ +} + static inline void bpf_prog_put(struct bpf_prog *prog) { } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962447a5..23eb2050f15e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -680,6 +680,17 @@ struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) } EXPORT_SYMBOL_GPL(bpf_prog_add); +void bpf_prog_sub(struct bpf_prog *prog, int i) +{ + /* Only to be used for undoing previous bpf_prog_add() in some + * error path. We still know that another entity in our call + * path holds a reference to the program, thus atomic_sub() can + * be safely used in such cases! + */ + WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); +} +EXPORT_SYMBOL_GPL(bpf_prog_sub); + struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) { return bpf_prog_add(prog, 1); -- cgit v1.2.3 From f5c9f9c72395c3291c2e35c905dedae2b98475a4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 14 Nov 2016 09:31:52 -0800 Subject: Revert "printk: make reading the kernel log flush pending lines" This reverts commit bfd8d3f23b51018388be0411ccbc2d56277fe294. It turns out that this flushes things much too aggressiverly, and causes lines to break up when the system logger races with new continuation lines being printed. There's a pending patch to make printk() flushing much more straightforward, but it's too invasive for 4.9, so in the meantime let's just not make the system message logging flush continuation lines. They'll be flushed by the final newline anyway. Suggested-by: Petr Mladek Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5028f4fd504a..f7a55e9ff2f7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -783,8 +783,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return ret; } -static void cont_flush(void); - static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -800,7 +798,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -863,7 +860,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return -ESPIPE; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); switch (whence) { case SEEK_SET: /* the first record */ @@ -902,7 +898,6 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) @@ -1289,7 +1284,6 @@ static int syslog_print(char __user *buf, int size) size_t skip; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -1349,7 +1343,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) return -ENOMEM; raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (buf) { u64 next_seq; u64 seq; @@ -1511,7 +1504,6 @@ int do_syslog(int type, char __user *buf, int len, int source) /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: raw_spin_lock_irq(&logbuf_lock); - cont_flush(); if (syslog_seq < log_first_seq) { /* messages are gone, move to first one */ syslog_seq = log_first_seq; @@ -3028,7 +3020,6 @@ void kmsg_dump(enum kmsg_dump_reason reason) dumper->active = true; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); dumper->cur_seq = clear_seq; dumper->cur_idx = clear_idx; dumper->next_seq = log_next_seq; @@ -3119,7 +3110,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, bool ret; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); raw_spin_unlock_irqrestore(&logbuf_lock, flags); @@ -3162,7 +3152,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - cont_flush(); if (dumper->cur_seq < log_first_seq) { /* messages are gone, move to first available one */ dumper->cur_seq = log_first_seq; -- cgit v1.2.3 From b8f2ed538477d9ab803c6458f497df1b1c6cf4ce Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Aug 2016 06:51:47 -0700 Subject: rcu: Tighten up __call_rcu() rcu_head alignment check Commit 720abae3d68ae ("rcu: force alignment on struct callback_head/rcu_head") forced the rcu_head (AKA callback_head) structure's alignment to pointer size, that is, to 4-byte boundaries on 32-bit systems and to 8-byte boundaries on 64-bit systems. This commit therefore checks for this same alignment in __call_rcu(), which used to contain a looser check for two-byte alignment. Signed-off-by: Paul E. McKenney Tested-by: Geert Uytterhoeven Acked-by: Kirill A. Shutemov Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 69a5611a7e7c..37e4f7d2be0c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3121,7 +3121,9 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, unsigned long flags; struct rcu_data *rdp; - WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ + /* Misaligned rcu_head! */ + WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + if (debug_rcu_head_queue(head)) { /* Probable double call_rcu(), so leak the callback. */ WRITE_ONCE(head->func, rcu_leak_callback); -- cgit v1.2.3 From 5403d367a746cfc51676e3b39be600d94f587867 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Sep 2016 05:04:24 -0700 Subject: rcu: Remove obsolete rcu_check_callbacks() header comment In the deep past, rcu_check_callbacks() was only invoked if rcu_pending() returned true. Which was fine, but these days rcu_check_callbacks() is invoked unconditionally. This commit therefore removes the obsolete sentence from the header comment. Reported-by: Michalis Kokologiannakis Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 37e4f7d2be0c..d52780024f9f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2828,8 +2828,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * Also schedule RCU core processing. * * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. If rcu_pending returns - * false, there is no point in invoking rcu_check_callbacks(). + * invoked from the scheduling-clock interrupt. */ void rcu_check_callbacks(int user) { -- cgit v1.2.3 From 1f21b50b7785b26aff2a0770d8a674921514ff0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Sep 2016 05:15:53 -0700 Subject: rcu: Remove obsolete comment from __call_rcu() The __call_rcu() comment about opportunistically noting grace period beginnings and endings is obsolete. RCU still does such opportunistic noting, but in __call_rcu_core() rather than __call_rcu(), and there already is an appropriate comment in __call_rcu_core(). This commit therefore removes the obsolete comment. Reported-by: Michalis Kokologiannakis Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d52780024f9f..865af187498e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3131,13 +3131,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, } head->func = func; head->next = NULL; - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); -- cgit v1.2.3 From d0af39e89ec59fe7c92c4bcbc2d652ea4c0ee644 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Oct 2016 18:26:04 -0700 Subject: torture: Trace long read-side delays Although rcutorture will occasionally do a 50-millisecond grace-period delay, these delays are quite rare. And rightly so, because otherwise the read rate would be quite low. Thie means that it can be important to identify whether or not a given run contained a long-delay read. This commit therefore inserts a trace_rcu_torture_read() event to flag runs containing long delays. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 5 ++++- kernel/rcu/rcutorture.c | 11 ++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d3e756539d44..9d4f9b3a2b7b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -698,7 +698,10 @@ TRACE_EVENT(rcu_batch_end, /* * Tracepoint for rcutorture readers. The first argument is the name * of the RCU flavor from rcutorture's viewpoint and the second argument - * is the callback address. + * is the callback address. The third argument is the start time in + * seconds, and the last two arguments are the grace period numbers + * at the beginning and end of the read, respectively. Note that the + * callback address can be NULL. */ TRACE_EVENT(rcu_torture_read, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bf08fee53dc7..87c51225ceec 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -289,15 +289,24 @@ static int rcu_torture_read_lock(void) __acquires(RCU) static void rcu_read_delay(struct torture_random_state *rrsp) { + unsigned long started; + unsigned long completed; const unsigned long shortdelay_us = 200; const unsigned long longdelay_ms = 50; + unsigned long long ts; /* We want a short delay sometimes to make a reader delay the grace * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { + started = cur_ops->completed(); + ts = rcu_trace_clock_local(); mdelay(longdelay_ms); + completed = cur_ops->completed(); + do_trace_rcu_torture_read(cur_ops->name, NULL, ts, + started, completed); + } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT -- cgit v1.2.3 From 0742ac3e2f9f4b8a3a394a270d8685078837662b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 11 Oct 2016 06:09:59 -0700 Subject: rcu: Make expedited grace periods recheck dyntick idle state Expedited grace periods check dyntick-idle state, and avoid sending IPIs to idle CPUs, including those running guest OSes, and, on NOHZ_FULL kernels, nohz_full CPUs. However, the kernel has been observed checking a CPU while it was non-idle, but sending the IPI after it has gone idle. This commit therefore rechecks idle state immediately before sending the IPI, refraining from IPIing CPUs that have since gone idle. Reported-by: Rik van Riel Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e99a5234d9ed..fe98dd24adf8 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -404,6 +404,7 @@ struct rcu_data { atomic_long_t exp_workdone1; /* # done by others #1. */ atomic_long_t exp_workdone2; /* # done by others #2. */ atomic_long_t exp_workdone3; /* # done by others #3. */ + int exp_dynticks_snap; /* Double-check need for IPI. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 24343eb87b58..d3053e99fdb6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -358,8 +358,10 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + rdp->exp_dynticks_snap = + atomic_add_return(0, &rdtp->dynticks); if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1) || + !(rdp->exp_dynticks_snap & 0x1) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -377,9 +379,17 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, /* IPI the remaining CPUs for expedited quiescent state. */ for_each_leaf_node_possible_cpu(rnp, cpu) { unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + if (!(mask_ofl_ipi & mask)) continue; retry_ipi: + if (atomic_add_return(0, &rdtp->dynticks) != + rdp->exp_dynticks_snap) { + mask_ofl_test |= mask; + continue; + } ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; -- cgit v1.2.3 From aa3e0bf1aa76579c6c51a2c1116a630aa716379d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Mar 2016 10:43:23 -0700 Subject: rcu: Don't kick unless grace period or request The current code can result in spurious kicks when there are no grace periods in progress and no grace-period-related requests. This is sort of OK for a diagnostic aid, but the resulting ftrace-dump messages in dmesg are annoying. This commit therefore avoids spurious kicks in the common case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 865af187498e..96c52e43f7ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1304,7 +1304,8 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) if (!rcu_kick_kthreads) return; j = READ_ONCE(rsp->jiffies_kick_kthreads); - if (time_after(jiffies, j) && rsp->gp_kthread) { + if (time_after(jiffies, j) && rsp->gp_kthread && + (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); rcu_ftrace_dump(DUMP_ALL); wake_up_process(rsp->gp_kthread); -- cgit v1.2.3 From 8443075eacb51df8539916c4170d2fdfe7c81433 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 10 Nov 2016 01:39:49 -0500 Subject: audit: tame initialization warning len_abuf in audit_log_execve_info Tame initialization warning of len_abuf in audit_log_execve_info even though there isn't presently a bug introduced by commit 43761473c254 ("audit: fix a double fetch in audit_log_single_execve_arg()"). Using UNINITIALIZED_VAR instead may mask future bugs. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5abf1dc1f91c..8c434318ec8d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1000,7 +1000,7 @@ static void audit_log_execve_info(struct audit_context *context, long len_rem; long len_full; long len_buf; - long len_abuf; + long len_abuf = 0; long len_tmp; bool require_data; bool encode; -- cgit v1.2.3 From 535e7b4b5ef220be374b895684f274872aebd0f8 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Sun, 13 Nov 2016 19:44:03 +0100 Subject: bpf: Use u64_to_user_ptr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the custom u64_to_ptr() function with the u64_to_user_ptr() macro. Signed-off-by: Mickaël Salaün Cc: Alexei Starovoitov Cc: Arnd Bergmann Cc: Daniel Borkmann Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 23eb2050f15e..cdc06546401b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -17,6 +17,7 @@ #include #include #include +#include DEFINE_PER_CPU(int, bpf_prog_active); @@ -252,12 +253,6 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } -/* helper to convert user pointers passed inside __aligned_u64 fields */ -static void __user *u64_to_ptr(__u64 val) -{ - return (void __user *) (unsigned long) val; -} - int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { return -ENOTSUPP; @@ -268,8 +263,8 @@ int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) static int map_lookup_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; @@ -342,8 +337,8 @@ err_put: static int map_update_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *uvalue = u64_to_ptr(attr->value); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; @@ -420,7 +415,7 @@ err_put: static int map_delete_elem(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); + void __user *ukey = u64_to_user_ptr(attr->key); int ufd = attr->map_fd; struct bpf_map *map; struct fd f; @@ -464,8 +459,8 @@ err_put: static int map_get_next_key(union bpf_attr *attr) { - void __user *ukey = u64_to_ptr(attr->key); - void __user *unext_key = u64_to_ptr(attr->next_key); + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *unext_key = u64_to_user_ptr(attr->next_key); int ufd = attr->map_fd; struct bpf_map *map; void *key, *next_key; @@ -741,7 +736,7 @@ static int bpf_prog_load(union bpf_attr *attr) return -EINVAL; /* copy eBPF program license from user space */ - if (strncpy_from_user(license, u64_to_ptr(attr->license), + if (strncpy_from_user(license, u64_to_user_ptr(attr->license), sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; @@ -771,7 +766,7 @@ static int bpf_prog_load(union bpf_attr *attr) prog->len = attr->insn_cnt; err = -EFAULT; - if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), prog->len * sizeof(struct bpf_insn)) != 0) goto free_prog; @@ -822,7 +817,7 @@ static int bpf_obj_pin(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ)) return -EINVAL; - return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); + return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); } static int bpf_obj_get(const union bpf_attr *attr) @@ -830,7 +825,7 @@ static int bpf_obj_get(const union bpf_attr *attr) if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) return -EINVAL; - return bpf_obj_get_user(u64_to_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); } SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) -- cgit v1.2.3 From 977c1f9c8c022d0173181766b34a0db3705265a4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 7 Nov 2016 15:14:20 -0800 Subject: ftrace: Ignore FTRACE_FL_DISABLED while walking dyn_ftrace records ftrace_shutdown() checks for sanity of ftrace records and if dyn_ftrace->flags is not zero, it will warn. It can happen that 'flags' are set to FTRACE_FL_DISABLED at this point, since some module was loaded, but before ftrace_module_enable() cleared the flags for this module. In other words the module.c is doing: ftrace_module_init(mod); // calls ftrace_update_code() that sets flags=FTRACE_FL_DISABLED ... // here ftrace_shutdown() is called that warns, since err = prepare_coming_module(mod); // didn't have a chance to clear FTRACE_FL_DISABLED Fix it by ignoring disabled records. It's similar to what __ftrace_hash_rec_update() is already doing. Link: http://lkml.kernel.org/r/1478560460-3818619-1-git-send-email-ast@fb.com Cc: stable@vger.kernel.org Fixes: b7ffffbb46f2 "ftrace: Add infrastructure for delayed enabling of module functions" Signed-off-by: Alexei Starovoitov Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2050a7652a86..326498baab83 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2763,7 +2763,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) struct dyn_ftrace *rec; do_for_each_ftrace_rec(pg, rec) { - if (FTRACE_WARN_ON_ONCE(rec->flags)) + if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED)) pr_warn(" %pS flags:%lx\n", (void *)rec->ip, rec->flags); } while_for_each_ftrace_rec(); -- cgit v1.2.3 From 546fece4eae871f033925ccf0ff2b740725ae915 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 14 Nov 2016 16:31:49 -0500 Subject: ftrace: Add more checks for FTRACE_FL_DISABLED in processing ip records When a module is first loaded and its function ip records are added to the ftrace list of functions to modify, they are set to DISABLED, as their text is still in a read only state. When the module is fully loaded, and can be updated, the flag is cleared, and if their's any functions that should be tracing them, it is updated at that moment. But there's several locations that do record accounting and should ignore records that are marked as disabled, or they can cause issues. Alexei already fixed one location, but others need to be addressed. Cc: stable@vger.kernel.org Fixes: b7ffffbb46f2 "ftrace: Add infrastructure for delayed enabling of module functions" Reported-by: Alexei Starovoitov Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 326498baab83..da87b3cba5b3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1862,6 +1862,10 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, /* Update rec->flags */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + /* We need to update only differences of filter_hash */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -1884,6 +1888,10 @@ rollback: /* Roll back what we did above */ do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (rec == end) goto err_out; @@ -2397,6 +2405,10 @@ void __weak ftrace_replace_code(int enable) return; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec); @@ -3598,6 +3610,10 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) goto out_unlock; do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) { ret = enter_record(hash, rec, clear_filter); if (ret < 0) { @@ -3793,6 +3809,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (!ftrace_match_record(rec, &func_g, NULL, 0)) continue; @@ -4685,6 +4704,9 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) do_for_each_ftrace_rec(pg, rec) { + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (ftrace_match_record(rec, &func_g, NULL, 0)) { /* if it is in the array */ exists = false; -- cgit v1.2.3 From 60f1d5e3bac44b598f67d36062da96c095d2b700 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Oct 2016 20:58:15 +0900 Subject: ftrace: Support full glob matching Use glob_match() to support flexible glob wildcards (*,?) and character classes ([) for ftrace. Since the full glob matching is slower than the current partial matching routines(*pat, pat*, *pat*), this leaves those routines and just add MATCH_GLOB for complex glob expression. e.g. ---- [root@localhost tracing]# echo 'sched*group' > set_ftrace_filter [root@localhost tracing]# cat set_ftrace_filter sched_free_group sched_change_group sched_create_group sched_online_group sched_destroy_group sched_offline_group [root@localhost tracing]# echo '[Ss]y[Ss]_*' > set_ftrace_filter [root@localhost tracing]# head set_ftrace_filter sys_arch_prctl sys_rt_sigreturn sys_ioperm SyS_iopl sys_modify_ldt SyS_mmap SyS_set_thread_area SyS_get_thread_area SyS_set_tid_address sys_fork ---- Link: http://lkml.kernel.org/r/147566869501.29136.6462645009894738056.stgit@devbox Acked-by: Namhyung Kim Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- Documentation/trace/events.txt | 9 +++------ Documentation/trace/ftrace.txt | 9 +++------ kernel/trace/Kconfig | 2 ++ kernel/trace/ftrace.c | 4 ++++ kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 ++ kernel/trace/trace_events_filter.c | 17 ++++++++++++++++- 7 files changed, 31 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index 08d74d75150d..2cc08d4a326e 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -189,16 +189,13 @@ And for string fields they are: ==, !=, ~ -The glob (~) only accepts a wild card character (*) at the start and or -end of the string. For example: +The glob (~) accepts a wild card character (*,?) and character classes +([). For example: prev_comm ~ "*sh" prev_comm ~ "sh*" prev_comm ~ "*sh*" - -But does not allow for it to be within the string: - - prev_comm ~ "ba*sh" <-- is invalid + prev_comm ~ "ba*sh" 5.2 Setting filters ------------------- diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 185c39fea2a0..1bc66c1db0cb 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -2218,16 +2218,13 @@ hrtimer_interrupt sys_nanosleep -Perhaps this is not enough. The filters also allow simple wild -cards. Only the following are currently available +Perhaps this is not enough. The filters also allow glob(7) matching. * - will match functions that begin with * - will match functions that end with ** - will match functions that have in it - -These are the only wild cards which are supported. - - * will not work. + * - will match functions that begin with + and end with Note: It is better to use quotes to enclose the wild cards, otherwise the shell may expand the parameters into names diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2a96b063d659..d5038005eb5d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,6 +70,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -133,6 +134,7 @@ config FUNCTION_TRACER select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER + select GLOB help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da87b3cba5b3..356bb70d071e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3511,6 +3511,10 @@ static int ftrace_match(char *str, struct ftrace_glob *g) memcmp(str + slen - g->len, g->search, g->len) == 0) matched = 1; break; + case MATCH_GLOB: + if (glob_match(g->search, str)) + matched = 1; + break; } return matched; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8696ce6bf2f6..d904516dfdab 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4065,7 +4065,7 @@ static const char readme_msg[] = "\n available_filter_functions - list of functions that can be filtered on\n" " set_ftrace_filter\t- echo function name in here to only trace these\n" "\t\t\t functions\n" - "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t accepts: func_full_name or glob-matching-pattern\n" "\t modules: Can select a group via module\n" "\t Format: :mod:\n" "\t example: echo :mod:ext3 > set_ftrace_filter\n" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fd24b1f9ac43..4b7918902ab8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,6 +15,7 @@ #include #include #include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -1257,6 +1258,7 @@ enum regex_type { MATCH_FRONT_ONLY, MATCH_MIDDLE_ONLY, MATCH_END_ONLY, + MATCH_GLOB, }; struct regex { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9daa9b3bc6d9..e1c7e2cdc240 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -344,6 +344,12 @@ static int regex_match_end(char *str, struct regex *r, int len) return 0; } +static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused) +{ + if (glob_match(r->pattern, str)) + return 1; + return 0; +} /** * filter_parse_regex - parse a basic regex * @buff: the raw regex @@ -380,14 +386,20 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) if (!i) { *search = buff + 1; type = MATCH_END_ONLY; - } else { + } else if (i == len - 1) { if (type == MATCH_END_ONLY) type = MATCH_MIDDLE_ONLY; else type = MATCH_FRONT_ONLY; buff[i] = 0; break; + } else { /* pattern continues, use full glob */ + type = MATCH_GLOB; + break; } + } else if (strchr("[?\\", buff[i])) { + type = MATCH_GLOB; + break; } } @@ -420,6 +432,9 @@ static void filter_build_regex(struct filter_pred *pred) case MATCH_END_ONLY: r->match = regex_match_end; break; + case MATCH_GLOB: + r->match = regex_match_glob; + break; } pred->not ^= not; -- cgit v1.2.3 From fdf5b679864b8a1165712164b8f17debeb6b10b6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 25 Oct 2016 16:14:28 -0400 Subject: tracing: Optimise comparison filters and fix binary and for 64 bit Currently the filter logic for comparisons (like greater-than and less-than) are used, they share the same function and a switch statement is used to jump to the comparison type to perform. This is done in the extreme hot path of the tracing code, and it does not take much more space to create a unique comparison function to perform each type of comparison and remove the switch statement. Also, a bug was found where the binary and operation for 64 bits could fail if the resulting bits were greater than 32 bits, because the result was passed into a 32 bit variable. This was fixed when adding the separate binary and function. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e1c7e2cdc240..1ba7a6b86f55 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -145,34 +145,50 @@ struct pred_stack { /* If not of not match is equal to not of not, then it is a match */ #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event) \ +static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ - int match = 0; \ - \ - switch (pred->op) { \ - case OP_LT: \ - match = (*addr < val); \ - break; \ - case OP_LE: \ - match = (*addr <= val); \ - break; \ - case OP_GT: \ - match = (*addr > val); \ - break; \ - case OP_GE: \ - match = (*addr >= val); \ - break; \ - case OP_BAND: \ - match = (*addr & val); \ - break; \ - default: \ - break; \ - } \ - \ + int match = (*addr < val); \ return !!match == !pred->not; \ -} +} \ +static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr <= val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr > val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = (*addr >= val); \ + return !!match == !pred->not; \ +} \ +static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = !!(*addr & val); \ + return match == !pred->not; \ +} \ +static const filter_pred_fn_t pred_funcs_##type[] = { \ + filter_pred_LT_##type, \ + filter_pred_LE_##type, \ + filter_pred_GT_##type, \ + filter_pred_GE_##type, \ + filter_pred_BAND_##type, \ +}; + +#define PRED_FUNC_START OP_LT #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ @@ -982,33 +998,33 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, if (op == OP_EQ || op == OP_NE) fn = filter_pred_64; else if (field_is_signed) - fn = filter_pred_s64; + fn = pred_funcs_s64[op - PRED_FUNC_START]; else - fn = filter_pred_u64; + fn = pred_funcs_u64[op - PRED_FUNC_START]; break; case 4: if (op == OP_EQ || op == OP_NE) fn = filter_pred_32; else if (field_is_signed) - fn = filter_pred_s32; + fn = pred_funcs_s32[op - PRED_FUNC_START]; else - fn = filter_pred_u32; + fn = pred_funcs_u32[op - PRED_FUNC_START]; break; case 2: if (op == OP_EQ || op == OP_NE) fn = filter_pred_16; else if (field_is_signed) - fn = filter_pred_s16; + fn = pred_funcs_s16[op - PRED_FUNC_START]; else - fn = filter_pred_u16; + fn = pred_funcs_u16[op - PRED_FUNC_START]; break; case 1: if (op == OP_EQ || op == OP_NE) fn = filter_pred_8; else if (field_is_signed) - fn = filter_pred_s8; + fn = pred_funcs_s8[op - PRED_FUNC_START]; else - fn = filter_pred_u8; + fn = pred_funcs_u8[op - PRED_FUNC_START]; break; } -- cgit v1.2.3 From 3f303fbccfc423a28765df6e1be0428fdf1aac59 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Oct 2016 15:58:03 -0400 Subject: tracing/filter: Define op as the enum that it is The trace_events_file.c filter logic can be a bit complex. I copy this into a userspace program where I can debug it a bit easier. One issue is the op is defined in most places as an int instead of as an enum, and gdb just gives the value when debugging. Having the actual op name shown in gdb is more useful. This has no functionality change, but helps in debugging when the file is debugged in user space. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1ba7a6b86f55..59a411ff60c7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -108,12 +108,12 @@ static char *err_text[] = { }; struct opstack_op { - int op; + enum filter_op_ids op; struct list_head list; }; struct postfix_elt { - int op; + enum filter_op_ids op; char *operand; struct list_head list; }; @@ -977,7 +977,7 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_legal_op(struct ftrace_event_field *field, int op) +static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op) { if (is_string_field(field) && (op != OP_EQ && op != OP_NE && op != OP_GLOB)) @@ -988,8 +988,8 @@ static bool is_legal_op(struct ftrace_event_field *field, int op) return true; } -static filter_pred_fn_t select_comparison_fn(int op, int field_size, - int field_is_signed) +static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { filter_pred_fn_t fn = NULL; @@ -1197,7 +1197,8 @@ static inline int append_operand_char(struct filter_parse_state *ps, char c) return 0; } -static int filter_opstack_push(struct filter_parse_state *ps, int op) +static int filter_opstack_push(struct filter_parse_state *ps, + enum filter_op_ids op) { struct opstack_op *opstack_op; @@ -1231,7 +1232,7 @@ static int filter_opstack_top(struct filter_parse_state *ps) static int filter_opstack_pop(struct filter_parse_state *ps) { struct opstack_op *opstack_op; - int op; + enum filter_op_ids op; if (filter_opstack_empty(ps)) return OP_NONE; @@ -1276,7 +1277,7 @@ static int postfix_append_operand(struct filter_parse_state *ps, char *operand) return 0; } -static int postfix_append_op(struct filter_parse_state *ps, int op) +static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op) { struct postfix_elt *elt; @@ -1306,8 +1307,8 @@ static void postfix_clear(struct filter_parse_state *ps) static int filter_parse(struct filter_parse_state *ps) { + enum filter_op_ids op, top_op; int in_string = 0; - int op, top_op; char ch; while ((ch = infix_next(ps))) { @@ -1398,7 +1399,8 @@ parse_operand: static struct filter_pred *create_pred(struct filter_parse_state *ps, struct trace_event_call *call, - int op, char *operand1, char *operand2) + enum filter_op_ids op, + char *operand1, char *operand2) { struct ftrace_event_field *field; static struct filter_pred pred; -- cgit v1.2.3 From 8d414bd2f77ce858f6b9d119c63b9ce29cf0b75d Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Mon, 14 Nov 2016 11:19:13 +0800 Subject: tracing: Allow wakeup_dl tracer to be used by instances Allow wakeup_dl tracer to be used by instances, like wakeup tracer and wakeup_rt tracer. Link: http://lkml.kernel.org/r/1479093553-31264-1-git-send-email-zhouchengming1@huawei.com Signed-off-by: Zhou Chengming Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 9d4399b553a3..1bf2324dc682 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -790,6 +790,7 @@ static struct tracer wakeup_dl_tracer __read_mostly = #endif .open = wakeup_trace_open, .close = wakeup_trace_close, + .allow_instances = true, .use_max_tr = true, }; -- cgit v1.2.3 From 981ee2d444408fc55b9390d6a4a54a6697513611 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 15 Nov 2016 03:06:50 +0100 Subject: sched/cputime, powerpc: Remove cputime_to_scaled() Currently cputime_to_scaled() just return it's argument on all implementations, we don't need to call this function. Signed-off-by: Stanislaw Gruszka Signed-off-by: Frederic Weisbecker Reviewed-by: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1479175612-14718-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/cputime.h | 7 ------- include/asm-generic/cputime_jiffies.h | 1 - include/asm-generic/cputime_nsecs.h | 1 - kernel/sched/cputime.c | 26 ++++++++++++-------------- 4 files changed, 12 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 9f5dcf73b608..aa2e6a34b872 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -52,13 +52,6 @@ static inline unsigned long cputime_to_jiffies(const cputime_t ct) return mulhdu((__force u64) ct, __cputime_jiffies_factor); } -/* Estimate the scaled cputime by scaling the real cputime based on - * the last scaled to real ratio */ -static inline cputime_t cputime_to_scaled(const cputime_t ct) -{ - return ct; -} - static inline cputime_t jiffies_to_cputime(const unsigned long jif) { u64 ct; diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index fe386fc6e85e..6bb8cd45f53b 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -7,7 +7,6 @@ typedef unsigned long __nocast cputime_t; #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) -#define cputime_to_scaled(__ct) (__ct) #define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) typedef u64 __nocast cputime64_t; diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index a84e28e0c634..4e3b18e559b1 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -34,7 +34,6 @@ typedef u64 __nocast cputime64_t; */ #define cputime_to_jiffies(__ct) \ cputime_div(__ct, NSEC_PER_SEC / HZ) -#define cputime_to_scaled(__ct) (__ct) #define jiffies_to_cputime(__jif) \ (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) #define cputime64_to_jiffies64(__ct) \ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5ebee3164e64..3229c7244fdd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -390,7 +390,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { u64 cputime = (__force u64) cputime_one_jiffy * ticks; - cputime_t scaled, other; + cputime_t other; /* * When returning from idle, many ticks can get accounted at @@ -403,7 +403,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, if (other >= cputime) return; cputime -= other; - scaled = cputime_to_scaled(cputime); if (this_cpu_ksoftirqd() == p) { /* @@ -411,15 +410,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime, scaled); + account_user_time(p, cputime, cputime); } else if (p == rq->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime, scaled); + account_guest_time(p, cputime, cputime); } else { - __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); + __account_system_time(p, cputime, cputime, CPUTIME_SYSTEM); } } @@ -502,7 +501,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t cputime, scaled, steal; + cputime_t cputime, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -520,12 +519,11 @@ void account_process_tick(struct task_struct *p, int user_tick) return; cputime -= steal; - scaled = cputime_to_scaled(cputime); if (user_tick) - account_user_time(p, cputime, scaled); + account_user_time(p, cputime, cputime); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime, cputime); else account_idle_time(cputime); } @@ -746,7 +744,7 @@ static void __vtime_account_system(struct task_struct *tsk) { cputime_t delta_cpu = get_vtime_delta(tsk); - account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); + account_system_time(tsk, irq_count(), delta_cpu, delta_cpu); } void vtime_account_system(struct task_struct *tsk) @@ -767,7 +765,7 @@ void vtime_account_user(struct task_struct *tsk) tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) { delta_cpu = get_vtime_delta(tsk); - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + account_user_time(tsk, delta_cpu, delta_cpu); } write_seqcount_end(&tsk->vtime_seqcount); } @@ -940,8 +938,8 @@ void task_cputime_scaled(struct task_struct *t, fetch_task_cputime(t, utimescaled, stimescaled, &t->utimescaled, &t->stimescaled, &udelta, &sdelta); if (utimescaled) - *utimescaled += cputime_to_scaled(udelta); + *utimescaled += udelta; if (stimescaled) - *stimescaled += cputime_to_scaled(sdelta); + *stimescaled += sdelta; } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From 40565b5aedd6d0ca88b7dfd3859d709d2f6f8cf9 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 15 Nov 2016 03:06:51 +0100 Subject: sched/cputime, powerpc, s390: Make scaled cputime arch specific Only s390 and powerpc have hardware facilities allowing to measure cputimes scaled by frequency. On all other architectures utimescaled/stimescaled are equal to utime/stime (however they are accounted separately). Remove {u,s}timescaled accounting on all architectures except powerpc and s390, where those values are explicitly accounted in the proper places. Signed-off-by: Stanislaw Gruszka Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161031162143.GB12646@redhat.com Signed-off-by: Ingo Molnar --- arch/Kconfig | 3 +++ arch/ia64/kernel/time.c | 4 +-- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/time.c | 6 +++-- arch/s390/Kconfig | 1 + arch/s390/kernel/vtime.c | 9 ++++--- include/linux/kernel_stat.h | 4 +-- include/linux/sched.h | 23 ++++++++++++----- kernel/fork.c | 2 ++ kernel/sched/cputime.c | 61 +++++++++++---------------------------------- 10 files changed, 53 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 659bdd079277..abab6590f08f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -512,6 +512,9 @@ config HAVE_CONTEXT_TRACKING config HAVE_VIRT_CPU_ACCOUNTING bool +config ARCH_HAS_SCALED_CPUTIME + bool + config HAVE_VIRT_CPU_ACCOUNTING_GEN bool default y if 64BIT diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 6f892b94e906..021f44ab4bfb 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -68,7 +68,7 @@ void vtime_account_user(struct task_struct *tsk) if (ti->ac_utime) { delta_utime = cycle_to_cputime(ti->ac_utime); - account_user_time(tsk, delta_utime, delta_utime); + account_user_time(tsk, delta_utime); ti->ac_utime = 0; } } @@ -112,7 +112,7 @@ void vtime_account_system(struct task_struct *tsk) { cputime_t delta = vtime_delta(tsk); - account_system_time(tsk, 0, delta, delta); + account_system_time(tsk, 0, delta); } EXPORT_SYMBOL_GPL(vtime_account_system); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65fba4c34cd7..c7f120aaa98f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -160,6 +160,7 @@ config PPC select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS select GENERIC_CPU_AUTOPROBE select HAVE_VIRT_CPU_ACCOUNTING + select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select HAVE_ARCH_HARDENED_USERCOPY select HAVE_KERNEL_GZIP diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81051986739c..be9751f1cb2a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -358,7 +358,8 @@ void vtime_account_system(struct task_struct *tsk) unsigned long delta, sys_scaled, stolen; delta = vtime_delta(tsk, &sys_scaled, &stolen); - account_system_time(tsk, 0, delta, sys_scaled); + account_system_time(tsk, 0, delta); + tsk->stimescaled += sys_scaled; if (stolen) account_steal_time(stolen); } @@ -391,7 +392,8 @@ void vtime_account_user(struct task_struct *tsk) acct->user_time = 0; acct->user_time_scaled = 0; acct->utime_sspurr = 0; - account_user_time(tsk, utime, utimescaled); + account_user_time(tsk, utime); + tsk->utimescaled += utimescaled; } #ifdef CONFIG_PPC32 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 426481d4cc86..028f97be5bae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -171,6 +171,7 @@ config S390 select SYSCTL_EXCEPTION_TRACE select TTY select VIRT_CPU_ACCOUNTING + select ARCH_HAS_SCALED_CPUTIME select VIRT_TO_BUS select HAVE_NMI diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 856e30d8463f..1bd5dde2d5a9 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -137,8 +137,10 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) user_scaled = (user_scaled * mult) / div; system_scaled = (system_scaled * mult) / div; } - account_user_time(tsk, user, user_scaled); - account_system_time(tsk, hardirq_offset, system, system_scaled); + account_user_time(tsk, user); + tsk->utimescaled += user_scaled; + account_system_time(tsk, hardirq_offset, system); + tsk->stimescaled += system_scaled; steal = S390_lowcore.steal_timer; if ((s64) steal > 0) { @@ -202,7 +204,8 @@ void vtime_account_irq_enter(struct task_struct *tsk) system_scaled = (system_scaled * mult) / div; } - account_system_time(tsk, 0, system, system_scaled); + account_system_time(tsk, 0, system); + tsk->stimescaled += system_scaled; virt_timer_forward(system); } diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44fda64ad434..00f776816aa3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -78,8 +78,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) return kstat_cpu(cpu).irqs_sum; } -extern void account_user_time(struct task_struct *, cputime_t, cputime_t); -extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); +extern void account_user_time(struct task_struct *, cputime_t); +extern void account_system_time(struct task_struct *, int, cputime_t); extern void account_steal_time(cputime_t); extern void account_idle_time(cputime_t); diff --git a/include/linux/sched.h b/include/linux/sched.h index 3762fe4e3a80..f72e81395dac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1647,7 +1647,10 @@ struct task_struct { int __user *set_child_tid; /* CLONE_CHILD_SETTID */ int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ - cputime_t utime, stime, utimescaled, stimescaled; + cputime_t utime, stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + cputime_t utimescaled, stimescaled; +#endif cputime_t gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -2240,8 +2243,6 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime); -extern void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, cputime_t *stimescaled); extern cputime_t task_gtime(struct task_struct *t); #else static inline void task_cputime(struct task_struct *t, @@ -2253,6 +2254,13 @@ static inline void task_cputime(struct task_struct *t, *stime = t->stime; } +static inline cputime_t task_gtime(struct task_struct *t) +{ + return t->gtime; +} +#endif + +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME static inline void task_cputime_scaled(struct task_struct *t, cputime_t *utimescaled, cputime_t *stimescaled) @@ -2262,12 +2270,15 @@ static inline void task_cputime_scaled(struct task_struct *t, if (stimescaled) *stimescaled = t->stimescaled; } - -static inline cputime_t task_gtime(struct task_struct *t) +#else +static inline void task_cputime_scaled(struct task_struct *t, + cputime_t *utimescaled, + cputime_t *stimescaled) { - return t->gtime; + task_cputime(t, utimescaled, stimescaled); } #endif + extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); diff --git a/kernel/fork.c b/kernel/fork.c index 997ac1d584f7..600e93b5e539 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1551,7 +1551,9 @@ static __latent_entropy struct task_struct *copy_process( init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; +#endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3229c7244fdd..ba55ebf77f9a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index, * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +void account_user_time(struct task_struct *p, cputime_t cputime) { int index; /* Add user time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; @@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime, * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +static void account_guest_time(struct task_struct *p, cputime_t cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; /* Add guest time to process. */ p->utime += cputime; - p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); p->gtime += cputime; @@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, * Account system cpu time to a process and desired cpustat field * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated + * @index: pointer to cpustat field that has to be updated */ static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, int index) +void __account_system_time(struct task_struct *p, cputime_t cputime, int index) { /* Add system time to process. */ p->stime += cputime; - p->stimescaled += cputime_scaled; account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) + cputime_t cputime) { int index; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime, cputime_scaled); + account_guest_time(p, cputime); return; } @@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, cputime_scaled, index); + __account_system_time(p, cputime, index); } /* @@ -410,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, cputime, CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime, cputime); + account_user_time(p, cputime); } else if (p == rq->idle) { account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime, cputime); + account_guest_time(p, cputime); } else { - __account_system_time(p, cputime, cputime, CPUTIME_SYSTEM); + __account_system_time(p, cputime, CPUTIME_SYSTEM); } } @@ -521,9 +511,9 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime -= steal; if (user_tick) - account_user_time(p, cputime, cputime); + account_user_time(p, cputime); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime, cputime); + account_system_time(p, HARDIRQ_OFFSET, cputime); else account_idle_time(cputime); } @@ -744,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk) { cputime_t delta_cpu = get_vtime_delta(tsk); - account_system_time(tsk, irq_count(), delta_cpu, delta_cpu); + account_system_time(tsk, irq_count(), delta_cpu); } void vtime_account_system(struct task_struct *tsk) @@ -765,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk) tsk->vtime_snap_whence = VTIME_SYS; if (vtime_delta(tsk)) { delta_cpu = get_vtime_delta(tsk); - account_user_time(tsk, delta_cpu, delta_cpu); + account_user_time(tsk, delta_cpu); } write_seqcount_end(&tsk->vtime_seqcount); } @@ -921,25 +911,4 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) if (stime) *stime += sdelta; } - -void task_cputime_scaled(struct task_struct *t, - cputime_t *utimescaled, cputime_t *stimescaled) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utimescaled) - *utimescaled = t->utimescaled; - if (stimescaled) - *stimescaled = t->stimescaled; - return; - } - - fetch_task_cputime(t, utimescaled, stimescaled, - &t->utimescaled, &t->stimescaled, &udelta, &sdelta); - if (utimescaled) - *utimescaled += udelta; - if (stimescaled) - *stimescaled += sdelta; -} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3 From 353c50ebe329daaf2c94dc41c1c481cbba2a31fd Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 15 Nov 2016 03:06:52 +0100 Subject: sched/cputime: Simplify task_cputime() Now since fetch_task_cputime() has no other users than task_cputime(), its code could be used directly in task_cputime(). Moreover since only 2 task_cputime() calls of 17 use a NULL argument, we can add dummy variables to those calls and remove NULL checks from task_cputimes(). Also remove NULL checks from task_cputimes_scaled(). Signed-off-by: Stanislaw Gruszka Signed-off-by: Frederic Weisbecker Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Neuling Cc: Paul Mackerras Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1479175612-14718-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 4 +-- include/linux/sched.h | 12 +++------ kernel/sched/cputime.c | 57 +++++++++++------------------------------- kernel/time/posix-cpu-timers.c | 4 +-- 4 files changed, 23 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c7364bd633e1..d90749b883f5 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev, static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ static unsigned int last_stime; /* = 0 */ - cputime_t stime; + cputime_t stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; recalc: - task_cputime(current, NULL, &stime); + task_cputime(current, &utime, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; } else if (jiffies_since_last_check > idle_period) { diff --git a/include/linux/sched.h b/include/linux/sched.h index f72e81395dac..fe3ce46cfd03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2248,10 +2248,8 @@ extern cputime_t task_gtime(struct task_struct *t); static inline void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { - if (utime) - *utime = t->utime; - if (stime) - *stime = t->stime; + *utime = t->utime; + *stime = t->stime; } static inline cputime_t task_gtime(struct task_struct *t) @@ -2265,10 +2263,8 @@ static inline void task_cputime_scaled(struct task_struct *t, cputime_t *utimescaled, cputime_t *stimescaled) { - if (utimescaled) - *utimescaled = t->utimescaled; - if (stimescaled) - *stimescaled = t->stimescaled; + *utimescaled = t->utimescaled; + *stimescaled = t->stimescaled; } #else static inline void task_cputime_scaled(struct task_struct *t, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ba55ebf77f9a..7700a9cba335 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -851,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -static void -fetch_task_cputime(struct task_struct *t, - cputime_t *u_dst, cputime_t *s_dst, - cputime_t *u_src, cputime_t *s_src, - cputime_t *udelta, cputime_t *sdelta) +void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { + cputime_t delta; unsigned int seq; - unsigned long long delta; - do { - *udelta = 0; - *sdelta = 0; + if (!vtime_accounting_enabled()) { + *utime = t->utime; + *stime = t->stime; + return; + } + do { seq = read_seqcount_begin(&t->vtime_seqcount); - if (u_dst) - *u_dst = *u_src; - if (s_dst) - *s_dst = *s_src; + *utime = t->utime; + *stime = t->stime; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_INACTIVE || - is_idle_task(t)) + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; delta = vtime_delta(t); @@ -882,33 +878,10 @@ fetch_task_cputime(struct task_struct *t, * Task runs either in user or kernel space, add pending nohz time to * the right place. */ - if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { - *udelta = delta; - } else { - if (t->vtime_snap_whence == VTIME_SYS) - *sdelta = delta; - } + if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) + *utime += delta; + else if (t->vtime_snap_whence == VTIME_SYS) + *stime += delta; } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } - - -void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) -{ - cputime_t udelta, sdelta; - - if (!vtime_accounting_enabled()) { - if (utime) - *utime = t->utime; - if (stime) - *stime = t->stime; - return; - } - - fetch_task_cputime(t, utime, stime, &t->utime, - &t->stime, &udelta, &sdelta); - if (utime) - *utime += udelta; - if (stime) - *stime += sdelta; -} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 39008d78927a..e887ffc8eef3 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -133,9 +133,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p) } static inline unsigned long long virt_ticks(struct task_struct *p) { - cputime_t utime; + cputime_t utime, stime; - task_cputime(p, &utime, NULL); + task_cputime(p, &utime, &stime); return cputime_to_expires(utime); } -- cgit v1.2.3 From 864c2357ca898c6171fe5284f5ecc795c8ce27a8 Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Tue, 1 Nov 2016 11:52:58 -0700 Subject: perf/core: Do not set cpuctx->cgrp for unscheduled cgroups Commit: db4a835601b7 ("perf/core: Set cgroup in CPU contexts for new cgroup events") failed to verify that event->cgrp is actually the scheduled cgroup in a CPU before setting cpuctx->cgrp. This patch fixes that. Now that there is a different path for scheduled and unscheduled cgroup, add a warning to catch when cpuctx->cgrp is still set after the last cgroup event has been unsheduled. To verify the bug: # Create 2 cgroups. mkdir /dev/cgroups/devices/g1 mkdir /dev/cgroups/devices/g2 # launch a task, bind it to a cpu and move it to g1 CPU=2 while :; do : ; done & P=$! taskset -pc $CPU $P echo $P > /dev/cgroups/devices/g1/tasks # monitor g2 (it runs no tasks) and observe output perf stat -e cycles -I 1000 -C $CPU -G g2 # time counts unit events 1.000091408 7,579,527 cycles g2 2.000350111 cycles g2 3.000589181 cycles g2 4.000771428 cycles g2 # note first line that displays that a task run in g2, despite # g2 having no tasks. This is because cpuctx->cgrp was wrongly # set when context of new event was installed. # After applying the fix we obtain the right output: perf stat -e cycles -I 1000 -C $CPU -G g2 # time counts unit events 1.000119615 cycles g2 2.000389430 cycles g2 3.000590962 cycles g2 Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Nilay Vaish Cc: Paul Turner Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Link: http://lkml.kernel.org/r/1478026378-86083-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0e292132efac..ff230bb4a02e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,6 +902,17 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); + + /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */ + if (perf_cgroup_from_task(current, ctx) != event->cgrp) { + /* + * We are removing the last cpu event in this context. + * If that event is not active in this cpu, cpuctx->cgrp + * should've been cleared by perf_cgroup_switch. + */ + WARN_ON_ONCE(!add && cpuctx->cgrp); + return; + } cpuctx->cgrp = add ? event->cgrp : NULL; } -- cgit v1.2.3 From 3a08c2fd763450a927d1130de078d6f9e74944fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 11 Nov 2016 10:55:06 -0800 Subject: bpf: LRU List Introduce bpf_lru_list which will provide LRU capability to the bpf_htab in the later patch. * General Thoughts: 1. Target use case. Read is more often than update. (i.e. bpf_lookup_elem() is more often than bpf_update_elem()). If bpf_prog does a bpf_lookup_elem() first and then an in-place update, it still counts as a read operation to the LRU list concern. 2. It may be useful to think of it as a LRU cache 3. Optimize the read case 3.1 No lock in read case 3.2 The LRU maintenance is only done during bpf_update_elem() 4. If there is a percpu LRU list, it will lose the system-wise LRU property. A completely isolated percpu LRU list has the best performance but the memory utilization is not ideal considering the work load may be imbalance. 5. Hence, this patch starts the LRU implementation with a global LRU list with batched operations before accessing the global LRU list. As a LRU cache, #read >> #update/#insert operations, it will work well. 6. There is a local list (for each cpu) which is named 'struct bpf_lru_locallist'. This local list is not used to sort the LRU property. Instead, the local list is to batch enough operations before acquiring the lock of the global LRU list. More details on this later. 7. In the later patch, it allows a percpu LRU list by specifying a map-attribute for scalability reason and for use cases that need to prepare for the worst (and pathological) case like DoS attack. The percpu LRU list is completely isolated from each other and the LRU nodes (including free nodes) cannot be moved across the list. The following description is for the global LRU list but mostly applicable to the percpu LRU list also. * Global LRU List: 1. It has three sub-lists: active-list, inactive-list and free-list. 2. The two list idea, active and inactive, is borrowed from the page cache. 3. All nodes are pre-allocated and all sit at the free-list (of the global LRU list) at the beginning. The pre-allocation reasoning is similar to the existing BPF_MAP_TYPE_HASH. However, opting-out prealloc (BPF_F_NO_PREALLOC) is not supported in the LRU map. * Active/Inactive List (of the global LRU list): 1. The active list, as its name says it, maintains the active set of the nodes. We can think of it as the working set or more frequently accessed nodes. The access frequency is approximated by a ref-bit. The ref-bit is set during the bpf_lookup_elem(). 2. The inactive list, as its name also says it, maintains a less active set of nodes. They are the candidates to be removed from the bpf_htab when we are running out of free nodes. 3. The ordering of these two lists is acting as a rough clock. The tail of the inactive list is the older nodes and should be released first if the bpf_htab needs free element. * Rotating the Active/Inactive List (of the global LRU list): 1. It is the basic operation to maintain the LRU property of the global list. 2. The active list is only rotated when the inactive list is running low. This idea is similar to the current page cache. Inactive running low is currently defined as "# of inactive < # of active". 3. The active list rotation always starts from the tail. It moves node without ref-bit set to the head of the inactive list. It moves node with ref-bit set back to the head of the active list and then clears its ref-bit. 4. The inactive rotation is pretty simply. It walks the inactive list and moves the nodes back to the head of active list if its ref-bit is set. The ref-bit is cleared after moving to the active list. If the node does not have ref-bit set, it just leave it as it is because it is already in the inactive list. * Shrinking the Inactive List (of the global LRU list): 1. Shrinking is the operation to get free nodes when the bpf_htab is full. 2. It usually only shrinks the inactive list to get free nodes. 3. During shrinking, it will walk the inactive list from the tail, delete the nodes without ref-bit set from bpf_htab. 4. If no free node found after step (3), it will forcefully get one node from the tail of inactive or active list. Forcefully is in the sense that it ignores the ref-bit. * Local List: 1. Each CPU has a 'struct bpf_lru_locallist'. The purpose is to batch enough operations before acquiring the lock of the global LRU. 2. A local list has two sub-lists, free-list and pending-list. 3. During bpf_update_elem(), it will try to get from the free-list of (the current CPU local list). 4. If the local free-list is empty, it will acquire from the global LRU list. The global LRU list can either satisfy it by its global free-list or by shrinking the global inactive list. Since we have acquired the global LRU list lock, it will try to get at most LOCAL_FREE_TARGET elements to the local free list. 5. When a new element is added to the bpf_htab, it will first sit at the pending-list (of the local list) first. The pending-list will be flushed to the global LRU list when it needs to acquire free nodes from the global list next time. * Lock Consideration: The LRU list has a lock (lru_lock). Each bucket of htab has a lock (buck_lock). If both locks need to be acquired together, the lock order is always lru_lock -> buck_lock and this only happens in the bpf_lru_list.c logic. In hashtab.c, both locks are not acquired together (i.e. one lock is always released first before acquiring another lock). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_lru_list.c | 567 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/bpf_lru_list.h | 80 +++++++ 3 files changed, 648 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_lru_list.c create mode 100644 kernel/bpf/bpf_lru_list.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index eed911d091da..c4d89d6e2058 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c new file mode 100644 index 000000000000..73f67094f93a --- /dev/null +++ b/kernel/bpf/bpf_lru_list.c @@ -0,0 +1,567 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include + +#include "bpf_lru_list.h" + +#define LOCAL_FREE_TARGET (128) +#define LOCAL_NR_SCANS LOCAL_FREE_TARGET + +/* Helpers to get the local list index */ +#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) +#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) +#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) +#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) + +static int get_next_cpu(int cpu) +{ + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_possible_mask); + return cpu; +} + +/* Local list helpers */ +static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_FREE_LIST_IDX]; +} + +static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) +{ + return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; +} + +/* bpf_lru_node helpers */ +static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) +{ + return node->ref; +} + +static void bpf_lru_list_count_inc(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]++; +} + +static void bpf_lru_list_count_dec(struct bpf_lru_list *l, + enum bpf_lru_list_type type) +{ + if (type < NR_BPF_LRU_LIST_COUNT) + l->counts[type]--; +} + +static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, + struct bpf_lru_node *node, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + /* If the removing node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + bpf_lru_list_count_dec(l, node->type); + + node->type = tgt_free_type; + list_move(&node->list, free_list); +} + +/* Move nodes from local list to the LRU list */ +static void __bpf_lru_node_move_in(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + node->ref = 0; + list_move(&node->list, &l->lists[tgt_type]); +} + +/* Move nodes between or within active and inactive list (like + * active to inactive, inactive to active or tail of active back to + * the head of active). + */ +static void __bpf_lru_node_move(struct bpf_lru_list *l, + struct bpf_lru_node *node, + enum bpf_lru_list_type tgt_type) +{ + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || + WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) + return; + + if (node->type != tgt_type) { + bpf_lru_list_count_dec(l, node->type); + bpf_lru_list_count_inc(l, tgt_type); + node->type = tgt_type; + } + node->ref = 0; + + /* If the moving node is the next_inactive_rotation candidate, + * move the next_inactive_rotation pointer also. + */ + if (&node->list == l->next_inactive_rotation) + l->next_inactive_rotation = l->next_inactive_rotation->prev; + + list_move(&node->list, &l->lists[tgt_type]); +} + +static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) +{ + return l->counts[BPF_LRU_LIST_T_INACTIVE] < + l->counts[BPF_LRU_LIST_T_ACTIVE]; +} + +/* Rotate the active list: + * 1. Start from tail + * 2. If the node has the ref bit set, it will be rotated + * back to the head of active list with the ref bit cleared. + * Give this node one more chance to survive in the active list. + * 3. If the ref bit is not set, move it to the head of the + * inactive list. + * 4. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int i = 0; + + first_node = list_first_entry(active, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, active, list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + + if (++i == lru->nr_scans || node == first_node) + break; + } +} + +/* Rotate the inactive list. It starts from the next_inactive_rotation + * 1. If the node has ref bit set, it will be moved to the head + * of active list with the ref bit cleared. + * 2. If the node does not have ref bit set, it will leave it + * at its current location (i.e. do nothing) so that it can + * be considered during the next inactive_shrink. + * 3. It will at most scan nr_scans nodes + */ +static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct list_head *cur, *next, *last; + struct bpf_lru_node *node; + unsigned int i = 0; + + if (list_empty(inactive)) + return; + + last = l->next_inactive_rotation->next; + if (last == inactive) + last = last->next; + + cur = l->next_inactive_rotation; + while (i < lru->nr_scans) { + if (cur == inactive) { + cur = cur->prev; + continue; + } + + node = list_entry(cur, struct bpf_lru_node, list); + next = cur->prev; + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + if (cur == last) + break; + cur = next; + i++; + } + + l->next_inactive_rotation = next; +} + +/* Shrink the inactive list. It starts from the tail of the + * inactive list and only move the nodes without the ref bit + * set to the designated free list. + */ +static unsigned int +__bpf_lru_list_shrink_inactive(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) +{ + struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + struct bpf_lru_node *node, *tmp_node, *first_node; + unsigned int nshrinked = 0; + unsigned int i = 0; + + first_node = list_first_entry(inactive, struct bpf_lru_node, list); + list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { + if (bpf_lru_node_is_ref(node)) { + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); + } else if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + if (++nshrinked == tgt_nshrink) + break; + } + + if (++i == lru->nr_scans) + break; + } + + return nshrinked; +} + +/* 1. Rotate the active list (if needed) + * 2. Always rotate the inactive list + */ +static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) +{ + if (bpf_lru_list_inactive_low(l)) + __bpf_lru_list_rotate_active(lru, l); + + __bpf_lru_list_rotate_inactive(lru, l); +} + +/* Calls __bpf_lru_list_shrink_inactive() to shrink some + * ref-bit-cleared nodes and move them to the designated + * free list. + * + * If it cannot get a free node after calling + * __bpf_lru_list_shrink_inactive(). It will just remove + * one node from either inactive or active list without + * honoring the ref-bit. It prefers inactive list to active + * list in this situation. + */ +static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, + struct bpf_lru_list *l, + unsigned int tgt_nshrink, + struct list_head *free_list, + enum bpf_lru_list_type tgt_free_type) + +{ + struct bpf_lru_node *node, *tmp_node; + struct list_head *force_shrink_list; + unsigned int nshrinked; + + nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, + free_list, tgt_free_type); + if (nshrinked) + return nshrinked; + + /* Do a force shrink by ignoring the reference bit */ + if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) + force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + else + force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; + + list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, + list) { + if (lru->del_from_htab(lru->del_arg, node)) { + __bpf_lru_node_move_to_free(l, node, free_list, + tgt_free_type); + return 1; + } + } + + return 0; +} + +/* Flush the nodes from the local pending list to the LRU list */ +static void __local_list_flush(struct bpf_lru_list *l, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node, *tmp_node; + + list_for_each_entry_safe_reverse(node, tmp_node, + local_pending_list(loc_l), list) { + if (bpf_lru_node_is_ref(node)) + __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); + else + __bpf_lru_node_move_in(l, node, + BPF_LRU_LIST_T_INACTIVE); + } +} + +static void bpf_lru_list_push_free(struct bpf_lru_list *l, + struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) + return; + + raw_spin_lock_irqsave(&l->lock, flags); + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + struct bpf_lru_node *node, *tmp_node; + unsigned int nfree = 0; + + raw_spin_lock(&l->lock); + + __local_list_flush(l, loc_l); + + __bpf_lru_list_rotate(lru, l); + + list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], + list) { + __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + if (++nfree == LOCAL_FREE_TARGET) + break; + } + + if (nfree < LOCAL_FREE_TARGET) + __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + local_free_list(loc_l), + BPF_LRU_LOCAL_LIST_T_FREE); + + raw_spin_unlock(&l->lock); +} + +static void __local_list_add_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l, + int cpu, + struct bpf_lru_node *node, + u32 hash) +{ + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->cpu = cpu; + node->type = BPF_LRU_LOCAL_LIST_T_PENDING; + node->ref = 0; + list_add(&node->list, local_pending_list(loc_l)); +} + +struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + + node = list_first_entry_or_null(local_free_list(loc_l), + struct bpf_lru_node, + list); + if (node) + list_del(&node->list); + + return node; +} + +struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, + struct bpf_lru_locallist *loc_l) +{ + struct bpf_lru_node *node; + bool force = false; + +ignore_ref: + /* Get from the tail (i.e. older element) of the pending list. */ + list_for_each_entry_reverse(node, local_pending_list(loc_l), + list) { + if ((!bpf_lru_node_is_ref(node) || force) && + lru->del_from_htab(lru->del_arg, node)) { + list_del(&node->list); + return node; + } + } + + if (!force) { + force = true; + goto ignore_ref; + } + + return NULL; +} + +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +{ + struct bpf_lru_locallist *loc_l, *steal_loc_l; + struct bpf_common_lru *clru = &lru->common_lru; + struct bpf_lru_node *node; + int steal, first_steal; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + loc_l = per_cpu_ptr(clru->local_list, cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + node = __local_list_pop_free(loc_l); + if (!node) { + bpf_lru_list_pop_free_to_local(lru, loc_l); + node = __local_list_pop_free(loc_l); + } + + if (node) + __local_list_add_pending(lru, loc_l, cpu, node, hash); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + + if (node) + return node; + + /* No free nodes found from the local free list and + * the global LRU list. + * + * Steal from the local free/pending list of the + * current CPU and remote CPU in RR. It starts + * with the loc_l->next_steal CPU. + */ + + first_steal = loc_l->next_steal; + steal = first_steal; + do { + steal_loc_l = per_cpu_ptr(clru->local_list, steal); + + raw_spin_lock_irqsave(&steal_loc_l->lock, flags); + + node = __local_list_pop_free(steal_loc_l); + if (!node) + node = __local_list_pop_pending(lru, steal_loc_l); + + raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); + + steal = get_next_cpu(steal); + } while (!node && steal != first_steal); + + loc_l->next_steal = steal; + + if (node) { + raw_spin_lock_irqsave(&loc_l->lock, flags); + __local_list_add_pending(lru, loc_l, cpu, node, hash); + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + } + + return node; +} + +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +{ + unsigned long flags; + + if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + return; + + if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); + + raw_spin_lock_irqsave(&loc_l->lock, flags); + + if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + goto check_lru_list; + } + + node->type = BPF_LRU_LOCAL_LIST_T_FREE; + node->ref = 0; + list_move(&node->list, local_free_list(loc_l)); + + raw_spin_unlock_irqrestore(&loc_l->lock, flags); + return; + } + +check_lru_list: + bpf_lru_list_push_free(&lru->common_lru.lru_list, node); +} + +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + struct bpf_lru_list *l = &lru->common_lru.lru_list; + u32 i; + + for (i = 0; i < nr_elems; i++) { + struct bpf_lru_node *node; + + node = (struct bpf_lru_node *)(buf + node_offset); + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + buf += elem_size; + } +} + +static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) + INIT_LIST_HEAD(&loc_l->lists[i]); + + loc_l->next_steal = cpu; + + raw_spin_lock_init(&loc_l->lock); +} + +static void bpf_lru_list_init(struct bpf_lru_list *l) +{ + int i; + + for (i = 0; i < NR_BPF_LRU_LIST_T; i++) + INIT_LIST_HEAD(&l->lists[i]); + + for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) + l->counts[i] = 0; + + l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; + + raw_spin_lock_init(&l->lock); +} + +int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset, + del_from_htab_func del_from_htab, void *del_arg) +{ + int cpu; + struct bpf_common_lru *clru = &lru->common_lru; + + clru->local_list = alloc_percpu(struct bpf_lru_locallist); + if (!clru->local_list) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + bpf_lru_locallist_init(loc_l, cpu); + } + + bpf_lru_list_init(&clru->lru_list); + lru->nr_scans = LOCAL_NR_SCANS; + + lru->del_from_htab = del_from_htab; + lru->del_arg = del_arg; + lru->hash_offset = hash_offset; + + return 0; +} + +void bpf_lru_destroy(struct bpf_lru *lru) +{ + free_percpu(lru->common_lru.local_list); +} diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h new file mode 100644 index 000000000000..aaa2445ed1ca --- /dev/null +++ b/kernel/bpf/bpf_lru_list.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __BPF_LRU_LIST_H_ +#define __BPF_LRU_LIST_H_ + +#include +#include + +#define NR_BPF_LRU_LIST_T (3) +#define NR_BPF_LRU_LIST_COUNT (2) +#define NR_BPF_LRU_LOCAL_LIST_T (2) +#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T + +enum bpf_lru_list_type { + BPF_LRU_LIST_T_ACTIVE, + BPF_LRU_LIST_T_INACTIVE, + BPF_LRU_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_FREE, + BPF_LRU_LOCAL_LIST_T_PENDING, +}; + +struct bpf_lru_node { + struct list_head list; + u16 cpu; + u8 type; + u8 ref; +}; + +struct bpf_lru_list { + struct list_head lists[NR_BPF_LRU_LIST_T]; + unsigned int counts[NR_BPF_LRU_LIST_COUNT]; + /* The next inacitve list rotation starts from here */ + struct list_head *next_inactive_rotation; + + raw_spinlock_t lock ____cacheline_aligned_in_smp; +}; + +struct bpf_lru_locallist { + struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; + u16 next_steal; + raw_spinlock_t lock; +}; + +struct bpf_common_lru { + struct bpf_lru_list lru_list; + struct bpf_lru_locallist __percpu *local_list; +}; + +typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); + +struct bpf_lru { + struct bpf_common_lru common_lru; + del_from_htab_func del_from_htab; + void *del_arg; + unsigned int hash_offset; + unsigned int nr_scans; +}; + +static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) +{ + /* ref is an approximation on access frequency. It does not + * have to be very accurate. Hence, no protection is used. + */ + node->ref = 1; +} + +int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset, + del_from_htab_func del_from_htab, void *delete_arg); +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems); +void bpf_lru_destroy(struct bpf_lru *lru); +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); + +#endif -- cgit v1.2.3 From 961578b63474d13ad0e2f615fcc2901c5197dda6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 11 Nov 2016 10:55:07 -0800 Subject: bpf: Add percpu LRU list Instead of having a common LRU list, this patch allows a percpu LRU list which can be selected by specifying a map attribute. The map attribute will be added in the later patch. While the common use case for LRU is #reads >> #updates, percpu LRU list allows bpf prog to absorb unusual #updates under pathological case (e.g. external traffic facing machine which could be under attack). Each percpu LRU is isolated from each other. The LRU nodes (including free nodes) cannot be moved across different LRU Lists. Here are the update performance comparison between common LRU list and percpu LRU list (the test code is at the last patch): [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 16 $i | awk '{r += $3}END{print r " updates"}'; done 1 cpus: 2934082 updates 4 cpus: 7391434 updates 8 cpus: 6500576 updates [root@kerneltest003.31.prn1 ~]# for i in 1 4 8; do echo -n "$i cpus: "; \ ./map_perf_test 32 $i | awk '{r += $3}END{printr " updates"}'; done 1 cpus: 2896553 updates 4 cpus: 9766395 updates 8 cpus: 17460553 updates Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/bpf_lru_list.c | 162 +++++++++++++++++++++++++++++++++++++++++----- kernel/bpf/bpf_lru_list.h | 8 ++- 2 files changed, 151 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 73f67094f93a..bfebff010ba9 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,6 +13,9 @@ #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET +#define PERCPU_FREE_TARGET (16) +#define PERCPU_NR_SCANS PERCPU_FREE_TARGET + /* Helpers to get the local list index */ #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) @@ -396,7 +399,40 @@ ignore_ref: return NULL; } -struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, + u32 hash) +{ + struct list_head *free_list; + struct bpf_lru_node *node = NULL; + struct bpf_lru_list *l; + unsigned long flags; + int cpu = raw_smp_processor_id(); + + l = per_cpu_ptr(lru->percpu_lru, cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_list_rotate(lru, l); + + free_list = &l->lists[BPF_LRU_LIST_T_FREE]; + if (list_empty(free_list)) + __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, + BPF_LRU_LIST_T_FREE); + + if (!list_empty(free_list)) { + node = list_first_entry(free_list, struct bpf_lru_node, list); + *(u32 *)((void *)node + lru->hash_offset) = hash; + node->ref = 0; + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); + } + + raw_spin_unlock_irqrestore(&l->lock, flags); + + return node; +} + +static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, + u32 hash) { struct bpf_lru_locallist *loc_l, *steal_loc_l; struct bpf_common_lru *clru = &lru->common_lru; @@ -458,7 +494,16 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) return node; } -void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +{ + if (lru->percpu) + return bpf_percpu_lru_pop_free(lru, hash); + else + return bpf_common_lru_pop_free(lru, hash); +} + +static void bpf_common_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) { unsigned long flags; @@ -490,8 +535,31 @@ check_lru_list: bpf_lru_list_push_free(&lru->common_lru.lru_list, node); } -void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, - u32 elem_size, u32 nr_elems) +static void bpf_percpu_lru_push_free(struct bpf_lru *lru, + struct bpf_lru_node *node) +{ + struct bpf_lru_list *l; + unsigned long flags; + + l = per_cpu_ptr(lru->percpu_lru, node->cpu); + + raw_spin_lock_irqsave(&l->lock, flags); + + __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); + + raw_spin_unlock_irqrestore(&l->lock, flags); +} + +void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +{ + if (lru->percpu) + bpf_percpu_lru_push_free(lru, node); + else + bpf_common_lru_push_free(lru, node); +} + +void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; @@ -507,6 +575,47 @@ void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, } } +void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + u32 i, pcpu_entries; + int cpu; + struct bpf_lru_list *l; + + pcpu_entries = nr_elems / num_possible_cpus(); + + i = 0; + + for_each_possible_cpu(cpu) { + struct bpf_lru_node *node; + + l = per_cpu_ptr(lru->percpu_lru, cpu); +again: + node = (struct bpf_lru_node *)(buf + node_offset); + node->cpu = cpu; + node->type = BPF_LRU_LIST_T_FREE; + node->ref = 0; + list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } +} + +void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, + u32 elem_size, u32 nr_elems) +{ + if (lru->percpu) + bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); + else + bpf_common_lru_populate(lru, buf, node_offset, elem_size, + nr_elems); +} + static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { int i; @@ -534,26 +643,42 @@ static void bpf_lru_list_init(struct bpf_lru_list *l) raw_spin_lock_init(&l->lock); } -int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset, +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *del_arg) { int cpu; - struct bpf_common_lru *clru = &lru->common_lru; - clru->local_list = alloc_percpu(struct bpf_lru_locallist); - if (!clru->local_list) - return -ENOMEM; + if (percpu) { + lru->percpu_lru = alloc_percpu(struct bpf_lru_list); + if (!lru->percpu_lru) + return -ENOMEM; - for_each_possible_cpu(cpu) { - struct bpf_lru_locallist *loc_l; + for_each_possible_cpu(cpu) { + struct bpf_lru_list *l; - loc_l = per_cpu_ptr(clru->local_list, cpu); - bpf_lru_locallist_init(loc_l, cpu); - } + l = per_cpu_ptr(lru->percpu_lru, cpu); + bpf_lru_list_init(l); + } + lru->nr_scans = PERCPU_NR_SCANS; + } else { + struct bpf_common_lru *clru = &lru->common_lru; - bpf_lru_list_init(&clru->lru_list); - lru->nr_scans = LOCAL_NR_SCANS; + clru->local_list = alloc_percpu(struct bpf_lru_locallist); + if (!clru->local_list) + return -ENOMEM; + for_each_possible_cpu(cpu) { + struct bpf_lru_locallist *loc_l; + + loc_l = per_cpu_ptr(clru->local_list, cpu); + bpf_lru_locallist_init(loc_l, cpu); + } + + bpf_lru_list_init(&clru->lru_list); + lru->nr_scans = LOCAL_NR_SCANS; + } + + lru->percpu = percpu; lru->del_from_htab = del_from_htab; lru->del_arg = del_arg; lru->hash_offset = hash_offset; @@ -563,5 +688,8 @@ int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset, void bpf_lru_destroy(struct bpf_lru *lru) { - free_percpu(lru->common_lru.local_list); + if (lru->percpu) + free_percpu(lru->percpu_lru); + else + free_percpu(lru->common_lru.local_list); } diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index aaa2445ed1ca..5c35a98d02bf 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -53,11 +53,15 @@ struct bpf_common_lru { typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); struct bpf_lru { - struct bpf_common_lru common_lru; + union { + struct bpf_common_lru common_lru; + struct bpf_lru_list __percpu *percpu_lru; + }; del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; unsigned int nr_scans; + bool percpu; }; static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) @@ -68,7 +72,7 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) node->ref = 1; } -int bpf_lru_init(struct bpf_lru *lru, u32 hash_offset, +int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *delete_arg); void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems); -- cgit v1.2.3 From fd91de7b3c69a7f108b92521e1115df3e058af55 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 11 Nov 2016 10:55:08 -0800 Subject: bpf: Refactor codes handling percpu map Refactor the codes that populate the value of a htab_elem in a BPF_MAP_TYPE_PERCPU_HASH typed bpf_map. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad1bc67aff1b..b478d80f9771 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -420,6 +420,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } } +static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + u32 size = round_up(htab->map.value_size, 8); + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, @@ -479,18 +497,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - if (!onallcpus) { - /* copy true value_size bytes */ - memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); - } else { - int off = 0, cpu; + pcpu_copy_value(htab, pptr, value, onallcpus); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); } else { @@ -606,22 +614,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, goto err; if (l_old) { - void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); - u32 size = htab->map.value_size; - /* per-cpu hash map can update value in-place */ - if (!onallcpus) { - memcpy(this_cpu_ptr(pptr), value, size); - } else { - int off = 0, cpu; - - size = round_up(size, 8); - for_each_possible_cpu(cpu) { - bpf_long_memcpy(per_cpu_ptr(pptr, cpu), - value + off, size); - off += size; - } - } + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); } else { l_new = alloc_htab_elem(htab, key, value, key_size, hash, true, onallcpus, false); -- cgit v1.2.3 From 29ba732acbeece1e34c68483d1ec1f3720fa1bb3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 11 Nov 2016 10:55:09 -0800 Subject: bpf: Add BPF_MAP_TYPE_LRU_HASH Provide a LRU version of the existing BPF_MAP_TYPE_HASH. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 ++ kernel/bpf/hashtab.c | 266 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 260 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2f38e0091b6..ed8c6799fb14 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -85,6 +85,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, }; enum bpf_prog_type { @@ -106,6 +107,13 @@ enum bpf_prog_type { #define BPF_EXIST 2 /* update existing element */ #define BPF_F_NO_PREALLOC (1U << 0) +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ +#define BPF_F_NO_COMMON_LRU (1U << 1) union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b478d80f9771..4a9e71a7c41f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -15,6 +15,7 @@ #include #include #include "percpu_freelist.h" +#include "bpf_lru_list.h" struct bucket { struct hlist_head head; @@ -25,7 +26,10 @@ struct bpf_htab { struct bpf_map map; struct bucket *buckets; void *elems; - struct pcpu_freelist freelist; + union { + struct pcpu_freelist freelist; + struct bpf_lru lru; + }; void __percpu *extra_elems; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ @@ -48,11 +52,19 @@ struct htab_elem { union { struct rcu_head rcu; enum extra_elem_state state; + struct bpf_lru_node lru_node; }; u32 hash; char key[0] __aligned(8); }; +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); + +static bool htab_is_lru(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH; +} + static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, void __percpu *pptr) { @@ -87,7 +99,22 @@ free_elems: vfree(htab->elems); } -static int prealloc_elems_and_freelist(struct bpf_htab *htab) +static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, + u32 hash) +{ + struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); + struct htab_elem *l; + + if (node) { + l = container_of(node, struct htab_elem, lru_node); + memcpy(l->key, key, htab->map.key_size); + return l; + } + + return NULL; +} + +static int prealloc_init(struct bpf_htab *htab) { int err = -ENOMEM, i; @@ -110,12 +137,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) } skip_percpu_elems: - err = pcpu_freelist_init(&htab->freelist); + if (htab_is_lru(htab)) + err = bpf_lru_init(&htab->lru, + htab->map.map_flags & BPF_F_NO_COMMON_LRU, + offsetof(struct htab_elem, hash) - + offsetof(struct htab_elem, lru_node), + htab_lru_map_delete_node, + htab); + else + err = pcpu_freelist_init(&htab->freelist); + if (err) goto free_elems; - pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, - htab->map.max_entries); + if (htab_is_lru(htab)) + bpf_lru_populate(&htab->lru, htab->elems, + offsetof(struct htab_elem, lru_node), + htab->elem_size, htab->map.max_entries); + else + pcpu_freelist_populate(&htab->freelist, htab->elems, + htab->elem_size, htab->map.max_entries); + return 0; free_elems: @@ -123,6 +165,16 @@ free_elems: return err; } +static void prealloc_destroy(struct bpf_htab *htab) +{ + htab_free_elems(htab); + + if (htab_is_lru(htab)) + bpf_lru_destroy(&htab->lru); + else + pcpu_freelist_destroy(&htab->freelist); +} + static int alloc_extra_elems(struct bpf_htab *htab) { void __percpu *pptr; @@ -144,14 +196,34 @@ static int alloc_extra_elems(struct bpf_htab *htab) static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; + bool lru = attr->map_type == BPF_MAP_TYPE_LRU_HASH; + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; u64 cost; - if (attr->map_flags & ~BPF_F_NO_PREALLOC) + if (lru && !capable(CAP_SYS_ADMIN)) + /* LRU implementation is much complicated than other + * maps. Hence, limit to CAP_SYS_ADMIN for now. + */ + return ERR_PTR(-EPERM); + + if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); + if (!lru && percpu_lru) + return ERR_PTR(-EINVAL); + + if (lru && !prealloc) + return ERR_PTR(-ENOTSUPP); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -171,6 +243,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size == 0) goto free_htab; + if (percpu_lru) { + /* ensure each CPU's lru list has >=1 elements. + * since we are at it, make each lru list has the same + * number of elements. + */ + htab->map.max_entries = roundup(attr->max_entries, + num_possible_cpus()); + if (htab->map.max_entries < attr->max_entries) + htab->map.max_entries = rounddown(attr->max_entries, + num_possible_cpus()); + } + /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); @@ -241,14 +325,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->buckets[i].lock); } - if (!percpu) { + if (!percpu && !lru) { + /* lru itself can remove the least used element, so + * there is no need for an extra elem during map_update. + */ err = alloc_extra_elems(htab); if (err) goto free_buckets; } - if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { - err = prealloc_elems_and_freelist(htab); + if (prealloc) { + err = prealloc_init(htab); if (err) goto free_extra_elems; } @@ -323,6 +410,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return l->key + round_up(map->key_size, 8); + } + + return NULL; +} + +/* It is called from the bpf_lru_list when the LRU needs to delete + * older elements from the htab. + */ +static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +{ + struct bpf_htab *htab = (struct bpf_htab *)arg; + struct htab_elem *l, *tgt_l; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + + tgt_l = container_of(node, struct htab_elem, lru_node); + b = __select_bucket(htab, tgt_l->hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l == tgt_l) { + hlist_del_rcu(&l->hash_node); + break; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + + return l == tgt_l; +} + /* Called from syscall */ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -579,6 +706,70 @@ err: return ret; } +static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old = NULL; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because getting free nodes from LRU may need + * to remove older elements from htab and this removal + * operation will need a bucket lock. + */ + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + bpf_lru_node_set_ref(&l_new->lru_node); + hlist_del_rcu(&l_old->hash_node); + } + ret = 0; + +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + + if (ret) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + else if (l_old) + bpf_lru_push_free(&htab->lru, &l_old->lru_node); + + return ret; +} + static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags, bool onallcpus) @@ -671,6 +862,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } +static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_irqsave(&b->lock, flags); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + ret = 0; + } + + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l) + bpf_lru_push_free(&htab->lru, &l->lru_node); + return ret; +} + static void delete_all_elements(struct bpf_htab *htab) { int i; @@ -703,12 +927,11 @@ static void htab_map_free(struct bpf_map *map) * not have executed. Wait for them. */ rcu_barrier(); - if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + if (htab->map.map_flags & BPF_F_NO_PREALLOC) delete_all_elements(htab); - } else { - htab_free_elems(htab); - pcpu_freelist_destroy(&htab->freelist); - } + else + prealloc_destroy(htab); + free_percpu(htab->extra_elems); kvfree(htab->buckets); kfree(htab); @@ -728,6 +951,20 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +static const struct bpf_map_ops htab_lru_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_map_lookup_elem, + .map_update_elem = htab_lru_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_type __read_mostly = { + .ops = &htab_lru_ops, + .type = BPF_MAP_TYPE_LRU_HASH, +}; + /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -798,6 +1035,7 @@ static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); + bpf_register_map_type(&htab_lru_type); return 0; } late_initcall(register_htab_map); -- cgit v1.2.3 From 8f8449384ec364ba2a654f11f94e754e4ff719e0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 11 Nov 2016 10:55:10 -0800 Subject: bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH Provide a LRU version of the existing BPF_MAP_TYPE_PERCPU_HASH Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 +- kernel/bpf/hashtab.c | 129 ++++++++++++++++++++++++++++++++++++++++++++--- kernel/bpf/syscall.c | 8 ++- 3 files changed, 131 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ed8c6799fb14..7d9b2832c280 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -86,6 +86,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, }; enum bpf_prog_type { @@ -108,7 +109,7 @@ enum bpf_prog_type { #define BPF_F_NO_PREALLOC (1U << 0) /* Instead of having one common LRU list in the - * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list * which can scale and perform better. * Note, the LRU nodes (including free nodes) cannot be moved * across different LRU lists. diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4a9e71a7c41f..34debc1a9641 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -62,7 +62,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); static bool htab_is_lru(const struct bpf_htab *htab) { - return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH; + return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +} + +static bool htab_is_percpu(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || + htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; } static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, @@ -85,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab) { int i; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto free_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -122,7 +129,7 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab->elems) return -ENOMEM; - if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + if (!htab_is_percpu(htab)) goto skip_percpu_elems; for (i = 0; i < htab->map.max_entries; i++) { @@ -195,8 +202,10 @@ static int alloc_extra_elems(struct bpf_htab *htab) /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { - bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; - bool lru = attr->map_type == BPF_MAP_TYPE_LRU_HASH; + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); /* percpu_lru means each cpu has its own LRU list. * it is different from BPF_MAP_TYPE_PERCPU_HASH where * the map's value itself is percpu. percpu_lru has @@ -823,12 +832,84 @@ err: return ret; } +static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* For LRU, we need to alloc before taking bucket's + * spinlock because LRU's elem alloc may need + * to remove older elem from htab and this removal + * operation will need a bucket lock. + */ + if (map_flags != BPF_EXIST) { + l_new = prealloc_lru_pop(htab, key, hash); + if (!l_new) + return -ENOMEM; + } + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + bpf_lru_node_set_ref(&l_old->lru_node); + + /* per-cpu hash map can update value in-place */ + pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), + value, onallcpus); + } else { + pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + value, onallcpus); + hlist_add_head_rcu(&l_new->hash_node, head); + l_new = NULL; + } + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + if (l_new) + bpf_lru_push_free(&htab->lru, &l_new->lru_node); + return ret; +} + static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } +static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, + false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { @@ -976,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { + bpf_lru_node_set_ref(&l->lru_node); + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + } + + return NULL; +} + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -993,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; + if (htab_is_lru(htab)) + bpf_lru_node_set_ref(&l->lru_node); pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, @@ -1008,10 +1104,16 @@ out: int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, u64 map_flags) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); int ret; rcu_read_lock(); - ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + if (htab_is_lru(htab)) + ret = __htab_lru_percpu_map_update_elem(map, key, value, + map_flags, true); + else + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, + true); rcu_read_unlock(); return ret; @@ -1031,11 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = { .type = BPF_MAP_TYPE_PERCPU_HASH, }; +static const struct bpf_map_ops htab_lru_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_percpu_map_lookup_elem, + .map_update_elem = htab_lru_percpu_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, +}; + +static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = { + .ops = &htab_lru_percpu_ops, + .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); bpf_register_map_type(&htab_lru_type); + bpf_register_map_type(&htab_lru_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 233e3ac836a6..ce1b7de7d72c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else @@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr) */ preempt_disable(); __this_cpu_inc(bpf_prog_active); - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); -- cgit v1.2.3 From fa32e8557b470f5ff90babc6cbacc61535a81a0f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jul 2016 15:25:08 -0400 Subject: tracing: Add new trace_marker_raw A new file is created: /sys/kernel/debug/tracing/trace_marker_raw This allows for appications to create data structures and write the binary data directly into it, and then read the trace data out from trace_pipe_raw into the same type of data structure. This saves on converting numbers into ASCII that would be required by trace_marker. Suggested-by: Olof Johansson Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 6 ++ kernel/trace/trace.c | 165 +++++++++++++++++++++++++++++++++-------- kernel/trace/trace.h | 2 + kernel/trace/trace_entries.h | 15 ++++ kernel/trace/trace_output.c | 30 ++++++++ 5 files changed, 187 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 1bc66c1db0cb..6c374c5fe400 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -396,6 +396,12 @@ of ftrace. Here is a list of some of the key files: trace_fd = open("trace_marker", WR_ONLY); + trace_marker_raw: + + This is similar to trace_marker above, but is meant for for binary data + to be written to it, where a tool can be used to parse the data + from trace_pipe_raw. + uprobe_events: Add dynamic tracepoints in programs. diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d904516dfdab..57069e7f369c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4054,6 +4054,7 @@ static const char readme_msg[] = " x86-tsc: TSC cycle counter\n" #endif "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" " tracing_cpumask\t- Limit which CPUs to trace\n" " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" @@ -5514,35 +5515,15 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static ssize_t -tracing_mark_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static inline int lock_user_pages(const char __user *ubuf, size_t cnt, + struct page **pages, void **map_page, + int *offset) { unsigned long addr = (unsigned long)ubuf; - struct trace_array *tr = filp->private_data; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct print_entry *entry; - unsigned long irq_flags; - struct page *pages[2]; - void *map_page[2]; int nr_pages = 1; - ssize_t written; - int offset; - int size; - int len; int ret; int i; - if (tracing_disabled) - return -EINVAL; - - if (!(tr->trace_flags & TRACE_ITER_MARKERS)) - return -EINVAL; - - if (cnt > TRACE_BUF_SIZE) - cnt = TRACE_BUF_SIZE; - /* * Userspace is injecting traces into the kernel trace buffer. * We want to be as non intrusive as possible. @@ -5557,26 +5538,70 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, * pages directly. We then write the data directly into the * ring buffer. */ - BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); /* check if we cross pages */ if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) nr_pages = 2; - offset = addr & (PAGE_SIZE - 1); + *offset = addr & (PAGE_SIZE - 1); addr &= PAGE_MASK; ret = get_user_pages_fast(addr, nr_pages, 0, pages); if (ret < nr_pages) { while (--ret >= 0) put_page(pages[ret]); - written = -EFAULT; - goto out; + return -EFAULT; } for (i = 0; i < nr_pages; i++) map_page[i] = kmap_atomic(pages[i]); + return nr_pages; +} + +static inline void unlock_user_pages(struct page **pages, + void **map_page, int nr_pages) +{ + int i; + + for (i = nr_pages - 1; i >= 0; i--) { + kunmap_atomic(map_page[i]); + put_page(pages[i]); + } +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + void *map_page[2]; + int nr_pages = 1; + ssize_t written; + int offset; + int size; + int len; + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + + nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset); + if (nr_pages < 0) + return nr_pages; + local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ buffer = tr->trace_buffer.buffer; @@ -5611,11 +5636,79 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, *fpos += written; out_unlock: - for (i = nr_pages - 1; i >= 0; i--) { - kunmap_atomic(map_page[i]); - put_page(pages[i]); + unlock_user_pages(pages, map_page, nr_pages); + + return written; +} + +/* Limit it for now to 3K (including tag) */ +#define RAW_DATA_MAX_SIZE (1024*3) + +static ssize_t +tracing_mark_raw_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct trace_array *tr = filp->private_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct raw_data_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + void *map_page[2]; + int nr_pages = 1; + ssize_t written; + int offset; + int size; + int len; + + if (tracing_disabled) + return -EINVAL; + + if (!(tr->trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + + /* The marker must at least have a tag id */ + if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); + + nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset); + if (nr_pages < 0) + return nr_pages; + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt; + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, + irq_flags, preempt_count()); + if (!event) { + /* Ring buffer disabled, return as if not open for write */ + written = -EBADF; + goto out_unlock; } - out: + + entry = ring_buffer_event_data(event); + + if (nr_pages == 2) { + len = PAGE_SIZE - offset; + memcpy(&entry->id, map_page[0] + offset, len); + memcpy(((char *)&entry->id) + len, map_page[1], cnt - len); + } else + memcpy(&entry->id, map_page[0] + offset, cnt); + + __buffer_unlock_commit(buffer, event); + + written = cnt; + + *fpos += written; + + out_unlock: + unlock_user_pages(pages, map_page, nr_pages); + return written; } @@ -5945,6 +6038,13 @@ static const struct file_operations tracing_mark_fops = { .release = tracing_release_generic_tr, }; +static const struct file_operations tracing_mark_raw_fops = { + .open = tracing_open_generic_tr, + .write = tracing_mark_raw_write, + .llseek = generic_file_llseek, + .release = tracing_release_generic_tr, +}; + static const struct file_operations trace_clock_fops = { .open = tracing_clock_open, .read = seq_read, @@ -7214,6 +7314,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); + trace_create_file("trace_marker_raw", 0220, d_tracer, + tr, &tracing_mark_raw_fops); + trace_create_file("trace_clock", 0644, d_tracer, tr, &trace_clock_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4b7918902ab8..9294f8606ade 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -40,6 +40,7 @@ enum trace_type { TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, + TRACE_RAW_DATA, __TRACE_LAST_TYPE, }; @@ -331,6 +332,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ + IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index d1cc37e78f99..eb7396b7e7c3 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -244,6 +244,21 @@ FTRACE_ENTRY(print, print_entry, FILTER_OTHER ); +FTRACE_ENTRY(raw_data, raw_data_entry, + + TRACE_RAW_DATA, + + F_STRUCT( + __field( unsigned int, id ) + __dynamic_array( char, buf ) + ), + + F_printk("id:%04x %08x", + __entry->id, (int)__entry->buf[0]), + + FILTER_OTHER +); + FTRACE_ENTRY(bputs, bputs_entry, TRACE_BPUTS, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3fc20422c166..5d33a7352919 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1288,6 +1288,35 @@ static struct trace_event trace_print_event = { .funcs = &trace_print_funcs, }; +static enum print_line_t trace_raw_data(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct raw_data_entry *field; + int i; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "# %x buf:", field->id); + + for (i = 0; i < iter->ent_size - offsetof(struct raw_data_entry, buf); i++) + trace_seq_printf(&iter->seq, " %02x", + (unsigned char)field->buf[i]); + + trace_seq_putc(&iter->seq, '\n'); + + return trace_handle_return(&iter->seq); +} + +static struct trace_event_functions trace_raw_data_funcs = { + .trace = trace_raw_data, + .raw = trace_raw_data, +}; + +static struct trace_event trace_raw_data_event = { + .type = TRACE_RAW_DATA, + .funcs = &trace_raw_data_funcs, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, @@ -1299,6 +1328,7 @@ static struct trace_event *events[] __initdata = { &trace_bprint_event, &trace_print_event, &trace_hwlat_event, + &trace_raw_data_event, NULL }; -- cgit v1.2.3 From d032ae8921ea792c1e6b2abb44022b2403f651f6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Nov 2016 12:31:20 -0800 Subject: ftrace: Provide API to use global filtering for ftrace ops Currently the global_ops filtering hash is not available to outside users registering for function tracing. Provide an API for those users to be able to choose global filtering. This is in preparation for pstore's ftrace feature to be able to use the global filters. Suggested-by: Steven Rostedt Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck Signed-off-by: Joel Fernandes Acked-by: Steven Rostedt Signed-off-by: Kees Cook --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b3d34d3e0e7e..d4a884db16a3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -398,6 +398,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); void ftrace_free_filter(struct ftrace_ops *ops); +void ftrace_ops_set_global_filter(struct ftrace_ops *ops); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); @@ -645,6 +646,7 @@ static inline unsigned long ftrace_location(unsigned long ip) #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_free_filter(ops) do { } while (0) +#define ftrace_ops_set_global_filter(ops) do { } while (0) static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return -ENODEV; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2050a7652a86..89d46e1c9302 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4239,6 +4239,23 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); +/** + * ftrace_ops_set_global_filter - setup ops to use global filters + * @ops - the ops which will use the global filters + * + * ftrace users who need global function trace filtering should call this. + * It can set the global filter only if ops were not initialized before. + */ +void ftrace_ops_set_global_filter(struct ftrace_ops *ops) +{ + if (ops->flags & FTRACE_OPS_FL_INITIALIZED) + return; + + ftrace_ops_init(ops); + ops->func_hash = &global_ops.local_hash; +} +EXPORT_SYMBOL_GPL(ftrace_ops_set_global_filter); + static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) -- cgit v1.2.3 From 74ba181e61c6accf9066d6980f44588de2f854f6 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 11 Nov 2016 00:10:08 -0500 Subject: timer: Move sys_alarm from timer.c to itimer.c Move the only user of alarm_setitimer to itimer.c where it is defined. This allows for making alarm_setitimer static, and dropping it from the build when __ARCH_WANT_SYS_ALARM is not defined. Signed-off-by: Nicolas Pitre Acked-by: John Stultz Cc: Paul Bolle Cc: linux-kbuild@vger.kernel.org Cc: netdev@vger.kernel.org Cc: Richard Cochran Cc: Josh Triplett Cc: Michal Marek Cc: Edward Cree Link: http://lkml.kernel.org/r/1478841010-28605-5-git-send-email-nicolas.pitre@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/time.h | 2 -- kernel/time/itimer.c | 15 ++++++++++++++- kernel/time/timer.c | 13 ------------- 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index 4cea09d94208..23f0f5ce3090 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -172,8 +172,6 @@ extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue); extern int do_getitimer(int which, struct itimerval *value); -extern unsigned int alarm_setitimer(unsigned int seconds); - extern long do_utimes(int dfd, const char __user *filename, struct timespec *times, int flags); struct tms; diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 1d5c7204ddc9..2b9f45bc955d 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -238,6 +238,8 @@ again: return 0; } +#ifdef __ARCH_WANT_SYS_ALARM + /** * alarm_setitimer - set alarm in seconds * @@ -250,7 +252,7 @@ again: * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid * negative timeval settings which would cause immediate expiry. */ -unsigned int alarm_setitimer(unsigned int seconds) +static unsigned int alarm_setitimer(unsigned int seconds) { struct itimerval it_new, it_old; @@ -275,6 +277,17 @@ unsigned int alarm_setitimer(unsigned int seconds) return it_old.it_value.tv_sec; } +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, struct itimerval __user *, ovalue) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 88aab86a4594..42d27aa242b9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1662,19 +1662,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -#ifdef __ARCH_WANT_SYS_ALARM - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -SYSCALL_DEFINE1(alarm, unsigned int, seconds) -{ - return alarm_setitimer(seconds); -} - -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); -- cgit v1.2.3 From 53d3eaa31508222e445b489f3c3ac4c63542a4ef Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 11 Nov 2016 00:10:09 -0500 Subject: posix_cpu_timers: Move the add_device_randomness() call to a proper place There is no logical relation between add_device_randomness() and posix_cpu_timers_exit(). Let's move the former to where the later is called. This way, when posix-cpu-timers.c is compiled out, there is no need to worry about not losing a call to add_device_randomness(). Signed-off-by: Nicolas Pitre Acked-by: John Stultz Cc: Paul Bolle Cc: linux-kbuild@vger.kernel.org Cc: netdev@vger.kernel.org Cc: Richard Cochran Cc: Josh Triplett Cc: Michal Marek Cc: Edward Cree Link: http://lkml.kernel.org/r/1478841010-28605-6-git-send-email-nicolas.pitre@linaro.org Signed-off-by: Thomas Gleixner --- kernel/exit.c | 4 ++++ kernel/time/posix-cpu-timers.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45ebbe3..d16bcdd89dbe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -116,6 +117,9 @@ static void __exit_signal(struct task_struct *tsk) sig->curr_target = next_thread(tsk); } + add_device_randomness((const void*) &tsk->se.sum_exec_runtime, + sizeof(unsigned long long)); + /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 39008d78927a..e582f20f47a4 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -447,10 +446,7 @@ static void cleanup_timers(struct list_head *head) */ void posix_cpu_timers_exit(struct task_struct *tsk) { - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, - sizeof(unsigned long long)); cleanup_timers(tsk->cpu_timers); - } void posix_cpu_timers_exit_group(struct task_struct *tsk) { -- cgit v1.2.3 From baa73d9e478ff32d62f3f9422822b59dd9a95a21 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 11 Nov 2016 00:10:10 -0500 Subject: posix-timers: Make them configurable Some embedded systems have no use for them. This removes about 25KB from the kernel binary size when configured out. Corresponding syscalls are routed to a stub logging the attempt to use those syscalls which should be enough of a clue if they were disabled without proper consideration. They are: timer_create, timer_gettime: timer_getoverrun, timer_settime, timer_delete, clock_adjtime, setitimer, getitimer, alarm. The clock_settime, clock_gettime, clock_getres and clock_nanosleep syscalls are replaced by simple wrappers compatible with CLOCK_REALTIME, CLOCK_MONOTONIC and CLOCK_BOOTTIME only which should cover the vast majority of use cases with very little code. Signed-off-by: Nicolas Pitre Acked-by: Richard Cochran Acked-by: Thomas Gleixner Acked-by: John Stultz Reviewed-by: Josh Triplett Cc: Paul Bolle Cc: linux-kbuild@vger.kernel.org Cc: netdev@vger.kernel.org Cc: Michal Marek Cc: Edward Cree Link: http://lkml.kernel.org/r/1478841010-28605-7-git-send-email-nicolas.pitre@linaro.org Signed-off-by: Thomas Gleixner --- arch/alpha/kernel/osf_sys.c | 8 +++ drivers/char/Kconfig | 1 + drivers/ptp/Kconfig | 2 +- fs/exec.c | 2 + init/Kconfig | 17 ++++++ kernel/compat.c | 8 +++ kernel/exit.c | 11 +++- kernel/fork.c | 2 + kernel/signal.c | 6 +++ kernel/sys.c | 3 +- kernel/time/Makefile | 10 +++- kernel/time/alarmtimer.c | 6 ++- kernel/time/posix-stubs.c | 123 ++++++++++++++++++++++++++++++++++++++++++++ kernel/time/timer.c | 3 +- security/selinux/hooks.c | 11 ++-- 15 files changed, 200 insertions(+), 13 deletions(-) create mode 100644 kernel/time/posix-stubs.c (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index ffb93f499c83..56e427c7aa3c 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1029,11 +1029,16 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv, return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } +asmlinkage long sys_ni_posix_timers(void); + SYSCALL_DEFINE2(osf_getitimer, int, which, struct itimerval32 __user *, it) { struct itimerval kit; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + error = do_getitimer(which, &kit); if (!error && put_it32(it, &kit)) error = -EFAULT; @@ -1047,6 +1052,9 @@ SYSCALL_DEFINE3(osf_setitimer, int, which, struct itimerval32 __user *, in, struct itimerval kin, kout; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + if (in) { if (get_it32(&kin, in)) return -EFAULT; diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index dcc09739a54e..45ba878ae025 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -542,6 +542,7 @@ config HANGCHECK_TIMER config MMTIMER tristate "MMTIMER Memory mapped RTC for SGI Altix" depends on IA64_GENERIC || IA64_SGI_SN2 + depends on POSIX_TIMERS default y help The mmtimer device allows direct userspace access to the diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 0f7492f8ea23..bdce33291161 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -6,7 +6,7 @@ menu "PTP clock support" config PTP_1588_CLOCK tristate "PTP clock support" - depends on NET + depends on NET && POSIX_TIMERS select PPS select NET_PTP_CLASSIFY help diff --git a/fs/exec.c b/fs/exec.c index 4e497b9ee71e..923c57d96899 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1169,8 +1169,10 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; +#ifdef CONFIG_POSIX_TIMERS exit_itimers(sig); flush_itimer_signals(); +#endif if (atomic_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; diff --git a/init/Kconfig b/init/Kconfig index 34407f15e6d3..456e0b891238 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1445,6 +1445,23 @@ config SYSCTL_SYSCALL If unsure say N here. +config POSIX_TIMERS + bool "Posix Clocks & timers" if EXPERT + default y + help + This includes native support for POSIX timers to the kernel. + Some embedded systems have no use for them and therefore they + can be configured out to reduce the size of the kernel image. + + When this option is disabled, the following syscalls won't be + available: timer_create, timer_gettime: timer_getoverrun, + timer_settime, timer_delete, clock_adjtime, getitimer, + setitimer, alarm. Furthermore, the clock_settime, clock_gettime, + clock_getres and clock_nanosleep syscalls will be limited to + CLOCK_REALTIME, CLOCK_MONOTONIC and CLOCK_BOOTTIME only. + + If unsure say y. + config KALLSYMS bool "Load all symbols for debugging/ksymoops" if EXPERT default y diff --git a/kernel/compat.c b/kernel/compat.c index 333d364be29d..b3a047f208a7 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -307,12 +307,17 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o, __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); } +asmlinkage long sys_ni_posix_timers(void); + COMPAT_SYSCALL_DEFINE2(getitimer, int, which, struct compat_itimerval __user *, it) { struct itimerval kit; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + error = do_getitimer(which, &kit); if (!error && put_compat_itimerval(it, &kit)) error = -EFAULT; @@ -326,6 +331,9 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, struct itimerval kin, kout; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + if (in) { if (get_compat_itimerval(&kin, in)) return -EFAULT; diff --git a/kernel/exit.c b/kernel/exit.c index d16bcdd89dbe..684de019b674 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -92,11 +92,10 @@ static void __exit_signal(struct task_struct *tsk) lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); +#ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) { posix_cpu_timers_exit_group(tsk); - tty = sig->tty; - sig->tty = NULL; } else { /* * This can only happen if the caller is de_thread(). @@ -105,7 +104,13 @@ static void __exit_signal(struct task_struct *tsk) */ if (unlikely(has_group_leader_pid(tsk))) posix_cpu_timers_exit_group(tsk); + } +#endif + if (group_dead) { + tty = sig->tty; + sig->tty = NULL; + } else { /* * If there is any task waiting for the group exit * then notify it: @@ -803,8 +808,10 @@ void __noreturn do_exit(long code) acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { +#ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); +#endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..17da35fa77e7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1342,8 +1342,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); +#ifdef CONFIG_POSIX_TIMERS hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; +#endif task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); diff --git a/kernel/signal.c b/kernel/signal.c index 75761acc77cf..29a410780aa9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -427,6 +427,7 @@ void flush_signals(struct task_struct *t) spin_unlock_irqrestore(&t->sighand->siglock, flags); } +#ifdef CONFIG_POSIX_TIMERS static void __flush_itimer_signals(struct sigpending *pending) { sigset_t signal, retain; @@ -460,6 +461,7 @@ void flush_itimer_signals(void) __flush_itimer_signals(&tsk->signal->shared_pending); spin_unlock_irqrestore(&tsk->sighand->siglock, flags); } +#endif void ignore_signals(struct task_struct *t) { @@ -567,6 +569,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); +#ifdef CONFIG_POSIX_TIMERS /* * itimer signal ? * @@ -590,6 +593,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) hrtimer_restart(tmr); } } +#endif } recalc_sigpending(); @@ -611,6 +615,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } +#ifdef CONFIG_POSIX_TIMERS if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* * Release the siglock to ensure proper locking order @@ -622,6 +627,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) do_schedule_next_timer(info); spin_lock(&tsk->sighand->siglock); } +#endif return signr; } diff --git a/kernel/sys.c b/kernel/sys.c index 89d5be418157..78c9fb7dd680 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1416,7 +1416,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, * applications, so we live with it */ if (!retval && new_rlim && resource == RLIMIT_CPU && - new_rlim->rlim_cur != RLIM_INFINITY) + new_rlim->rlim_cur != RLIM_INFINITY && + IS_ENABLED(CONFIG_POSIX_TIMERS)) update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: read_unlock(&tasklist_lock); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 49eca0beed32..976840d29a71 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,6 +1,12 @@ -obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o +obj-y += time.o timer.o hrtimer.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o +obj-y += timeconv.o timecounter.o alarmtimer.o + +ifeq ($(CONFIG_POSIX_TIMERS),y) + obj-y += posix-timers.o posix-cpu-timers.o posix-clock.o itimer.o +else + obj-y += posix-stubs.o +endif obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 12dd190634ab..a15caa3d1721 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -846,8 +846,10 @@ static int __init alarmtimer_init(void) alarmtimer_rtc_timer_init(); - posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); - posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + } /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c new file mode 100644 index 000000000000..cd6716e115e8 --- /dev/null +++ b/kernel/time/posix-stubs.c @@ -0,0 +1,123 @@ +/* + * Dummy stubs used when CONFIG_POSIX_TIMERS=n + * + * Created by: Nicolas Pitre, July 2016 + * Copyright: (C) 2016 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage long sys_ni_posix_timers(void) +{ + pr_err_once("process %d (%s) attempted a POSIX timer syscall " + "while CONFIG_POSIX_TIMERS is not set\n", + current->pid, current->comm); + return -ENOSYS; +} + +#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) + +SYS_NI(timer_create); +SYS_NI(timer_gettime); +SYS_NI(timer_getoverrun); +SYS_NI(timer_settime); +SYS_NI(timer_delete); +SYS_NI(clock_adjtime); +SYS_NI(getitimer); +SYS_NI(setitimer); +#ifdef __ARCH_WANT_SYS_ALARM +SYS_NI(alarm); +#endif + +/* + * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC + * as it is easy to remain compatible with little code. CLOCK_BOOTTIME + * is also included for convenience as at least systemd uses it. + */ + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) +{ + struct timespec new_tp; + + if (which_clock != CLOCK_REALTIME) + return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) + return -EFAULT; + return do_sys_settimeofday(&new_tp, NULL); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) +{ + struct timespec kernel_tp; + + switch (which_clock) { + case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; + case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; + case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; + default: return -EINVAL; + } + if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) +{ + struct timespec rtn_tp = { + .tv_sec = 0, + .tv_nsec = hrtimer_resolution, + }; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp))) + return -EFAULT; + return 0; + default: + return -EINVAL; + } +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct timespec t; + + switch (which_clock) { + case CLOCK_REALTIME: + case CLOCK_MONOTONIC: + case CLOCK_BOOTTIME: + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + if (!timespec_valid(&t)) + return -EINVAL; + return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); + default: + return -EINVAL; + } +} + +#ifdef CONFIG_COMPAT +long clock_nanosleep_restart(struct restart_block *restart_block) +{ + return hrtimer_nanosleep_restart(restart_block); +} +#endif diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 42d27aa242b9..e2892e454fe3 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1601,7 +1601,8 @@ void update_process_times(int user_tick) irq_work_tick(); #endif scheduler_tick(); - run_posix_cpu_timers(p); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(p); } /** diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 09fd6108e421..38b79d797aaf 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2525,7 +2525,8 @@ static void selinux_bprm_committing_creds(struct linux_binprm *bprm) rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur); } task_unlock(current); - update_rlimit_cpu(current, rlimit(RLIMIT_CPU)); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + update_rlimit_cpu(current, rlimit(RLIMIT_CPU)); } } @@ -2555,9 +2556,11 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) */ rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL); if (rc) { - memset(&itimer, 0, sizeof itimer); - for (i = 0; i < 3; i++) - do_setitimer(i, &itimer, NULL); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) { + memset(&itimer, 0, sizeof itimer); + for (i = 0; i < 3; i++) + do_setitimer(i, &itimer, NULL); + } spin_lock_irq(¤t->sighand->siglock); if (!fatal_signal_pending(current)) { flush_sigqueue(¤t->pending); -- cgit v1.2.3 From bf0d31c05411f030e7df9f98b5be4d0c6fd5cd39 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 25 Oct 2016 11:03:12 +0200 Subject: locking/core, stop_machine: Yield the CPU during stop machine() Some time ago the following commit: 57f2ffe14fd125c2 ("s390: remove diag 44 calls from cpu_relax()") ... stopped cpu_relax() on s390 yielding to the hypervisor. As it turns out this made stop_machine() run really slow on virtualized overcommited systems. For example the kprobes test during bootup took several seconds instead of just running unnoticed with large guests. Therefore, yielding was reintroduced with commit: 4d92f50249eb ("s390: reintroduce diag 44 calls for cpu_relax()") ... but in fact the stop machine code seems to be the only place where this yielding was really necessary. This place is probably the most important one as it makes all but one guest CPUs wait for one guest CPU. As we now have cpu_relax_yield(), we can use this in multi_cpu_stop(). For now lets only add it here. We can add it later in other places when necessary. Signed-off-by: Christian Borntraeger Signed-off-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Noam Camus Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Cc: linuxppc-dev@lists.ozlabs.org Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1477386195-32736-3-git-send-email-borntraeger@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ec9ab2f01489..1eb82661ecdb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data) /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax(); + cpu_relax_yield(); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { -- cgit v1.2.3 From f2f09a4cee3507dba0e24b87ba2961a5c377d3a7 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 25 Oct 2016 11:03:14 +0200 Subject: locking/core: Remove cpu_relax_lowlatency() users With the s390 special case of a yielding cpu_relax() implementation gone, we can now remove all users of cpu_relax_lowlatency() and replace them with cpu_relax(). Signed-off-by: Christian Borntraeger Signed-off-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Noam Camus Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Cc: linuxppc-dev@lists.ozlabs.org Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1477386195-32736-5-git-send-email-borntraeger@de.ibm.com Signed-off-by: Ingo Molnar --- drivers/gpu/drm/i915/i915_gem_request.c | 2 +- drivers/vhost/net.c | 4 ++-- kernel/locking/mcs_spinlock.h | 4 ++-- kernel/locking/mutex.c | 4 ++-- kernel/locking/osq_lock.c | 6 +++--- kernel/locking/qrwlock.c | 6 +++--- kernel/locking/rwsem-xadd.c | 4 ++-- lib/lockref.c | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 8832f8ec1583..383d13416442 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -723,7 +723,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req, if (busywait_stop(timeout_us, cpu)) break; - cpu_relax_lowlatency(); + cpu_relax(); } while (!need_resched()); return false; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 5dc128a8da83..5dc34653274a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -342,7 +342,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, endtime = busy_clock() + vq->busyloop_timeout; while (vhost_can_busy_poll(vq->dev, endtime) && vhost_vq_avail_empty(vq->dev, vq)) - cpu_relax_lowlatency(); + cpu_relax(); preempt_enable(); r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), out_num, in_num, NULL, NULL); @@ -533,7 +533,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) while (vhost_can_busy_poll(&net->dev, endtime) && !sk_has_rx_data(sk) && vhost_vq_avail_empty(&net->dev, vq)) - cpu_relax_lowlatency(); + cpu_relax(); preempt_enable(); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index c835270f0c2f..6a385aabcce7 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -28,7 +28,7 @@ struct mcs_spinlock { #define arch_mcs_spin_lock_contended(l) \ do { \ while (!(smp_load_acquire(l))) \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } while (0) #endif @@ -108,7 +108,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) return; /* Wait until the next pointer is set */ while (!(next = READ_ONCE(node->next))) - cpu_relax_lowlatency(); + cpu_relax(); } /* Pass lock to next waiter. */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 17a88e929e6a..a65e09a046ac 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -369,7 +369,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) break; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); @@ -492,7 +492,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } if (!waiter) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 05a37857ab55..4ea2710b9d6c 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -75,7 +75,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, break; } - cpu_relax_lowlatency(); + cpu_relax(); } return next; @@ -122,7 +122,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) if (need_resched()) goto unqueue; - cpu_relax_lowlatency(); + cpu_relax(); } return true; @@ -148,7 +148,7 @@ unqueue: if (smp_load_acquire(&node->locked)) return true; - cpu_relax_lowlatency(); + cpu_relax(); /* * Or we race against a concurrent unqueue()'s step-B, in which diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 19248ddf37ce..cc3ed0ccdfa2 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -54,7 +54,7 @@ static __always_inline void rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax_lowlatency(); + cpu_relax(); cnts = atomic_read_acquire(&lock->cnts); } } @@ -130,7 +130,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; - cpu_relax_lowlatency(); + cpu_relax(); } /* When no more readers, set the locked flag */ @@ -141,7 +141,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) _QW_LOCKED) == _QW_WAITING)) break; - cpu_relax_lowlatency(); + cpu_relax(); } unlock: arch_spin_unlock(&lock->wait_lock); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2337b4bb2366..2fa2e2e64950 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -368,7 +368,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) return false; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); out: @@ -423,7 +423,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } osq_unlock(&sem->osq); done: diff --git a/lib/lockref.c b/lib/lockref.c index 5a92189ad711..c4bfcb8836cd 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -20,7 +20,7 @@ if (likely(old.lock_count == prev.lock_count)) { \ SUCCESS; \ } \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } \ } while (0) -- cgit v1.2.3 From 527b0a76f41d062381adbb55c8eb61e32cb0bfc9 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 11 Nov 2016 15:27:49 +0100 Subject: sched/cpuacct: Avoid %lld seq_printf warning For s390 kernel builds I keep getting this warning: kernel/sched/cpuacct.c: In function 'cpuacct_stats_show': kernel/sched/cpuacct.c:298:25: warning: format '%lld' expects argument of type 'long long int', but argument 4 has type 'clock_t {aka long int}' [-Wformat=] seq_printf(sf, "%s %lld\n", Silence the warning by adding an explicit cast. Signed-off-by: Martin Schwidefsky Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161111142749.6545-1-schwidefsky@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index bc0b309c3f19..9add206b5608 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[stat], - cputime64_to_clock_t(val[stat])); + (long long)cputime64_to_clock_t(val[stat])); } return 0; -- cgit v1.2.3 From 104cb16d9eb684f071d5bf3aa87c0d01af259b7c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:07 +0100 Subject: sched/fair: Compute task/cpu utilization at wake-up correctly At task wake-up load-tracking isn't updated until the task is enqueued. The task's own view of its utilization contribution may therefore not be aligned with its contribution to the cfs_rq load-tracking which may have been updated in the meantime. Basically, the task's own utilization hasn't yet accounted for the sleep decay, while the cfs_rq may have (partially). Estimating the cfs_rq utilization in case the task is migrated at wake-up as task_rq(p)->cfs.avg.util_avg - p->se.avg.util_avg is therefore incorrect as the two load-tracking signals aren't time synchronized (different last update). To solve this problem, this patch synchronizes the task utilization with its previous rq before the task utilization is used in the wake-up path. Currently the update/synchronization is done _after_ the task has been placed by select_task_rq_fair(). The synchronization is done without having to take the rq lock using the existing mechanism used in remove_entity_load_avg(). Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3cf446c53043..b05d691bbda8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3198,6 +3198,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) } #endif +/* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -3205,7 +3218,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* * tasks cannot exit without having gone through wake_up_new_task() -> @@ -3217,9 +3229,7 @@ void remove_entity_load_avg(struct sched_entity *se) * calls this. */ - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -5582,6 +5592,24 @@ static inline int task_util(struct task_struct *p) return p->se.avg.util_avg; } +/* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); + + return (util >= capacity) ? capacity : util; +} + /* * Disable WAKE_AFFINE in the case where task @p doesn't fit in the * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. @@ -5600,6 +5628,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) if (max_cap - min_cap < max_cap >> 3) return 0; + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + return min_cap * 1024 < task_util(p) * capacity_margin; } -- cgit v1.2.3 From 6a0b19c0f39a7a7b7fb77d3867a733136ff059a3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:08 +0100 Subject: sched/fair: Consider spare capacity in find_idlest_group() In low-utilization scenarios comparing relative loads in find_idlest_group() doesn't always lead to the most optimum choice. Systems with groups containing different numbers of cpus and/or cpus of different compute capacity are significantly better off when considering spare capacity rather than relative load in those scenarios. In addition to existing load based search an alternative spare capacity based candidate sched_group is found and selected instead if sufficient spare capacity exists. If not, existing behaviour is preserved. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b05d691bbda8..1ad37064c0c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5202,6 +5202,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return 1; } +static inline int task_util(struct task_struct *p); +static int cpu_util_wake(int cpu, struct task_struct *p); + +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5211,7 +5219,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *most_spare_sg = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -5219,7 +5229,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -5231,8 +5241,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -5242,6 +5256,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + spare_cap = capacity_spare_wake(i, p); + + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -5249,12 +5268,33 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_spare = max_spare_cap; + } else { + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + */ + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; -- cgit v1.2.3 From bf475ce0a3dd75b5d1df6c6c14ae25168caa15ac Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:09 +0100 Subject: sched/fair: Add per-CPU min capacity to sched_group_capacity struct sched_group_capacity currently represents the compute capacity sum of all CPUs in the sched_group. Unless it is divided by the group_weight to get the average capacity per CPU, it hides differences in CPU capacity for mixed capacity systems (e.g. high RT/IRQ utilization or ARM big.LITTLE). But even the average may not be sufficient if the group covers CPUs of different capacities. Instead, by extending struct sched_group_capacity to indicate min per-CPU capacity in the group a suitable group for a given task utilization can more easily be found such that CPUs with reduced capacity can be avoided for tasks with high utilization (not implemented by this patch). Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- kernel/sched/fair.c | 17 ++++++++++++----- kernel/sched/sched.h | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3cfa0dd5b34..6bf1fd3514d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5708,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -6185,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1ad37064c0c2..faf8f18616e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6909,13 +6909,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6928,6 +6929,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -6952,11 +6954,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + min_capacity = min(capacity, min_capacity); } } else { /* @@ -6966,12 +6969,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + min_capacity = min(sgc->min_capacity, min_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->min_capacity = min_capacity; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 055f935d4421..345c1ccaba34 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -892,7 +892,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ -- cgit v1.2.3 From 9e0994c0a1c1f82c705f1f66388e1bcffcee8bb9 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:10 +0100 Subject: sched/fair: Avoid pulling tasks from non-overloaded higher capacity groups For asymmetric CPU capacity systems it is counter-productive for throughput if low capacity CPUs are pulling tasks from non-overloaded CPUs with higher capacity. The assumption is that higher CPU capacity is preferred over running alone in a group with lower CPU capacity. This patch rejects higher CPU capacity groups with one or less task per CPU as potential busiest group which could otherwise lead to a series of failing load-balancing attempts leading to a force-migration. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index faf8f18616e6..ee39bfda5ae5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7073,6 +7073,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->min_capacity * capacity_margin < + ref->sgc->min_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7176,6 +7187,20 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; -- cgit v1.2.3 From 893c5d2279041afeb593f1fa8edd9d02edf5b7cb Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Oct 2016 14:41:12 +0100 Subject: sched/fair: Fix incorrect comment for capacity_margin The comment for capacity_margin introduced in: 3273163c6775 ("sched/fair: Let asymmetric CPU configurations balance at wake-up") ... got its usage the wrong way round - fix it. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1476452472-24740-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee39bfda5ae5..5e6c00ad2ac3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -109,7 +109,7 @@ unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; /* * The margin used when comparing utilization with CPU capacity: - * util * 1024 < capacity * margin + * util * margin < capacity * 1024 */ unsigned int capacity_margin = 1280; /* ~20% */ -- cgit v1.2.3 From df217913e72ec7e603d8b68cc4c70646cf7000db Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:42 +0100 Subject: sched/fair: Factorize attach/detach entity Factorize post_init_entity_util_avg() and part of attach_task_cfs_rq() in one function attach_entity_cfs_rq(). Create symmetric detach_entity_cfs_rq() function. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e6c00ad2ac3..0731affbe81f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -701,9 +701,7 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); -static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); -static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); +static void attach_entity_cfs_rq(struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -735,7 +733,6 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; - u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -763,14 +760,12 @@ void post_init_entity_util_avg(struct sched_entity *se) * such that the next switched_to_fair() has the * expected state. */ - se->avg.last_update_time = now; + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); return; } } - update_cfs_rq_load_avg(now, cfs_rq, false); - attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + attach_entity_cfs_rq(se); } #else /* !CONFIG_SMP */ @@ -8783,30 +8778,19 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } -static void detach_task_cfs_rq(struct task_struct *p) +static void detach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } - /* Catch up with the cfs_rq and remove our load when we leave */ update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); } -static void attach_task_cfs_rq(struct task_struct *p) +static void attach_entity_cfs_rq(struct sched_entity *se) { - struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); @@ -8818,10 +8802,35 @@ static void attach_task_cfs_rq(struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - /* Synchronize task with its cfs_rq */ + /* Synchronize entity with its cfs_rq */ update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } + + detach_entity_cfs_rq(se); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; -- cgit v1.2.3 From 9c2791f936ef5fd04a118b5c284f2c9a95f4a647 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:43 +0100 Subject: sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list Fix the insertion of cfs_rq in rq->leaf_cfs_rq_list to ensure that a child will always be called before its parent. The hierarchical order in shares update list has been introduced by commit: 67e86250f8ea ("sched: Introduce hierarchal order on shares update list") With the current implementation a child can be still put after its parent. Lets take the example of: root \ b /\ c d* | e* with root -> b -> c already enqueued but not d -> e so the leaf_cfs_rq_list looks like: head -> c -> b -> root -> tail The branch d -> e will be added the first time that they are enqueued, starting with e then d. When e is added, its parents is not already on the list so e is put at the tail : head -> c -> b -> root -> e -> tail Then, d is added at the head because its parent is already on the list: head -> d -> c -> b -> root -> e -> tail e is not placed at the right position and will be called the last whereas it should be called at the beginning. Because it follows the bottom-up enqueue sequence, we are sure that we will finished to add either a cfs_rq without parent or a cfs_rq with a parent that is already on the list. We can use this event to detect when we have finished to add a new branch. For the others, whose parents are not already added, we have to ensure that they will be added after their children that have just been inserted the steps before, and after any potential parents that are already in the list. The easiest way is to put the cfs_rq just after the last inserted one and to keep track of it untl the branch is fully added. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 1 + 3 files changed, 49 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6bf1fd3514d5..dc64bd71ed2b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7604,6 +7604,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0731affbe81f..4a67026a2424 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -283,19 +283,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 345c1ccaba34..36f30e0aa266 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -623,6 +623,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* -- cgit v1.2.3 From d31b1a66cbe0931733583ad9d9e8c6cfd710907d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:44 +0100 Subject: sched/fair: Factorize PELT update Every time we modify load/utilization of sched_entity, we start to sync it with its cfs_rq. This update is done in different ways: - when attaching/detaching a sched_entity, we update cfs_rq and then we sync the entity with the cfs_rq. - when enqueueing/dequeuing the sched_entity, we update both sched_entity and cfs_rq metrics to now. Use update_load_avg() everytime we have to update and sync cfs_rq and sched_entity before changing the state of a sched_enity. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 76 ++++++++++++++++++----------------------------------- 1 file changed, 25 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4a67026a2424..d707ad037b31 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3092,8 +3092,14 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return decayed || removed_load; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); @@ -3104,11 +3110,13 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } - if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) + if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); } @@ -3122,26 +3130,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - * - * Or we're fresh through post_init_entity_util_avg(). - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; @@ -3161,9 +3149,6 @@ skip_aging: */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); @@ -3178,34 +3163,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3289,7 +3260,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) return 0; } -static inline void update_load_avg(struct sched_entity *se, int not_used) +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1) { cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); } @@ -3434,6 +3408,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -3508,6 +3483,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); @@ -3595,7 +3571,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -3713,7 +3689,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); + update_load_avg(curr, UPDATE_TG); update_cfs_shares(cfs_rq); #ifdef CONFIG_SCHED_HRTICK @@ -4610,7 +4586,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -4669,7 +4645,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); update_cfs_shares(cfs_rq); } @@ -8821,10 +8797,9 @@ static inline bool vruntime_normalized(struct task_struct *p) static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); } @@ -8832,7 +8807,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8843,7 +8817,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) #endif /* Synchronize entity with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); } -- cgit v1.2.3 From 09a43ace1f986b003c118fdf6ddf1fd685692d49 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:45 +0100 Subject: sched/fair: Propagate load during synchronous attach/detach When a task moves from/to a cfs_rq, we set a flag which is then used to propagate the change at parent level (sched_entity and cfs_rq) during next update. If the cfs_rq is throttled, the flag will stay pending until the cfs_rq is unthrottled. For propagating the utilization, we copy the utilization of group cfs_rq to the sched_entity. For propagating the load, we have to take into account the load of the whole task group in order to evaluate the load of the sched_entity. Similarly to what was done before the rewrite of PELT, we add a correction factor in case the task group's load is greater than its share so it will contribute the same load of a task of equal weight. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 188 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d707ad037b31..8cf26fd7ce58 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2918,6 +2918,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + #ifdef CONFIG_FAIR_GROUP_SCHED /** * update_tg_load_avg - update the tg's load avg @@ -2997,8 +3017,138 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } } + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) @@ -3105,6 +3255,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) u64 now = cfs_rq_clock_task(cfs_rq); struct rq *rq = rq_of(cfs_rq); int cpu = cpu_of(rq); + int decayed; /* * Track task load average for carrying it to new CPU after migrated, and @@ -3116,7 +3267,10 @@ static inline void update_load_avg(struct sched_entity *se, int flags) cfs_rq->curr == se, NULL); } - if (update_cfs_rq_load_avg(now, cfs_rq, true) && (flags & UPDATE_TG)) + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); + + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); } @@ -3135,6 +3289,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -3154,6 +3309,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); cfs_rq_util_change(cfs_rq); } @@ -8794,6 +8950,31 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -8802,6 +8983,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) update_load_avg(se, 0); detach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); } static void attach_entity_cfs_rq(struct sched_entity *se) @@ -8820,6 +9002,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); } static void detach_task_cfs_rq(struct task_struct *p) @@ -8898,6 +9081,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 36f30e0aa266..d7e39317d688 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -404,6 +404,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT -- cgit v1.2.3 From 4e5160766fcc9f41bbd38bac11f92dce993644aa Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:46 +0100 Subject: sched/fair: Propagate asynchrous detach A task can be asynchronously detached from cfs_rq when migrating between CPUs. The load of the migrated task is then removed from source cfs_rq during its next update. We use this event to set propagation flag. During the load balance, we take advantage of the update of blocked load to propagate any pending changes. The propagation relies on patch: "sched: Fix hierarchical order in rq->leaf_cfs_rq_list" ... which orders children and parents, to ensure that it's done in one pass. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cf26fd7ce58..090a9bb51ab2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3219,6 +3219,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { @@ -3226,6 +3227,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -6872,6 +6874,10 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3 From d03266910a533d874c01ef2ca8dc73009f2925fa Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Nov 2016 10:53:47 +0100 Subject: sched/fair: Fix task group initialization The moves of tasks are now propagated down to root and the utilization of cfs_rq reflects reality so it doesn't need to be estimated at init. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: kernellwp@gmail.com Cc: pjt@google.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1478598827-32372-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 090a9bb51ab2..02605f2826a2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9198,7 +9198,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); + attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } -- cgit v1.2.3 From 2874aa2e467dbc0b4f7cb0ee5dc872e98e000a47 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Nov 2016 11:00:04 -0800 Subject: bpf: Fix compilation warning in __bpf_lru_list_rotate_inactive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc-6.2.1 gives the following warning: kernel/bpf/bpf_lru_list.c: In function ‘__bpf_lru_list_rotate_inactive.isra.3’: kernel/bpf/bpf_lru_list.c:201:28: warning: ‘next’ may be used uninitialized in this function [-Wmaybe-uninitialized] The "next" is currently initialized in the while() loop which must have >=1 iterations. This patch initializes next to get rid of the compiler warning. Fixes: 3a08c2fd7634 ("bpf: LRU List") Reported-by: David Miller Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/bpf_lru_list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index bfebff010ba9..89b7ef41c86b 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -170,7 +170,7 @@ static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; - struct list_head *cur, *next, *last; + struct list_head *cur, *last, *next = inactive; struct bpf_lru_node *node; unsigned int i = 0; -- cgit v1.2.3 From bfe130773862bb3a02cdc4d4c2169f7f0210a46b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2016 10:12:58 +0100 Subject: genirq/affinity: Take reserved vectors into account when spreading irqs The recent addition of reserved vectors at the beginning or the end of the vector space did not take the reserved vectors at the beginning into account for the various loop exit conditions. As a consequence the last vectors of the spread area are not included into the spread algorithm and are treated like the reserved vectors at the end of the vector space and get the default affinity mask assigned. Sum up the affinity vectors and the reserved vectors at the beginning and use the sum as exit condition. [ tglx: Fixed all conditions instead of only one and massaged changelog ] Signed-off-by: Christoph Hellwig Link: http://lkml.kernel.org/r/1479201178-29604-2-git-send-email-hch@lst.de Signed-off-by: Thomas Gleixner --- kernel/irq/affinity.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 17360bd9619b..49eb38d48816 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -61,6 +61,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) { int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec; int affv = nvecs - affd->pre_vectors - affd->post_vectors; + int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; cpumask_var_t nmsk; @@ -87,7 +88,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, cpumask_of_node(n)); - if (++curvec == affv) + if (++curvec == last_affv) break; } goto done; @@ -107,7 +108,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); - for (v = 0; curvec < affv && v < vecs_to_assign; curvec++, v++) { + for (v = 0; curvec < last_affv && v < vecs_to_assign; + curvec++, v++) { cpus_per_vec = ncpus / vecs_to_assign; /* Account for extra vectors to compensate rounding errors */ @@ -119,7 +121,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec); } - if (curvec >= affv) + if (curvec >= last_affv) break; } -- cgit v1.2.3 From b6e5d5b947527558afac4aa0cdfa2ac586332e03 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Nov 2016 18:36:44 +0100 Subject: genirq/affinity: Use default affinity mask for reserved vectors The reserved vectors at the beginning and the end of the vector space get cpu_possible_mask assigned as their affinity mask. All other non-auto affine interrupts get the default irq affinity mask assigned. Using cpu_possible_mask breaks that rule. Treat them like any other interrupt and use irq_default_affinity as target mask. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig --- kernel/irq/affinity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 49eb38d48816..9be9bda7c1f9 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -75,7 +75,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) - cpumask_copy(masks + curvec, cpu_possible_mask); + cpumask_copy(masks + curvec, irq_default_affinity); /* Stabilize the cpumasks */ get_online_cpus(); @@ -130,7 +130,7 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) - cpumask_copy(masks + curvec, cpu_possible_mask); + cpumask_copy(masks + curvec, irq_default_affinity); out: free_cpumask_var(nmsk); return masks; -- cgit v1.2.3 From f23cc643f9baec7f71f2b74692da3cf03abbbfda Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Nov 2016 15:45:36 -0500 Subject: bpf: fix range arithmetic for bpf map access I made some invalid assumptions with BPF_AND and BPF_MOD that could result in invalid accesses to bpf map entries. Fix this up by doing a few things 1) Kill BPF_MOD support. This doesn't actually get used by the compiler in real life and just adds extra complexity. 2) Fix the logic for BPF_AND, don't allow AND of negative numbers and set the minimum value to 0 for positive AND's. 3) Don't do operations on the ranges if they are set to the limits, as they are by definition undefined, and allowing arithmetic operations on those values could make them appear valid when they really aren't. This fixes the testcase provided by Jann as well as a few other theoretical problems. Reported-by: Jann Horn Signed-off-by: Josef Bacik Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 5 ++-- kernel/bpf/verifier.c | 70 +++++++++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7035b997aaa5..6aaf425cebc3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -14,7 +14,7 @@ * are obviously wrong for any sort of memory access. */ #define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024) -#define BPF_REGISTER_MIN_RANGE -(1024 * 1024 * 1024) +#define BPF_REGISTER_MIN_RANGE -1 struct bpf_reg_state { enum bpf_reg_type type; @@ -22,7 +22,8 @@ struct bpf_reg_state { * Used to determine if any memory access using this register will * result in a bad access. */ - u64 min_value, max_value; + s64 min_value; + u64 max_value; union { /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ s64 imm; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 99a7e5b388f2..6a936159c6e0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -216,8 +216,8 @@ static void print_verifier_state(struct bpf_verifier_state *state) reg->map_ptr->key_size, reg->map_ptr->value_size); if (reg->min_value != BPF_REGISTER_MIN_RANGE) - verbose(",min_value=%llu", - (unsigned long long)reg->min_value); + verbose(",min_value=%lld", + (long long)reg->min_value); if (reg->max_value != BPF_REGISTER_MAX_RANGE) verbose(",max_value=%llu", (unsigned long long)reg->max_value); @@ -758,7 +758,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if ((s64)reg->min_value < 0) { + if (reg->min_value < 0) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -1468,7 +1468,8 @@ static void check_reg_overflow(struct bpf_reg_state *reg) { if (reg->max_value > BPF_REGISTER_MAX_RANGE) reg->max_value = BPF_REGISTER_MAX_RANGE; - if ((s64)reg->min_value < BPF_REGISTER_MIN_RANGE) + if (reg->min_value < BPF_REGISTER_MIN_RANGE || + reg->min_value > BPF_REGISTER_MAX_RANGE) reg->min_value = BPF_REGISTER_MIN_RANGE; } @@ -1476,7 +1477,8 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; - u64 min_val = BPF_REGISTER_MIN_RANGE, max_val = BPF_REGISTER_MAX_RANGE; + s64 min_val = BPF_REGISTER_MIN_RANGE; + u64 max_val = BPF_REGISTER_MAX_RANGE; bool min_set = false, max_set = false; u8 opcode = BPF_OP(insn->code); @@ -1512,22 +1514,43 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, return; } + /* If one of our values was at the end of our ranges then we can't just + * do our normal operations to the register, we need to set the values + * to the min/max since they are undefined. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + switch (opcode) { case BPF_ADD: - dst_reg->min_value += min_val; - dst_reg->max_value += max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value += min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value += max_val; break; case BPF_SUB: - dst_reg->min_value -= min_val; - dst_reg->max_value -= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value -= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value -= max_val; break; case BPF_MUL: - dst_reg->min_value *= min_val; - dst_reg->max_value *= max_val; + if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) + dst_reg->min_value *= min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value *= max_val; break; case BPF_AND: - /* & is special since it could end up with 0 bits set. */ - dst_reg->min_value &= min_val; + /* Disallow AND'ing of negative numbers, ain't nobody got time + * for that. Otherwise the minimum is 0 and the max is the max + * value we could AND against. + */ + if (min_val < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = 0; dst_reg->max_value = max_val; break; case BPF_LSH: @@ -1537,24 +1560,25 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, */ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - else + else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) dst_reg->min_value <<= min_val; if (max_val > ilog2(BPF_REGISTER_MAX_RANGE)) dst_reg->max_value = BPF_REGISTER_MAX_RANGE; - else + else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) dst_reg->max_value <<= max_val; break; case BPF_RSH: - dst_reg->min_value >>= min_val; - dst_reg->max_value >>= max_val; - break; - case BPF_MOD: - /* % is special since it is an unsigned modulus, so the floor - * will always be 0. + /* RSH by a negative number is undefined, and the BPF_RSH is an + * unsigned shift, so make the appropriate casts. */ - dst_reg->min_value = 0; - dst_reg->max_value = max_val - 1; + if (min_val < 0 || dst_reg->min_value < 0) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + else + dst_reg->min_value = + (u64)(dst_reg->min_value) >> min_val; + if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) + dst_reg->max_value >>= max_val; break; default: reset_reg_range_values(regs, insn->dst_reg); -- cgit v1.2.3 From 8e2ddb03643eb9d0bc4926946d7ce0d308eef0a5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:20 +0530 Subject: cpufreq: schedutil: Avoid indented labels Switch to the more common practice of writing labels. Suggested-by: Peter Zijlstra Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69e06898997d..8c4e1652e895 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -454,17 +454,17 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - out: +out: mutex_unlock(&global_tunables_lock); cpufreq_enable_fast_switch(policy); return 0; - fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables); - free_sg_policy: +free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); -- cgit v1.2.3 From 4a71ce4348bb61740d411822357061f8bf870f4c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:21 +0530 Subject: cpufreq: schedutil: enable fast switch earlier The fast_switch_enabled flag will be used by both sugov_policy_alloc() and sugov_policy_free() with a later patch. Prepare for that by moving the calls to enable and disable it to the beginning of sugov_init() and end of sugov_exit(). Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8c4e1652e895..68f21bb6bd44 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -416,9 +416,13 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY; + cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } mutex_lock(&global_tunables_lock); @@ -456,8 +460,6 @@ static int sugov_init(struct cpufreq_policy *policy) out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0; fail: @@ -468,6 +470,10 @@ free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -478,8 +484,6 @@ static void sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count; - cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock); count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -490,6 +494,7 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); } static int sugov_start(struct cpufreq_policy *policy) -- cgit v1.2.3 From 02a7b1ee3baa15a98b541d8cfd156bbe1a091c20 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:22 +0530 Subject: cpufreq: schedutil: move slow path from workqueue to SCHED_FIFO task If slow path frequency changes are conducted in a SCHED_OTHER context then they may be delayed for some amount of time, including indefinitely, when real time or deadline activity is taking place. Move the slow path to a real time kernel thread. In the future the thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set to 50 for now. Hackbench results on ARM Exynos, dual core A15 platform for 10 iterations: $ hackbench -s 100 -l 100 -g 10 -f 20 Before After --------------------------------- 1.808 1.603 1.847 1.251 2.229 1.590 1.952 1.600 1.947 1.257 1.925 1.627 2.694 1.620 1.258 1.621 1.919 1.632 1.250 1.240 Average: 1.8829 1.5041 Based on initial work by Steve Muckle. Signed-off-by: Steve Muckle Signed-off-by: Viresh Kumar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 85 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 68f21bb6bd44..f165ba0f0766 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,11 +12,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include "sched.h" +#define SUGOV_KTHREAD_PRIORITY 50 + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -35,8 +38,10 @@ struct sugov_policy { /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress; bool need_freq_update; @@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_unlock(&sg_policy->update_lock); } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); @@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + + /* + * For Real Time and Deadline tasks, schedutil governor shoots the + * frequency to maximum. And special care must be taken to ensure that + * this kthread doesn't result in that. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() and before that schedutil + * rejects all other frequency scaling requests. + * + * Though there is a very rare case where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + kthread_queue_work(&sg_policy->worker, &sg_policy->work); } /************************** sysfs interface ************************/ @@ -372,7 +391,6 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) sg_policy->policy = policy; init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; @@ -384,6 +402,51 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) kfree(sg_policy); } +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + kthread_init_work(&sg_policy->work, sugov_work); + kthread_init_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + kthread_flush_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); +} + static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) { struct sugov_tunables *tunables; @@ -424,12 +487,16 @@ static int sugov_init(struct cpufreq_policy *policy) goto disable_fast_switch; } + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; + mutex_lock(&global_tunables_lock); if (global_tunables) { if (WARN_ON(have_governor_per_policy())) { ret = -EINVAL; - goto free_sg_policy; + goto stop_kthread; } policy->governor_data = sg_policy; sg_policy->tunables = global_tunables; @@ -441,7 +508,7 @@ static int sugov_init(struct cpufreq_policy *policy) tunables = sugov_tunables_alloc(sg_policy); if (!tunables) { ret = -ENOMEM; - goto free_sg_policy; + goto stop_kthread; } tunables->rate_limit_us = LATENCY_MULTIPLIER; @@ -466,6 +533,9 @@ fail: policy->governor_data = NULL; sugov_tunables_free(tunables); +stop_kthread: + sugov_kthread_stop(sg_policy); + free_sg_policy: mutex_unlock(&global_tunables_lock); @@ -493,6 +563,7 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); + sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); } @@ -541,7 +612,7 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + kthread_cancel_work_sync(&sg_policy->work); } static void sugov_limits(struct cpufreq_policy *policy) -- cgit v1.2.3 From 21ef57297b15a49b0c4dd4e7135c1a08e9a29a1c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Nov 2016 13:53:23 +0530 Subject: cpufreq: schedutil: irq-work and mutex are only used in slow path Execute the irq-work specific initialization/exit code only when the fast path isn't available. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f165ba0f0766..42a220e78f00 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -390,15 +390,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL; sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; } static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); } @@ -432,6 +429,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) sg_policy->thread = thread; kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + wake_up_process(thread); return 0; @@ -445,6 +445,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy) kthread_flush_worker(&sg_policy->worker); kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); } static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) @@ -611,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); - irq_work_sync(&sg_policy->irq_work); - kthread_cancel_work_sync(&sg_policy->work); + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } } static void sugov_limits(struct cpufreq_policy *policy) -- cgit v1.2.3 From f97960fbddd98e1b7bfba469b8510b49a62b4bc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 17 Nov 2016 17:31:55 +0100 Subject: kernel/printk: Block cpuhotplug callback when tasks are frozen The recent conversion of the console hotplug notifier to the state machine missed the fact, that the notifier only operated on the non frozen transitions. As a consequence the console_lock/unlock() pair is also invoked during suspend, which results in a lockdep warning. Restore the previous state by making the lock/unlock conditional on !tasks_frozen. Fixes: 90b14889d2f9 ("kernel/printk: Convert to hotplug state machine") Reported-and-tested-by: Borislav Petkov Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1611171729320.3645@nanos Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior --- kernel/printk/printk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4487ffcd42d5..37893da9bae8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2194,8 +2194,10 @@ void resume_console(void) */ static int console_cpu_notify(unsigned int cpu) { - console_lock(); - console_unlock(); + if (!cpuhp_tasks_frozen) { + console_lock(); + console_unlock(); + } return 0; } -- cgit v1.2.3 From c7d03a00b56fc23c3a01a8353789ad257363e281 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 17 Nov 2016 04:58:21 +0300 Subject: netns: make struct pernet_operations::id unsigned int Make struct pernet_operations::id unsigned. There are 2 reasons to do so: 1) This field is really an index into an zero based array and thus is unsigned entity. Using negative value is out-of-bound access by definition. 2) On x86_64 unsigned 32-bit data which are mixed with pointers via array indexing or offsets added or subtracted to pointers are preffered to signed 32-bit data. "int" being used as an array index needs to be sign-extended to 64-bit before being used. void f(long *p, int i) { g(p[i]); } roughly translates to movsx rsi, esi mov rdi, [rsi+...] call g MOVSX is 3 byte instruction which isn't necessary if the variable is unsigned because x86_64 is zero extending by default. Now, there is net_generic() function which, you guessed it right, uses "int" as an array index: static inline void *net_generic(const struct net *net, int id) { ... ptr = ng->ptr[id - 1]; ... } And this function is used a lot, so those sign extensions add up. Patch snipes ~1730 bytes on allyesconfig kernel (without all junk messing with code generation): add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) Unfortunately some functions actually grow bigger. This is a semmingly random artefact of code generation with register allocator being used differently. gcc decides that some variable needs to live in new r8+ registers and every access now requires REX prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be used which is longer than [r8] However, overall balance is in negative direction: add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) function old new delta nfsd4_lock 3886 3959 +73 tipc_link_build_proto_msg 1096 1140 +44 mac80211_hwsim_new_radio 2776 2808 +32 tipc_mon_rcv 1032 1058 +26 svcauth_gss_legacy_init 1413 1429 +16 tipc_bcbase_select_primary 379 392 +13 nfsd4_exchange_id 1247 1260 +13 nfsd4_setclientid_confirm 782 793 +11 ... put_client_renew_locked 494 480 -14 ip_set_sockfn_get 730 716 -14 geneve_sock_add 829 813 -16 nfsd4_sequence_done 721 703 -18 nlmclnt_lookup_host 708 686 -22 nfsd4_lockt 1085 1063 -22 nfs_get_client 1077 1050 -27 tcf_bpf_init 1106 1076 -30 nfsd4_encode_fattr 5997 5930 -67 Total: Before=154856051, After=154854321, chg -0.00% Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/infiniband/core/cma.c | 2 +- drivers/net/bonding/bond_main.c | 2 +- drivers/net/geneve.c | 2 +- drivers/net/gtp.c | 2 +- drivers/net/ppp/ppp_generic.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- drivers/net/vxlan.c | 2 +- drivers/net/wireless/mac80211_hwsim.c | 2 +- fs/lockd/netns.h | 2 +- fs/lockd/svc.c | 2 +- fs/nfs/inode.c | 2 +- fs/nfs/netns.h | 2 +- fs/nfs_common/grace.c | 2 +- fs/nfsd/netns.h | 2 +- fs/nfsd/nfsctl.c | 2 +- include/net/bonding.h | 2 +- include/net/ip_tunnels.h | 6 +++--- include/net/net_namespace.h | 2 +- include/net/netfilter/nf_conntrack_l4proto.h | 2 +- include/net/netfilter/nf_conntrack_synproxy.h | 2 +- include/net/netns/generic.h | 2 +- kernel/audit.c | 2 +- net/8021q/vlan.c | 2 +- net/8021q/vlan.h | 2 +- net/bridge/br_netfilter_hooks.c | 2 +- net/caif/caif_dev.c | 2 +- net/core/net_namespace.c | 7 +++---- net/core/pktgen.c | 2 +- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_tunnel.c | 4 ++-- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/sit.c | 2 +- net/ipv6/xfrm6_tunnel.c | 2 +- net/key/af_key.c | 2 +- net/netfilter/ipset/ip_set_core.c | 2 +- net/netfilter/ipvs/ip_vs_core.c | 2 +- net/netfilter/nf_conntrack_proto_dccp.c | 2 +- net/netfilter/nf_conntrack_proto_gre.c | 2 +- net/netfilter/nf_conntrack_proto_sctp.c | 2 +- net/netfilter/nf_conntrack_proto_udplite.c | 2 +- net/netfilter/nf_synproxy_core.c | 2 +- net/netfilter/nfnetlink_log.c | 2 +- net/netfilter/nfnetlink_queue.c | 2 +- net/netfilter/xt_hashlimit.c | 2 +- net/netfilter/xt_recent.c | 2 +- net/openvswitch/datapath.c | 2 +- net/openvswitch/datapath.h | 2 +- net/phonet/pn_dev.c | 2 +- net/rds/tcp.c | 2 +- net/sched/act_bpf.c | 2 +- net/sched/act_connmark.c | 2 +- net/sched/act_csum.c | 2 +- net/sched/act_gact.c | 2 +- net/sched/act_ife.c | 2 +- net/sched/act_ipt.c | 4 ++-- net/sched/act_mirred.c | 2 +- net/sched/act_nat.c | 2 +- net/sched/act_pedit.c | 2 +- net/sched/act_police.c | 2 +- net/sched/act_simple.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/act_skbmod.c | 2 +- net/sched/act_tunnel_key.c | 2 +- net/sched/act_vlan.c | 2 +- net/sunrpc/netns.h | 2 +- net/sunrpc/sunrpc_syms.c | 2 +- net/tipc/core.c | 2 +- net/tipc/core.h | 2 +- 73 files changed, 80 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 89a6b0546804..c68f4fe001d7 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -116,7 +116,7 @@ static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; -static int cma_pernet_id; +static unsigned int cma_pernet_id; struct cma_pernet { struct idr tcp_ps; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 5708f17e4cdf..8029dd4912b6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -199,7 +199,7 @@ MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where " atomic_t netpoll_block_tx = ATOMIC_INIT(0); #endif -int bond_net_id __read_mostly; +unsigned int bond_net_id __read_mostly; static __be32 arp_target[BOND_MAX_ARP_TARGETS]; static int arp_ip_count; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 85a423a66478..90dc6b188607 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -43,7 +43,7 @@ struct geneve_net { struct list_head sock_list; }; -static int geneve_net_id; +static unsigned int geneve_net_id; union geneve_addr { struct sockaddr_in sin; diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 719d19f35673..98f10c216521 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -77,7 +77,7 @@ struct gtp_dev { struct hlist_head *addr_hash; }; -static int gtp_net_id __read_mostly; +static unsigned int gtp_net_id __read_mostly; struct gtp_net { struct list_head gtp_dev_list; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 5489c0ec1d9a..3d3b1f4339ef 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -204,7 +204,7 @@ static atomic_t ppp_unit_count = ATOMIC_INIT(0); static atomic_t channel_count = ATOMIC_INIT(0); /* per-net private data for this module */ -static int ppp_net_id __read_mostly; +static unsigned int ppp_net_id __read_mostly; struct ppp_net { /* units to ppp mapping */ struct idr units_idr; diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 4ddae8118c85..f017c72bb7fd 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -95,7 +95,7 @@ static const struct proto_ops pppoe_ops; static const struct ppp_channel_ops pppoe_chan_ops; /* per-net private data for this module */ -static int pppoe_net_id __read_mostly; +static unsigned int pppoe_net_id __read_mostly; struct pppoe_net { /* * we could use _single_ hash table for all diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 0a3fd675408f..21e92be6e56c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -52,7 +52,7 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -static int vxlan_net_id; +static unsigned int vxlan_net_id; static struct rtnl_link_ops vxlan_link_ops; static const u8 all_zeros_mac[ETH_ALEN + 2]; diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 8f366cc097e6..1293f8494985 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -250,7 +250,7 @@ static inline void hwsim_clear_chanctx_magic(struct ieee80211_chanctx_conf *c) cp->magic = 0; } -static int hwsim_net_id; +static unsigned int hwsim_net_id; static int hwsim_netgroup; diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5426189406c1..fb8cac88251a 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -15,6 +15,6 @@ struct lockd_net { struct list_head nsm_handles; }; -extern int lockd_net_id; +extern unsigned int lockd_net_id; #endif diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index fc4084ef4736..1c13dd80744f 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,7 +57,7 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; -int lockd_net_id; +unsigned int lockd_net_id; /* * These can be set at insmod time (useful for NFS as root filesystem), diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bf4ec5ecc97e..ce42dd00e4ee 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2015,7 +2015,7 @@ static void nfsiod_stop(void) destroy_workqueue(wq); } -int nfs_net_id; +unsigned int nfs_net_id; EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index fbce0d885d4c..5fbd2bde91ba 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -35,6 +35,6 @@ struct nfs_net { #endif }; -extern int nfs_net_id; +extern unsigned int nfs_net_id; #endif diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index fd8c9a5bcac4..420d3a0ab258 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -9,7 +9,7 @@ #include #include -static int grace_net_id; +static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ee36efd5aece..3714231a9d0f 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -124,5 +124,5 @@ struct nfsd_net { /* Simple check to find out if a given net was properly initialized */ #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) -extern int nfsd_net_id; +extern unsigned int nfsd_net_id; #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 36b2af931e06..2857e46d5cc5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1201,7 +1201,7 @@ static int create_proc_exports_entry(void) } #endif -int nfsd_net_id; +unsigned int nfsd_net_id; static __net_init int nfsd_init_net(struct net *net) { diff --git a/include/net/bonding.h b/include/net/bonding.h index f32f7ef8a23a..3c857778a6ca 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -681,7 +681,7 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) } /* exported from bond_main.c */ -extern int bond_net_id; +extern unsigned int bond_net_id; extern const struct bond_parm_tbl bond_lacp_tbl[]; extern const struct bond_parm_tbl xmit_hashtype_tbl[]; extern const struct bond_parm_tbl arp_validate_tbl[]; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 59557c07904b..e893fe43dd13 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -129,7 +129,7 @@ struct ip_tunnel { #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ - int ip_tnl_net_id; + unsigned int ip_tnl_net_id; struct gro_cells gro_cells; bool collect_md; bool ignore_df; @@ -248,7 +248,7 @@ void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); -int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, +int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); @@ -275,7 +275,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); -void ip_tunnel_setup(struct net_device *dev, int net_id); +void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fc4f757107df..d7149e93a60a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -291,7 +291,7 @@ struct pernet_operations { int (*init)(struct net *net); void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); - int *id; + unsigned int *id; size_t size; }; diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 2152b70626d5..e7b836590f0b 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -98,7 +98,7 @@ struct nf_conntrack_l4proto { const struct nla_policy *nla_policy; } ctnl_timeout; #endif - int *net_id; + unsigned int *net_id; /* Init l4proto pernet data */ int (*init_net)(struct net *net, u_int16_t proto); diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index e6937318546c..b0ca402c1f72 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -54,7 +54,7 @@ struct synproxy_net { struct synproxy_stats __percpu *stats; }; -extern int synproxy_net_id; +extern unsigned int synproxy_net_id; static inline struct synproxy_net *synproxy_pernet(struct net *net) { return net_generic(net, synproxy_net_id); diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index 70e158551704..d315786bcfd7 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -31,7 +31,7 @@ struct net_generic { void *ptr[0]; }; -static inline void *net_generic(const struct net *net, int id) +static inline void *net_generic(const struct net *net, unsigned int id) { struct net_generic *ng; void *ptr; diff --git a/kernel/audit.c b/kernel/audit.c index f1ca11613379..92c463d2d1c7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -static int audit_net_id; +static unsigned int audit_net_id; /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index a79365574531..691f0ad7067d 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -44,7 +44,7 @@ /* Global VLAN variables */ -int vlan_net_id __read_mostly; +unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index cc1557978066..df8bd65dd370 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -159,7 +159,7 @@ void vlan_netlink_fini(void); extern struct rtnl_link_ops vlan_link_ops; -extern int vlan_net_id; +extern unsigned int vlan_net_id; struct proc_dir_entry; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 8155bd2a5138..83d937f4415e 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -46,7 +46,7 @@ #include #endif -static int brnf_net_id __read_mostly; +static unsigned int brnf_net_id __read_mostly; struct brnf_net { bool enabled; diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index d730a0f68f46..2d38b6e34203 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -52,7 +52,7 @@ struct caif_net { struct caif_device_entry_list caifdevs; }; -static int caif_net_id; +static unsigned int caif_net_id; static int q_high = 50; /* Percent */ struct cfcnfg *get_cfcnfg(struct net *net) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1309d78e2a64..35d37b196e67 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -55,7 +55,7 @@ static struct net_generic *net_alloc_generic(void) return ng; } -static int net_assign_generic(struct net *net, int id, void *data) +static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; @@ -122,8 +122,7 @@ out: static void ops_free(const struct pernet_operations *ops, struct net *net) { if (ops->id && ops->size) { - int id = *ops->id; - kfree(net_generic(net, id)); + kfree(net_generic(net, *ops->id)); } } @@ -881,7 +880,7 @@ again: } return error; } - max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); + max_gen_ptrs = max(max_gen_ptrs, *ops->id); } error = __register_pernet_operations(list, ops); if (error) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 306b8f0e03c1..8e69ce472236 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -413,7 +413,7 @@ struct pktgen_hdr { }; -static int pg_net_id __read_mostly; +static unsigned int pg_net_id __read_mostly; struct pktgen_net { struct net *net; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 576f705d8180..78fd62048335 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -113,8 +113,8 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static struct rtnl_link_ops ipgre_link_ops __read_mostly; static int ipgre_tunnel_init(struct net_device *dev); -static int ipgre_net_id __read_mostly; -static int gre_tap_net_id __read_mostly; +static unsigned int ipgre_net_id __read_mostly; +static unsigned int gre_tap_net_id __read_mostly; static void ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 12a92e3349ed..823abaef006b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -994,7 +994,7 @@ int ip_tunnel_get_iflink(const struct net_device *dev) } EXPORT_SYMBOL(ip_tunnel_get_iflink); -int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, +int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); @@ -1196,7 +1196,7 @@ void ip_tunnel_uninit(struct net_device *dev) EXPORT_SYMBOL_GPL(ip_tunnel_uninit); /* Do least required initialization, rest of init is done in tunnel_init call */ -void ip_tunnel_setup(struct net_device *dev, int net_id) +void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->ip_tnl_net_id = net_id; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5d7944f394d9..8b14f1404c8f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -46,7 +46,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; -static int vti_net_id __read_mostly; +static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c9392589c415..79489f017854 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -121,7 +121,7 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -static int ipip_net_id __read_mostly; +static unsigned int ipip_net_id __read_mostly; static int ipip_tunnel_init(struct net_device *dev); static struct rtnl_link_ops ipip_link_ops __read_mostly; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 4a9e6db9df8d..e6e206fa86c8 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -62,7 +62,7 @@ struct clusterip_config { static const struct file_operations clusterip_proc_fops; #endif -static int clusterip_net_id __read_mostly; +static unsigned int clusterip_net_id __read_mostly; struct clusterip_net { struct list_head configs; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 710bc79f9113..75b6108234dd 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -64,7 +64,7 @@ MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); #define IP6_GRE_HASH_SIZE_SHIFT 5 #define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT) -static int ip6gre_net_id __read_mostly; +static unsigned int ip6gre_net_id __read_mostly; struct ip6gre_net { struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 259e8507d2cd..d3c619eda051 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -83,7 +83,7 @@ static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; -static int ip6_tnl_net_id __read_mostly; +static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index af3f0e011265..c476bb8e9cdb 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -64,7 +64,7 @@ static int vti6_dev_init(struct net_device *dev); static void vti6_dev_setup(struct net_device *dev); static struct rtnl_link_ops vti6_link_ops __read_mostly; -static int vti6_net_id __read_mostly; +static unsigned int vti6_net_id __read_mostly; struct vti6_net { /* the vti6 tunnel fallback device */ struct net_device *fb_tnl_dev; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index dc7a3449ffc1..0355231162b8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -76,7 +76,7 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst); static struct rtnl_link_ops sit_link_ops __read_mostly; -static int sit_net_id __read_mostly; +static unsigned int sit_net_id __read_mostly; struct sit_net { struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index e1c0bbe7996c..d7b731a78d09 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -44,7 +44,7 @@ struct xfrm6_tunnel_net { u32 spi; }; -static int xfrm6_tunnel_net_id __read_mostly; +static unsigned int xfrm6_tunnel_net_id __read_mostly; static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) { return net_generic(net, xfrm6_tunnel_net_id); diff --git a/net/key/af_key.c b/net/key/af_key.c index f9c9ecb0cdd3..c6252ed42c1d 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -36,7 +36,7 @@ #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) -static int pfkey_net_id __read_mostly; +static unsigned int pfkey_net_id __read_mostly; struct netns_pfkey { /* List of all pfkey sockets. */ struct hlist_head table; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 23345d2d136a..c296f9b606d4 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -36,7 +36,7 @@ struct ip_set_net { bool is_destroyed; /* all sets are destroyed */ }; -static int ip_set_net_id __read_mostly; +static unsigned int ip_set_net_id __read_mostly; static inline struct ip_set_net *ip_set_pernet(struct net *net) { diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2c1b498a7a27..db40050f8785 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -70,7 +70,7 @@ EXPORT_SYMBOL(ip_vs_get_debug_level); #endif EXPORT_SYMBOL(ip_vs_new_conn_out); -static int ip_vs_net_id __read_mostly; +static unsigned int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index ac8976964975..073b047314dc 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -385,7 +385,7 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = }; /* this module per-net specifics */ -static int dccp_net_id __read_mostly; +static unsigned int dccp_net_id __read_mostly; struct dccp_net { struct nf_proto_net pn; int dccp_loose; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index ff405c9183f1..87bb40a3feb5 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -53,7 +53,7 @@ static unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_REPLIED] = 180*HZ, }; -static int proto_gre_net_id __read_mostly; +static unsigned int proto_gre_net_id __read_mostly; struct netns_proto_gre { struct nf_proto_net nf; rwlock_t keymap_lock; diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 17c0ade23fd8..d096c2d6b87b 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -144,7 +144,7 @@ static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { } }; -static int sctp_net_id __read_mostly; +static unsigned int sctp_net_id __read_mostly; struct sctp_net { struct nf_proto_net pn; unsigned int timeouts[SCTP_CONNTRACK_MAX]; diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 8cdb4b1bf933..7808604c70a2 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -35,7 +35,7 @@ static unsigned int udplite_timeouts[UDPLITE_CT_MAX] = { [UDPLITE_CT_REPLIED] = 180*HZ, }; -static int udplite_net_id __read_mostly; +static unsigned int udplite_net_id __read_mostly; struct udplite_net { struct nf_proto_net pn; unsigned int timeouts[UDPLITE_CT_MAX]; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index c8a4a48bced9..7c6d1fbe38b9 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -24,7 +24,7 @@ #include #include -int synproxy_net_id; +unsigned int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); bool diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 7435505037b7..763cb4d54e8d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -80,7 +80,7 @@ struct nfulnl_instance { #define INSTANCE_BUCKETS 16 -static int nfnl_log_net_id __read_mostly; +static unsigned int nfnl_log_net_id __read_mostly; struct nfnl_log_net { spinlock_t instances_lock; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 1e33115b399f..be7627b80400 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -77,7 +77,7 @@ struct nfqnl_instance { typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); -static int nfnl_queue_net_id __read_mostly; +static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 struct nfnl_queue_net { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index b89b688e9d01..10063408141d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -49,7 +49,7 @@ struct hashlimit_net { struct proc_dir_entry *ip6t_hashlimit; }; -static int hashlimit_net_id; +static unsigned int hashlimit_net_id; static inline struct hashlimit_net *hashlimit_pernet(struct net *net) { return net_generic(net, hashlimit_net_id); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index bf250000e084..1d89a4eaf841 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -95,7 +95,7 @@ struct recent_net { #endif }; -static int recent_net_id __read_mostly; +static unsigned int recent_net_id __read_mostly; static inline struct recent_net *recent_pernet(struct net *net) { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 1402f1be642d..2d4c4d3911c0 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -58,7 +58,7 @@ #include "vport-internal_dev.h" #include "vport-netdev.h" -int ovs_net_id __read_mostly; +unsigned int ovs_net_id __read_mostly; static struct genl_family dp_packet_genl_family; static struct genl_family dp_flow_genl_family; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index ab85c1cae255..1c6e9377436d 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -144,7 +144,7 @@ struct ovs_net { bool xt_label; }; -extern int ovs_net_id; +extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index a58680016472..2cb4c5dfad6f 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -44,7 +44,7 @@ struct phonet_net { struct phonet_routes routes; }; -static int phonet_net_id __read_mostly; +static unsigned int phonet_net_id __read_mostly; static struct phonet_net *phonet_pernet(struct net *net) { diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 3296a6ac583a..1a0399dea764 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -366,7 +366,7 @@ struct rds_transport rds_tcp_transport = { .t_mp_capable = 1, }; -static int rds_tcp_netid; +static unsigned int rds_tcp_netid; /* per-network namespace private data for this module */ struct rds_tcp_net { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 9ff06cfbcdec..1aa4ecf41baf 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -33,7 +33,7 @@ struct tcf_bpf_cfg { bool is_ebpf; }; -static int bpf_net_id; +static unsigned int bpf_net_id; static struct tc_action_ops act_bpf_ops; static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index eae07a2e774d..ab8062909962 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -30,7 +30,7 @@ #define CONNMARK_TAB_MASK 3 -static int connmark_net_id; +static unsigned int connmark_net_id; static struct tc_action_ops act_connmark_ops; static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index e0defcef376d..a0edd80a44db 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -42,7 +42,7 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static int csum_net_id; +static unsigned int csum_net_id; static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e0aa30f83c6c..e6c874a2b283 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -25,7 +25,7 @@ #define GACT_TAB_MASK 15 -static int gact_net_id; +static unsigned int gact_net_id; static struct tc_action_ops act_gact_ops; #ifdef CONFIG_GACT_PROB diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 95c463cbb9a6..80b848d3f096 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -35,7 +35,7 @@ #define IFE_TAB_MASK 15 -static int ife_net_id; +static unsigned int ife_net_id; static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index ce7ea6c1c50d..992ef8d624f1 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -30,10 +30,10 @@ #define IPT_TAB_MASK 15 -static int ipt_net_id; +static unsigned int ipt_net_id; static struct tc_action_ops act_ipt_ops; -static int xt_net_id; +static unsigned int xt_net_id; static struct tc_action_ops act_xt_ops; static int ipt_init_target(struct xt_entry_target *t, char *table, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6073a1132725..b2d417b8f46c 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -70,7 +70,7 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; -static int mirred_net_id; +static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static bool dev_is_mac_header_xmit(const struct net_device *dev) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 8e8b0cc30704..9b6aec665495 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -31,7 +31,7 @@ #define NAT_TAB_MASK 15 -static int nat_net_id; +static unsigned int nat_net_id; static struct tc_action_ops act_nat_ops; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b54d56d4959b..eda322045e75 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -25,7 +25,7 @@ #define PEDIT_TAB_MASK 15 -static int pedit_net_id; +static unsigned int pedit_net_id; static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { diff --git a/net/sched/act_police.c b/net/sched/act_police.c index d1bd248fe146..c990b73a6c85 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -55,7 +55,7 @@ struct tc_police_compat { /* Each policer is serialized by its individual spinlock */ -static int police_net_id; +static unsigned int police_net_id; static struct tc_action_ops act_police_ops; static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 289af6f9bb3b..823a73ad0c60 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -26,7 +26,7 @@ #define SIMP_TAB_MASK 7 -static int simp_net_id; +static unsigned int simp_net_id; static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 024f3a3afeff..06ccae3c12ee 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -29,7 +29,7 @@ #define SKBEDIT_TAB_MASK 15 -static int skbedit_net_id; +static unsigned int skbedit_net_id; static struct tc_action_ops act_skbedit_ops; static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index e7d96381c908..3b7074e23024 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -22,7 +22,7 @@ #define SKBMOD_TAB_MASK 15 -static int skbmod_net_id; +static unsigned int skbmod_net_id; static struct tc_action_ops act_skbmod_ops; #define MAX_EDIT_LEN ETH_HLEN diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index edc720f11687..7af712526f01 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -22,7 +22,7 @@ #define TUNNEL_KEY_TAB_MASK 15 -static int tunnel_key_net_id; +static unsigned int tunnel_key_net_id; static struct tc_action_ops act_tunnel_key_ops; static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index b57fcbcefea1..19e0dba305ce 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -21,7 +21,7 @@ #define VLAN_TAB_MASK 15 -static int vlan_net_id; +static unsigned int vlan_net_id; static struct tc_action_ops act_vlan_ops; static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index df5826876535..394ce523174c 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -34,7 +34,7 @@ struct sunrpc_net { struct proc_dir_entry *use_gssp_proc; }; -extern int sunrpc_net_id; +extern unsigned int sunrpc_net_id; int ip_map_cache_create(struct net *); void ip_map_cache_destroy(struct net *); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index ee5d3d253102..d1c330a7953a 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -24,7 +24,7 @@ #include "netns.h" -int sunrpc_net_id; +unsigned int sunrpc_net_id; EXPORT_SYMBOL_GPL(sunrpc_net_id); static __net_init int sunrpc_init_net(struct net *net) diff --git a/net/tipc/core.c b/net/tipc/core.c index 236b043a4156..0b982d048fb9 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -47,7 +47,7 @@ #include /* configurable TIPC parameters */ -int tipc_net_id __read_mostly; +unsigned int tipc_net_id __read_mostly; int sysctl_tipc_rmem[3] __read_mostly; /* min/default/max */ static int __net_init tipc_init_net(struct net *net) diff --git a/net/tipc/core.h b/net/tipc/core.h index a1845fb27d80..5cc5398be722 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -74,7 +74,7 @@ struct tipc_monitor; #define MAX_BEARERS 3 #define TIPC_DEF_MON_THRESHOLD 32 -extern int tipc_net_id __read_mostly; +extern unsigned int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; -- cgit v1.2.3 From e245d99e6cc4a0b904b87b46b4f60d46fb405987 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 2 Nov 2016 09:36:33 -0700 Subject: lockdep: Limit static allocations if PROVE_LOCKING_SMALL is defined Reduce the size of data structure for lockdep entries by half if PROVE_LOCKING_SMALL if defined. This is used only for sparc. Signed-off-by: Babu Moger Acked-by: Sam Ravnborg Signed-off-by: David S. Miller --- kernel/locking/lockdep_internals.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 51c4b24b6328..c2b88490d857 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -45,6 +45,14 @@ enum { #define LOCKF_USED_IN_IRQ_READ \ (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +/* + * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * .data and .bss to fit in required 32MB limit for the kernel. With + * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * So, reduce the static allocations for lockdeps related structures so that + * everything fits in current required size limit. + */ +#ifdef CONFIG_PROVE_LOCKING_SMALL /* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. @@ -54,18 +62,24 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ +#define MAX_LOCKDEP_ENTRIES 16384UL +#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_STACK_TRACE_ENTRIES 262144UL +#else #define MAX_LOCKDEP_ENTRIES 32768UL #define MAX_LOCKDEP_CHAINS_BITS 16 -#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the hash_lock. */ #define MAX_STACK_TRACE_ENTRIES 524288UL +#endif + +#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) extern struct list_head all_lock_classes; extern struct lock_chain lock_chains[]; -- cgit v1.2.3 From 833fc48d18ce3595990b405ae82a160b33028994 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 10 Nov 2016 01:41:14 -0500 Subject: audit: skip sessionid sentinel value when auto-incrementing The value (unsigned int)-1 is used as a sentinel to indicate the sessionID is unset. Skip this value when the session_id value wraps. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8c434318ec8d..d161b17ce8ce 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2025,8 +2025,11 @@ int audit_set_loginuid(kuid_t loginuid) goto out; /* are we setting or clearing? */ - if (uid_valid(loginuid)) + if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); + if (unlikely(sessionid == (unsigned int)-1)) + sessionid = (unsigned int)atomic_inc_return(&session_id); + } task->sessionid = sessionid; task->loginuid = loginuid; -- cgit v1.2.3 From c1e8f06d7a0eea232ce0767471e1b4756ccab70a Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Wed, 16 Nov 2016 16:14:33 -0500 Subject: audit: fix formatting of AUDIT_CONFIG_CHANGE events The AUDIT_CONFIG_CHANGE events sometimes use a op= field. The current code logs the value of the field with quotes. This field is documented to not be encoded, so it should not have quotes. Signed-off-by: Steve Grubb Reviewed-by: Richard Guy Briggs [PM: reformatted commit description to make checkpatch.pl happy] Signed-off-by: Paul Moore --- kernel/audit_fsnotify.c | 5 ++--- kernel/audit_tree.c | 3 +-- kernel/audit_watch.c | 5 ++--- kernel/auditfilter.c | 3 +-- 4 files changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f84f8d06e1f6..f75154889aa9 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -130,10 +130,9 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=", + audit_log_format(ab, "auid=%u ses=%u op=%s", from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current)); - audit_log_string(ab, op); + audit_get_sessionid(current), op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, audit_mark->path); audit_log_key(ab, rule->filterkey); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 25772476fa4a..055f11b0a50f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -458,8 +458,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "op="); - audit_log_string(ab, "remove_rule"); + audit_log_format(ab, "op=remove_rule"); audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0d302a87f21b..686e068ec3da 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -242,10 +242,9 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; - audit_log_format(ab, "auid=%u ses=%u op=", + audit_log_format(ab, "auid=%u ses=%u op=%s", from_kuid(&init_user_ns, audit_get_loginuid(current)), - audit_get_sessionid(current)); - audit_log_string(ab, op); + audit_get_sessionid(current), op); audit_log_format(ab, " path="); audit_log_untrustedstring(ab, w->path); audit_log_key(ab, r->filterkey); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 85d9cac497e4..632e90d1005f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1074,8 +1074,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re return; audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); audit_log_task_context(ab); - audit_log_format(ab, " op="); - audit_log_string(ab, action); + audit_log_format(ab, " op=%s", action); audit_log_key(ab, rule->filterkey); audit_log_format(ab, " list=%d res=%d", rule->listnr, res); audit_log_end(ab); -- cgit v1.2.3 From 194a6b5b9cb6b91a5f7d86984165a3bc55188599 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 17 Nov 2016 11:46:38 -0500 Subject: sched/wake_q: Rename WAKE_Q to DEFINE_WAKE_Q Currently the wake_q data structure is defined by the WAKE_Q() macro. This macro, however, looks like a function doing something as "wake" is a verb. Even checkpatch.pl was confused as it reported warnings like WARNING: Missing a blank line after declarations #548: FILE: kernel/futex.c:3665: + int ret; + WAKE_Q(wake_q); This patch renames the WAKE_Q() macro to DEFINE_WAKE_Q() which clarifies what the macro is doing and eliminates the checkpatch.pl warnings. Signed-off-by: Waiman Long Acked-by: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1479401198-1765-1-git-send-email-longman@redhat.com [ Resolved conflict and added missing rename. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- ipc/mqueue.c | 4 ++-- ipc/msg.c | 8 ++++---- kernel/futex.c | 8 ++++---- kernel/locking/mutex.c | 2 +- kernel/locking/rtmutex.c | 2 +- kernel/locking/rwsem-xadd.c | 10 +++++----- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index c1aa3b02f6ac..dc37cbe2b13c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -989,7 +989,7 @@ enum cpu_idle_type { * already in a wake queue, the wakeup will happen soon and the second * waker can just skip it. * - * The WAKE_Q macro declares and initializes the list head. + * The DEFINE_WAKE_Q macro declares and initializes the list head. * wake_up_q() does NOT reinitialize the list; it's expected to be * called near the end of a function, where the fact that the queue is * not used again will be easy to see by inspection. @@ -1009,7 +1009,7 @@ struct wake_q_head { #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) -#define WAKE_Q(name) \ +#define DEFINE_WAKE_Q(name) \ struct wake_q_head name = { WAKE_Q_TAIL, &name.first } extern void wake_q_add(struct wake_q_head *head, diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 8cbd6e6894d5..7a2d8f0c8ae5 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -967,7 +967,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, struct timespec ts; struct posix_msg_tree_node *new_leaf = NULL; int ret = 0; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (u_abs_timeout) { int res = prepare_timeout(u_abs_timeout, &expires, &ts); @@ -1151,7 +1151,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, msg_ptr = wait.msg; } } else { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); msg_ptr = msg_get(info); diff --git a/ipc/msg.c b/ipc/msg.c index e12307d0c920..32e9bd837cde 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -235,7 +235,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); ss_wakeup(msq, &wake_q, true); @@ -397,7 +397,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, goto out_up; case IPC_SET: { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { @@ -634,7 +634,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, struct msg_msg *msg; int err; struct ipc_namespace *ns; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; @@ -850,7 +850,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl struct msg_queue *msq; struct ipc_namespace *ns; struct msg_msg *msg, *copy = NULL; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; diff --git a/kernel/futex.c b/kernel/futex.c index 2c4be467fecd..9246d9f593d1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1298,7 +1298,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); bool deboost; int ret = 0; @@ -1415,7 +1415,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; int ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (!bitset) return -EINVAL; @@ -1469,7 +1469,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); @@ -1708,7 +1708,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (requeue_pi) { /* diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a65e09a046ac..c0731685603f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -840,7 +840,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne { struct task_struct *next = NULL; unsigned long owner, flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1ec0f48962b3..d8d7a153b711 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1382,7 +1382,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, struct wake_q_head *wqh)) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2fa2e2e64950..263e7449282a 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -225,7 +225,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -461,7 +461,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); @@ -495,7 +495,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* @@ -571,7 +571,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -625,7 +625,7 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); -- cgit v1.2.3 From e96271f3ed7e702fa36dd0605c0c5b5f065af816 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 18 Nov 2016 13:38:43 +0200 Subject: perf/core: Fix address filter parser The token table passed into match_token() must be null-terminated, which it currently is not in the perf's address filter string parser, as caught by Vince's perf_fuzzer and KASAN. It doesn't blow up otherwise because of the alignment padding of the table to the next element in the .rodata, which is luck. Fixing by adding a null-terminator to the token table. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Alexander Shishkin Acked-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Thomas Gleixner Cc: dvyukov@google.com Cc: stable@vger.kernel.org # v4.7+ Fixes: 375637bc524 ("perf/core: Introduce address range filtering") Link: http://lkml.kernel.org/r/877f81f264.fsf@ashishki-desk.ger.corp.intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ff230bb4a02e..6ee1febdf6ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8029,6 +8029,7 @@ restart: * if is not specified, the range is treated as a single address. */ enum { + IF_ACT_NONE = -1, IF_ACT_FILTER, IF_ACT_START, IF_ACT_STOP, @@ -8052,6 +8053,7 @@ static const match_table_t if_tokens = { { IF_SRC_KERNEL, "%u/%u" }, { IF_SRC_FILEADDR, "%u@%s" }, { IF_SRC_KERNELADDR, "%u" }, + { IF_ACT_NONE, NULL }, }; /* -- cgit v1.2.3 From 97bc402db7821259f6a722cb38e060aa9b35b6e8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 19 Nov 2016 01:45:00 +0100 Subject: bpf, mlx5: fix mlx5e_create_rq taking reference on prog In mlx5e_create_rq(), when creating a new queue, we call bpf_prog_add() but without checking the return value. bpf_prog_add() can fail since 92117d8443bc ("bpf: fix refcnt overflow"), so we really must check it. Take the reference right when we assign it to the rq from priv->xdp_prog, and just drop the reference on error path. Destruction in mlx5e_destroy_rq() looks good, though. Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support") Signed-off-by: Daniel Borkmann Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 +++++++++---- kernel/bpf/syscall.c | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bd0732d5d219..54bae797b338 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -513,7 +513,13 @@ static int mlx5e_create_rq(struct mlx5e_channel *c, rq->channel = c; rq->ix = c->ix; rq->priv = c->priv; - rq->xdp_prog = priv->xdp_prog; + + rq->xdp_prog = priv->xdp_prog ? bpf_prog_inc(priv->xdp_prog) : NULL; + if (IS_ERR(rq->xdp_prog)) { + err = PTR_ERR(rq->xdp_prog); + rq->xdp_prog = NULL; + goto err_rq_wq_destroy; + } rq->buff.map_dir = DMA_FROM_DEVICE; if (rq->xdp_prog) @@ -590,12 +596,11 @@ static int mlx5e_create_rq(struct mlx5e_channel *c, rq->page_cache.head = 0; rq->page_cache.tail = 0; - if (rq->xdp_prog) - bpf_prog_add(rq->xdp_prog, 1); - return 0; err_rq_wq_destroy: + if (rq->xdp_prog) + bpf_prog_put(rq->xdp_prog); mlx5_wq_destroy(&rq->wq_ctrl); return err; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ce1b7de7d72c..eb15498b8d55 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -696,6 +696,7 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) { return bpf_prog_add(prog, 1); } +EXPORT_SYMBOL_GPL(bpf_prog_inc); static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) { -- cgit v1.2.3 From 4e201566402c878a225d4425df8a4a664c6f251e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 22 Nov 2016 09:21:16 +0000 Subject: genirq/msi: Drop artificial PCI dependency The generic MSI layer doesn't have any PCI ties anymore, and the build hack should have been removed some time ago. Fixes: d9109698be6e ("genirq: Introduce msi_domain_alloc/free_irqs()") Signed-off-by: Marc Zyngier Link: http://lkml.kernel.org/r/1479806476-20801-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- kernel/irq/msi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 8a3e872798f3..ee230063f033 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,9 +14,7 @@ #include #include #include - -/* Temparory solution for building, will be removed later */ -#include +#include /** * alloc_msi_entry - Allocate an initialize msi_entry -- cgit v1.2.3 From 18f649ef344127ef6de23a5a4272dbe2fdb73dde Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:09 +0100 Subject: sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task() The PF_EXITING check in task_wants_autogroup() is no longer needed. Remove it, but see the next patch. However the comment is correct in that autogroup_move_group() must always change task_group() for every thread so the sysctl_ check is very wrong; we can race with cgroups and even sys_setsid() is not safe because a task running with task_group() == ag->tg must participate in refcounting: int main(void) { int sctl = open("/proc/sys/kernel/sched_autogroup_enabled", O_WRONLY); assert(sctl > 0); if (fork()) { wait(NULL); // destroy the child's ag/tg pause(); } assert(pwrite(sctl, "1\n", 2, 0) == 2); assert(setsid() > 0); if (fork()) pause(); kill(getppid(), SIGKILL); sleep(1); // The child has gone, the grandchild runs with kref == 1 assert(pwrite(sctl, "0\n", 2, 0) == 2); assert(setsid() > 0); // runs with the freed ag/tg for (;;) sleep(1); return 0; } crashes the kernel. It doesn't really need sleep(1), it doesn't matter if autogroup_move_group() actually frees the task_group or this happens later. Reported-by: Vern Lovejoy Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Link: http://lkml.kernel.org/r/20161114184609.GA15965@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index a5d966cb8891..ad2b19ad6ca0 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). */ - if (p->flags & PF_EXITING) - return false; - return true; } @@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } -- cgit v1.2.3 From 8e5bfa8c1f8471aa4a2d30be631ef2b50e10abaf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:12 +0100 Subject: sched/autogroup: Do not use autogroup->tg in zombie threads Exactly because for_each_thread() in autogroup_move_group() can't see it and update its ->sched_task_group before _put() and possibly free(). So the exiting task needs another sched_move_task() before exit_notify() and we need to re-introduce the PF_EXITING (or similar) check removed by the previous change for another reason. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Cc: vlovejoy@redhat.com Link: http://lkml.kernel.org/r/20161114184612.GA15968@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/exit.c | 1 + kernel/sched/auto_group.c | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..e9c009dc3a4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2567,6 +2567,7 @@ extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); +extern void sched_autogroup_exit_task(struct task_struct *p); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); @@ -2576,6 +2577,7 @@ static inline void sched_autogroup_create_attach(struct task_struct *p) { } static inline void sched_autogroup_detach(struct task_struct *p) { } static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } +static inline void sched_autogroup_exit_task(struct task_struct *p) { } #endif extern int yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/exit.c b/kernel/exit.c index 9d68c45ebbe3..3076f3089919 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -836,6 +836,7 @@ void __noreturn do_exit(long code) */ perf_event_exit_task(tsk); + sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index ad2b19ad6ca0..f1c8fd566246 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -115,10 +115,26 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) * If we race with autogroup_move_group() the caller can use the old * value of signal->autogroup but in this case sched_move_task() will * be called again before autogroup_kref_put(). + * + * However, there is no way sched_autogroup_exit_task() could tell us + * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case. */ + if (p->flags & PF_EXITING) + return false; + return true; } +void sched_autogroup_exit_task(struct task_struct *p) +{ + /* + * We are going to call exit_notify() and autogroup_move_group() can't + * see this thread after that: we can no longer use signal->autogroup. + * See the PF_EXITING check in task_wants_autogroup(). + */ + sched_move_task(p); +} + static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -142,6 +158,9 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * In the latter case for_each_thread() can not miss a migrating thread, * cpu_cgroup_attach() must not be possible after cgroup_exit() and it * can't be removed from thread list, we hold ->siglock. + * + * If an exiting thread was already removed from thread list we rely on + * sched_autogroup_exit_task(). */ for_each_thread(p, t) sched_move_task(t); -- cgit v1.2.3 From 5aff60a191e579ae00ae5ca6ce16c13b687bc8a3 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 2 Nov 2016 05:08:29 -0400 Subject: locking/osq: Break out of spin-wait busy waiting loop for a preempted vCPU in osq_lock() An over-committed guest with more vCPUs than pCPUs has a heavy overload in osq_lock(). This is because if vCPU-A holds the osq lock and yields out, vCPU-B ends up waiting for per_cpu node->locked to be set. IOW, vCPU-B waits for vCPU-A to run and unlock the osq lock. Use the new vcpu_is_preempted(cpu) interface to detect if a vCPU is currently running or not, and break out of the spin-loop if so. test case: $ perf record -a perf bench sched messaging -g 400 -p && perf report before patch: 18.09% sched-messaging [kernel.vmlinux] [k] osq_lock 12.28% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner 5.27% sched-messaging [kernel.vmlinux] [k] mutex_unlock 3.89% sched-messaging [kernel.vmlinux] [k] wait_consider_task 3.64% sched-messaging [kernel.vmlinux] [k] _raw_write_lock_irq 3.41% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner.is 2.49% sched-messaging [kernel.vmlinux] [k] system_call after patch: 20.68% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner 8.45% sched-messaging [kernel.vmlinux] [k] mutex_unlock 4.12% sched-messaging [kernel.vmlinux] [k] system_call 3.01% sched-messaging [kernel.vmlinux] [k] system_call_common 2.83% sched-messaging [kernel.vmlinux] [k] copypage_power7 2.64% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner 2.00% sched-messaging [kernel.vmlinux] [k] osq_lock Suggested-by: Boqun Feng Tested-by: Juergen Gross Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Acked-by: Christian Borntraeger Acked-by: Paolo Bonzini Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: bsingharora@gmail.com Cc: dave@stgolabs.net Cc: kernellwp@gmail.com Cc: konrad.wilk@oracle.com Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: rkrcmar@redhat.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: xen-devel-request@lists.xenproject.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1478077718-37424-3-git-send-email-xinhui.pan@linux.vnet.ibm.com [ Translated to English. ] Signed-off-by: Ingo Molnar --- kernel/locking/osq_lock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 4ea2710b9d6c..a3167941093b 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr) return cpu_nr + 1; } +static inline int node_cpu(struct optimistic_spin_node *node) +{ + return node->cpu - 1; +} + static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) { int cpu_nr = encoded_cpu_val - 1; @@ -118,8 +123,10 @@ bool osq_lock(struct optimistic_spin_queue *lock) while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. + * Use vcpu_is_preempted() to avoid waiting for a preempted + * lock holder: */ - if (need_resched()) + if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) goto unqueue; cpu_relax(); -- cgit v1.2.3 From 05ffc951392df57edecc2519327b169210c3df75 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 2 Nov 2016 05:08:30 -0400 Subject: locking/mutex: Break out of expensive busy-loop on {mutex,rwsem}_spin_on_owner() when owner vCPU is preempted An over-committed guest with more vCPUs than pCPUs has a heavy overload in the two spin_on_owner. This blames on the lock holder preemption issue. Break out of the loop if the vCPU is preempted: if vcpu_is_preempted(cpu) is true. test-case: perf record -a perf bench sched messaging -g 400 -p && perf report before patch: 20.68% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner 8.45% sched-messaging [kernel.vmlinux] [k] mutex_unlock 4.12% sched-messaging [kernel.vmlinux] [k] system_call 3.01% sched-messaging [kernel.vmlinux] [k] system_call_common 2.83% sched-messaging [kernel.vmlinux] [k] copypage_power7 2.64% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner 2.00% sched-messaging [kernel.vmlinux] [k] osq_lock after patch: 9.99% sched-messaging [kernel.vmlinux] [k] mutex_unlock 5.28% sched-messaging [unknown] [H] 0xc0000000000768e0 4.27% sched-messaging [kernel.vmlinux] [k] __copy_tofrom_user_power7 3.77% sched-messaging [kernel.vmlinux] [k] copypage_power7 3.24% sched-messaging [kernel.vmlinux] [k] _raw_write_lock_irq 3.02% sched-messaging [kernel.vmlinux] [k] system_call 2.69% sched-messaging [kernel.vmlinux] [k] wait_consider_task Tested-by: Juergen Gross Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Acked-by: Christian Borntraeger Acked-by: Paolo Bonzini Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: boqun.feng@gmail.com Cc: bsingharora@gmail.com Cc: dave@stgolabs.net Cc: kernellwp@gmail.com Cc: konrad.wilk@oracle.com Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: rkrcmar@redhat.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: xen-devel-request@lists.xenproject.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1478077718-37424-4-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 13 +++++++++++-- kernel/locking/rwsem-xadd.c | 14 +++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c0731685603f..9b349619f431 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -364,7 +364,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) */ barrier(); - if (!owner->on_cpu || need_resched()) { + /* + * Use vcpu_is_preempted to detect lock holder preemption issue. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { ret = false; break; } @@ -389,8 +393,13 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) rcu_read_lock(); owner = __mutex_owner(lock); + + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ if (owner) - retval = owner->on_cpu; + retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); rcu_read_unlock(); /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 263e7449282a..631506004f9e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) goto done; } - ret = owner->on_cpu; + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ + ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); done: rcu_read_unlock(); return ret; @@ -362,8 +366,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) */ barrier(); - /* abort spinning when need_resched or owner is not running */ - if (!owner->on_cpu || need_resched()) { + /* + * abort spinning when need_resched or owner is not running or + * owner's cpu is preempted. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { rcu_read_unlock(); return false; } -- cgit v1.2.3 From bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 13 Oct 2016 21:23:16 -0500 Subject: mm: Add a user_ns owner to mm_struct and fix ptrace permission checks During exec dumpable is cleared if the file that is being executed is not readable by the user executing the file. A bug in ptrace_may_access allows reading the file if the executable happens to enter into a subordinate user namespace (aka clone(CLONE_NEWUSER), unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER). This problem is fixed with only necessary userspace breakage by adding a user namespace owner to mm_struct, captured at the time of exec, so it is clear in which user namespace CAP_SYS_PTRACE must be present in to be able to safely give read permission to the executable. The function ptrace_may_access is modified to verify that the ptracer has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns. This ensures that if the task changes it's cred into a subordinate user namespace it does not become ptraceable. The function ptrace_attach is modified to only set PT_PTRACE_CAP when CAP_SYS_PTRACE is held over task->mm->user_ns. The intent of PT_PTRACE_CAP is to be a flag to note that whatever permission changes the task might go through the tracer has sufficient permissions for it not to be an issue. task->cred->user_ns is always the same as or descendent of mm->user_ns. Which guarantees that having CAP_SYS_PTRACE over mm->user_ns is the worst case for the tasks credentials. To prevent regressions mm->dumpable and mm->user_ns are not considered when a task has no mm. As simply failing ptrace_may_attach causes regressions in privileged applications attempting to read things such as /proc//stat Cc: stable@vger.kernel.org Acked-by: Kees Cook Tested-by: Cyrill Gorcunov Fixes: 8409cca70561 ("userns: allow ptrace from non-init user namespaces") Signed-off-by: "Eric W. Biederman" --- include/linux/mm_types.h | 1 + kernel/fork.c | 9 ++++++--- kernel/ptrace.c | 26 +++++++++++--------------- mm/init-mm.c | 2 ++ 4 files changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4a8acedf4b7d..08d947fc4c59 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -473,6 +473,7 @@ struct mm_struct { */ struct task_struct __rcu *owner; #endif + struct user_namespace *user_ns; /* store ref to file /proc//exe symlink points to */ struct file __rcu *exe_file; diff --git a/kernel/fork.c b/kernel/fork.c index 997ac1d584f7..ba8a01564985 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -745,7 +745,8 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + struct user_namespace *user_ns) { mm->mmap = NULL; mm->mm_rb = RB_ROOT; @@ -785,6 +786,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) if (init_new_context(p, mm)) goto fail_nocontext; + mm->user_ns = get_user_ns(user_ns); return mm; fail_nocontext: @@ -830,7 +832,7 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - return mm_init(mm, current); + return mm_init(mm, current, current_user_ns()); } /* @@ -845,6 +847,7 @@ void __mmdrop(struct mm_struct *mm) destroy_context(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); + put_user_ns(mm->user_ns); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1126,7 +1129,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm, tsk)) + if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; err = dup_mmap(mm, oldmm); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e6474f7272ec..282821557183 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -220,7 +220,7 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; - int dumpable = 0; + struct mm_struct *mm; kuid_t caller_uid; kgid_t caller_gid; @@ -271,16 +271,11 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); - smp_rmb(); - if (task->mm) - dumpable = get_dumpable(task->mm); - rcu_read_lock(); - if (dumpable != SUID_DUMP_USER && - !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); + mm = task->mm; + if (mm && + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptrace_has_cap(mm->user_ns, mode))) + return -EPERM; return security_ptrace_access_check(task, mode); } @@ -331,6 +326,11 @@ static int ptrace_attach(struct task_struct *task, long request, task_lock(task); retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (!retval) { + struct mm_struct *mm = task->mm; + if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE)) + flags |= PT_PTRACE_CAP; + } task_unlock(task); if (retval) goto unlock_creds; @@ -344,10 +344,6 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) flags |= PT_SEIZED; - rcu_read_lock(); - if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE)) - flags |= PT_PTRACE_CAP; - rcu_read_unlock(); task->ptrace = flags; __ptrace_link(task, current); diff --git a/mm/init-mm.c b/mm/init-mm.c index a56a851908d2..975e49f00f34 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -21,5 +22,6 @@ struct mm_struct init_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) }; -- cgit v1.2.3 From 64b875f7ac8a5d60a4e191479299e931ee949b67 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Nov 2016 18:48:07 -0600 Subject: ptrace: Capture the ptracer's creds not PT_PTRACE_CAP When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was overlooked. This can result in incorrect behavior when an application like strace traces an exec of a setuid executable. Further PT_PTRACE_CAP does not have enough information for making good security decisions as it does not report which user namespace the capability is in. This has already allowed one mistake through insufficient granulariy. I found this issue when I was testing another corner case of exec and discovered that I could not get strace to set PT_PTRACE_CAP even when running strace as root with a full set of caps. This change fixes the above issue with strace allowing stracing as root a setuid executable without disabling setuid. More fundamentaly this change allows what is allowable at all times, by using the correct information in it's decision. Cc: stable@vger.kernel.org Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12") Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 2 +- include/linux/capability.h | 1 + include/linux/ptrace.h | 1 - include/linux/sched.h | 1 + kernel/capability.c | 20 ++++++++++++++++++++ kernel/ptrace.c | 12 +++++++----- 6 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 4e497b9ee71e..3cf2cfced97a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1406,7 +1406,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) unsigned n_fs; if (p->ptrace) { - if (p->ptrace & PT_PTRACE_CAP) + if (ptracer_capable(p, current_user_ns())) bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; else bprm->unsafe |= LSM_UNSAFE_PTRACE; diff --git a/include/linux/capability.h b/include/linux/capability.h index dbc21c719ce6..d6088e2a7668 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -242,6 +242,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) #endif /* CONFIG_MULTIUSER */ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); +extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 504c98a278d4..e13bfdf7f314 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -19,7 +19,6 @@ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ -#define PT_PTRACE_CAP 0x00000004 /* ptracer can follow suid-exec */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..e9f693598e15 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1656,6 +1656,7 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ + const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */ const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ const struct cred __rcu *cred; /* effective (overridable) subjective task diff --git a/kernel/capability.c b/kernel/capability.c index 00411c82dac5..dfa0e4528b0b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -473,3 +473,23 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) kgid_has_mapping(ns, inode->i_gid); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); + +/** + * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace + * @tsk: The task that may be ptraced + * @ns: The user namespace to search for CAP_SYS_PTRACE in + * + * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE + * in the specified user namespace. + */ +bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) +{ + int ret = 0; /* An absent tracer adds no restrictions */ + const struct cred *cred; + rcu_read_lock(); + cred = rcu_dereference(tsk->ptracer_cred); + if (cred) + ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + rcu_read_unlock(); + return (ret == 0); +} diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 282821557183..e82c15cadd6d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -39,6 +39,9 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) BUG_ON(!list_empty(&child->ptrace_entry)); list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; + rcu_read_lock(); + child->ptracer_cred = get_cred(__task_cred(new_parent)); + rcu_read_unlock(); } /** @@ -71,12 +74,16 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) */ void __ptrace_unlink(struct task_struct *child) { + const struct cred *old_cred; BUG_ON(!child->ptrace); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); child->parent = child->real_parent; list_del_init(&child->ptrace_entry); + old_cred = child->ptracer_cred; + child->ptracer_cred = NULL; + put_cred(old_cred); spin_lock(&child->sighand->siglock); child->ptrace = 0; @@ -326,11 +333,6 @@ static int ptrace_attach(struct task_struct *task, long request, task_lock(task); retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); - if (!retval) { - struct mm_struct *mm = task->mm; - if (mm && ns_capable(mm->user_ns, CAP_SYS_PTRACE)) - flags |= PT_PTRACE_CAP; - } task_unlock(task); if (retval) goto unlock_creds; -- cgit v1.2.3 From 84d77d3f06e7e8dea057d10e8ec77ad71f721be3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 22 Nov 2016 12:06:50 -0600 Subject: ptrace: Don't allow accessing an undumpable mm It is the reasonable expectation that if an executable file is not readable there will be no way for a user without special privileges to read the file. This is enforced in ptrace_attach but if ptrace is already attached before exec there is no enforcement for read-only executables. As the only way to read such an mm is through access_process_vm spin a variant called ptrace_access_vm that will fail if the target process is not being ptraced by the current process, or the current process did not have sufficient privileges when ptracing began to read the target processes mm. In the ptrace implementations replace access_process_vm by ptrace_access_vm. There remain several ptrace sites that still use access_process_vm as they are reading the target executables instructions (for kernel consumption) or register stacks. As such it does not appear necessary to add a permission check to those calls. This bug has always existed in Linux. Fixes: v1.0 Cc: stable@vger.kernel.org Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/ptrace.c | 2 +- arch/blackfin/kernel/ptrace.c | 4 ++-- arch/cris/arch-v32/kernel/ptrace.c | 2 +- arch/ia64/kernel/ptrace.c | 2 +- arch/mips/kernel/ptrace32.c | 4 ++-- arch/powerpc/kernel/ptrace32.c | 4 ++-- include/linux/mm.h | 2 ++ include/linux/ptrace.h | 3 +++ kernel/ptrace.c | 42 ++++++++++++++++++++++++++++++++------ mm/memory.c | 2 +- mm/nommu.c | 2 +- 11 files changed, 52 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index 940dfb406591..04abdec7f496 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -283,7 +283,7 @@ long arch_ptrace(struct task_struct *child, long request, /* When I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ case PTRACE_PEEKDATA: - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), + copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE); ret = -EIO; if (copied != sizeof(tmp)) diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index 8d79286ee4e8..360d99645163 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -270,7 +270,7 @@ long arch_ptrace(struct task_struct *child, long request, switch (bfin_mem_access_type(addr, to_copy)) { case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: - copied = access_process_vm(child, addr, &tmp, + copied = ptrace_access_vm(child, addr, &tmp, to_copy, FOLL_FORCE); if (copied) break; @@ -323,7 +323,7 @@ long arch_ptrace(struct task_struct *child, long request, switch (bfin_mem_access_type(addr, to_copy)) { case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: - copied = access_process_vm(child, addr, &data, + copied = ptrace_access_vm(child, addr, &data, to_copy, FOLL_FORCE | FOLL_WRITE); break; diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c index f0df654ac6fc..fe1f9cf7b391 100644 --- a/arch/cris/arch-v32/kernel/ptrace.c +++ b/arch/cris/arch-v32/kernel/ptrace.c @@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request, /* The trampoline page is globally mapped, no page table to traverse.*/ tmp = *(unsigned long*)addr; } else { - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE); + copied = ptrace_access_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 31aa8c0f68e1..36f660da8124 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -1159,7 +1159,7 @@ arch_ptrace (struct task_struct *child, long request, case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: /* read word at location addr */ - if (access_process_vm(child, addr, &data, sizeof(data), + if (ptrace_access_vm(child, addr, &data, sizeof(data), FOLL_FORCE) != sizeof(data)) return -EIO; diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index 7e71a4e0281b..5fcbdcd7abd0 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -69,7 +69,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0) break; - copied = access_process_vm(child, (u64)addrOthers, &tmp, + copied = ptrace_access_vm(child, (u64)addrOthers, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; @@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (get_user(addrOthers, (u32 __user * __user *) (unsigned long) addr) != 0) break; ret = 0; - if (access_process_vm(child, (u64)addrOthers, &data, + if (ptrace_access_vm(child, (u64)addrOthers, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE) == sizeof(data)) break; diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index 010b7b310237..1e887f3a61a6 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -73,7 +73,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (get_user(addrOthers, (u32 __user * __user *)addr) != 0) break; - copied = access_process_vm(child, (u64)addrOthers, &tmp, + copied = ptrace_access_vm(child, (u64)addrOthers, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; @@ -178,7 +178,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (get_user(addrOthers, (u32 __user * __user *)addr) != 0) break; ret = 0; - if (access_process_vm(child, (u64)addrOthers, &tmp, + if (ptrace_access_vm(child, (u64)addrOthers, &tmp, sizeof(tmp), FOLL_FORCE | FOLL_WRITE) == sizeof(tmp)) break; diff --git a/include/linux/mm.h b/include/linux/mm.h index a92c8d73aeaf..0b5b2e4df14e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1270,6 +1270,8 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void * unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); +extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index e13bfdf7f314..e0e539321ab9 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -8,6 +8,9 @@ #include /* For task_active_pid_ns. */ #include +extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); + /* * Ptrace flags * diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e82c15cadd6d..49ba7c1ade9d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -27,6 +27,35 @@ #include #include +/* + * Access another process' address space via ptrace. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + if (!tsk->ptrace || + (current != tsk->parent) || + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptracer_capable(tsk, mm->user_ns))) { + mmput(mm); + return 0; + } + + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + mmput(mm); + + return ret; +} + /* * ptrace a task: make the debugger its new parent and @@ -535,7 +564,8 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); + retval = ptrace_access_vm(tsk, src, buf, this_len, FOLL_FORCE); + if (!retval) { if (copied) break; @@ -562,7 +592,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, + retval = ptrace_access_vm(tsk, dst, buf, this_len, FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) @@ -1126,7 +1156,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); + copied = ptrace_access_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1137,7 +1167,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), + copied = ptrace_access_vm(tsk, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1185,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), + ret = ptrace_access_vm(child, addr, &word, sizeof(word), FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; @@ -1165,7 +1195,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), + ret = ptrace_access_vm(child, addr, &data, sizeof(data), FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/mm/memory.c b/mm/memory.c index e18c57bdc75c..cbb1e5e5f791 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3868,7 +3868,7 @@ EXPORT_SYMBOL_GPL(generic_access_phys); * Access another process' address space as given in mm. If non-NULL, use the * given task for page fault accounting. */ -static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, +int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; diff --git a/mm/nommu.c b/mm/nommu.c index 8b8faaf2a9e9..44265e00b701 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1808,7 +1808,7 @@ void filemap_map_pages(struct fault_env *fe, } EXPORT_SYMBOL(filemap_map_pages); -static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, +int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; -- cgit v1.2.3 From f84df2a6f268de584a201e8911384a2d244876e3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 16 Nov 2016 22:06:51 -0600 Subject: exec: Ensure mm->user_ns contains the execed files When the user namespace support was merged the need to prevent ptrace from revealing the contents of an unreadable executable was overlooked. Correct this oversight by ensuring that the executed file or files are in mm->user_ns, by adjusting mm->user_ns. Use the new function privileged_wrt_inode_uidgid to see if the executable is a member of the user namespace, and as such if having CAP_SYS_PTRACE in the user namespace should allow tracing the executable. If not update mm->user_ns to the parent user namespace until an appropriate parent is found. Cc: stable@vger.kernel.org Reported-by: Jann Horn Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.") Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 19 +++++++++++++++++-- include/linux/capability.h | 1 + kernel/capability.c | 16 ++++++++++++++-- 3 files changed, 32 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 3cf2cfced97a..fdf53f0c421b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1275,8 +1275,22 @@ EXPORT_SYMBOL(flush_old_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { - if (inode_permission(file_inode(file), MAY_READ) < 0) + struct inode *inode = file_inode(file); + if (inode_permission(inode, MAY_READ) < 0) { + struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + /* Ensure mm->user_ns contains the executable */ + user_ns = old = bprm->mm->user_ns; + while ((user_ns != &init_user_ns) && + !privileged_wrt_inode_uidgid(user_ns, inode)) + user_ns = user_ns->parent; + + if (old != user_ns) { + bprm->mm->user_ns = get_user_ns(user_ns); + put_user_ns(old); + } + } } EXPORT_SYMBOL(would_dump); @@ -1306,7 +1320,6 @@ void setup_new_exec(struct linux_binprm * bprm) !gid_eq(bprm->cred->gid, current_egid())) { current->pdeath_signal = 0; } else { - would_dump(bprm, bprm->file); if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) set_dumpable(current->mm, suid_dumpable); } @@ -1741,6 +1754,8 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out; + would_dump(bprm, bprm->file); + retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/include/linux/capability.h b/include/linux/capability.h index d6088e2a7668..6ffb67e10c06 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -240,6 +240,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) return true; } #endif /* CONFIG_MULTIUSER */ +extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode); extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); diff --git a/kernel/capability.c b/kernel/capability.c index dfa0e4528b0b..4984e1f552eb 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -456,6 +456,19 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, } EXPORT_SYMBOL(file_ns_capable); +/** + * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? + * @ns: The user namespace in question + * @inode: The inode in question + * + * Return true if the inode uid and gid are within the namespace. + */ +bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) +{ + return kuid_has_mapping(ns, inode->i_uid) && + kgid_has_mapping(ns, inode->i_gid); +} + /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question @@ -469,8 +482,7 @@ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); - return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) && - kgid_has_mapping(ns, inode->i_gid); + return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); -- cgit v1.2.3 From 31eff2434db542763a00074a8368d7bd78d14ea1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Nov 2016 19:35:34 +0100 Subject: sched/nohz: Convert to hotplug state machine Install the callbacks via the state machine. Signed-off-by: Sebastian Andrzej Siewior Cc: rt@linuxtronix.de Link: http://lkml.kernel.org/r/20161117183541.8588-14-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3bcb61b52f6c..71496a20e670 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -390,24 +390,16 @@ static int __init tick_nohz_full_setup(char *str) } __setup("nohz_full=", tick_nohz_full_setup); -static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int tick_nohz_cpu_down(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* - * The boot CPU handles housekeeping duty (unbound timers, - * workqueues, timekeeping, ...) on behalf of full dynticks - * CPUs. It must remain online when nohz full is enabled. - */ - if (tick_nohz_full_running && tick_do_timer_cpu == cpu) - return NOTIFY_BAD; - break; - } - return NOTIFY_OK; + /* + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. + */ + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) + return -EBUSY; + return 0; } static int tick_nohz_init_all(void) @@ -428,7 +420,7 @@ static int tick_nohz_init_all(void) void __init tick_nohz_init(void) { - int cpu; + int cpu, ret; if (!tick_nohz_full_running) { if (tick_nohz_init_all() < 0) @@ -469,7 +461,10 @@ void __init tick_nohz_init(void) for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); - cpu_notifier(tick_nohz_cpu_down_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "kernel/nohz:predown", NULL, + tick_nohz_cpu_down); + WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); -- cgit v1.2.3 From 478409dd683db76cbcfe7bf8332a37f01deb0a2d Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Mon, 21 Nov 2016 15:57:18 +0800 Subject: tracing: Add hook to function tracing for other subsystems to use Currently Function traces can be only exported to the ring buffer. This adds a trace_export concept which can process traces and export them to a registered destination as an addition to the current one that outputs to Ftrace - i.e. ring buffer. In this way, if we want function traces to be sent to other destinations rather than only to the ring buffer, we just need to register a new trace_export and implement its own .write() function for writing traces to storage. With this patch, only function tracing (trace type is TRACE_FN) is supported. Link: http://lkml.kernel.org/r/1479715043-6534-2-git-send-email-zhang.chunyan@linaro.org Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt --- include/linux/trace.h | 28 +++++++++++ kernel/trace/trace.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 include/linux/trace.h (limited to 'kernel') diff --git a/include/linux/trace.h b/include/linux/trace.h new file mode 100644 index 000000000000..9330a58e2651 --- /dev/null +++ b/include/linux/trace.h @@ -0,0 +1,28 @@ +#ifndef _LINUX_TRACE_H +#define _LINUX_TRACE_H + +#ifdef CONFIG_TRACING +/* + * The trace export - an export of Ftrace output. The trace_export + * can process traces and export them to a registered destination as + * an addition to the current only output of Ftrace - i.e. ring buffer. + * + * If you want traces to be sent to some other place rather than ring + * buffer only, just need to register a new trace_export and implement + * its own .write() function for writing traces to the storage. + * + * next - pointer to the next trace_export + * write - copy traces which have been delt with ->commit() to + * the destination + */ +struct trace_export { + struct trace_export __rcu *next; + void (*write)(const void *, unsigned int); +}; + +int register_ftrace_export(struct trace_export *export); +int unregister_ftrace_export(struct trace_export *export); + +#endif /* CONFIG_TRACING */ + +#endif /* _LINUX_TRACE_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 57069e7f369c..edccdff8a36d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include "trace.h" @@ -2128,6 +2129,129 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_userstack(buffer, flags, pc); } +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event) +{ + struct trace_entry *entry; + unsigned int size = 0; + + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(entry, size); +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled); + +static inline void ftrace_exports_enable(void) +{ + static_branch_enable(&ftrace_exports_enabled); +} + +static inline void ftrace_exports_disable(void) +{ + static_branch_disable(&ftrace_exports_enabled); +} + +void ftrace_exports(struct ring_buffer_event *event) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_notrace(ftrace_exports_list); + while (export) { + trace_process_export(export, event); + export = rcu_dereference_raw_notrace(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* + * We are entering export into the list but another + * CPU might be walking that list. We need to make sure + * the export->next pointer is valid before another CPU sees + * the export pointer included into the list. + */ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + if (*list == NULL) + ftrace_exports_enable(); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + if (*list == NULL) + ftrace_exports_disable(); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(&ftrace_export_lock); + + add_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(&ftrace_export_lock); + + ret = rm_ftrace_export(&ftrace_exports_list, export); + + mutex_unlock(&ftrace_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, @@ -2146,8 +2270,11 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!call_filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) { + if (static_branch_unlikely(&ftrace_exports_enabled)) + ftrace_exports(event); __buffer_unlock_commit(buffer, event); + } } #ifdef CONFIG_STACKTRACE -- cgit v1.2.3 From 7d436400223bb46e9f88e6bba6f8d867acf0d82c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 22 Nov 2016 18:32:03 -0500 Subject: tracing: Add error checks to creation of event files The creation of the set_event_pid file was assigned to a variable "entry" but that variable was never used. Ideally, it should be used to check if the file was created and warn if it was not. The files header_page, header_event should also be checked and a warning if they fail to be created. The "enable" file was moved up, as it is a more crucial file to have and a hard failure (return -ENOMEM) should be returned if it is not created. Reported-by: David Binderman Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 03c0a48c3ac4..ba67ede48822 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2843,20 +2843,32 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } + entry = trace_create_file("enable", 0644, d_events, + tr, &ftrace_tr_enable_fops); + if (!entry) { + pr_warn("Could not create tracefs 'enable' entry\n"); + return -ENOMEM; + } + + /* There are not as crucial, just warn if they are not created */ + entry = tracefs_create_file("set_event_pid", 0644, parent, tr, &ftrace_set_event_pid_fops); + if (!entry) + pr_warn("Could not create tracefs 'set_event_pid' entry\n"); /* ring buffer internal formats */ - trace_create_file("header_page", 0444, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - trace_create_file("header_event", 0444, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); + entry = trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + if (!entry) + pr_warn("Could not create tracefs 'header_page' entry\n"); - trace_create_file("enable", 0644, d_events, - tr, &ftrace_tr_enable_fops); + entry = trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + if (!entry) + pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; -- cgit v1.2.3 From 176cedc4ed143745708999155c11b5717cdebb35 Mon Sep 17 00:00:00 2001 From: "T.Zhou" Date: Wed, 23 Nov 2016 08:48:32 +0800 Subject: sched/dl: Fix comment in pick_next_task_dl() Fix cut & paste oversight: s/pull_rt_task/pull_dl_task Signed-off-by: T.Zhou Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: juri.lelli@gmail.com Link: http://lkml.kernel.org/r/20161123004832.GA2983@geo Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c61b461248a3..70ef2b1901e4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo pull_dl_task(rq); lockdep_repin_lock(&rq->lock, cookie); /* - * pull_rt_task() can drop (and re-acquire) rq->lock; this + * pull_dl_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to * re-start task selection. */ -- cgit v1.2.3 From 2b4d5b2582deffb77b3b4b48a59cd36e9e1e14d9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Nov 2016 07:37:00 +0100 Subject: sched/fair: Clean up the tunable parameter definitions No change in functionality: - align the default values vertically to make them easier to scan - standardize the 'default:' lines - fix minor whitespace typos Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02605f2826a2..aa475896782d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,7 +37,6 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -46,31 +45,35 @@ * * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_latency = 6000000ULL; -unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; /* * The initial- and re-scaling of tunables is configurable - * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) * * Options are: - * SCHED_TUNABLESCALING_NONE - unscaled, always *1 - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) - * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ -enum sched_tunable_scaling sysctl_sched_tunable_scaling - = SCHED_TUNABLESCALING_LOG; +enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; /* * Minimal preemption granularity for CPU-bound tasks: + * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* - * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ static unsigned int sched_nr_latency = 8; @@ -82,16 +85,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; /* * SCHED_OTHER wake-up granularity. - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; -const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; #ifdef CONFIG_CFS_BANDWIDTH /* @@ -102,16 +106,18 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; * to consumption or the quota being specified to be smaller than the slice) * we will always only issue the remaining available time. * - * default: 5 msec, units: microseconds - */ -unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + * (default: 5 msec, units: microseconds) + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif /* * The margin used when comparing utilization with CPU capacity: * util * margin < capacity * 1024 + * + * (default: ~20%) */ -unsigned int capacity_margin = 1280; /* ~20% */ +unsigned int capacity_margin = 1280; static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -7174,8 +7180,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) * cpumask covering 1 cpu of the first group and 3 cpus of the second group. * Something like: * - * { 0 1 2 3 } { 4 5 6 7 } - * * * * * + * { 0 1 2 3 } { 4 5 6 7 } + * * * * * * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload -- cgit v1.2.3 From 3e9a8aadca4807b4eadd33a50014c9b2767a4f1f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 11:29:58 -0500 Subject: tracing: Create a always_inlined __trace_buffer_lock_reserve() As Andi Kleen pointed out in the Link below, the trace events has quite a bit of code execution. A lot of that happens to be calling functions, where some of them should simply be inlined. One of these functions happens to be trace_buffer_lock_reserve() which is also a global, but it is used throughout the file it is defined in. Create a __trace_buffer_lock_reserve() that is always inlined that the file can benefit from. Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org Reported-by: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 87 +++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index edccdff8a36d..490533726b54 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -739,6 +739,31 @@ static inline void ftrace_trace_stack(struct trace_array *tr, #endif +static __always_inline void +trace_event_setup(struct ring_buffer_event *event, + int type, unsigned long flags, int pc) +{ + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; +} + +static __always_inline struct ring_buffer_event * +__trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) + trace_event_setup(event, type, flags, pc); + + return event; +} + static void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) @@ -795,8 +820,8 @@ int __trace_puts(unsigned long ip, const char *str, int size) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, pc); if (!event) return 0; @@ -843,8 +868,8 @@ int __trace_bputs(unsigned long ip, const char *str) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, pc); if (!event) return 0; @@ -1913,29 +1938,13 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); -static __always_inline void -trace_event_setup(struct ring_buffer_event *event, - int type, unsigned long flags, int pc) -{ - struct trace_entry *ent = ring_buffer_event_data(event); - - tracing_generic_entry_update(ent, flags, pc); - ent->type = type; -} - struct ring_buffer_event * trace_buffer_lock_reserve(struct ring_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc) { - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(buffer, len); - if (event != NULL) - trace_event_setup(event, type, flags, pc); - - return event; + return __trace_buffer_lock_reserve(buffer, type, len, flags, pc); } DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); @@ -2090,8 +2099,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, this_cpu_dec(trace_buffered_event_cnt); } - entry = trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer @@ -2100,8 +2109,8 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { *current_rb = temp_buffer; - entry = trace_buffer_lock_reserve(*current_rb, - type, len, flags, pc); + entry = __trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); } return entry; } @@ -2262,8 +2271,8 @@ trace_function(struct trace_array *tr, struct ring_buffer_event *event; struct ftrace_entry *entry; - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + flags, pc); if (!event) return; entry = ring_buffer_event_data(event); @@ -2342,8 +2351,8 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, size *= sizeof(unsigned long); - event = trace_buffer_lock_reserve(buffer, TRACE_STACK, - sizeof(*entry) + size, flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, + sizeof(*entry) + size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -2444,8 +2453,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) __this_cpu_inc(user_stack_count); - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, - sizeof(*entry), flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, + sizeof(*entry), flags, pc); if (!event) goto out_drop_count; entry = ring_buffer_event_data(event); @@ -2615,8 +2624,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -2671,8 +2680,8 @@ __trace_array_vprintk(struct ring_buffer *buffer, local_save_flags(flags); size = sizeof(*entry) + len + 1; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - flags, pc); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); @@ -5732,8 +5741,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, preempt_count()); + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); if (!event) { /* Ring buffer disabled, return as if not open for write */ written = -EBADF; @@ -5810,8 +5819,8 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt; buffer = tr->trace_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, - irq_flags, preempt_count()); + event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, + irq_flags, preempt_count()); if (!event) { /* Ring buffer disabled, return as if not open for write */ written = -EBADF; -- cgit v1.2.3 From fa7ffb39efccd574163ebc5dbfe4ff066186f261 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 11:36:30 -0500 Subject: ring-buffer: Make rb_reserve_next_event() always inlined The function rb_reserved_next_event() is called by two functions: ring_buffer_lock_reserve() and ring_buffer_write(). This is in a very hot path of the tracing code, and it is best that they are not functions. The two callers are basically wrapers for rb_reserver_next_event(). Removing the function calls can save execution time in the hotpath of tracing. Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org Reported-by: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c143739b8d7..1f3580cee6cc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2733,7 +2733,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) -- cgit v1.2.3 From 929ddbf3ef4e07fef67e93e998020d49d2533724 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 11:40:34 -0500 Subject: ring-buffer: Always inline rb_event_data() The rb_event_data() is the fast path of getting the ring buffer data from an event. Externally, ring_buffer_event_data() is used to access this function. But unfortunately, rb_event_data() is not inlined, and calling ring_buffer_event_data() causes that function to be called again. Force rb_event_data() to be inlined to lower the number of operations needed when calling ring_buffer_event_data(). Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org Reported-by: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1f3580cee6cc..2760aaca6d1b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -245,7 +245,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) EXPORT_SYMBOL_GPL(ring_buffer_event_length); /* inline for ring buffer fast paths */ -static void * +static __always_inline void * rb_event_data(struct ring_buffer_event *event) { if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) -- cgit v1.2.3 From 4239174570da080f3623724d97062bf55de7e36b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 15:52:45 -0500 Subject: tracing: Make tracepoint_printk a static_key Currently, when tracepoint_printk is set (enabled by the "tp_printk" kernel command line), it causes trace events to print via printk(). This is a very dangerous operation, but is useful for debugging. The issue is, it's seldom used, but it is always checked even if it's not enabled by the kernel command line. Instead of having this feature called by a branch against a variable, turn that variable into a static key, and this will remove the test and jump. To simplify things, the functions output_printk() and trace_event_buffer_commit() were moved from trace_events.c to trace.c. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 +++ kernel/sysctl.c | 2 +- kernel/trace/trace.c | 78 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 40 ----------------------- 4 files changed, 83 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index b3d34d3e0e7e..8700049fd0e5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -945,6 +945,10 @@ extern int __disable_trace_on_warning; #define INIT_TRACE_RECURSION .trace_recursion = 0, #endif +int tracepoint_printk_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } #endif /* CONFIG_TRACING */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 706309f9ed84..6ccc60dfbc7a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -634,7 +634,7 @@ static struct ctl_table kern_table[] = { .data = &tracepoint_printk, .maxlen = sizeof(tracepoint_printk), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = tracepoint_printk_sysctl, }, #endif #ifdef CONFIG_KEXEC_CORE diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 490533726b54..725e8b2c453f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -69,6 +69,7 @@ bool __read_mostly tracing_selftest_disabled; /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; +static DEFINE_STATIC_KEY_FALSE(tracepoint_printk_key); /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { @@ -2116,6 +2117,81 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); +static DEFINE_SPINLOCK(tracepoint_iter_lock); +static DEFINE_MUTEX(tracepoint_printk_mutex); + +static void output_printk(struct trace_event_buffer *fbuffer) +{ + struct trace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + /* We should never get here if iter is NULL */ + if (WARN_ON_ONCE(!iter)) + return; + + event_call = fbuffer->trace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->trace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + +int tracepoint_printk_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int save_tracepoint_printk; + int ret; + + mutex_lock(&tracepoint_printk_mutex); + save_tracepoint_printk = tracepoint_printk; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + /* + * This will force exiting early, as tracepoint_printk + * is always zero when tracepoint_printk_iter is not allocated + */ + if (!tracepoint_print_iter) + tracepoint_printk = 0; + + if (save_tracepoint_printk == tracepoint_printk) + goto out; + + if (tracepoint_printk) + static_key_enable(&tracepoint_printk_key.key); + else + static_key_disable(&tracepoint_printk_key.key); + + out: + mutex_unlock(&tracepoint_printk_mutex); + + return ret; +} + +void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) +{ + if (static_key_false(&tracepoint_printk_key.key)) + output_printk(fbuffer); + + event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_commit); + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -7977,6 +8053,8 @@ void __init trace_init(void) kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); if (WARN_ON(!tracepoint_print_iter)) tracepoint_printk = 0; + else + static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); trace_event_init(); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ba67ede48822..d35fc2b0d304 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -283,46 +283,6 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer, } EXPORT_SYMBOL_GPL(trace_event_buffer_reserve); -static DEFINE_SPINLOCK(tracepoint_iter_lock); - -static void output_printk(struct trace_event_buffer *fbuffer) -{ - struct trace_event_call *event_call; - struct trace_event *event; - unsigned long flags; - struct trace_iterator *iter = tracepoint_print_iter; - - if (!iter) - return; - - event_call = fbuffer->trace_file->event_call; - if (!event_call || !event_call->event.funcs || - !event_call->event.funcs->trace) - return; - - event = &fbuffer->trace_file->event_call->event; - - spin_lock_irqsave(&tracepoint_iter_lock, flags); - trace_seq_init(&iter->seq); - iter->ent = fbuffer->entry; - event_call->event.funcs->trace(iter, 0, event); - trace_seq_putc(&iter->seq, 0); - printk("%s", iter->seq.buffer); - - spin_unlock_irqrestore(&tracepoint_iter_lock, flags); -} - -void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) -{ - if (tracepoint_printk) - output_printk(fbuffer); - - event_trigger_unlock_commit(fbuffer->trace_file, fbuffer->buffer, - fbuffer->event, fbuffer->entry, - fbuffer->flags, fbuffer->pc); -} -EXPORT_SYMBOL_GPL(trace_event_buffer_commit); - int trace_event_reg(struct trace_event_call *call, enum trace_reg type, void *data) { -- cgit v1.2.3 From 52ffabe3848a1ebd944cdf7801a77247b1cb46d5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 20:28:38 -0500 Subject: tracing: Make __buffer_unlock_commit() always_inline The function __buffer_unlock_commit() is called in a few places outside of trace.c. But for the most part, it should really be inlined, as it is in the hot path of the trace_events. For the callers outside of trace.c, create a new function trace_buffer_unlock_commit_nostack(), as the reason it was used was to avoid the stack tracing that trace_buffer_unlock_commit() could do. Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org Reported-by: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 41 +++++++++++++++++++++++------------- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_branch.c | 2 +- kernel/trace/trace_functions_graph.c | 4 ++-- kernel/trace/trace_hwlat.c | 2 +- 5 files changed, 32 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 725e8b2c453f..60416bf7c591 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -794,6 +794,22 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); + +static __always_inline void +__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) +{ + __this_cpu_write(trace_cmdline_save, true); + + /* If this is the temp buffer, we need to commit fully */ + if (this_cpu_read(trace_buffered_event) == event) { + /* Length is in event->array[0] */ + ring_buffer_write(buffer, event->array[0], &event->array[1]); + /* Release the temp buffer */ + this_cpu_dec(trace_buffered_event_cnt); + } else + ring_buffer_unlock_commit(buffer, event); +} + /** * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller @@ -2059,21 +2075,6 @@ void trace_buffered_event_disable(void) preempt_enable(); } -void -__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) -{ - __this_cpu_write(trace_cmdline_save, true); - - /* If this is the temp buffer, we need to commit fully */ - if (this_cpu_read(trace_buffered_event) == event) { - /* Length is in event->array[0] */ - ring_buffer_write(buffer, event->array[0], &event->array[1]); - /* Release the temp buffer */ - this_cpu_dec(trace_buffered_event_cnt); - } else - ring_buffer_unlock_commit(buffer, event); -} - static struct ring_buffer *temp_buffer; struct ring_buffer_event * @@ -2214,6 +2215,16 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, ftrace_trace_userstack(buffer, flags, pc); } +/* + * Similar to trace_buffer_unlock_commit_regs() but do not dump stack. + */ +void +trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + __buffer_unlock_commit(buffer, event); +} + static void trace_process_export(struct trace_export *export, struct ring_buffer_event *event) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9294f8606ade..37602e722336 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -602,8 +602,8 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void __buffer_unlock_commit(struct ring_buffer *buffer, - struct ring_buffer_event *event); +void trace_buffer_unlock_commit_nostack(struct ring_buffer *buffer, + struct ring_buffer_event *event); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3a2a73716a5b..75489de546b6 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -81,7 +81,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->correct = val == expect; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); out: current->trace_recursion &= ~TRACE_BRANCH_BIT; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4e480e870474..8e1a115439fa 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -358,7 +358,7 @@ int __trace_graph_entry(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->graph_ent = *trace; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); return 1; } @@ -469,7 +469,7 @@ void __trace_graph_return(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ret = *trace; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } void trace_graph_return(struct ftrace_graph_ret *trace) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b97286c48735..775569ec50d0 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -127,7 +127,7 @@ static void trace_hwlat_sample(struct hwlat_sample *sample) entry->nmi_count = sample->nmi_count; if (!call_filter_check_discard(call, entry, buffer, event)) - __buffer_unlock_commit(buffer, event); + trace_buffer_unlock_commit_nostack(buffer, event); } /* Macros to encapsulate the time capturing infrastructure */ -- cgit v1.2.3 From 2289d5672f99d36764cbf66e13b5401f700e7043 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 20:35:32 -0500 Subject: ring-buffer: Force inline of hotpath helper functions There's several small helper functions in ring_buffer.c that are used in the hot path. For some reason, even though they are marked inline, gcc tends not to enforce it. Make sure these functions are always inlined. Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org Reported-by: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2760aaca6d1b..04068c0ed927 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1829,48 +1829,48 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static inline void * +static __always_inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { return bpage->data + index; } -static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) +static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; } -static inline struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) { return __rb_page_index(cpu_buffer->reader_page, cpu_buffer->reader_page->read); } -static inline struct ring_buffer_event * +static __always_inline struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned rb_page_commit(struct buffer_page *bpage) +static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } /* Size is determined by what has been committed */ -static inline unsigned rb_page_size(struct buffer_page *bpage) +static __always_inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); } -static inline unsigned +static __always_inline unsigned rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) { return rb_page_commit(cpu_buffer->commit_page); } -static inline unsigned +static __always_inline unsigned rb_event_index(struct ring_buffer_event *event) { unsigned long addr = (unsigned long)event; -- cgit v1.2.3 From babe3fce95e6490b88a2a21d90eb4ba9884edb82 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 20:38:39 -0500 Subject: ring-buffer: Froce rb_update_write_stamp() to be inlined The function rb_update_write_stamp() is in the hotpath of the ring buffer recording. Make sure that it is inlined as well. There's not many places that call it. Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org Reported-by: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 04068c0ed927..04579a0c7711 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2486,7 +2486,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event) event->time_delta = 1; } -static inline bool +static __always_inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -2500,7 +2500,7 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_commit_index(cpu_buffer) == index; } -static void +static __always_inline void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { -- cgit v1.2.3 From 38e11df134297ea3860c7aad8263ece27db01308 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 23 Nov 2016 20:42:31 -0500 Subject: ring-buffer: Force rb_end_commit() and rb_set_commit_to_write() inline Both rb_end_commit() and rb_set_commit_to_write() are in the fast path of the ring buffer recording. Make sure they are always inlined. Link: http://lkml.kernel.org/r/20161121183700.GW26852@two.firstfloor.org Reported-by: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 04579a0c7711..f17476f9d7f4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2386,7 +2386,7 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) local_inc(&cpu_buffer->commits); } -static void +static __always_inline void rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long max_count; @@ -2441,7 +2441,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) goto again; } -static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; -- cgit v1.2.3 From 83929cce95251cc77e5659bf493bd424ae0e7a67 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 23 Nov 2016 11:33:37 +0100 Subject: sched/autogroup: Fix 64-bit kernel nice level adjustment Michael Kerrisk reported: > Regarding the previous paragraph... My tests indicate > that writing *any* value to the autogroup [nice priority level] > file causes the task group to get a lower priority. Because autogroup didn't call the then meaningless scale_load()... Autogroup nice level adjustment has been broken ever since load resolution was increased for 64-bit kernels. Use scale_load() to scale group weight. Michael Kerrisk tested this patch to fix the problem: > Applied and tested against 4.9-rc6 on an Intel u7 (4 cores). > Test setup: > > Terminal window 1: running 40 CPU burner jobs > Terminal window 2: running 40 CPU burner jobs > Terminal window 1: running 1 CPU burner job > > Demonstrated that: > * Writing "0" to the autogroup file for TW1 now causes no change > to the rate at which the process on the terminal consume CPU. > * Writing -20 to the autogroup file for TW1 caused those processes > to get the lion's share of CPU while TW2 TW3 get a tiny amount. > * Writing -20 to the autogroup files for TW1 and TW3 allowed the > process on TW3 to get as much CPU as it was getting as when > the autogroup nice values for both terminals were 0. Reported-by: Michael Kerrisk Tested-by: Michael Kerrisk Signed-off-by: Mike Galbraith Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-man Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1479897217.4306.6.camel@gmx.de Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index f1c8fd566246..da39489d2d80 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -212,6 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; + unsigned long shares; int err; if (nice < MIN_NICE || nice > MAX_NICE) @@ -230,9 +231,10 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) next = HZ / 10 + jiffies; ag = autogroup_task_get(p); + shares = scale_load(sched_prio_to_weight[nice + 20]); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); + err = sched_group_set_shares(ag->tg, shares); if (!err) ag->nice = nice; up_write(&ag->lock); -- cgit v1.2.3 From afe06efdf07c12fd9370d5cce5383398cedf6c90 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 22 Nov 2016 12:23:53 -0800 Subject: sched: Extend scheduler's asym packing We generalize the scheduler's asym packing to provide an ordering of the cpu beyond just the cpu number. This allows the use of the ASYM_PACKING scheduler machinery to move loads to preferred CPU in a sched domain. The preference is defined with the cpu priority given by arch_asym_cpu_priority(cpu). We also record the most preferred cpu in a sched group when we build the cpu's capacity for fast lookup of preferred cpu during load balancing. Co-developed-by: Peter Zijlstra (Intel) Signed-off-by: Tim Chen Acked-by: Peter Zijlstra (Intel) Cc: linux-pm@vger.kernel.org Cc: jolsa@redhat.com Cc: rjw@rjwysocki.net Cc: linux-acpi@vger.kernel.org Cc: Srinivas Pandruvada Cc: bp@suse.de Link: http://lkml.kernel.org/r/0e73ae12737dfaafa46c07066cc7c5d3f1675e46.1479844244.git.tim.c.chen@linux.intel.com Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 15 +++++++++++++++ kernel/sched/fair.c | 53 ++++++++++++++++++++++++++++++++++----------------- kernel/sched/sched.h | 6 ++++++ 4 files changed, 59 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 19abba04ceca..fe9a499d5aa4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1077,6 +1077,8 @@ static inline int cpu_numa_flags(void) } #endif +extern int arch_asym_cpu_priority(int cpu); + struct sched_domain_attr { int relax_domain_level; }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dc64bd71ed2b..393759bd526e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6303,7 +6303,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) WARN_ON(!sg); do { + int cpu, max_cpu = -1; + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + + if (!(sd->flags & SD_ASYM_PACKING)) + goto next; + + for_each_cpu(cpu, sched_group_cpus(sg)) { + if (max_cpu < 0) + max_cpu = cpu; + else if (sched_asym_prefer(cpu, max_cpu)) + max_cpu = cpu; + } + sg->asym_prefer_cpu = max_cpu; + +next: sg = sg->next; } while (sg != sd->groups); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa475896782d..18d9e75f1f6e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -97,6 +97,16 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SMP +/* + * For asym packing, by default the lower numbered cpu has higher priority. + */ +int __weak arch_asym_cpu_priority(int cpu) +{ + return -cpu; +} +#endif + #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool @@ -7388,16 +7398,18 @@ asym_packing: if (env->idle == CPU_NOT_IDLE) return true; /* - * ASYM_PACKING needs to move all the work to the lowest - * numbered CPUs in the group, therefore mark all groups - * higher than ourself as busy. + * ASYM_PACKING needs to move all the work to the highest + * prority CPUs in the group, therefore mark all groups + * of lower priority than ourself as busy. */ - if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && + sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { if (!sds->busiest) return true; - /* Prefer to move from highest possible cpu's work */ - if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) + /* Prefer to move from lowest priority cpu's work */ + if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, + sg->asym_prefer_cpu)) return true; } @@ -7549,8 +7561,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (!sds->busiest) return 0; - busiest_cpu = group_first_cpu(sds->busiest); - if (env->dst_cpu > busiest_cpu) + busiest_cpu = sds->busiest->asym_prefer_cpu; + if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) return 0; env->imbalance = DIV_ROUND_CLOSEST( @@ -7888,10 +7900,11 @@ static int need_active_balance(struct lb_env *env) /* * ASYM_PACKING needs to force migrate tasks from busy but - * higher numbered CPUs in order to pack all tasks in the - * lowest numbered CPUs. + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. */ - if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) + if ((sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu)) return 1; } @@ -8740,7 +8753,7 @@ static inline bool nohz_kick_needed(struct rq *rq) unsigned long now = jiffies; struct sched_domain_shared *sds; struct sched_domain *sd; - int nr_busy, cpu = rq->cpu; + int nr_busy, i, cpu = rq->cpu; bool kick = false; if (unlikely(rq->idle_balance)) @@ -8791,12 +8804,18 @@ static inline bool nohz_kick_needed(struct rq *rq) } sd = rcu_dereference(per_cpu(sd_asym, cpu)); - if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) { - kick = true; - goto unlock; - } + if (sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (i == cpu || + !cpumask_test_cpu(i, nohz.idle_cpus_mask)) + continue; + if (sched_asym_prefer(i, cpu)) { + kick = true; + goto unlock; + } + } + } unlock: rcu_read_unlock(); return kick; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d7e39317d688..7b34c7826ca5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -540,6 +540,11 @@ struct dl_rq { #ifdef CONFIG_SMP +static inline bool sched_asym_prefer(int a, int b) +{ + return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); +} + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -908,6 +913,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* cpu of highest priority in group */ /* * The CPUs this group covers. -- cgit v1.2.3 From d06e622d3d9206e6a2cc45a0f9a3256da8773ff4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 24 Nov 2016 13:51:11 +0530 Subject: cpufreq: schedutil: Rectify comment in sugov_irq_work() function This patch rectifies a comment present in sugov_irq_work() function to follow proper grammar. Suggested-by: Ingo Molnar Signed-off-by: Viresh Kumar Acked-by: Ingo Molnar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 42a220e78f00..fd4659313640 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -315,15 +315,15 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For Real Time and Deadline tasks, schedutil governor shoots the - * frequency to maximum. And special care must be taken to ensure that - * this kthread doesn't result in that. + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is - * updated only at the end of the sugov_work() and before that schedutil - * rejects all other frequency scaling requests. + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. * - * Though there is a very rare case where the RT thread yields right + * There is a very rare case though, where the RT thread yields right * after the work_in_progress flag is cleared. The effects of that are * neglected for now. */ -- cgit v1.2.3 From 3007098494bec614fb55dee7bc0410bb7db5ad18 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:26 +0100 Subject: cgroup: add support for eBPF programs This patch adds two sets of eBPF program pointers to struct cgroup. One for such that are directly pinned to a cgroup, and one for such that are effective for it. To illustrate the logic behind that, assume the following example cgroup hierarchy. A - B - C \ D - E If only B has a program attached, it will be effective for B, C, D and E. If D then attaches a program itself, that will be effective for both D and E, and the program in B will only affect B and C. Only one program of a given type is effective for a cgroup. Attaching and detaching programs will be done through the bpf(2) syscall. For now, ingress and egress inet socket filtering are the only supported use-cases. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 79 +++++++++++++++++++++ include/linux/cgroup-defs.h | 4 ++ init/Kconfig | 12 ++++ kernel/bpf/Makefile | 1 + kernel/bpf/cgroup.c | 167 ++++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup.c | 18 +++++ 6 files changed, 281 insertions(+) create mode 100644 include/linux/bpf-cgroup.h create mode 100644 kernel/bpf/cgroup.c (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h new file mode 100644 index 000000000000..ec80d0c0953e --- /dev/null +++ b/include/linux/bpf-cgroup.h @@ -0,0 +1,79 @@ +#ifndef _BPF_CGROUP_H +#define _BPF_CGROUP_H + +#include +#include +#include + +struct sock; +struct cgroup; +struct sk_buff; + +#ifdef CONFIG_CGROUP_BPF + +extern struct static_key_false cgroup_bpf_enabled_key; +#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) + +struct cgroup_bpf { + /* + * Store two sets of bpf_prog pointers, one for programs that are + * pinned directly to this cgroup, and one for those that are effective + * when this cgroup is accessed. + */ + struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE]; + struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE]; +}; + +void cgroup_bpf_put(struct cgroup *cgrp); +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent); + +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type); + +/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */ +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type); + +int __cgroup_bpf_run_filter(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type); + +/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */ +#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter(sk, skb, \ + BPF_CGROUP_INET_INGRESS); \ + \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + typeof(sk) __sk = sk_to_full_sk(sk); \ + if (sk_fullsock(__sk)) \ + __ret = __cgroup_bpf_run_filter(__sk, skb, \ + BPF_CGROUP_INET_EGRESS); \ + } \ + __ret; \ +}) + +#else + +struct cgroup_bpf {}; +static inline void cgroup_bpf_put(struct cgroup *cgrp) {} +static inline void cgroup_bpf_inherit(struct cgroup *cgrp, + struct cgroup *parent) {} + +#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) + +#endif /* CONFIG_CGROUP_BPF */ + +#endif /* _BPF_CGROUP_H */ diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5b17de62c962..861b4677fc5b 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_CGROUPS @@ -300,6 +301,9 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work; + /* used to store eBPF programs */ + struct cgroup_bpf bpf; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/init/Kconfig b/init/Kconfig index 34407f15e6d3..405120b5f13e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1154,6 +1154,18 @@ config CGROUP_PERF Say N if unsure. +config CGROUP_BPF + bool "Support for eBPF programs attached to cgroups" + depends on BPF_SYSCALL && SOCK_CGROUP_DATA + help + Allow attaching eBPF programs to a cgroup using the bpf(2) + syscall command BPF_PROG_ATTACH. + + In which context these programs are accessed depends on the type + of attachment. For instance, programs that are attached using + BPF_CGROUP_INET_INGRESS will be executed on the ingress path of + inet sockets. + config CGROUP_DEBUG bool "Example controller" default n diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index c4d89d6e2058..1276474ac3cd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif +obj-$(CONFIG_CGROUP_BPF) += cgroup.o diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c new file mode 100644 index 000000000000..a0ab43f264b0 --- /dev/null +++ b/kernel/bpf/cgroup.c @@ -0,0 +1,167 @@ +/* + * Functions to manage eBPF programs attached to cgroups + * + * Copyright (c) 2016 Daniel Mack + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +EXPORT_SYMBOL(cgroup_bpf_enabled_key); + +/** + * cgroup_bpf_put() - put references of all bpf programs + * @cgrp: the cgroup to modify + */ +void cgroup_bpf_put(struct cgroup *cgrp) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { + struct bpf_prog *prog = cgrp->bpf.prog[type]; + + if (prog) { + bpf_prog_put(prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } + } +} + +/** + * cgroup_bpf_inherit() - inherit effective programs from parent + * @cgrp: the cgroup to modify + * @parent: the parent to inherit from + */ +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +{ + unsigned int type; + + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { + struct bpf_prog *e; + + e = rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + rcu_assign_pointer(cgrp->bpf.effective[type], e); + } +} + +/** + * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @parent: The parent of @cgrp, or %NULL if @cgrp is the root + * @prog: A new program to pin + * @type: Type of pinning operation (ingress/egress) + * + * Each cgroup has a set of two pointers for bpf programs; one for eBPF + * programs it owns, and which is effective for execution. + * + * If @prog is %NULL, this function attaches a new program to the cgroup and + * releases the one that is currently attached, if any. @prog is then made + * the effective program of type @type in that cgroup. + * + * If @prog is %NULL, the currently attached program of type @type is released, + * and the effective program of the parent cgroup (if any) is inherited to + * @cgrp. + * + * Then, the descendants of @cgrp are walked and the effective program for + * each of them is set to the effective program of @cgrp unless the + * descendant has its own program attached, in which case the subbranch is + * skipped. This ensures that delegated subcgroups with own programs are left + * untouched. + * + * Must be called with cgroup_mutex held. + */ +void __cgroup_bpf_update(struct cgroup *cgrp, + struct cgroup *parent, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct bpf_prog *old_prog, *effective; + struct cgroup_subsys_state *pos; + + old_prog = xchg(cgrp->bpf.prog + type, prog); + + effective = (!prog && parent) ? + rcu_dereference_protected(parent->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)) : + prog; + + css_for_each_descendant_pre(pos, &cgrp->self) { + struct cgroup *desc = container_of(pos, struct cgroup, self); + + /* skip the subtree if the descendant has its own program */ + if (desc->bpf.prog[type] && desc != cgrp) + pos = css_rightmost_descendant(pos); + else + rcu_assign_pointer(desc->bpf.effective[type], + effective); + } + + if (prog) + static_branch_inc(&cgroup_bpf_enabled_key); + + if (old_prog) { + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + } +} + +/** + * __cgroup_bpf_run_filter() - Run a program for packet filtering + * @sk: The socken sending or receiving traffic + * @skb: The skb that is being sent or received + * @type: The type of program to be exectuted + * + * If no socket is passed, or the socket is not of type INET or INET6, + * this function does nothing and returns 0. + * + * The program type passed in via @type must be suitable for network + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + int ret = 0; + + if (!sk || !sk_fullsock(sk)) + return 0; + + if (sk->sk_family != AF_INET && + sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) { + unsigned int offset = skb->data - skb_network_header(skb); + + __skb_push(skb, offset); + ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; + __skb_pull(skb, offset); + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 85bc9beb046d..2ee9ec3051b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); + if (parent) + cgroup_bpf_inherit(cgrp, parent); + cgroup_propagate_control(cgrp); /* @cgrp doesn't have dir yet so the following will only create csses */ @@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void) } subsys_initcall(cgroup_namespaces_init); +#ifdef CONFIG_CGROUP_BPF +void cgroup_bpf_update(struct cgroup *cgrp, + struct bpf_prog *prog, + enum bpf_attach_type type) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + mutex_lock(&cgroup_mutex); + __cgroup_bpf_update(cgrp, parent, prog, type); + mutex_unlock(&cgroup_mutex); +} +#endif /* CONFIG_CGROUP_BPF */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit v1.2.3 From f4324551489e8781d838f941b7aee4208e52e8bf Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Wed, 23 Nov 2016 16:52:27 +0100 Subject: bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and BPF_PROG_DETACH which allow attaching and detaching eBPF programs to a target. On the API level, the target could be anything that has an fd in userspace, hence the name of the field in union bpf_attr is called 'target_fd'. When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is expected to be a valid file descriptor of a cgroup v2 directory which has the bpf controller enabled. These are the only use-cases implemented by this patch at this point, but more can be added. If a program of the given type already exists in the given cgroup, the program is swapped automically, so userspace does not have to drop an existing program first before installing a new one, which would otherwise leave a gap in which no program is attached. For more information on the propagation logic to subcgroups, please refer to the bpf cgroup controller implementation. The API is guarded by CAP_NET_ADMIN. Signed-off-by: Daniel Mack Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 8 +++++ kernel/bpf/syscall.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5ae679fac993..1370a9d1456f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -73,6 +73,8 @@ enum bpf_cmd { BPF_PROG_LOAD, BPF_OBJ_PIN, BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, }; enum bpf_map_type { @@ -159,6 +161,12 @@ union bpf_attr { __aligned_u64 pathname; __u32 bpf_fd; }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __u32 target_fd; /* container object to attach to */ + __u32 attach_bpf_fd; /* eBPF program to attach */ + __u32 attach_type; + }; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb15498b8d55..1090d16a31c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -835,6 +835,77 @@ static int bpf_obj_get(const union bpf_attr *attr) return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); } +#ifdef CONFIG_CGROUP_BPF + +#define BPF_PROG_ATTACH_LAST_FIELD attach_type + +static int bpf_prog_attach(const union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_ATTACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + prog = bpf_prog_get_type(attr->attach_bpf_fd, + BPF_PROG_TYPE_CGROUP_SKB); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} + +#define BPF_PROG_DETACH_LAST_FIELD attach_type + +static int bpf_prog_detach(const union bpf_attr *attr) +{ + struct cgroup *cgrp; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (CHECK_ATTR(BPF_PROG_DETACH)) + return -EINVAL; + + switch (attr->attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + cgroup_bpf_update(cgrp, NULL, attr->attach_type); + cgroup_put(cgrp); + break; + + default: + return -EINVAL; + } + + return 0; +} +#endif /* CONFIG_CGROUP_BPF */ + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -901,6 +972,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; + +#ifdef CONFIG_CGROUP_BPF + case BPF_PROG_ATTACH: + err = bpf_prog_attach(&attr); + break; + case BPF_PROG_DETACH: + err = bpf_prog_detach(&attr); + break; +#endif + default: err = -EINVAL; break; -- cgit v1.2.3 From 7fd8329ba502ef76dd91db561c7aed696b2c7720 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 21 Sep 2016 13:47:22 +0200 Subject: taint/module: Clean up global and module taint flags handling The commit 66cc69e34e86a231 ("Fix: module signature vs tracepoints: add new TAINT_UNSIGNED_MODULE") updated module_taint_flags() to potentially print one more character. But it did not increase the size of the corresponding buffers in m_show() and print_modules(). We have recently done the same mistake when adding a taint flag for livepatching, see https://lkml.kernel.org/r/cfba2c823bb984690b73572aaae1db596b54a082.1472137475.git.jpoimboe@redhat.com Also struct module uses an incompatible type for mod-taints flags. It survived from the commit 2bc2d61a9638dab670d ("[PATCH] list module taint flags in Oops/panic"). There was used "int" for the global taint flags at these times. But only the global tain flags was later changed to "unsigned long" by the commit 25ddbb18aae33ad2 ("Make the taint flags reliable"). This patch defines TAINT_FLAGS_COUNT that can be used to create arrays and buffers of the right size. Note that we could not use enum because the taint flag indexes are used also in assembly code. Then it reworks the table that describes the taint flags. The TAINT_* numbers can be used as the index. Instead, we add information if the taint flag is also shown per-module. Finally, it uses "unsigned long", bit operations, and the updated taint_flags table also for mod->taints. It is not optimal because only few taint flags can be printed by module_taint_flags(). But better be on the safe side. IMHO, it is not worth the optimization and this is a good compromise. Signed-off-by: Petr Mladek Link: http://lkml.kernel.org/r/1474458442-21581-1-git-send-email-pmladek@suse.com [jeyu@redhat.com: fix broken lkml link in changelog] Signed-off-by: Jessica Yu --- include/linux/kernel.h | 9 +++++++++ include/linux/module.h | 2 +- kernel/module.c | 33 +++++++++++++------------------ kernel/panic.c | 53 ++++++++++++++++++++++++-------------------------- 4 files changed, 48 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index bc6ed52a39b9..441def77246d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -506,6 +506,15 @@ extern enum system_states { #define TAINT_UNSIGNED_MODULE 13 #define TAINT_SOFTLOCKUP 14 #define TAINT_LIVEPATCH 15 +#define TAINT_FLAGS_COUNT 16 + +struct taint_flag { + char true; /* character printed when tainted */ + char false; /* character printed when not tainted */ + bool module; /* also show as a per-module taint flag */ +}; + +extern const struct taint_flag taint_flags[TAINT_FLAGS_COUNT]; extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] diff --git a/include/linux/module.h b/include/linux/module.h index 0c3207d26ac0..f6ee569c62bb 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -399,7 +399,7 @@ struct module { /* Arch-specific module values */ struct mod_arch_specific arch; - unsigned int taints; /* same bits as kernel:tainted */ + unsigned long taints; /* same bits as kernel:taint_flags */ #ifdef CONFIG_GENERIC_BUG /* Support for BUG */ diff --git a/kernel/module.c b/kernel/module.c index f57dd63186e6..a4acd8f403ae 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -330,7 +330,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag, enum lockdep_ok lockdep_ok) { add_taint(flag, lockdep_ok); - mod->taints |= (1U << flag); + set_bit(flag, &mod->taints); } /* @@ -1138,24 +1138,13 @@ static inline int module_unload_init(struct module *mod) static size_t module_flags_taint(struct module *mod, char *buf) { size_t l = 0; + int i; + + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + if (taint_flags[i].module && test_bit(i, &mod->taints)) + buf[l++] = taint_flags[i].true; + } - if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) - buf[l++] = 'P'; - if (mod->taints & (1 << TAINT_OOT_MODULE)) - buf[l++] = 'O'; - if (mod->taints & (1 << TAINT_FORCED_MODULE)) - buf[l++] = 'F'; - if (mod->taints & (1 << TAINT_CRAP)) - buf[l++] = 'C'; - if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) - buf[l++] = 'E'; - if (mod->taints & (1 << TAINT_LIVEPATCH)) - buf[l++] = 'K'; - /* - * TAINT_FORCED_RMMOD: could be added. - * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't - * apply to modules. - */ return l; } @@ -4041,6 +4030,10 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, } #endif /* CONFIG_KALLSYMS */ +/* Maximum number of characters written by module_flags() */ +#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4) + +/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ static char *module_flags(struct module *mod, char *buf) { int bx = 0; @@ -4085,7 +4078,7 @@ static void m_stop(struct seq_file *m, void *p) static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); - char buf[8]; + char buf[MODULE_FLAGS_BUF_SIZE]; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) @@ -4256,7 +4249,7 @@ EXPORT_SYMBOL_GPL(__module_text_address); void print_modules(void) { struct module *mod; - char buf[8]; + char buf[MODULE_FLAGS_BUF_SIZE]; printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ diff --git a/kernel/panic.c b/kernel/panic.c index e6480e20379e..c51edaa04fce 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -298,30 +298,27 @@ void panic(const char *fmt, ...) EXPORT_SYMBOL(panic); - -struct tnt { - u8 bit; - char true; - char false; -}; - -static const struct tnt tnts[] = { - { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, - { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' }, - { TAINT_FORCED_RMMOD, 'R', ' ' }, - { TAINT_MACHINE_CHECK, 'M', ' ' }, - { TAINT_BAD_PAGE, 'B', ' ' }, - { TAINT_USER, 'U', ' ' }, - { TAINT_DIE, 'D', ' ' }, - { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, - { TAINT_WARN, 'W', ' ' }, - { TAINT_CRAP, 'C', ' ' }, - { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, - { TAINT_OOT_MODULE, 'O', ' ' }, - { TAINT_UNSIGNED_MODULE, 'E', ' ' }, - { TAINT_SOFTLOCKUP, 'L', ' ' }, - { TAINT_LIVEPATCH, 'K', ' ' }, +/* + * TAINT_FORCED_RMMOD could be a per-module flag but the module + * is being removed anyway. + */ +const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { + { 'P', 'G', true }, /* TAINT_PROPRIETARY_MODULE */ + { 'F', ' ', true }, /* TAINT_FORCED_MODULE */ + { 'S', ' ', false }, /* TAINT_CPU_OUT_OF_SPEC */ + { 'R', ' ', false }, /* TAINT_FORCED_RMMOD */ + { 'M', ' ', false }, /* TAINT_MACHINE_CHECK */ + { 'B', ' ', false }, /* TAINT_BAD_PAGE */ + { 'U', ' ', false }, /* TAINT_USER */ + { 'D', ' ', false }, /* TAINT_DIE */ + { 'A', ' ', false }, /* TAINT_OVERRIDDEN_ACPI_TABLE */ + { 'W', ' ', false }, /* TAINT_WARN */ + { 'C', ' ', true }, /* TAINT_CRAP */ + { 'I', ' ', false }, /* TAINT_FIRMWARE_WORKAROUND */ + { 'O', ' ', true }, /* TAINT_OOT_MODULE */ + { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ + { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ + { 'K', ' ', true }, /* TAINT_LIVEPATCH */ }; /** @@ -348,16 +345,16 @@ static const struct tnt tnts[] = { */ const char *print_tainted(void) { - static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")]; + static char buf[TAINT_FLAGS_COUNT + sizeof("Tainted: ")]; if (tainted_mask) { char *s; int i; s = buf + sprintf(buf, "Tainted: "); - for (i = 0; i < ARRAY_SIZE(tnts); i++) { - const struct tnt *t = &tnts[i]; - *s++ = test_bit(t->bit, &tainted_mask) ? + for (i = 0; i < TAINT_FLAGS_COUNT; i++) { + const struct taint_flag *t = &taint_flags[i]; + *s++ = test_bit(i, &tainted_mask) ? t->true : t->false; } *s = 0; -- cgit v1.2.3 From 885a78d4a5b3ad2d7c41d1819b001d7957f442cd Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Thu, 20 Oct 2016 17:18:12 +0100 Subject: module: Ensure a module's state is set accordingly during module coming cleanup code In load_module() in the event of an error, for e.g. unknown module parameter(s) specified we go to perform some module coming clean up operations. At this point the module is still in a "formed" state when it is actually going away. This patch updates the module's state accordingly to ensure anyone on the module_notify_list waiting for a module going away notification will be notified accordingly. Signed-off-by: Aaron Tomlin Acked-by: Rusty Russell Reviewed-by: Miroslav Benes Link: http://lkml.kernel.org/r/1476980293-19062-2-git-send-email-atomlin@redhat.com Signed-off-by: Jessica Yu --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a4acd8f403ae..f082832ad3ad 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3697,6 +3697,7 @@ static int load_module(struct load_info *info, const char __user *uargs, sysfs_cleanup: mod_sysfs_teardown(mod); coming_cleanup: + mod->state = MODULE_STATE_GOING; blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); -- cgit v1.2.3 From 905dd707fc856bae57de144dcf873583881f9489 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Thu, 27 Oct 2016 10:36:06 +0100 Subject: module: When modifying a module's text ignore modules which are going away too By default, during the access permission modification of a module's core and init pages, we only ignore modules that are malformed. Albeit for a module which is going away, it does not make sense to change its text to RO since the module should be RW, before deallocation. This patch makes set_all_modules_text_ro() skip modules which are going away too. Signed-off-by: Aaron Tomlin Acked-by: Rusty Russell Link: http://lkml.kernel.org/r/1477560966-781-1-git-send-email-atomlin@redhat.com [jeyu@redhat.com: add comment as suggested by Steven Rostedt] Signed-off-by: Jessica Yu --- kernel/module.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f082832ad3ad..927a67e30855 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1958,7 +1958,13 @@ void set_all_modules_text_ro(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { - if (mod->state == MODULE_STATE_UNFORMED) + /* + * Ignore going modules since it's possible that ro + * protection has already been disabled, otherwise we'll + * run into protection faults at module deallocation. + */ + if (mod->state == MODULE_STATE_UNFORMED || + mod->state == MODULE_STATE_GOING) continue; frob_text(&mod->core_layout, set_memory_ro); -- cgit v1.2.3 From 71d9f5079358c148e71eba930e436a7a0cb35d95 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 16 Nov 2016 16:45:48 +0100 Subject: module: Fix a comment above strong_try_module_get() The comment above strong_try_module_get() function is not true anymore. Return values changed with commit c9a3ba55bb5d ("module: wait for dependent modules doing init."). Signed-off-by: Miroslav Benes Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1611161635330.12580@pobox.suse.cz [jeyu@redhat.com: style fixes to make checkpatch.pl happy] Signed-off-by: Jessica Yu --- kernel/module.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 927a67e30855..6281c70683d3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -313,8 +313,11 @@ struct load_info { } index; }; -/* We require a truly strong try_module_get(): 0 means failure due to - ongoing or failed initialization etc. */ +/* + * We require a truly strong try_module_get(): 0 means success. + * Otherwise an error is returned due to ongoing or failed + * initialization etc. + */ static inline int strong_try_module_get(struct module *mod) { BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); -- cgit v1.2.3 From 39290b389ea2654f9190e3b48c57d27b24def83e Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Mon, 14 Nov 2016 15:15:05 +0900 Subject: module: extend 'rodata=off' boot cmdline parameter to module mappings The current "rodata=off" parameter disables read-only kernel mappings under CONFIG_DEBUG_RODATA: commit d2aa1acad22f ("mm/init: Add 'rodata=off' boot cmdline parameter to disable read-only kernel mappings") This patch is a logical extension to module mappings ie. read-only mappings at module loading can be disabled even if CONFIG_DEBUG_SET_MODULE_RONX (mainly for debug use). Please note, however, that it only affects RO/RW permissions, keeping NX set. This is the first step to make CONFIG_DEBUG_SET_MODULE_RONX mandatory (always-on) in the future as CONFIG_DEBUG_RODATA on x86 and arm64. Suggested-by: and Acked-by: Mark Rutland Signed-off-by: AKASHI Takahiro Reviewed-by: Kees Cook Acked-by: Rusty Russell Link: http://lkml.kernel.org/r/20161114061505.15238-1-takahiro.akashi@linaro.org Signed-off-by: Jessica Yu --- include/linux/init.h | 3 +++ init/main.c | 7 +++++-- kernel/module.c | 20 +++++++++++++++++--- 3 files changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/init.h b/include/linux/init.h index e30104ceb86d..885c3e6d0f9d 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -126,6 +126,9 @@ void prepare_namespace(void); void __init load_default_modules(void); int __init init_rootfs(void); +#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_DEBUG_SET_MODULE_RONX) +extern bool rodata_enabled; +#endif #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); #endif diff --git a/init/main.c b/init/main.c index 2858be732f6d..959a24242988 100644 --- a/init/main.c +++ b/init/main.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include @@ -914,14 +915,16 @@ static int try_to_run_init_process(const char *init_filename) static noinline void __init kernel_init_freeable(void); -#ifdef CONFIG_DEBUG_RODATA -static bool rodata_enabled = true; +#if defined(CONFIG_DEBUG_RODATA) || defined(CONFIG_SET_MODULE_RONX) +bool rodata_enabled __ro_after_init = true; static int __init set_debug_rodata(char *str) { return strtobool(str, &rodata_enabled); } __setup("rodata=", set_debug_rodata); +#endif +#ifdef CONFIG_DEBUG_RODATA static void mark_readonly(void) { if (rodata_enabled) diff --git a/kernel/module.c b/kernel/module.c index 6281c70683d3..039ce82803f7 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1902,6 +1902,9 @@ static void frob_writable_data(const struct module_layout *layout, /* livepatching wants to disable read-only so it can frob module. */ void module_disable_ro(const struct module *mod) { + if (!rodata_enabled) + return; + frob_text(&mod->core_layout, set_memory_rw); frob_rodata(&mod->core_layout, set_memory_rw); frob_ro_after_init(&mod->core_layout, set_memory_rw); @@ -1911,6 +1914,9 @@ void module_disable_ro(const struct module *mod) void module_enable_ro(const struct module *mod, bool after_init) { + if (!rodata_enabled) + return; + frob_text(&mod->core_layout, set_memory_ro); frob_rodata(&mod->core_layout, set_memory_ro); frob_text(&mod->init_layout, set_memory_ro); @@ -1943,6 +1949,9 @@ void set_all_modules_text_rw(void) { struct module *mod; + if (!rodata_enabled) + return; + mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) @@ -1959,6 +1968,9 @@ void set_all_modules_text_ro(void) { struct module *mod; + if (!rodata_enabled) + return; + mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { /* @@ -1978,10 +1990,12 @@ void set_all_modules_text_ro(void) static void disable_ro_nx(const struct module_layout *layout) { - frob_text(layout, set_memory_rw); - frob_rodata(layout, set_memory_rw); + if (rodata_enabled) { + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_ro_after_init(layout, set_memory_rw); + } frob_rodata(layout, set_memory_x); - frob_ro_after_init(layout, set_memory_rw); frob_ro_after_init(layout, set_memory_x); frob_writable_data(layout, set_memory_x); } -- cgit v1.2.3 From 88575199cc65de99a156888629a68180c830eff2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 26 Nov 2016 01:28:04 +0100 Subject: bpf: drop unnecessary context cast from BPF_PROG_RUN Since long already bpf_func is not only about struct sk_buff * as input anymore. Make it generic as void *, so that callers don't need to cast for it each time they call BPF_PROG_RUN(). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- include/linux/filter.h | 6 +++--- kernel/events/core.c | 2 +- kernel/seccomp.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index eb3715700c95..876ab3a92ad5 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1518,7 +1518,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, unsigned int len) xdp.data = data; xdp.data_end = data + len; - return BPF_PROG_RUN(prog, (void *)&xdp); + return BPF_PROG_RUN(prog, &xdp); } /** diff --git a/include/linux/filter.h b/include/linux/filter.h index 1f09c521adfe..7f246a281435 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -408,8 +408,8 @@ struct bpf_prog { enum bpf_prog_type type; /* Type of BPF program */ struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - unsigned int (*bpf_func)(const struct sk_buff *skb, - const struct bpf_insn *filter); + unsigned int (*bpf_func)(const void *ctx, + const struct bpf_insn *insn); /* Instructions for interpreter */ union { struct sock_filter insns[0]; @@ -504,7 +504,7 @@ static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, u32 ret; rcu_read_lock(); - ret = BPF_PROG_RUN(prog, (void *)xdp); + ret = BPF_PROG_RUN(prog, xdp); rcu_read_unlock(); return ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6ee1febdf6ff..22cc734aa1b2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7726,7 +7726,7 @@ static void bpf_overflow_handler(struct perf_event *event, if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; rcu_read_lock(); - ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + ret = BPF_PROG_RUN(event->prog, &ctx); rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0db7c8a2afe2..bff9c774987a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -195,7 +195,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd) * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; -- cgit v1.2.3 From 21116b7068b9b66ac16b2fe3675469f459968c3f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 26 Nov 2016 01:28:07 +0100 Subject: bpf: add owner_prog_type and accounted mem to array map's fdinfo Allow for checking the owner_prog_type of a program array map. In some cases bpf(2) can return -EINVAL /after/ the verifier passed and did all the rewrites of the bpf program. The reason that lets us fail at this late stage is that program array maps are incompatible. Allow users to inspect this earlier after they got the map fd through BPF_OBJ_GET command. tc will get support for this. Also, display how much we charged the map with regards to RLIMIT_MEMLOCK. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1090d16a31c1..4caa18e6860a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -138,18 +138,31 @@ static int bpf_map_release(struct inode *inode, struct file *filp) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; + const struct bpf_array *array; + u32 owner_prog_type = 0; + + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + array = container_of(map, struct bpf_array, map); + owner_prog_type = array->owner_prog_type; + } seq_printf(m, "map_type:\t%u\n" "key_size:\t%u\n" "value_size:\t%u\n" "max_entries:\t%u\n" - "map_flags:\t%#x\n", + "map_flags:\t%#x\n" + "memlock:\t%llu\n", map->map_type, map->key_size, map->value_size, map->max_entries, - map->map_flags); + map->map_flags, + map->pages * 1ULL << PAGE_SHIFT); + + if (owner_prog_type) + seq_printf(m, "owner_prog_type:\t%u\n", + owner_prog_type); } #endif -- cgit v1.2.3 From a3af5f80010625a9ffbe8edd4bae615a7516b6bc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 26 Nov 2016 01:28:08 +0100 Subject: bpf: allow for mount options to specify permissions Since we recently converted the BPF filesystem over to use mount_nodev(), we now have the possibility to also hold mount options in sb's s_fs_info. This work implements mount options support for specifying permissions on the sb's inode, which will be used by tc when it manually needs to mount the fs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2565809fbb34..0b030c9126d3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -364,15 +365,66 @@ static void bpf_evict_inode(struct inode *inode) static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, + .show_options = generic_show_options, .evict_inode = bpf_evict_inode, }; +enum { + OPT_MODE, + OPT_ERR, +}; + +static const match_table_t bpf_mount_tokens = { + { OPT_MODE, "mode=%o" }, + { OPT_ERR, NULL }, +}; + +struct bpf_mount_opts { + umode_t mode; +}; + +static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option, token; + char *ptr; + + opts->mode = S_IRWXUGO; + + while ((ptr = strsep(&data, ",")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, bpf_mount_tokens, args); + switch (token) { + case OPT_MODE: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* We might like to report bad mount options here, but + * traditionally we've ignored all mount options, so we'd + * better continue to ignore non-existing options for bpf. + */ + } + } + + return 0; +} + static int bpf_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr bpf_rfiles[] = { { "" } }; + struct bpf_mount_opts opts; struct inode *inode; int ret; + save_mount_options(sb, data); + + ret = bpf_parse_options(data, &opts); + if (ret) + return ret; + ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; @@ -382,7 +434,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent) inode = sb->s_root->d_inode; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; - inode->i_mode |= S_ISVTX | S_IRWXUGO; + inode->i_mode |= S_ISVTX | opts.mode; return 0; } -- cgit v1.2.3 From bb8313b603eb8fd52de48a079bfcd72dcab2ef1e Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 28 Nov 2016 23:03:04 -0800 Subject: cpuidle: Allow enforcing deepest idle state selection When idle injection is used to cap power, we need to override the governor's choice of idle states. For this reason, make it possible the deepest idle state selection to be enforced by setting a flag on a given CPU to achieve the maximum potential power draw reduction. Signed-off-by: Jacob Pan [ rjw: Subject & changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 13 ++++++++++++- include/linux/cpuidle.h | 7 ++++++- kernel/sched/idle.c | 13 ++++++++----- 3 files changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207abb5a4..afc005b917fe 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,17 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -#ifdef CONFIG_SUSPEND +/* Set the current cpu to use the deepest idle state, override governors */ +void cpuidle_use_deepest_state(bool enable) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + dev->use_deepest_state = enable; + preempt_enable(); +} + /** * cpuidle_find_deepest_state - Find the deepest available idle state. * @drv: cpuidle driver for the given CPU. @@ -109,6 +119,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return find_deepest_state(drv, dev, UINT_MAX, 0, false); } +#ifdef CONFIG_SUSPEND static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 15deea449edc..da346f2817a8 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -74,6 +74,7 @@ struct cpuidle_driver_kobj; struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; + unsigned int use_deepest_state:1; unsigned int cpu; int last_residency; @@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #endif -#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) +#ifdef CONFIG_CPU_IDLE extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev); +extern void cpuidle_use_deepest_state(bool enable); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) +{ +} #endif /* kernel/sched/idle.c */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d5300d..513e4dfeeae7 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); -- cgit v1.2.3 From c1de45ca831acee9b72c9320dde447edafadb43f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Nov 2016 23:03:05 -0800 Subject: sched/idle: Add support for tasks that inject idle Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use realtime tasks to take control of CPU then inject idle. There are two issues with this approach: 1. Low efficiency: injected idle task is treated as busy so sched ticks do not stop during injected idle period, the result of these unwanted wakeups can be ~20% loss in power savings. 2. Idle accounting: injected idle time is presented to user as busy. This patch addresses the issues by introducing a new PF_IDLE flag which allows any given task to be treated as idle task while the flag is set. Therefore, idle injection tasks can run through the normal flow of NOHZ idle enter/exit to get the correct accounting as well as tick stop when possible. The implication is that idle task is then no longer limited to PID == 0. Acked-by: Ingo Molnar Signed-off-by: Peter Zijlstra Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- include/linux/cpu.h | 2 + include/linux/sched.h | 3 +- kernel/fork.c | 2 +- kernel/sched/core.c | 1 + kernel/sched/idle.c | 162 +++++++++++++++++++++++++++++++------------------- 5 files changed, 107 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b886dc17f2f3..ac0efae38072 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); +void play_idle(unsigned long duration_ms); + #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 348f51b0ec92..114c7fcb6af6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, /* * Per process flags */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ @@ -2609,7 +2610,7 @@ extern struct task_struct *idle_task(int cpu); */ static inline bool is_idle_task(const struct task_struct *p) { - return p->pid == 0; + return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); diff --git a/kernel/fork.c b/kernel/fork.c index 623259fc794d..5074b2f0827b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1537,7 +1537,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..63b3a8a49884 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5285,6 +5285,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 513e4dfeeae7..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -205,76 +205,65 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); - - while (1) { - /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. - */ - - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ - while (!need_resched()) { - check_pgt_cache(); - rmb(); + __current_set_polling(); + tick_nohz_idle_enter(); - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } + while (!need_resched()) { + check_pgt_cache(); + rmb(); - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); } - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); + local_irq_disable(); + arch_cpu_idle_enter(); /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ - smp_mb__after_atomic(); - - sched_ttwu_pending(); - schedule_preempt_disabled(); + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); } bool cpu_in_idle(unsigned long pc) @@ -283,6 +272,56 @@ bool cpu_in_idle(unsigned long pc) pc < (unsigned long)__cpuidle_text_end; } +struct idle_timer { + struct hrtimer timer; + int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); + + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); + void cpu_startup_entry(enum cpuhp_state state) { /* @@ -302,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } -- cgit v1.2.3 From 948a5312f41658f7b76a598a139ef1f4dea09ca9 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:22 -0800 Subject: timekeeping: Add a fast and NMI safe boot clock This boot clock can be used as a tracing clock and will account for suspend time. To keep it NMI safe since we're accessing from tracing, we're not using a separate timekeeper with updates to monotonic clock and boot offset protected with seqlocks. This has the following minor side effects: (1) Its possible that a timestamp be taken after the boot offset is updated but before the timekeeper is updated. If this happens, the new boot offset is added to the old timekeeping making the clock appear to update slightly earlier: CPU 0 CPU 1 timekeeping_inject_sleeptime64() __timekeeping_inject_sleeptime(tk, delta); timestamp(); timekeeping_update(tk, TK_CLEAR_NTP...); (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be partially updated. Since the tk->offs_boot update is a rare event, this should be a rare occurrence which postprocessing should be able to handle. Signed-off-by: Joel Fernandes Signed-off-by: John Stultz Reviewed-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Richard Cochran Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1480372524-15181-6-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 09168c52ab64..361f8bf1429d 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -249,6 +249,7 @@ static inline u64 ktime_get_raw_ns(void) extern u64 ktime_get_mono_fast_ns(void); extern u64 ktime_get_raw_fast_ns(void); +extern u64 ktime_get_boot_fast_ns(void); /* * Timespec interfaces utilizing the ktime based ones diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 37dec7e3db43..b2286e94c934 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -425,6 +425,35 @@ u64 ktime_get_raw_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); +/** + * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock. + * + * To keep it NMI safe since we're accessing from tracing, we're not using a + * separate timekeeper with updates to monotonic clock and boot offset + * protected with seqlocks. This has the following minor side effects: + * + * (1) Its possible that a timestamp be taken after the boot offset is updated + * but before the timekeeper is updated. If this happens, the new boot offset + * is added to the old timekeeping making the clock appear to update slightly + * earlier: + * CPU 0 CPU 1 + * timekeeping_inject_sleeptime64() + * __timekeeping_inject_sleeptime(tk, delta); + * timestamp(); + * timekeeping_update(tk, TK_CLEAR_NTP...); + * + * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be + * partially updated. Since the tk->offs_boot update is a rare event, this + * should be a rare occurrence which postprocessing should be able to handle. + */ +u64 notrace ktime_get_boot_fast_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot)); +} +EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; -- cgit v1.2.3 From 80ec3552107ac16a836dbff4cf3c23fdd3256ee3 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Nov 2016 14:35:23 -0800 Subject: trace: Add an option for boot clock as trace clock Unlike monotonic clock, boot clock as a trace clock will account for time spent in suspend useful for tracing suspend/resume. This uses earlier introduced infrastructure for using the fast boot clock. Signed-off-by: Joel Fernandes Signed-off-by: John Stultz Reviewed-by: Thomas Gleixner Acked-by: Steven Rostedt Cc: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1480372524-15181-7-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8696ce6bf2f6..f7b64db285df 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1125,6 +1125,7 @@ static struct { { trace_clock, "perf", 1 }, { ktime_get_mono_fast_ns, "mono", 1 }, { ktime_get_raw_fast_ns, "mono_raw", 1 }, + { ktime_get_boot_fast_ns, "boot", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.2.3 From 8fae47705685fcaa75a1fe4c8c3e18300a702979 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sun, 20 Nov 2016 16:47:55 -0500 Subject: audit: add support for session ID user filter Define AUDIT_SESSIONID in the uapi and add support for specifying user filters based on the session ID. Also add the new session ID filter to the feature bitmap so userspace knows it is available. https://github.com/linux-audit/audit-kernel/issues/4 RFE: add a session ID filter to the kernel's user filter Signed-off-by: Richard Guy Briggs [PM: combine multiple patches from Richard into this one] Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 5 ++++- kernel/auditfilter.c | 2 ++ kernel/auditsc.c | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 82e8aa59446b..c8dc97bc2c1b 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -254,6 +254,7 @@ #define AUDIT_OBJ_LEV_LOW 22 #define AUDIT_OBJ_LEV_HIGH 23 #define AUDIT_LOGINUID_SET 24 +#define AUDIT_SESSIONID 25 /* Session ID */ /* These are ONLY useful when checking * at syscall exit time (AUDIT_AT_EXIT). */ @@ -329,9 +330,11 @@ enum { #define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT 0x00000001 #define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME 0x00000002 #define AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH 0x00000004 +#define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER 0x00000010 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \ AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \ - AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH) + AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH | \ + AUDIT_FEATURE_BITMAP_SESSIONID_FILTER) /* deprecated: AUDIT_VERSION_* */ #define AUDIT_VERSION_LATEST AUDIT_FEATURE_BITMAP_ALL diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 632e90d1005f..880519d6cf2a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -363,6 +363,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: + case AUDIT_SESSIONID: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -476,6 +477,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (!gid_valid(f->gid)) goto exit_free; break; + case AUDIT_SESSIONID: case AUDIT_ARCH: entry->rule.arch_f = f; break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index d161b17ce8ce..f78cb1b3fa74 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -446,6 +446,7 @@ static int audit_filter_rules(struct task_struct *tsk, const struct cred *cred; int i, need_sid = 1; u32 sid; + unsigned int sessionid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); @@ -508,6 +509,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FSGID: result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; + case AUDIT_SESSIONID: + sessionid = audit_get_sessionid(current); + result = audit_comparator(sessionid, f->op, f->val); + break; case AUDIT_PERS: result = audit_comparator(tsk->personality, f->op, f->val); break; -- cgit v1.2.3 From faaae2a581435f32781a105dda3501df388fddcb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Nov 2016 15:20:14 -0800 Subject: Re-enable CONFIG_MODVERSIONS in a slightly weaker form This enables CONFIG_MODVERSIONS again, but allows for missing symbol CRC information in order to work around the issue that newer binutils versions seem to occasionally drop the CRC on the floor. binutils 2.26 seems to work fine, while binutils 2.27 seems to break MODVERSIONS of symbols that have been defined in assembler files. [ We've had random missing CRC's before - it may be an old problem that just is now reliably triggered with the weak asm symbols and a new version of binutils ] Some day I really do want to remove MODVERSIONS entirely. Sadly, today does not appear to be that day: Debian people apparently do want the option to enable MODVERSIONS to make it easier to have external modules across kernel versions, and this seems to be a fairly minimal fix for the annoying problem. Cc: Ben Hutchings Acked-by: Michal Marek Signed-off-by: Linus Torvalds --- init/Kconfig | 1 - kernel/module.c | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index c4fbc1e55c25..34407f15e6d3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1945,7 +1945,6 @@ config MODULE_FORCE_UNLOAD config MODVERSIONS bool "Module versioning support" - depends on BROKEN help Usually, you have to use modules compiled with your kernel. Saying Y here makes it sometimes possible to use modules diff --git a/kernel/module.c b/kernel/module.c index f57dd63186e6..0e54d5bf0097 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1301,8 +1301,9 @@ static int check_version(Elf_Shdr *sechdrs, goto bad_version; } - pr_warn("%s: no symbol version for %s\n", mod->name, symname); - return 0; + /* Broken toolchain. Warn once, then let it go.. */ + pr_warn_once("%s: no symbol version for %s\n", mod->name, symname); + return 1; bad_version: pr_warn("%s: disagrees about version of symbol %s\n", -- cgit v1.2.3 From 01ae87eab53675cbdabd5c4d727c4a35e397cce0 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Mon, 28 Nov 2016 14:11:04 +0100 Subject: bpf: cgroup: fix documentation of __cgroup_bpf_update() There's a 'not' missing in one paragraph. Add it. Fixes: 3007098494be ("cgroup: add support for eBPF programs") Signed-off-by: Daniel Mack Reported-by: Rami Rosen Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index a0ab43f264b0..8c784f8c67cd 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -66,8 +66,8 @@ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) * Each cgroup has a set of two pointers for bpf programs; one for eBPF * programs it owns, and which is effective for execution. * - * If @prog is %NULL, this function attaches a new program to the cgroup and - * releases the one that is currently attached, if any. @prog is then made + * If @prog is not %NULL, this function attaches a new program to the cgroup + * and releases the one that is currently attached, if any. @prog is then made * the effective program of type @type in that cgroup. * * If @prog is %NULL, the currently attached program of type @type is released, -- cgit v1.2.3 From f8319483f57f1ca22370f4150bb990aca7728a67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Nov 2016 14:32:25 +1100 Subject: locking/lockdep: Provide a type check for lock_is_held Christoph requested lockdep_assert_held() variants that distinguish between held-for-read or held-for-write. Provide: int lock_is_held_type(struct lockdep_map *lock, int read) which takes the same argument as lock_acquire(.read) and matches it to the held_lock instance. Use of this function should be gated by the debug_locks variable. When that is 0 the return value of the lock_is_held_type() function is undefined. This is done to allow both negative and positive tests for holding locks. By default we provide (positive) lockdep_assert_held{,_exclusive,_read}() macros. Requested-by: Christoph Hellwig Signed-off-by: Peter Zijlstra (Intel) Tested-by: Jens Axboe Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- include/linux/lockdep.h | 25 +++++++++++++++++++++++-- kernel/locking/lockdep.c | 20 ++++++++++++-------- 2 files changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index c1458fede1f9..1e327bb80838 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -338,9 +338,18 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); -#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) +/* + * Same "read" as for lock_acquire(), except -1 means any. + */ +extern int lock_is_held_type(struct lockdep_map *lock, int read); + +static inline int lock_is_held(struct lockdep_map *lock) +{ + return lock_is_held_type(lock, -1); +} -extern int lock_is_held(struct lockdep_map *lock); +#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) +#define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, @@ -372,6 +381,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) +#define lockdep_assert_held_exclusive(l) do { \ + WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ + } while (0) + +#define lockdep_assert_held_read(l) do { \ + WARN_ON(debug_locks && !lockdep_is_held_type(l, 1)); \ + } while (0) + #define lockdep_assert_held_once(l) do { \ WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \ } while (0) @@ -428,7 +445,11 @@ struct lock_class_key { }; #define lockdep_depth(tsk) (0) +#define lockdep_is_held_type(l, r) (1) + #define lockdep_assert_held(l) do { (void)(l); } while (0) +#define lockdep_assert_held_exclusive(l) do { (void)(l); } while (0) +#define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_recursing(tsk) (0) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b3..cff580a6edf9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3188,7 +3188,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock); +static int __lock_is_held(struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3329,7 +3329,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock)) + if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) @@ -3576,7 +3576,7 @@ found_it: return 1; } -static int __lock_is_held(struct lockdep_map *lock) +static int __lock_is_held(struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3584,8 +3584,12 @@ static int __lock_is_held(struct lockdep_map *lock) for (i = 0; i < curr->lockdep_depth; i++) { struct held_lock *hlock = curr->held_locks + i; - if (match_held_lock(hlock, lock)) - return 1; + if (match_held_lock(hlock, lock)) { + if (read == -1 || hlock->read == read) + return 1; + + return 0; + } } return 0; @@ -3769,7 +3773,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held(struct lockdep_map *lock) +int lock_is_held_type(struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -3781,13 +3785,13 @@ int lock_is_held(struct lockdep_map *lock) check_flags(flags); current->lockdep_recursion = 1; - ret = __lock_is_held(lock); + ret = __lock_is_held(lock, read); current->lockdep_recursion = 0; raw_local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(lock_is_held); +EXPORT_SYMBOL_GPL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { -- cgit v1.2.3 From 60fe3910bb029e3671ce7ac080a7acb7e032b9e0 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 29 Nov 2016 23:45:47 +1100 Subject: kexec_file: Allow arch-specific memory walking for kexec_add_buffer Allow architectures to specify a different memory walking function for kexec_add_buffer. x86 uses iomem to track reserved memory ranges, but PowerPC uses the memblock subsystem. Signed-off-by: Thiago Jung Bauermann Acked-by: Dave Young Acked-by: Balbir Singh Signed-off-by: Michael Ellerman --- include/linux/kexec.h | 29 ++++++++++++++++++++++++++++- kernel/kexec_file.c | 30 ++++++++++++++++++++++-------- kernel/kexec_internal.h | 16 ---------------- 3 files changed, 50 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 406c33dcae13..5e320ddaaa82 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -148,7 +148,34 @@ struct kexec_file_ops { kexec_verify_sig_t *verify_sig; #endif }; -#endif + +/** + * struct kexec_buf - parameters for finding a place for a buffer in memory + * @image: kexec image in which memory to search. + * @buffer: Contents which will be copied to the allocated memory. + * @bufsz: Size of @buffer. + * @mem: On return will have address of the buffer in memory. + * @memsz: Size for the buffer in memory. + * @buf_align: Minimum alignment needed. + * @buf_min: The buffer can't be placed below this address. + * @buf_max: The buffer can't be placed above this address. + * @top_down: Allocate from top of memory. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long mem; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; +}; + +int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(u64, u64, void *)); +#endif /* CONFIG_KEXEC_FILE */ struct kimage { kimage_entry_t head; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 037c321c5618..f865674bff51 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -428,6 +428,27 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } +/** + * arch_kexec_walk_mem - call func(data) on free memory regions + * @kbuf: Context info for the search. Also passed to @func. + * @func: Function to call for each memory region. + * + * Return: The memory walk will stop when func returns a non-zero value + * and that value will be returned. If all free regions are visited without + * func returning non-zero, then zero will be returned. + */ +int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, + int (*func)(u64, u64, void *)) +{ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return walk_iomem_res_desc(crashk_res.desc, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, + kbuf, func); + else + return walk_system_ram_res(0, ULONG_MAX, kbuf, func); +} + /* * Helper function for placing a buffer in a kexec segment. This assumes * that kexec_mutex is held. @@ -474,14 +495,7 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, kbuf->top_down = top_down; /* Walk the RAM ranges and allocate a suitable range for the buffer */ - if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res_desc(crashk_res.desc, - IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); - else - ret = walk_system_ram_res(0, -1, kbuf, - locate_mem_hole_callback); + ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); if (ret != 1) { /* A suitable memory range could not be found for buffer */ return -EADDRNOTAVAIL; diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 0a52315d9c62..4cef7e4706b0 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -20,22 +20,6 @@ struct kexec_sha_region { unsigned long len; }; -/* - * Keeps track of buffer parameters as provided by caller for requesting - * memory placement of buffer. - */ -struct kexec_buf { - struct kimage *image; - char *buffer; - unsigned long bufsz; - unsigned long mem; - unsigned long memsz; - unsigned long buf_align; - unsigned long buf_min; - unsigned long buf_max; - bool top_down; /* allocate from top of memory hole */ -}; - void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } -- cgit v1.2.3 From ec2b9bfaac44ea889625a6b9473d33898b10d35f Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 29 Nov 2016 23:45:48 +1100 Subject: kexec_file: Change kexec_add_buffer to take kexec_buf as argument. This is done to simplify the kexec_add_buffer argument list. Adapt all callers to set up a kexec_buf to pass to kexec_add_buffer. In addition, change the type of kexec_buf.buffer from char * to void *. There is no particular reason for it to be a char *, and the change allows us to get rid of 3 existing casts to char * in the code. Signed-off-by: Thiago Jung Bauermann Acked-by: Dave Young Acked-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/x86/kernel/crash.c | 37 ++++++++-------- arch/x86/kernel/kexec-bzimage64.c | 48 +++++++++++---------- include/linux/kexec.h | 8 +--- kernel/kexec_file.c | 88 ++++++++++++++++++--------------------- 4 files changed, 87 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 650830e39e3a..3741461c63a0 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg) int crash_load_segments(struct kimage *image) { - unsigned long src_start, src_sz, elf_sz; - void *elf_addr; int ret; + struct kexec_buf kbuf = { .image = image, .buf_min = 0, + .buf_max = ULONG_MAX, .top_down = false }; /* * Determine and load a segment for backup area. First 640K RAM @@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image) if (ret < 0) return ret; - src_start = image->arch.backup_src_start; - src_sz = image->arch.backup_src_sz; - /* Add backup segment. */ - if (src_sz) { + if (image->arch.backup_src_sz) { + kbuf.buffer = &crash_zero_bytes; + kbuf.bufsz = sizeof(crash_zero_bytes); + kbuf.memsz = image->arch.backup_src_sz; + kbuf.buf_align = PAGE_SIZE; /* * Ideally there is no source for backup segment. This is * copied in purgatory after crash. Just add a zero filled * segment for now to make sure checksum logic works fine. */ - ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, - sizeof(crash_zero_bytes), src_sz, - PAGE_SIZE, 0, -1, 0, - &image->arch.backup_load_addr); + ret = kexec_add_buffer(&kbuf); if (ret) return ret; + image->arch.backup_load_addr = kbuf.mem; pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, src_start, src_sz); + image->arch.backup_load_addr, + image->arch.backup_src_start, kbuf.memsz); } /* Prepare elf headers and add a segment */ - ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) return ret; - image->arch.elf_headers = elf_addr; - image->arch.elf_headers_sz = elf_sz; + image->arch.elf_headers = kbuf.buffer; + image->arch.elf_headers_sz = kbuf.bufsz; - ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, - ELF_CORE_HEADER_ALIGN, 0, -1, 0, - &image->arch.elf_load_addr); + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + ret = kexec_add_buffer(&kbuf); if (ret) { vfree((void *)image->arch.elf_headers); return ret; } + image->arch.elf_load_addr = kbuf.mem; pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_load_addr, elf_sz, elf_sz); + image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz); return ret; } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 3407b148c240..d0a814a9d96a 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel, struct setup_header *header; int setup_sects, kern16_size, ret = 0; - unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + unsigned long setup_header_size, params_cmdline_sz; struct boot_params *params; unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; unsigned long purgatory_load_addr; - unsigned long kernel_bufsz, kernel_memsz, kernel_align; - char *kernel_buf; struct bzimage64_data *ldata; struct kexec_entry64_regs regs64; void *stack; unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, + .top_down = true }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; @@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel, params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); - params_misc_sz = params_cmdline_sz + efi_map_sz + + kbuf.bufsz = params_cmdline_sz + efi_map_sz + sizeof(struct setup_data) + sizeof(struct efi_setup_data); - params = kzalloc(params_misc_sz, GFP_KERNEL); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel, /* Is there a limit on setup header size? */ memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); - ret = kexec_add_buffer(image, (char *)params, params_misc_sz, - params_misc_sz, 16, MIN_BOOTPARAM_ADDR, - ULONG_MAX, 1, &bootparam_load_addr); + kbuf.buffer = params; + kbuf.memsz = kbuf.bufsz; + kbuf.buf_align = 16; + kbuf.buf_min = MIN_BOOTPARAM_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + bootparam_load_addr = kbuf.mem; pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - bootparam_load_addr, params_misc_sz, params_misc_sz); + bootparam_load_addr, kbuf.bufsz, kbuf.bufsz); /* Load kernel */ - kernel_buf = kernel + kern16_size; - kernel_bufsz = kernel_len - kern16_size; - kernel_memsz = PAGE_ALIGN(header->init_size); - kernel_align = header->kernel_alignment; - - ret = kexec_add_buffer(image, kernel_buf, - kernel_bufsz, kernel_memsz, kernel_align, - MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, - &kernel_load_addr); + kbuf.buffer = kernel + kern16_size; + kbuf.bufsz = kernel_len - kern16_size; + kbuf.memsz = PAGE_ALIGN(header->init_size); + kbuf.buf_align = header->kernel_alignment; + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + kernel_load_addr = kbuf.mem; pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_load_addr, kernel_memsz, kernel_memsz); + kernel_load_addr, kbuf.bufsz, kbuf.memsz); /* Load initrd high */ if (initrd) { - ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, - PAGE_SIZE, MIN_INITRD_LOAD_ADDR, - ULONG_MAX, 1, &initrd_load_addr); + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.buf_min = MIN_INITRD_LOAD_ADDR; + ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; + initrd_load_addr = kbuf.mem; pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", initrd_load_addr, initrd_len, initrd_len); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 5e320ddaaa82..437ef1b47428 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -163,7 +163,7 @@ struct kexec_file_ops { */ struct kexec_buf { struct kimage *image; - char *buffer; + void *buffer; unsigned long bufsz; unsigned long mem; unsigned long memsz; @@ -175,6 +175,7 @@ struct kexec_buf { int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)); +extern int kexec_add_buffer(struct kexec_buf *kbuf); #endif /* CONFIG_KEXEC_FILE */ struct kimage { @@ -239,11 +240,6 @@ extern asmlinkage long sys_kexec_load(unsigned long entry, struct kexec_segment __user *segments, unsigned long flags); extern int kernel_kexec(void); -extern int kexec_add_buffer(struct kimage *image, char *buffer, - unsigned long bufsz, unsigned long memsz, - unsigned long buf_align, unsigned long buf_min, - unsigned long buf_max, bool top_down, - unsigned long *load_addr); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); extern int kexec_load_purgatory(struct kimage *image, unsigned long min, diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f865674bff51..efd2c094af7e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -449,25 +449,27 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } -/* - * Helper function for placing a buffer in a kexec segment. This assumes - * that kexec_mutex is held. +/** + * kexec_add_buffer - place a buffer in a kexec segment + * @kbuf: Buffer contents and memory parameters. + * + * This function assumes that kexec_mutex is held. + * On successful return, @kbuf->mem will have the physical address of + * the buffer in memory. + * + * Return: 0 on success, negative errno on error. */ -int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, - unsigned long memsz, unsigned long buf_align, - unsigned long buf_min, unsigned long buf_max, - bool top_down, unsigned long *load_addr) +int kexec_add_buffer(struct kexec_buf *kbuf) { struct kexec_segment *ksegment; - struct kexec_buf buf, *kbuf; int ret; /* Currently adding segment this way is allowed only in file mode */ - if (!image->file_mode) + if (!kbuf->image->file_mode) return -EINVAL; - if (image->nr_segments >= KEXEC_SEGMENT_MAX) + if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX) return -EINVAL; /* @@ -477,22 +479,14 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, * logic goes through list of segments to make sure there are * no destination overlaps. */ - if (!list_empty(&image->control_pages)) { + if (!list_empty(&kbuf->image->control_pages)) { WARN_ON(1); return -EINVAL; } - memset(&buf, 0, sizeof(struct kexec_buf)); - kbuf = &buf; - kbuf->image = image; - kbuf->buffer = buffer; - kbuf->bufsz = bufsz; - - kbuf->memsz = ALIGN(memsz, PAGE_SIZE); - kbuf->buf_align = max(buf_align, PAGE_SIZE); - kbuf->buf_min = buf_min; - kbuf->buf_max = buf_max; - kbuf->top_down = top_down; + /* Ensure minimum alignment needed for segments. */ + kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); + kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); @@ -502,13 +496,12 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, } /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments]; + ksegment = &kbuf->image->segment[kbuf->image->nr_segments]; ksegment->kbuf = kbuf->buffer; ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; - image->nr_segments++; - *load_addr = ksegment->mem; + kbuf->image->nr_segments++; return 0; } @@ -630,13 +623,15 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, unsigned long max, int top_down) { struct purgatory_info *pi = &image->purgatory_info; - unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; - unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned long align, bss_align, bss_sz, bss_pad; + unsigned long entry, load_addr, curr_load_addr, bss_addr, offset; unsigned char *buf_addr, *src; int i, ret = 0, entry_sidx = -1; const Elf_Shdr *sechdrs_c; Elf_Shdr *sechdrs = NULL; - void *purgatory_buf = NULL; + struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1, + .buf_min = min, .buf_max = max, + .top_down = top_down }; /* * sechdrs_c points to section headers in purgatory and are read @@ -702,9 +697,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, } /* Determine how much memory is needed to load relocatable object. */ - buf_align = 1; bss_align = 1; - buf_sz = 0; bss_sz = 0; for (i = 0; i < pi->ehdr->e_shnum; i++) { @@ -713,10 +706,10 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, align = sechdrs[i].sh_addralign; if (sechdrs[i].sh_type != SHT_NOBITS) { - if (buf_align < align) - buf_align = align; - buf_sz = ALIGN(buf_sz, align); - buf_sz += sechdrs[i].sh_size; + if (kbuf.buf_align < align) + kbuf.buf_align = align; + kbuf.bufsz = ALIGN(kbuf.bufsz, align); + kbuf.bufsz += sechdrs[i].sh_size; } else { /* bss section */ if (bss_align < align) @@ -728,32 +721,31 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, /* Determine the bss padding required to align bss properly */ bss_pad = 0; - if (buf_sz & (bss_align - 1)) - bss_pad = bss_align - (buf_sz & (bss_align - 1)); + if (kbuf.bufsz & (bss_align - 1)) + bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1)); - memsz = buf_sz + bss_pad + bss_sz; + kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz; /* Allocate buffer for purgatory */ - purgatory_buf = vzalloc(buf_sz); - if (!purgatory_buf) { + kbuf.buffer = vzalloc(kbuf.bufsz); + if (!kbuf.buffer) { ret = -ENOMEM; goto out; } - if (buf_align < bss_align) - buf_align = bss_align; + if (kbuf.buf_align < bss_align) + kbuf.buf_align = bss_align; /* Add buffer to segment list */ - ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, - buf_align, min, max, top_down, - &pi->purgatory_load_addr); + ret = kexec_add_buffer(&kbuf); if (ret) goto out; + pi->purgatory_load_addr = kbuf.mem; /* Load SHF_ALLOC sections */ - buf_addr = purgatory_buf; + buf_addr = kbuf.buffer; load_addr = curr_load_addr = pi->purgatory_load_addr; - bss_addr = load_addr + buf_sz + bss_pad; + bss_addr = load_addr + kbuf.bufsz + bss_pad; for (i = 0; i < pi->ehdr->e_shnum; i++) { if (!(sechdrs[i].sh_flags & SHF_ALLOC)) @@ -799,11 +791,11 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min, * Used later to identify which section is purgatory and skip it * from checksumming. */ - pi->purgatory_buf = purgatory_buf; + pi->purgatory_buf = kbuf.buffer; return ret; out: vfree(sechdrs); - vfree(purgatory_buf); + vfree(kbuf.buffer); return ret; } -- cgit v1.2.3 From e2e806f9e437b46a3fc8f3174a225c73f2e38c3d Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Tue, 29 Nov 2016 23:45:49 +1100 Subject: kexec_file: Factor out kexec_locate_mem_hole from kexec_add_buffer. kexec_locate_mem_hole will be used by the PowerPC kexec_file_load implementation to find free memory for the purgatory stack. Signed-off-by: Thiago Jung Bauermann Acked-by: Dave Young Signed-off-by: Michael Ellerman --- include/linux/kexec.h | 1 + kernel/kexec_file.c | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 437ef1b47428..a33f63351f86 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -176,6 +176,7 @@ struct kexec_buf { int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *)); extern int kexec_add_buffer(struct kexec_buf *kbuf); +int kexec_locate_mem_hole(struct kexec_buf *kbuf); #endif /* CONFIG_KEXEC_FILE */ struct kimage { diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index efd2c094af7e..0c2df7f73792 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -449,6 +449,23 @@ int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +/** + * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel + * @kbuf: Parameters for the memory search. + * + * On success, kbuf->mem will have the start address of the memory region found. + * + * Return: 0 on success, negative errno on error. + */ +int kexec_locate_mem_hole(struct kexec_buf *kbuf) +{ + int ret; + + ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); + + return ret == 1 ? 0 : -EADDRNOTAVAIL; +} + /** * kexec_add_buffer - place a buffer in a kexec segment * @kbuf: Buffer contents and memory parameters. @@ -489,11 +506,9 @@ int kexec_add_buffer(struct kexec_buf *kbuf) kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); /* Walk the RAM ranges and allocate a suitable range for the buffer */ - ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback); - if (ret != 1) { - /* A suitable memory range could not be found for buffer */ - return -EADDRNOTAVAIL; - } + ret = kexec_locate_mem_hole(kbuf); + if (ret) + return ret; /* Found a suitable memory range */ ksegment = &kbuf->image->segment[kbuf->image->nr_segments]; -- cgit v1.2.3 From e2d2afe15ed452f91797a80dbc0a17838ba03ed4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 Nov 2016 12:27:09 -0500 Subject: bpf: fix states equal logic for varlen access If we have a branch that looks something like this int foo = map->value; if (condition) { foo += blah; } else { foo = bar; } map->array[foo] = baz; We will incorrectly assume that the !condition branch is equal to the condition branch as the register for foo will be UNKNOWN_VALUE in both cases. We need to adjust this logic to only do this if we didn't do a varlen access after we processed the !condition branch, otherwise we have different ranges and need to check the other branch as well. Fixes: 484611357c19 ("bpf: allow access into map value arrays") Reported-by: Jann Horn Signed-off-by: Josef Bacik Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a936159c6e0..8199821f54cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2454,6 +2454,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { + bool varlen_map_access = env->varlen_map_value_access; struct bpf_reg_state *rold, *rcur; int i; @@ -2467,12 +2468,17 @@ static bool states_equal(struct bpf_verifier_env *env, /* If the ranges were not the same, but everything else was and * we didn't do a variable access into a map then we are a-ok. */ - if (!env->varlen_map_value_access && + if (!varlen_map_access && rold->type == rcur->type && rold->imm == rcur->imm) continue; + /* If we didn't map access then again we don't care about the + * mismatched range values and it's ok if our old type was + * UNKNOWN and we didn't go to a NOT_INIT'ed reg. + */ if (rold->type == NOT_INIT || - (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT)) + (!varlen_map_access && rold->type == UNKNOWN_VALUE && + rcur->type != NOT_INIT)) continue; if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET && -- cgit v1.2.3 From 4a057549d6044c2dea47e80f8369a76225ec9d90 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 28 Nov 2016 14:35:21 -0800 Subject: alarmtimer: Add tracepoints for alarm timers Alarm timers are one of the mechanisms to wake up a system from suspend, but there exist no tracepoints to analyse which process/thread armed an alarmtimer. Add tracepoints for start/cancel/expire of individual alarm timers and one for tracing the suspend time decision when to resume the system. The following trace excerpt illustrates the new mechanism: Binder:3292_2-3304 [000] d..2 149.981123: alarmtimer_cancel: alarmtimer:ffffffc1319a7800 type:REALTIME expires:1325463120000000000 now:1325376810370370245 Binder:3292_2-3304 [000] d..2 149.981136: alarmtimer_start: alarmtimer:ffffffc1319a7800 type:REALTIME expires:1325376840000000000 now:1325376810370384591 Binder:3292_9-3953 [000] d..2 150.212991: alarmtimer_cancel: alarmtimer:ffffffc1319a5a00 type:BOOTTIME expires:179552000000 now:150154008122 Binder:3292_9-3953 [000] d..2 150.213006: alarmtimer_start: alarmtimer:ffffffc1319a5a00 type:BOOTTIME expires:179551000000 now:150154025622 system_server-3000 [002] ...1 162.701940: alarmtimer_suspend: alarmtimer type:REALTIME expires:1325376840000000000 The wakeup time which is selected at suspend time allows to map it back to the task arming the timer: Binder:3292_2. [ tglx: Store alarm timer expiry time instead of some useless RTC relative information, add proper type information for wakeups which are handled via the clock_nanosleep/freezer and massage the changelog. ] Signed-off-by: Baolin Wang Signed-off-by: John Stultz Acked-by: Steven Rostedt Cc: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1480372524-15181-5-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/alarmtimer.h | 5 ++ include/trace/events/alarmtimer.h | 96 +++++++++++++++++++++++++++++++++++++++ kernel/time/alarmtimer.c | 53 +++++++++++++++++---- 3 files changed, 144 insertions(+), 10 deletions(-) create mode 100644 include/trace/events/alarmtimer.h (limited to 'kernel') diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 9d8031257a90..c70aac13244a 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -10,7 +10,12 @@ enum alarmtimer_type { ALARM_REALTIME, ALARM_BOOTTIME, + /* Supported types end here */ ALARM_NUMTYPE, + + /* Used for tracing information. No usable types. */ + ALARM_REALTIME_FREEZER, + ALARM_BOOTTIME_FREEZER, }; enum alarmtimer_restart { diff --git a/include/trace/events/alarmtimer.h b/include/trace/events/alarmtimer.h new file mode 100644 index 000000000000..a1c108c16c9c --- /dev/null +++ b/include/trace/events/alarmtimer.h @@ -0,0 +1,96 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM alarmtimer + +#if !defined(_TRACE_ALARMTIMER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ALARMTIMER_H + +#include +#include +#include + +TRACE_DEFINE_ENUM(ALARM_REALTIME); +TRACE_DEFINE_ENUM(ALARM_BOOTTIME); +TRACE_DEFINE_ENUM(ALARM_REALTIME_FREEZER); +TRACE_DEFINE_ENUM(ALARM_BOOTTIME_FREEZER); + +#define show_alarm_type(type) __print_flags(type, " | ", \ + { 1 << ALARM_REALTIME, "REALTIME" }, \ + { 1 << ALARM_BOOTTIME, "BOOTTIME" }, \ + { 1 << ALARM_REALTIME_FREEZER, "REALTIME Freezer" }, \ + { 1 << ALARM_BOOTTIME_FREEZER, "BOOTTIME Freezer" }) + +TRACE_EVENT(alarmtimer_suspend, + + TP_PROTO(ktime_t expires, int flag), + + TP_ARGS(expires, flag), + + TP_STRUCT__entry( + __field(s64, expires) + __field(unsigned char, alarm_type) + ), + + TP_fast_assign( + __entry->expires = expires.tv64; + __entry->alarm_type = flag; + ), + + TP_printk("alarmtimer type:%s expires:%llu", + show_alarm_type((1 << __entry->alarm_type)), + __entry->expires + ) +); + +DECLARE_EVENT_CLASS(alarm_class, + + TP_PROTO(struct alarm *alarm, ktime_t now), + + TP_ARGS(alarm, now), + + TP_STRUCT__entry( + __field(void *, alarm) + __field(unsigned char, alarm_type) + __field(s64, expires) + __field(s64, now) + ), + + TP_fast_assign( + __entry->alarm = alarm; + __entry->alarm_type = alarm->type; + __entry->expires = alarm->node.expires.tv64; + __entry->now = now.tv64; + ), + + TP_printk("alarmtimer:%p type:%s expires:%llu now:%llu", + __entry->alarm, + show_alarm_type((1 << __entry->alarm_type)), + __entry->expires, + __entry->now + ) +); + +DEFINE_EVENT(alarm_class, alarmtimer_fired, + + TP_PROTO(struct alarm *alarm, ktime_t now), + + TP_ARGS(alarm, now) +); + +DEFINE_EVENT(alarm_class, alarmtimer_start, + + TP_PROTO(struct alarm *alarm, ktime_t now), + + TP_ARGS(alarm, now) +); + +DEFINE_EVENT(alarm_class, alarmtimer_cancel, + + TP_PROTO(struct alarm *alarm, ktime_t now), + + TP_ARGS(alarm, now) +); + +#endif /* _TRACE_ALARMTIMER_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index a15caa3d1721..9b08ca391aed 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /** * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base @@ -40,7 +43,9 @@ static struct alarm_base { clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; -/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +/* freezer information to handle clock_nanosleep triggered wakeups */ +static enum alarmtimer_type freezer_alarmtype; +static ktime_t freezer_expires; static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); @@ -194,6 +199,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); + trace_alarmtimer_fired(alarm, base->gettime()); return ret; } @@ -218,15 +224,16 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); */ static int alarmtimer_suspend(struct device *dev) { - struct rtc_time tm; - ktime_t min, now; - unsigned long flags; + ktime_t min, now, expires; + int i, ret, type; struct rtc_device *rtc; - int i; - int ret; + unsigned long flags; + struct rtc_time tm; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; freezer_delta = ktime_set(0, 0); spin_unlock_irqrestore(&freezer_delta_lock, flags); @@ -247,8 +254,11 @@ static int alarmtimer_suspend(struct device *dev) if (!next) continue; delta = ktime_sub(next->expires, base->gettime()); - if (!min.tv64 || (delta.tv64 < min.tv64)) + if (!min.tv64 || (delta.tv64 < min.tv64)) { + expires = next->expires; min = delta; + type = i; + } } if (min.tv64 == 0) return 0; @@ -258,6 +268,8 @@ static int alarmtimer_suspend(struct device *dev) return -EBUSY; } + trace_alarmtimer_suspend(expires, type); + /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); rtc_read_time(rtc, &tm); @@ -295,15 +307,32 @@ static int alarmtimer_resume(struct device *dev) static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { - ktime_t delta; + struct alarm_base *base; unsigned long flags; - struct alarm_base *base = &alarm_bases[type]; + ktime_t delta; + + switch(type) { + case ALARM_REALTIME: + base = &alarm_bases[ALARM_REALTIME]; + type = ALARM_REALTIME_FREEZER; + break; + case ALARM_BOOTTIME: + base = &alarm_bases[ALARM_BOOTTIME]; + type = ALARM_BOOTTIME_FREEZER; + break; + default: + WARN_ONCE(1, "Invalid alarm type: %d\n", type); + return; + } delta = ktime_sub(absexp, base->gettime()); spin_lock_irqsave(&freezer_delta_lock, flags); - if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) + if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) { freezer_delta = delta; + freezer_expires = absexp; + freezer_alarmtype = type; + } spin_unlock_irqrestore(&freezer_delta_lock, flags); } @@ -342,6 +371,8 @@ void alarm_start(struct alarm *alarm, ktime_t start) alarmtimer_enqueue(base, alarm); hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_start(alarm, base->gettime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -390,6 +421,8 @@ int alarm_try_to_cancel(struct alarm *alarm) if (ret >= 0) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); + + trace_alarmtimer_cancel(alarm, base->gettime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); -- cgit v1.2.3 From 60602982720f3a77366ee3e493a6e3d15e7e84f5 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 29 Nov 2016 09:14:56 -0800 Subject: audit: remove useless synchronize_net() netlink kernel socket is protected by refcount, not RCU. Its rcv path is neither protected by RCU. So the synchronize_net() is just pointless. Cc: Richard Guy Briggs Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- kernel/audit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 92c463d2d1c7..67b9fbd871be 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1172,9 +1172,8 @@ static void __net_exit audit_net_exit(struct net *net) audit_sock = NULL; } - RCU_INIT_POINTER(aunet->nlsk, NULL); - synchronize_net(); netlink_kernel_release(sock); + aunet->nlsk = NULL; } static struct pernet_operations audit_net_ops __net_initdata = { -- cgit v1.2.3 From b32614c03413f8a6025d8677c2b7c0ee976e63d4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sun, 27 Nov 2016 00:13:34 +0100 Subject: tracing/rb: Convert to hotplug state machine Install the callbacks via the state machine. The notifier in struct ring_buffer is replaced by the multi instance interface. Upon __ring_buffer_alloc() invocation, cpuhp_state_add_instance() will invoke the trace_rb_cpu_prepare() on each CPU. This callback may now fail. This means __ring_buffer_alloc() will fail and cleanup (like previously) and during a CPU up event this failure will not allow the CPU to come up. Signed-off-by: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161126231350.10321-7-bigeasy@linutronix.de Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 1 + include/linux/ring_buffer.h | 6 ++ kernel/trace/ring_buffer.c | 135 +++++++++++++++----------------------------- kernel/trace/trace.c | 15 ++++- 4 files changed, 66 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e3771fb959c0..18bcfeb2463e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -62,6 +62,7 @@ enum cpuhp_state { CPUHP_TOPOLOGY_PREPARE, CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, + CPUHP_TRACE_RB_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_NOTF_ERR_INJ_PREPARE, CPUHP_MIPS_SOC_PREPARE, diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 4acc552e9279..b6d4568795a7 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -198,4 +198,10 @@ enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; +#ifdef CONFIG_RING_BUFFER +int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node); +#else +#define trace_rb_cpu_prepare NULL +#endif + #endif /* _LINUX_RING_BUFFER_H */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c143739b8d7..a7a055f167c7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -479,9 +479,7 @@ struct ring_buffer { struct ring_buffer_per_cpu **buffers; -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + struct hlist_node node; u64 (*clock)(void); struct rb_irq_work irq_work; @@ -1274,11 +1272,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu); -#endif - /** * __ring_buffer_alloc - allocate a new ring_buffer * @size: the size in bytes per cpu that is needed. @@ -1296,6 +1289,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, long nr_pages; int bsize; int cpu; + int ret; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1318,17 +1312,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (nr_pages < 2) nr_pages = 2; - /* - * In case of non-hotplug cpu, if the ring-buffer is allocated - * in early initcall, it will not be notified of secondary cpus. - * In that off case, we need to allocate for all possible cpus. - */ -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - cpumask_copy(buffer->cpumask, cpu_online_mask); -#else - cpumask_copy(buffer->cpumask, cpu_possible_mask); -#endif buffer->cpus = nr_cpu_ids; bsize = sizeof(void *) * nr_cpu_ids; @@ -1337,19 +1320,15 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; - for_each_buffer_cpu(buffer, cpu) { - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) - goto fail_free_buffers; - } + cpu = raw_smp_processor_id(); + cpumask_set_cpu(cpu, buffer->cpumask); + buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; -#ifdef CONFIG_HOTPLUG_CPU - buffer->cpu_notify.notifier_call = rb_cpu_notify; - buffer->cpu_notify.priority = 0; - __register_cpu_notifier(&buffer->cpu_notify); - cpu_notifier_register_done(); -#endif + ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); + if (ret < 0) + goto fail_free_buffers; mutex_init(&buffer->mutex); @@ -1364,9 +1343,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif fail_free_buffer: kfree(buffer); @@ -1383,18 +1359,11 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_begin(); - __unregister_cpu_notifier(&buffer->cpu_notify); -#endif + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); -#ifdef CONFIG_HOTPLUG_CPU - cpu_notifier_register_done(); -#endif - kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); @@ -4633,62 +4602,48 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_HOTPLUG_CPU -static int rb_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +/* + * We only allocate new buffers, never free them if the CPU goes down. + * If we were to free the buffer, then the user would lose any trace that was in + * the buffer. + */ +int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) { - struct ring_buffer *buffer = - container_of(self, struct ring_buffer, cpu_notify); - long cpu = (long)hcpu; + struct ring_buffer *buffer; long nr_pages_same; int cpu_i; unsigned long nr_pages; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (cpumask_test_cpu(cpu, buffer->cpumask)) - return NOTIFY_OK; - - nr_pages = 0; - nr_pages_same = 1; - /* check if all cpu sizes are same */ - for_each_buffer_cpu(buffer, cpu_i) { - /* fill in the size from first enabled cpu */ - if (nr_pages == 0) - nr_pages = buffer->buffers[cpu_i]->nr_pages; - if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { - nr_pages_same = 0; - break; - } - } - /* allocate minimum pages, user can later expand it */ - if (!nr_pages_same) - nr_pages = 2; - buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, nr_pages, cpu); - if (!buffer->buffers[cpu]) { - WARN(1, "failed to allocate ring buffer on CPU %ld\n", - cpu); - return NOTIFY_OK; + buffer = container_of(node, struct ring_buffer, node); + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; } - smp_wmb(); - cpumask_set_cpu(cpu, buffer->cpumask); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* - * Do nothing. - * If we were to free the buffer, then the user would - * lose any trace that was in the buffer. - */ - break; - default: - break; } - return NOTIFY_OK; + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %u\n", + cpu); + return -ENOMEM; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + return 0; } -#endif #ifdef CONFIG_RING_BUFFER_STARTUP_TEST /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8696ce6bf2f6..465d56febc5b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7659,10 +7659,21 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); + /* + * The prepare callbacks allocates some memory for the ring buffer. We + * don't free the buffer if the if the CPU goes down. If we were to free + * the buffer, then the user would lose any trace that was in the + * buffer. The memory will be removed once the "instance" is removed. + */ + ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, + "trace/RB:preapre", trace_rb_cpu_prepare, + NULL); + if (ret < 0) + goto out_free_cpumask; /* Used for event triggers */ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) - goto out_free_cpumask; + goto out_rm_hp_state; if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; @@ -7723,6 +7734,8 @@ out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: ring_buffer_free(temp_buffer); +out_rm_hp_state: + cpuhp_remove_multi_state(CPUHP_TRACE_RB_PREPARE); out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: -- cgit v1.2.3 From dbb26055defd03d59f678cb5f2c992abe05b064a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:41 +0000 Subject: locking/rtmutex: Prevent dequeue vs. unlock race David reported a futex/rtmutex state corruption. It's caused by the following problem: CPU0 CPU1 CPU2 l->owner=T1 rt_mutex_lock(l) lock(l->wait_lock) l->owner = T1 | HAS_WAITERS; enqueue(T2) boost() unlock(l->wait_lock) schedule() rt_mutex_lock(l) lock(l->wait_lock) l->owner = T1 | HAS_WAITERS; enqueue(T3) boost() unlock(l->wait_lock) schedule() signal(->T2) signal(->T3) lock(l->wait_lock) dequeue(T2) deboost() unlock(l->wait_lock) lock(l->wait_lock) dequeue(T3) ===> wait list is now empty deboost() unlock(l->wait_lock) lock(l->wait_lock) fixup_rt_mutex_waiters() if (wait_list_empty(l)) { owner = l->owner & ~HAS_WAITERS; l->owner = owner ==> l->owner = T1 } lock(l->wait_lock) rt_mutex_unlock(l) fixup_rt_mutex_waiters() if (wait_list_empty(l)) { owner = l->owner & ~HAS_WAITERS; cmpxchg(l->owner, T1, NULL) ===> Success (l->owner = NULL) l->owner = owner ==> l->owner = T1 } That means the problem is caused by fixup_rt_mutex_waiters() which does the RMW to clear the waiters bit unconditionally when there are no waiters in the rtmutexes rbtree. This can be fatal: A concurrent unlock can release the rtmutex in the fastpath because the waiters bit is not set. If the cmpxchg() gets in the middle of the RMW operation then the previous owner, which just unlocked the rtmutex is set as the owner again when the write takes place after the successfull cmpxchg(). The solution is rather trivial: verify that the owner member of the rtmutex has the waiters bit set before clearing it. This does not require a cmpxchg() or other atomic operations because the waiters bit can only be set and cleared with the rtmutex wait_lock held. It's also safe against the fast path unlock attempt. The unlock attempt via cmpxchg() will either see the bit set and take the slowpath or see the bit cleared and release it atomically in the fastpath. It's remarkable that the test program provided by David triggers on ARM64 and MIPS64 really quick, but it refuses to reproduce on x86-64, while the problem exists there as well. That refusal might explain that this got not discovered earlier despite the bug existing from day one of the rtmutex implementation more than 10 years ago. Thanks to David for meticulously instrumenting the code and providing the information which allowed to decode this subtle problem. Reported-by: David Daney Tested-by: David Daney Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Will Deacon Cc: stable@vger.kernel.org Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core") Link: http://lkml.kernel.org/r/20161130210030.351136722@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1ec0f48962b3..2c49d76f96c3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) static void fixup_rt_mutex_waiters(struct rt_mutex *lock) { - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); + unsigned long owner, *p = (unsigned long *) &lock->owner; + + if (rt_mutex_has_waiters(lock)) + return; + + /* + * The rbtree has no waiters enqueued, now make sure that the + * lock->owner still has the waiters bit set, otherwise the + * following can happen: + * + * CPU 0 CPU 1 CPU2 + * l->owner=T1 + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T2) + * boost() + * unlock(l->lock) + * block() + * + * rt_mutex_lock(l) + * lock(l->lock) + * l->owner = T1 | HAS_WAITERS; + * enqueue(T3) + * boost() + * unlock(l->lock) + * block() + * signal(->T2) signal(->T3) + * lock(l->lock) + * dequeue(T2) + * deboost() + * unlock(l->lock) + * lock(l->lock) + * dequeue(T3) + * ==> wait list is empty + * deboost() + * unlock(l->lock) + * lock(l->lock) + * fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * l->owner = owner + * owner = l->owner & ~HAS_WAITERS; + * ==> l->owner = T1 + * } + * lock(l->lock) + * rt_mutex_unlock(l) fixup_rt_mutex_waiters() + * if (wait_list_empty(l) { + * owner = l->owner & ~HAS_WAITERS; + * cmpxchg(l->owner, T1, NULL) + * ===> Success (l->owner = NULL) + * + * l->owner = owner + * ==> l->owner = T1 + * } + * + * With the check for the waiter bit in place T3 on CPU2 will not + * overwrite. All tasks fiddling with the waiters bit are + * serialized by l->lock, so nothing else can modify the waiters + * bit. If the bit is set then nothing can change l->owner either + * so the simple RMW is safe. The cmpxchg() will simply fail if it + * happens in the middle of the RMW because the waiters bit is + * still set. + */ + owner = READ_ONCE(*p); + if (owner & RT_MUTEX_HAS_WAITERS) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); } /* -- cgit v1.2.3 From 1be5d4fa0af34fb7bafa205aeb59f5c7cc7a089d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:42 +0000 Subject: locking/rtmutex: Use READ_ONCE() in rt_mutex_owner() While debugging the rtmutex unlock vs. dequeue race Will suggested to use READ_ONCE() in rt_mutex_owner() as it might race against the cmpxchg_release() in unlock_rt_mutex_safe(). Will: "It's a minor thing which will most likely not matter in practice" Careful search did not unearth an actual problem in todays code, but it's better to be safe than surprised. Suggested-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: David Daney Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/20161130210030.431379999@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex_common.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 4f5f83c7d2d3..e317e1cbb3eb 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -75,8 +75,9 @@ task_top_pi_waiter(struct task_struct *p) static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL); } /* -- cgit v1.2.3 From b5016e8203003c44264ec88fe2276ff54a51f689 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:44 +0000 Subject: locking/rtmutex: Get rid of RT_MUTEX_OWNER_MASKALL This is a left over from the original rtmutex implementation which used both bit0 and bit1 in the owner pointer. Commit: 8161239a8bcc ("rtmutex: Simplify PI algorithm and make highest prio task get lock") ... removed the usage of bit1, but kept the extra mask around. This is confusing at best. Remove it and just use RT_MUTEX_HAS_WAITERS for the masking. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: David Daney Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Will Deacon Link: http://lkml.kernel.org/r/20161130210030.509567906@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex_common.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index e317e1cbb3eb..990134617b4c 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -71,13 +71,12 @@ task_top_pi_waiter(struct task_struct *p) * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL -#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL); + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } /* -- cgit v1.2.3 From 84d82ec5b9046ecdf16031d3e93a66ef50257402 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:45 +0000 Subject: locking/rtmutex: Explain locking rules for rt_mutex_proxy_unlock()/init_proxy_locked() While debugging the unlock vs. dequeue race which resulted in state corruption of futexes the lockless nature of rt_mutex_proxy_unlock() caused some confusion. Add commentry to explain why it is safe to do this lockless. Add matching comments to rt_mutex_init_proxy_locked() for completeness sake. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: David Daney Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Will Deacon Link: http://lkml.kernel.org/r/20161130210030.591941927@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6e6cab7ac12f..2f443ed2320a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1619,11 +1619,15 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a * proxy owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @proxy_owner:the task to set as owner * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. */ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) @@ -1637,10 +1641,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, /** * rt_mutex_proxy_unlock - release a lock on behalf of owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This merrily cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. */ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) -- cgit v1.2.3 From 3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 30 Nov 2016 17:10:10 +0100 Subject: bpf: BPF for lightweight tunnel infrastructure Registers new BPF program types which correspond to the LWT hooks: - BPF_PROG_TYPE_LWT_IN => dst_input() - BPF_PROG_TYPE_LWT_OUT => dst_output() - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit() The separate program types are required to differentiate between the capabilities each LWT hook allows: * Programs attached to dst_input() or dst_output() are restricted and may only read the data of an skb. This prevent modification and possible invalidation of already validated packet headers on receive and the construction of illegal headers while the IP headers are still being assembled. * Programs attached to lwtunnel_xmit() are allowed to modify packet content as well as prepending an L2 header via a newly introduced helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is invoked after the IP header has been assembled completely. All BPF programs receive an skb with L3 headers attached and may return one of the following error codes: BPF_OK - Continue routing as per nexthop BPF_DROP - Drop skb and return EPERM BPF_REDIRECT - Redirect skb to device as per redirect() helper. (Only valid in lwtunnel_xmit() context) The return codes are binary compatible with their TC_ACT_ relatives to ease compatibility. Signed-off-by: Thomas Graf Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 32 +++- include/uapi/linux/lwtunnel.h | 23 +++ kernel/bpf/verifier.c | 14 +- net/Kconfig | 8 + net/core/Makefile | 1 + net/core/filter.c | 173 ++++++++++++++++++ net/core/lwt_bpf.c | 396 ++++++++++++++++++++++++++++++++++++++++++ net/core/lwtunnel.c | 2 + 9 files changed, 646 insertions(+), 5 deletions(-) create mode 100644 net/core/lwt_bpf.c (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7f246a281435..7ba644626553 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -438,7 +438,7 @@ struct xdp_buff { }; /* compute the linear packet data range [data, data_end) which - * will be accessed by cls_bpf and act_bpf programs + * will be accessed by cls_bpf, act_bpf and lwt programs */ static inline void bpf_compute_data_end(struct sk_buff *skb) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1370a9d1456f..22ac82792687 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,6 +101,9 @@ enum bpf_prog_type { BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, }; enum bpf_attach_type { @@ -409,6 +412,16 @@ union bpf_attr { * * int bpf_get_numa_node_id() * Return: Id of current NUMA node. + * + * int bpf_skb_change_head() + * Grows headroom of skb and adjusts MAC header offset accordingly. + * Will extends/reallocae as required automatically. + * May change skb data pointer and will thus invalidate any check + * performed for direct packet access. + * @skb: pointer to skb + * @len: length of header to be pushed in front + * @flags: Flags (unused for now) + * Return: 0 on success or negative error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -453,7 +466,8 @@ union bpf_attr { FN(skb_pull_data), \ FN(csum_update), \ FN(set_hash_invalid), \ - FN(get_numa_node_id), + FN(get_numa_node_id), \ + FN(skb_change_head), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -537,6 +551,22 @@ struct bpf_tunnel_key { __u32 tunnel_label; }; +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes */ +}; + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 453cc6215bfd..92724cba1eba 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -10,6 +10,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_ILA, LWTUNNEL_ENCAP_IP6, LWTUNNEL_ENCAP_SEG6, + LWTUNNEL_ENCAP_BPF, __LWTUNNEL_ENCAP_MAX, }; @@ -43,4 +44,26 @@ enum lwtunnel_ip6_t { #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) +enum { + LWT_BPF_PROG_UNSPEC, + LWT_BPF_PROG_FD, + LWT_BPF_PROG_NAME, + __LWT_BPF_PROG_MAX, +}; + +#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1) + +enum { + LWT_BPF_UNSPEC, + LWT_BPF_IN, + LWT_BPF_OUT, + LWT_BPF_XMIT, + LWT_BPF_XMIT_HEADROOM, + __LWT_BPF_MAX, +}; + +#define LWT_BPF_MAX (__LWT_BPF_MAX - 1) + +#define LWT_BPF_MAX_HEADROOM 256 + #endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8740c5fa02fc..8135cb1077ee 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, #define MAX_PACKET_OFF 0xffff static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, - const struct bpf_call_arg_meta *meta) + const struct bpf_call_arg_meta *meta, + enum bpf_access_type t) { switch (env->prog->type) { + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + /* dst_input() and dst_output() can't write for now */ + if (t == BPF_WRITE) + return false; case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_XMIT: if (meta) return meta->pkt_access; @@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, err = check_stack_read(state, off, size, value_regno); } } else if (state->regs[regno].type == PTR_TO_PACKET) { - if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) { + if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; } @@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) { + if (type == PTR_TO_PACKET && + !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; } diff --git a/net/Kconfig b/net/Kconfig index 7b6cd340b72b..a1005007224c 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -402,6 +402,14 @@ config LWTUNNEL weight tunnel endpoint. Tunnel encapsulation parameters are stored with light weight tunnel state associated with fib routes. +config LWTUNNEL_BPF + bool "Execute BPF program as route nexthop action" + depends on LWTUNNEL + default y if LWTUNNEL=y + ---help--- + Allows to run BPF programs as a nexthop action following a route + lookup for incoming and outgoing packets. + config DST_CACHE bool default n diff --git a/net/core/Makefile b/net/core/Makefile index d6508c2ddca5..f6761b6e3b29 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o +obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o diff --git a/net/core/filter.c b/net/core/filter.c index 698a262b8ebb..1c4d0faf22c8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1689,6 +1689,12 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { + /* Verify that a link layer header is carried */ + if (unlikely(skb->mac_header >= skb->network_header)) { + kfree_skb(skb); + return -ERANGE; + } + bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); @@ -2188,12 +2194,53 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 new_len = skb->len + head_room; + int ret; + + if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + new_len < skb->len)) + return -EINVAL; + + ret = skb_cow(skb, head_room); + if (likely(!ret)) { + /* Idea for this helper is that we currently only + * allow to expand on mac header. This means that + * skb->protocol network header, etc, stay as is. + * Compared to bpf_skb_change_tail(), we're more + * flexible due to not needing to linearize or + * reset GSO. Intention for this helper is to be + * used by an L3 skb that needs to push mac header + * for redirection into L2 device. + */ + __skb_push(skb, head_room); + memset(skb->data, 0, head_room); + skb_reset_mac_header(skb); + } + + bpf_compute_data_end(skb); + return 0; +} + +static const struct bpf_func_proto bpf_skb_change_head_proto = { + .func = bpf_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || func == bpf_skb_store_bytes || func == bpf_skb_change_proto || + func == bpf_skb_change_head || func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || @@ -2639,6 +2686,68 @@ cg_skb_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +lwt_inout_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_get_route_realm: + return &bpf_get_route_realm_proto; + case BPF_FUNC_get_hash_recalc: + return &bpf_get_hash_recalc_proto; + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_skb_under_cgroup: + return &bpf_skb_under_cgroup_proto; + default: + return sk_filter_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +lwt_xmit_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_redirect: + return &bpf_redirect_proto; + case BPF_FUNC_clone_redirect: + return &bpf_clone_redirect_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; + case BPF_FUNC_l3_csum_replace: + return &bpf_l3_csum_replace_proto; + case BPF_FUNC_l4_csum_replace: + return &bpf_l4_csum_replace_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; + default: + return lwt_inout_func_proto(func_id); + } +} + static bool __is_valid_access(int off, int size, enum bpf_access_type type) { if (off < 0 || off >= sizeof(struct __sk_buff)) @@ -2676,6 +2785,39 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool lwt_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + return false; + } + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct __sk_buff, mark): + case offsetof(struct __sk_buff, priority): + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + + return __is_valid_access(off, size, type); +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -3007,6 +3149,19 @@ static const struct bpf_verifier_ops cg_skb_ops = { .convert_ctx_access = sk_filter_convert_ctx_access, }; +static const struct bpf_verifier_ops lwt_inout_ops = { + .get_func_proto = lwt_inout_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + +static const struct bpf_verifier_ops lwt_xmit_ops = { + .get_func_proto = lwt_xmit_func_proto, + .is_valid_access = lwt_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3032,6 +3187,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = { .type = BPF_PROG_TYPE_CGROUP_SKB, }; +static struct bpf_prog_type_list lwt_in_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_IN, +}; + +static struct bpf_prog_type_list lwt_out_type __read_mostly = { + .ops = &lwt_inout_ops, + .type = BPF_PROG_TYPE_LWT_OUT, +}; + +static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { + .ops = &lwt_xmit_ops, + .type = BPF_PROG_TYPE_LWT_XMIT, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); @@ -3039,6 +3209,9 @@ static int __init register_sk_filter_ops(void) bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&lwt_in_type); + bpf_register_prog_type(&lwt_out_type); + bpf_register_prog_type(&lwt_xmit_type); return 0; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c new file mode 100644 index 000000000000..71bb3e2eca08 --- /dev/null +++ b/net/core/lwt_bpf.c @@ -0,0 +1,396 @@ +/* Copyright (c) 2016 Thomas Graf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +struct bpf_lwt_prog { + struct bpf_prog *prog; + char *name; +}; + +struct bpf_lwt { + struct bpf_lwt_prog in; + struct bpf_lwt_prog out; + struct bpf_lwt_prog xmit; + int family; +}; + +#define MAX_PROG_NAME 256 + +static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct bpf_lwt *)lwt->data; +} + +#define NO_REDIRECT false +#define CAN_REDIRECT true + +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, + struct dst_entry *dst, bool can_redirect) +{ + int ret; + + /* Preempt disable is needed to protect per-cpu redirect_info between + * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and + * access to maps strictly require a rcu_read_lock() for protection, + * mixing with BH RCU lock doesn't work. + */ + preempt_disable(); + rcu_read_lock(); + bpf_compute_data_end(skb); + ret = bpf_prog_run_save_cb(lwt->prog, skb); + rcu_read_unlock(); + + switch (ret) { + case BPF_OK: + break; + + case BPF_REDIRECT: + if (unlikely(!can_redirect)) { + pr_warn_once("Illegal redirect return code in prog %s\n", + lwt->name ? : ""); + ret = BPF_OK; + } else { + ret = skb_do_redirect(skb); + if (ret == 0) + ret = BPF_REDIRECT; + } + break; + + case BPF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + + default: + pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); + kfree_skb(skb); + ret = -EINVAL; + break; + } + + preempt_enable(); + + return ret; +} + +static int bpf_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->in.prog) { + ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_input)) { + pr_warn_once("orig_input not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_input(skb); +} + +static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + int ret; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->out.prog) { + ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); + if (ret < 0) + return ret; + } + + if (unlikely(!dst->lwtstate->orig_output)) { + pr_warn_once("orig_output not set on dst for prog %s\n", + bpf->out.name); + kfree_skb(skb); + return -EINVAL; + } + + return dst->lwtstate->orig_output(net, sk, skb); +} + +static int xmit_check_hhlen(struct sk_buff *skb) +{ + int hh_len = skb_dst(skb)->dev->hard_header_len; + + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) + return -ENOMEM; + } + + return 0; +} + +static int bpf_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct bpf_lwt *bpf; + + bpf = bpf_lwt_lwtunnel(dst->lwtstate); + if (bpf->xmit.prog) { + int ret; + + ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); + switch (ret) { + case BPF_OK: + /* If the header was expanded, headroom might be too + * small for L2 header to come, expand as needed. + */ + ret = xmit_check_hhlen(skb); + if (unlikely(ret)) + return ret; + + return LWTUNNEL_XMIT_CONTINUE; + case BPF_REDIRECT: + return LWTUNNEL_XMIT_DONE; + default: + return ret; + } + } + + return LWTUNNEL_XMIT_CONTINUE; +} + +static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) +{ + if (prog->prog) + bpf_prog_put(prog->prog); + + kfree(prog->name); +} + +static void bpf_destroy_state(struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + bpf_lwt_prog_destroy(&bpf->in); + bpf_lwt_prog_destroy(&bpf->out); + bpf_lwt_prog_destroy(&bpf->xmit); +} + +static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { + [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, + [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, + .len = MAX_PROG_NAME }, +}; + +static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, + enum bpf_prog_type type) +{ + struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; + struct bpf_prog *p; + int ret; + u32 fd; + + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) + return -EINVAL; + + prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); + if (!prog->name) + return -ENOMEM; + + fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); + p = bpf_prog_get_type(fd, type); + if (IS_ERR(p)) + return PTR_ERR(p); + + prog->prog = p; + + return 0; +} + +static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { + [LWT_BPF_IN] = { .type = NLA_NESTED, }, + [LWT_BPF_OUT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, + [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, +}; + +static int bpf_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[LWT_BPF_MAX + 1]; + struct lwtunnel_state *newts; + struct bpf_lwt *bpf; + int ret; + + if (family != AF_INET && family != AF_INET6) + return -EAFNOSUPPORT; + + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + if (ret < 0) + return ret; + + if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*bpf)); + if (!newts) + return -ENOMEM; + + newts->type = LWTUNNEL_ENCAP_BPF; + bpf = bpf_lwt_lwtunnel(newts); + + if (tb[LWT_BPF_IN]) { + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, + BPF_PROG_TYPE_LWT_IN); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_OUT]) { + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, + BPF_PROG_TYPE_LWT_OUT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT]) { + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, + BPF_PROG_TYPE_LWT_XMIT); + if (ret < 0) + goto errout; + } + + if (tb[LWT_BPF_XMIT_HEADROOM]) { + u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); + + if (headroom > LWT_BPF_MAX_HEADROOM) { + ret = -ERANGE; + goto errout; + } + + newts->headroom = headroom; + } + + bpf->family = family; + *ts = newts; + + return 0; + +errout: + bpf_destroy_state(newts); + kfree(newts); + return ret; +} + +static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, + struct bpf_lwt_prog *prog) +{ + struct nlattr *nest; + + if (!prog->prog) + return 0; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (prog->name && + nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) + return -EMSGSIZE; + + return nla_nest_end(skb, nest); +} + +static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) +{ + struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); + + if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || + bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) + return -EMSGSIZE; + + return 0; +} + +static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + int nest_len = nla_total_size(sizeof(struct nlattr)) + + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ + 0; + + return nest_len + /* LWT_BPF_IN */ + nest_len + /* LWT_BPF_OUT */ + nest_len + /* LWT_BPF_XMIT */ + 0; +} + +int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +{ + /* FIXME: + * The LWT state is currently rebuilt for delete requests which + * results in a new bpf_prog instance. Comparing names for now. + */ + if (!a->name && !b->name) + return 0; + + if (!a->name || !b->name) + return 1; + + return strcmp(a->name, b->name); +} + +static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); + struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); + + return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || + bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || + bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); +} + +static const struct lwtunnel_encap_ops bpf_encap_ops = { + .build_state = bpf_build_state, + .destroy_state = bpf_destroy_state, + .input = bpf_input, + .output = bpf_output, + .xmit = bpf_xmit, + .fill_encap = bpf_fill_encap_info, + .get_encap_size = bpf_encap_nlsize, + .cmp_encap = bpf_encap_cmp, +}; + +static int __init bpf_lwt_init(void) +{ + return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); +} + +subsys_initcall(bpf_lwt_init) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 03976e939818..a5d4e866ce88 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "ILA"; case LWTUNNEL_ENCAP_SEG6: return "SEG6"; + case LWTUNNEL_ENCAP_BPF: + return "BPF"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: -- cgit v1.2.3 From b2cd12574aa3e1625f471ff57cde7f628a18a46b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 1 Dec 2016 08:48:03 -0800 Subject: bpf: Refactor cgroups code in prep for new type Code move and rename only; no functional change intended. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 46 +++++++++++++++++++++++----------------------- kernel/bpf/cgroup.c | 10 +++++----- kernel/bpf/syscall.c | 28 +++++++++++++++------------- 3 files changed, 43 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 0cf1adfadd2d..af2ca8b432c0 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -36,31 +36,31 @@ void cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, enum bpf_attach_type type); -int __cgroup_bpf_run_filter(struct sock *sk, - struct sk_buff *skb, - enum bpf_attach_type type); - -/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */ -#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter(sk, skb, \ - BPF_CGROUP_INET_INGRESS); \ - \ - __ret; \ +int __cgroup_bpf_run_filter_skb(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type); + +/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ +#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ + BPF_CGROUP_INET_INGRESS); \ + \ + __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) \ -({ \ - int __ret = 0; \ - if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ - typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk)) \ - __ret = __cgroup_bpf_run_filter(__sk, skb, \ - BPF_CGROUP_INET_EGRESS); \ - } \ - __ret; \ +#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + typeof(sk) __sk = sk_to_full_sk(sk); \ + if (sk_fullsock(__sk)) \ + __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ + BPF_CGROUP_INET_EGRESS); \ + } \ + __ret; \ }) #else diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8c784f8c67cd..8fe55ffd109d 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -118,7 +118,7 @@ void __cgroup_bpf_update(struct cgroup *cgrp, } /** - * __cgroup_bpf_run_filter() - Run a program for packet filtering + * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socken sending or receiving traffic * @skb: The skb that is being sent or received * @type: The type of program to be exectuted @@ -132,9 +132,9 @@ void __cgroup_bpf_update(struct cgroup *cgrp, * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned. */ -int __cgroup_bpf_run_filter(struct sock *sk, - struct sk_buff *skb, - enum bpf_attach_type type) +int __cgroup_bpf_run_filter_skb(struct sock *sk, + struct sk_buff *skb, + enum bpf_attach_type type) { struct bpf_prog *prog; struct cgroup *cgrp; @@ -164,4 +164,4 @@ int __cgroup_bpf_run_filter(struct sock *sk, return ret; } -EXPORT_SYMBOL(__cgroup_bpf_run_filter); +EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4caa18e6860a..5518a6839ab1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -856,6 +856,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) { struct bpf_prog *prog; struct cgroup *cgrp; + enum bpf_prog_type ptype; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -866,25 +867,26 @@ static int bpf_prog_attach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_CGROUP_SKB); - if (IS_ERR(prog)) - return PTR_ERR(prog); - - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) { - bpf_prog_put(prog); - return PTR_ERR(cgrp); - } - - cgroup_bpf_update(cgrp, prog, attr->attach_type); - cgroup_put(cgrp); + ptype = BPF_PROG_TYPE_CGROUP_SKB; break; default: return -EINVAL; } + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) { + bpf_prog_put(prog); + return PTR_ERR(cgrp); + } + + cgroup_bpf_update(cgrp, prog, attr->attach_type); + cgroup_put(cgrp); + return 0; } -- cgit v1.2.3 From 61023658760032e97869b07d54be9681d2529e77 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 1 Dec 2016 08:48:04 -0800 Subject: bpf: Add new cgroup attach type to enable sock modifications Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run any time a process in the cgroup opens an AF_INET or AF_INET6 socket. Currently only sk_bound_dev_if is exported to userspace for modification by a bpf program. This allows a cgroup to be configured such that AF_INET{6} sockets opened by processes are automatically bound to a specific device. In turn, this enables the running of programs that do not support SO_BINDTODEVICE in a specific VRF context / L3 domain. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf-cgroup.h | 14 +++++++++++ include/uapi/linux/bpf.h | 6 +++++ kernel/bpf/cgroup.c | 33 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 5 +++- net/core/filter.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 12 ++++++++- net/ipv6/af_inet6.c | 8 ++++++ 7 files changed, 138 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index af2ca8b432c0..7b6e5d168c95 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -40,6 +40,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -63,6 +66,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled && sk) { \ + __ret = __cgroup_bpf_run_filter_sk(sk, \ + BPF_CGROUP_INET_SOCK_CREATE); \ + } \ + __ret; \ +}) + #else struct cgroup_bpf {}; @@ -72,6 +85,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #endif /* CONFIG_CGROUP_BPF */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 22ac82792687..bfe5e31a1288 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -101,6 +101,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_XDP, BPF_PROG_TYPE_PERF_EVENT, BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, BPF_PROG_TYPE_LWT_IN, BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, @@ -109,6 +110,7 @@ enum bpf_prog_type { enum bpf_attach_type { BPF_CGROUP_INET_INGRESS, BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, __MAX_BPF_ATTACH_TYPE }; @@ -567,6 +569,10 @@ enum bpf_ret_code { /* >127 are reserved for prog type specific return codes */ }; +struct bpf_sock { + __u32 bound_dev_if; +}; + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8fe55ffd109d..a515f7b007c6 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -165,3 +165,36 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); + +/** + * __cgroup_bpf_run_filter_sk() - Run a program on a sock + * @sk: sock structure to manipulate + * @type: The type of program to be exectuted + * + * socket is passed is expected to be of type INET or INET6. + * + * The program type passed in via @type must be suitable for sock + * filtering. No further check is performed to assert that. + * + * This function will return %-EPERM if any if an attached program was found + * and if it returned != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sk(struct sock *sk, + enum bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog *prog; + int ret = 0; + + + rcu_read_lock(); + + prog = rcu_dereference(cgrp->bpf.effective[type]); + if (prog) + ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5518a6839ab1..85af86c496cd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -869,7 +869,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_EGRESS: ptype = BPF_PROG_TYPE_CGROUP_SKB; break; - + case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; default: return -EINVAL; } @@ -905,6 +907,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) return PTR_ERR(cgrp); diff --git a/net/core/filter.c b/net/core/filter.c index 1c4d0faf22c8..0ab252e462aa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2818,6 +2818,32 @@ static bool lwt_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + enum bpf_reg_type *reg_type) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + break; + default: + return false; + } + } + + if (off < 0 || off + size > sizeof(struct bpf_sock)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + if (size != sizeof(__u32)) + return false; + + return true; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -3076,6 +3102,30 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, + int dst_reg, int src_reg, + int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct bpf_sock, bound_dev_if): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + else + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sock, sk_bound_dev_if)); + break; + } + + return insn - insn_buf; +} + static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -3162,6 +3212,12 @@ static const struct bpf_verifier_ops lwt_xmit_ops = { .gen_prologue = tc_cls_act_prologue, }; +static const struct bpf_verifier_ops cg_sock_ops = { + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, + .convert_ctx_access = sock_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, @@ -3202,6 +3258,11 @@ static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { .type = BPF_PROG_TYPE_LWT_XMIT, }; +static struct bpf_prog_type_list cg_sock_type __read_mostly = { + .ops = &cg_sock_ops, + .type = BPF_PROG_TYPE_CGROUP_SOCK +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); @@ -3209,6 +3270,7 @@ static int __init register_sk_filter_ops(void) bpf_register_prog_type(&sched_act_type); bpf_register_prog_type(&xdp_type); bpf_register_prog_type(&cg_skb_type); + bpf_register_prog_type(&cg_sock_type); bpf_register_prog_type(&lwt_in_type); bpf_register_prog_type(&lwt_out_type); bpf_register_prog_type(&lwt_xmit_type); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5ddf5cda07f4..24d2550492ee 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -374,8 +374,18 @@ lookup_protocol: if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) + if (err) { + sk_common_release(sk); + goto out; + } + } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { sk_common_release(sk); + goto out; + } } out: return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d424f3a3737a..237e654ba717 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -258,6 +258,14 @@ lookup_protocol: goto out; } } + + if (!kern) { + err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); + if (err) { + sk_common_release(sk); + goto out; + } + } out: return err; out_rcu_unlock: -- cgit v1.2.3 From 450630975da9e7dffe540753e169dc4da5fe7c29 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Dec 2016 18:24:56 -0500 Subject: don't open-code file_inode() Signed-off-by: Al Viro --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 ++-- drivers/staging/greybus/camera.c | 4 ++-- drivers/staging/greybus/es2.c | 6 +++--- drivers/staging/greybus/svc.c | 6 +++--- drivers/staging/greybus/timesync.c | 2 +- drivers/target/target_core_configfs.c | 2 +- fs/aio.c | 6 +++--- fs/autofs4/inode.c | 2 +- fs/fcntl.c | 2 +- fs/orangefs/file.c | 2 +- fs/orangefs/orangefs-debugfs.c | 6 ++++-- fs/overlayfs/copy_up.c | 2 +- kernel/audit_watch.c | 4 ++-- kernel/events/core.c | 2 +- kernel/locking/qspinlock_stat.h | 12 ++---------- security/smack/smack_lsm.c | 2 +- 18 files changed, 40 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3161d77bf299..3c2858972217 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2501,7 +2501,7 @@ static void amdgpu_debugfs_remove_files(struct amdgpu_device *adev) static ssize_t amdgpu_debugfs_regs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; bool pm_pg_lock, use_bank; @@ -2570,7 +2570,7 @@ end: static ssize_t amdgpu_debugfs_regs_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -2601,7 +2601,7 @@ static ssize_t amdgpu_debugfs_regs_write(struct file *f, const char __user *buf, static ssize_t amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -2628,7 +2628,7 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf, static ssize_t amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -2656,7 +2656,7 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user static ssize_t amdgpu_debugfs_regs_didt_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -2683,7 +2683,7 @@ static ssize_t amdgpu_debugfs_regs_didt_read(struct file *f, char __user *buf, static ssize_t amdgpu_debugfs_regs_didt_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -2711,7 +2711,7 @@ static ssize_t amdgpu_debugfs_regs_didt_write(struct file *f, const char __user static ssize_t amdgpu_debugfs_regs_smc_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -2738,7 +2738,7 @@ static ssize_t amdgpu_debugfs_regs_smc_read(struct file *f, char __user *buf, static ssize_t amdgpu_debugfs_regs_smc_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -2766,7 +2766,7 @@ static ssize_t amdgpu_debugfs_regs_smc_write(struct file *f, const char __user * static ssize_t amdgpu_debugfs_gca_config_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; uint32_t *config, no_regs = 0; @@ -2836,7 +2836,7 @@ static ssize_t amdgpu_debugfs_gca_config_read(struct file *f, char __user *buf, static ssize_t amdgpu_debugfs_sensor_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; int idx, r; int32_t value; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 3cb5e903cd62..2636f619569e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -283,7 +283,7 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring) static ssize_t amdgpu_debugfs_ring_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_ring *ring = (struct amdgpu_ring*)f->f_inode->i_private; + struct amdgpu_ring *ring = file_inode(f)->i_private; int r, i; uint32_t value, result, early[3]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index dcaf691f56b5..a5d00ef2b421 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1412,7 +1412,7 @@ static const struct drm_info_list amdgpu_ttm_debugfs_list[] = { static ssize_t amdgpu_ttm_vram_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; @@ -1456,7 +1456,7 @@ static const struct file_operations amdgpu_ttm_vram_fops = { static ssize_t amdgpu_ttm_gtt_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = f->f_inode->i_private; + struct amdgpu_device *adev = file_inode(f)->i_private; ssize_t result = 0; int r; diff --git a/drivers/staging/greybus/camera.c b/drivers/staging/greybus/camera.c index 491bdd720c0c..cebb76e7d55c 100644 --- a/drivers/staging/greybus/camera.c +++ b/drivers/staging/greybus/camera.c @@ -1091,7 +1091,7 @@ static ssize_t gb_camera_debugfs_read(struct file *file, char __user *buf, size_t len, loff_t *offset) { const struct gb_camera_debugfs_entry *op = file->private_data; - struct gb_camera *gcam = file->f_inode->i_private; + struct gb_camera *gcam = file_inode(file)->i_private; struct gb_camera_debugfs_buffer *buffer; ssize_t ret; @@ -1113,7 +1113,7 @@ static ssize_t gb_camera_debugfs_write(struct file *file, loff_t *offset) { const struct gb_camera_debugfs_entry *op = file->private_data; - struct gb_camera *gcam = file->f_inode->i_private; + struct gb_camera *gcam = file_inode(file)->i_private; ssize_t ret; char *kbuf; diff --git a/drivers/staging/greybus/es2.c b/drivers/staging/greybus/es2.c index baab460eeaa3..d367cdffaa10 100644 --- a/drivers/staging/greybus/es2.c +++ b/drivers/staging/greybus/es2.c @@ -1250,7 +1250,7 @@ static int apb_log_poll(void *data) static ssize_t apb_log_read(struct file *f, char __user *buf, size_t count, loff_t *ppos) { - struct es2_ap_dev *es2 = f->f_inode->i_private; + struct es2_ap_dev *es2 = file_inode(f)->i_private; ssize_t ret; size_t copied; char *tmp_buf; @@ -1304,7 +1304,7 @@ static void usb_log_disable(struct es2_ap_dev *es2) static ssize_t apb_log_enable_read(struct file *f, char __user *buf, size_t count, loff_t *ppos) { - struct es2_ap_dev *es2 = f->f_inode->i_private; + struct es2_ap_dev *es2 = file_inode(f)->i_private; int enable = !IS_ERR_OR_NULL(es2->apb_log_task); char tmp_buf[3]; @@ -1317,7 +1317,7 @@ static ssize_t apb_log_enable_write(struct file *f, const char __user *buf, { int enable; ssize_t retval; - struct es2_ap_dev *es2 = f->f_inode->i_private; + struct es2_ap_dev *es2 = file_inode(f)->i_private; retval = kstrtoint_from_user(buf, count, 10, &enable); if (retval) diff --git a/drivers/staging/greybus/svc.c b/drivers/staging/greybus/svc.c index 550055ec27a5..8779270cadc1 100644 --- a/drivers/staging/greybus/svc.c +++ b/drivers/staging/greybus/svc.c @@ -757,7 +757,7 @@ static int gb_svc_version_request(struct gb_operation *op) static ssize_t pwr_debugfs_voltage_read(struct file *file, char __user *buf, size_t len, loff_t *offset) { - struct svc_debugfs_pwrmon_rail *pwrmon_rails = file->f_inode->i_private; + struct svc_debugfs_pwrmon_rail *pwrmon_rails = file_inode(file)->i_private; struct gb_svc *svc = pwrmon_rails->svc; int ret, desc; u32 value; @@ -780,7 +780,7 @@ static ssize_t pwr_debugfs_voltage_read(struct file *file, char __user *buf, static ssize_t pwr_debugfs_current_read(struct file *file, char __user *buf, size_t len, loff_t *offset) { - struct svc_debugfs_pwrmon_rail *pwrmon_rails = file->f_inode->i_private; + struct svc_debugfs_pwrmon_rail *pwrmon_rails = file_inode(file)->i_private; struct gb_svc *svc = pwrmon_rails->svc; int ret, desc; u32 value; @@ -803,7 +803,7 @@ static ssize_t pwr_debugfs_current_read(struct file *file, char __user *buf, static ssize_t pwr_debugfs_power_read(struct file *file, char __user *buf, size_t len, loff_t *offset) { - struct svc_debugfs_pwrmon_rail *pwrmon_rails = file->f_inode->i_private; + struct svc_debugfs_pwrmon_rail *pwrmon_rails = file_inode(file)->i_private; struct gb_svc *svc = pwrmon_rails->svc; int ret, desc; u32 value; diff --git a/drivers/staging/greybus/timesync.c b/drivers/staging/greybus/timesync.c index 2e68af7dea6d..09ce00484b3f 100644 --- a/drivers/staging/greybus/timesync.c +++ b/drivers/staging/greybus/timesync.c @@ -921,7 +921,7 @@ EXPORT_SYMBOL_GPL(gb_timesync_schedule_asynchronous); static ssize_t gb_timesync_ping_read(struct file *file, char __user *ubuf, size_t len, loff_t *offset, bool ktime) { - struct gb_timesync_svc *timesync_svc = file->f_inode->i_private; + struct gb_timesync_svc *timesync_svc = file_inode(file)->i_private; char *buf; ssize_t ret = 0; diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 2001005bef45..a35a347ec357 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -143,7 +143,7 @@ static ssize_t target_core_item_dbroot_store(struct config_item *item, pr_err("db_root: cannot open: %s\n", db_root_stage); return -EINVAL; } - if (!S_ISDIR(fp->f_inode->i_mode)) { + if (!S_ISDIR(file_inode(fp)->i_mode)) { filp_close(fp, 0); mutex_unlock(&g_tf_lock); pr_err("db_root: not a directory: %s\n", db_root_stage); diff --git a/fs/aio.c b/fs/aio.c index 428484f2f841..8edf253484af 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -277,10 +277,10 @@ static void put_aio_ring_file(struct kioctx *ctx) struct address_space *i_mapping; if (aio_ring_file) { - truncate_setsize(aio_ring_file->f_inode, 0); + truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ - i_mapping = aio_ring_file->f_inode->i_mapping; + i_mapping = aio_ring_file->f_mapping; spin_lock(&i_mapping->private_lock); i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; @@ -483,7 +483,7 @@ static int aio_setup_ring(struct kioctx *ctx) for (i = 0; i < nr_pages; i++) { struct page *page; - page = find_or_create_page(file->f_inode->i_mapping, + page = find_or_create_page(file->f_mapping, i, GFP_HIGHUSER | __GFP_ZERO); if (!page) break; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 438b5bf675b6..09e7d68dff02 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -94,7 +94,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",indirect"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) - seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino); + seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); else seq_printf(m, ",pipe_ino=-1"); #endif diff --git a/fs/fcntl.c b/fs/fcntl.c index 350a2c8cfd28..6e2771c210f6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -52,7 +52,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ - if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) { + if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) { if (!filp->f_mapping || !filp->f_mapping->a_ops || !filp->f_mapping->a_ops->direct_IO) return -EINVAL; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 02cc6139ec90..e6bbc8083d77 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -724,7 +724,7 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) { int rc = -EINVAL; - if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { + if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { if (cmd == F_GETLK) { rc = 0; posix_test_lock(filp, fl); diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 38887cc5577f..f1e824979aad 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -434,6 +434,7 @@ static ssize_t orangefs_debug_write(struct file *file, char *debug_string; struct orangefs_kernel_op_s *new_op = NULL; struct client_debug_mask c_mask = { NULL, 0, 0 }; + char *s; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_write: %pD\n", @@ -521,8 +522,9 @@ static ssize_t orangefs_debug_write(struct file *file, } mutex_lock(&orangefs_debug_lock); - memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); - sprintf((char *)file->f_inode->i_private, "%s\n", debug_string); + s = file_inode(file)->i_private; + memset(s, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); + sprintf(s, "%s\n", debug_string); mutex_unlock(&orangefs_debug_lock); *ppos += count; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 36795eed40b0..2838bddb1f91 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -33,7 +33,7 @@ static int ovl_check_fd(const void *data, struct file *f, unsigned int fd) { const struct dentry *dentry = data; - if (f->f_inode == d_inode(dentry)) + if (file_inode(f) == d_inode(dentry)) pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n", f, fd, current->pid, current->comm); return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0d302a87f21b..99401efef7ac 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -548,8 +548,8 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) exe_file = get_task_exe_file(tsk); if (!exe_file) return 0; - ino = exe_file->f_inode->i_ino; - dev = exe_file->f_inode->i_sb->s_dev; + ino = file_inode(exe_file)->i_ino; + dev = file_inode(exe_file)->i_sb->s_dev; fput(exe_file); return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6ee1febdf6ff..5134a1c17186 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6701,7 +6701,7 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, struct file *file, unsigned long offset, unsigned long size) { - if (filter->inode != file->f_inode) + if (filter->inode != file_inode(file)) return false; if (filter->offset > offset + size) diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index eb0a599fcf58..e852be4851fc 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -108,11 +108,7 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - counter = (long)(file->f_inode->i_private); + counter = (long)file_inode(file)->i_private; if (counter >= qstat_num) return -EBADF; @@ -177,11 +173,7 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf, /* * Get the counter ID stored in file->f_inode->i_private */ - if (!file->f_inode) { - WARN_ON_ONCE(1); - return -EBADF; - } - if ((long)(file->f_inode->i_private) != qstat_reset_cnts) + if ((long)file_inode(file)->i_private != qstat_reset_cnts) return count; for_each_possible_cpu(cpu) { diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 1cb060293505..ad170310a856 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -225,7 +225,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file, { struct task_smack *tsp = cred->security; struct smack_known *sskp = tsp->smk_task; - struct inode *inode = file->f_inode; + struct inode *inode = file_inode(file); struct inode_smack *isp = inode->i_security; char acc[SMK_NUM_ACCESS_TYPE + 1]; -- cgit v1.2.3 From 3c839744b33782b930c5c61df35511ede5e5a574 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Sat, 3 Dec 2016 12:31:33 -0800 Subject: bpf: Preserve const register type on const OR alu ops Occasionally, clang (e.g. version 3.8.1) translates a sum between two constant operands using a BPF_OR instead of a BPF_ADD. The verifier is currently not handling this scenario, and the destination register type becomes UNKNOWN_VALUE even if it's still storing a constant. As a result, the destination register cannot be used as argument to a helper function expecting a ARG_CONST_STACK_*, limiting some use cases. Modify the verifier to handle this case, and add a few tests to make sure all combinations are supported, and stack boundaries are still verified even with BPF_OR. Signed-off-by: Gianluca Borello Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 9 ++++- tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/test_verifier.c | 60 +++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e742210750e..38d05da84a49 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1481,14 +1481,19 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, struct bpf_reg_state *src_reg = ®s[insn->src_reg]; u8 opcode = BPF_OP(insn->code); - /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn. - * Don't care about overflow or negative values, just add them + /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' + * insn. Don't care about overflow or negative values, just add them */ if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) dst_reg->imm += insn->imm; else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && src_reg->type == CONST_IMM) dst_reg->imm += src_reg->imm; + else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) + dst_reg->imm |= insn->imm; + else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && + src_reg->type == CONST_IMM) + dst_reg->imm |= src_reg->imm; else mark_reg_unknown_value(regs, insn->dst_reg); return 0; diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 3c59f96e3ed8..071431bedde8 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -1,2 +1,3 @@ test_verifier test_maps +test_lru_map diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 5da2e9d7689c..8d71e44b319d 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2683,6 +2683,66 @@ static struct bpf_test tests[] = { .errstr_unpriv = "R0 pointer arithmetic prohibited", .result_unpriv = REJECT, }, + { + "constant register |= constant should keep constant type", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48), + BPF_MOV64_IMM(BPF_REG_2, 34), + BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "constant register |= constant should not bypass stack boundary checks", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48), + BPF_MOV64_IMM(BPF_REG_2, 34), + BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-48 access_size=58", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "constant register |= constant register should keep constant type", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48), + BPF_MOV64_IMM(BPF_REG_2, 34), + BPF_MOV64_IMM(BPF_REG_4, 13), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "constant register |= constant register should not bypass stack boundary checks", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48), + BPF_MOV64_IMM(BPF_REG_2, 34), + BPF_MOV64_IMM(BPF_REG_4, 24), + BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack type R1 off=-48 access_size=58", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From cbbd26b8b1a6af9c02e2b6523e12bd50cc765059 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 1 Nov 2016 22:09:04 -0400 Subject: [iov_iter] new primitives - copy_from_iter_full() and friends copy_from_iter_full(), copy_from_iter_full_nocache() and csum_and_copy_from_iter_full() - counterparts of copy_from_iter() et.al., advancing iterator only in case of successful full copy and returning whether it had been successful or not. Convert some obvious users. *NOTE* - do not blindly assume that something is a good candidate for those unless you are sure that not advancing iov_iter in failure case is the right thing in this case. Anything that does short read/short write kind of stuff (or is in a loop, etc.) is unlikely to be a good one. Signed-off-by: Al Viro --- drivers/bluetooth/hci_vhci.c | 2 +- drivers/net/macvtap.c | 4 +- drivers/net/tun.c | 7 +-- drivers/usb/gadget/function/f_fs.c | 2 +- drivers/usb/gadget/legacy/inode.c | 2 +- drivers/vhost/scsi.c | 3 +- drivers/vhost/vhost.c | 3 +- fs/ncpfs/file.c | 2 +- fs/orangefs/devorangefs-req.c | 13 ++--- include/linux/uio.h | 3 ++ kernel/printk/printk.c | 2 +- lib/iov_iter.c | 98 +++++++++++++++++++++++++++++++++++++- net/atm/common.c | 2 +- net/bluetooth/l2cap_core.c | 6 +-- net/packet/af_packet.c | 5 +- net/tipc/msg.c | 4 +- security/keys/keyctl.c | 2 +- 17 files changed, 121 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index c4a75a18dcae..233e850fdac7 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -181,7 +181,7 @@ static inline ssize_t vhci_get_user(struct vhci_data *data, if (!skb) return -ENOMEM; - if (copy_from_iter(skb_put(skb, len), len, from) != len) { + if (!copy_from_iter_full(skb_put(skb, len), len, from)) { kfree_skb(skb); return -EFAULT; } diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 070e3290aa6e..19d81ca3fb49 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -673,7 +673,6 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, int depth; bool zerocopy = false; size_t linear; - ssize_t n; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = q->vnet_hdr_sz; @@ -684,8 +683,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, len -= vnet_hdr_len; err = -EFAULT; - n = copy_from_iter(&vnet_hdr, sizeof(vnet_hdr), from); - if (n != sizeof(vnet_hdr)) + if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from)) goto err; iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr)); if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8093e39ae263..4fa2d756548a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1171,7 +1171,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, bool zerocopy = false; int err; u32 rxhash; - ssize_t n; if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -1181,8 +1180,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, return -EINVAL; len -= sizeof(pi); - n = copy_from_iter(&pi, sizeof(pi), from); - if (n != sizeof(pi)) + if (!copy_from_iter_full(&pi, sizeof(pi), from)) return -EFAULT; } @@ -1191,8 +1189,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, return -EINVAL; len -= tun->vnet_hdr_sz; - n = copy_from_iter(&gso, sizeof(gso), from); - if (n != sizeof(gso)) + if (!copy_from_iter_full(&gso, sizeof(gso), from)) return -EFAULT; if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 17989b72cdae..0bfd1e25b431 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -949,7 +949,7 @@ static ssize_t ffs_epfile_io(struct file *file, struct ffs_io_data *io_data) goto error_mutex; } if (!io_data->read && - copy_from_iter(data, data_len, &io_data->data) != data_len) { + !copy_from_iter_full(data, data_len, &io_data->data)) { ret = -EFAULT; goto error_mutex; } diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index bd82dd12deff..10b2576f8b6a 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -667,7 +667,7 @@ ep_write_iter(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; } - if (unlikely(copy_from_iter(buf, len, from) != len)) { + if (unlikely(!copy_from_iter_full(buf, len, from))) { value = -EFAULT; goto out; } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 6e29d053843d..b296985fda0d 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -922,8 +922,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) */ iov_iter_init(&out_iter, WRITE, vq->iov, out, out_size); - ret = copy_from_iter(req, req_size, &out_iter); - if (unlikely(ret != req_size)) { + if (unlikely(!copy_from_iter_full(req, req_size, &out_iter))) { vq_err(vq, "Faulted on copy_from_iter\n"); vhost_scsi_send_bad_target(vs, vq, head, out); continue; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c6f2d89c0e97..06e8b81b6253 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1862,8 +1862,7 @@ static int get_indirect(struct vhost_virtqueue *vq, i, count); return -EINVAL; } - if (unlikely(copy_from_iter(&desc, sizeof(desc), &from) != - sizeof(desc))) { + if (unlikely(!copy_from_iter_full(&desc, sizeof(desc), &from))) { vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc); return -EINVAL; diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index dd38ca1f2ecb..83ca77231707 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -203,7 +203,7 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) bufsize - (pos % bufsize), iov_iter_count(from)); - if (copy_from_iter(bouncebuffer, to_write, from) != to_write) { + if (!copy_from_iter_full(bouncebuffer, to_write, from)) { errno = -EFAULT; break; } diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index 516ffb4dc9a0..b0ced669427e 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -355,7 +355,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, __u64 tag; } head; int total = ret = iov_iter_count(iter); - int n; int downcall_size = sizeof(struct orangefs_downcall_s); int head_size = sizeof(head); @@ -372,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, return -EFAULT; } - n = copy_from_iter(&head, head_size, iter); - if (n < head_size) { + if (!copy_from_iter_full(&head, head_size, iter)) { gossip_err("%s: failed to copy head.\n", __func__); return -EFAULT; } @@ -407,8 +405,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, return ret; } - n = copy_from_iter(&op->downcall, downcall_size, iter); - if (n != downcall_size) { + if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) { gossip_err("%s: failed to copy downcall.\n", __func__); goto Efault; } @@ -462,10 +459,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, goto Enomem; } memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size); - n = copy_from_iter(op->downcall.trailer_buf, - op->downcall.trailer_size, - iter); - if (n != op->downcall.trailer_size) { + if (!copy_from_iter_full(op->downcall.trailer_buf, + op->downcall.trailer_size, iter)) { gossip_err("%s: failed to copy trailer.\n", __func__); vfree(op->downcall.trailer_buf); goto Efault; diff --git a/include/linux/uio.h b/include/linux/uio.h index 6e22b544d039..e57c0ccd61c6 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -89,7 +89,9 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); +bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i); size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); +bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i); size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); @@ -155,6 +157,7 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) } size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); +bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); int import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f7a55e9ff2f7..f6bda15396df 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -748,7 +748,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; buf[len] = '\0'; - if (copy_from_iter(buf, len, from) != len) { + if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f2bd21b93dfc..83c00b7f59b5 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -568,6 +568,31 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(copy_from_iter); +bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) \ + return false; + + iterate_all_kinds(i, bytes, v, ({ + if (__copy_from_user((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len)) + return false; + 0;}), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(copy_from_iter_full); + size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; @@ -587,6 +612,30 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(copy_from_iter_nocache); +bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) \ + return false; + iterate_all_kinds(i, bytes, v, ({ + if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len)) + return false; + 0;}), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(copy_from_iter_full_nocache); + size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -1008,7 +1057,7 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, } iterate_and_advance(i, bytes, v, ({ int err = 0; - next = csum_and_copy_from_user(v.iov_base, + next = csum_and_copy_from_user(v.iov_base, (to += v.iov_len) - v.iov_len, v.iov_len, 0, &err); if (!err) { @@ -1037,6 +1086,51 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, } EXPORT_SYMBOL(csum_and_copy_from_iter); +bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *to = addr; + __wsum sum, next; + size_t off = 0; + sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) + return false; + iterate_all_kinds(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_from_user(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0, &err); + if (err) + return false; + sum = csum_block_add(sum, next, off); + off += v.iov_len; + 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(csum_and_copy_from_iter_full); + size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { @@ -1051,7 +1145,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, - v.iov_base, + v.iov_base, v.iov_len, 0, &err); if (!err) { sum = csum_block_add(sum, next, off); diff --git a/net/atm/common.c b/net/atm/common.c index 6dc12305799e..a3ca922d307b 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -630,7 +630,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) goto out; skb->dev = NULL; /* for paths shared with net_device interfaces */ ATM_SKB(skb)->atm_options = vcc->atm_options; - if (copy_from_iter(skb_put(skb, size), size, &m->msg_iter) != size) { + if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { kfree_skb(skb); error = -EFAULT; goto out; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 577f1c01454a..ce0b5dd01953 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2127,7 +2127,7 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan, struct sk_buff **frag; int sent = 0; - if (copy_from_iter(skb_put(skb, count), count, &msg->msg_iter) != count) + if (!copy_from_iter_full(skb_put(skb, count), count, &msg->msg_iter)) return -EFAULT; sent += count; @@ -2147,8 +2147,8 @@ static inline int l2cap_skbuff_fromiovec(struct l2cap_chan *chan, *frag = tmp; - if (copy_from_iter(skb_put(*frag, count), count, - &msg->msg_iter) != count) + if (!copy_from_iter_full(skb_put(*frag, count), count, + &msg->msg_iter)) return -EFAULT; sent += count; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d2238b204691..588ec202d5ba 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2432,14 +2432,11 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr) { - int n; - if (*len < sizeof(*vnet_hdr)) return -EINVAL; *len -= sizeof(*vnet_hdr); - n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter); - if (n != sizeof(*vnet_hdr)) + if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; return __packet_snd_vnet_parse(vnet_hdr, *len); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 17201aa8423d..a22be502f1bd 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -268,7 +268,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, __skb_queue_tail(list, skb); skb_copy_to_linear_data(skb, mhdr, mhsz); pktpos = skb->data + mhsz; - if (copy_from_iter(pktpos, dsz, &m->msg_iter) == dsz) + if (copy_from_iter_full(pktpos, dsz, &m->msg_iter)) return dsz; rc = -EFAULT; goto error; @@ -299,7 +299,7 @@ int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, if (drem < pktrem) pktrem = drem; - if (copy_from_iter(pktpos, pktrem, &m->msg_iter) != pktrem) { + if (!copy_from_iter_full(pktpos, pktrem, &m->msg_iter)) { rc = -EFAULT; goto error; } diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index d580ad06b792..f89f1900e58d 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1074,7 +1074,7 @@ long keyctl_instantiate_key_common(key_serial_t id, } ret = -EFAULT; - if (copy_from_iter(payload, plen, from) != plen) + if (!copy_from_iter_full(payload, plen, from)) goto error2; } -- cgit v1.2.3 From 7bd509e311f408f7a5132fcdde2069af65fa05ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 4 Dec 2016 23:19:41 +0100 Subject: bpf: add prog_digest and expose it via fdinfo/netlink When loading a BPF program via bpf(2), calculate the digest over the program's instruction stream and store it in struct bpf_prog's digest member. This is done at a point in time before any instructions are rewritten by the verifier. Any unstable map file descriptor number part of the imm field will be zeroed for the hash. fdinfo example output for progs: # cat /proc/1590/fdinfo/5 pos: 0 flags: 02000002 mnt_id: 11 prog_type: 1 prog_jited: 1 prog_digest: b27e8b06da22707513aa97363dfb11c7c3675d28 memlock: 4096 When programs are pinned and retrieved by an ELF loader, the loader can check the program's digest through fdinfo and compare it against one that was generated over the ELF file's program section to see if the program needs to be reloaded. Furthermore, this can also be exposed through other means such as netlink in case of a tc cls/act dump (or xdp in future), but also through tracepoints or other facilities to identify the program. Other than that, the digest can also serve as a base name for the work in progress kallsyms support of programs. The digest doesn't depend/select the crypto layer, since we need to keep dependencies to a minimum. iproute2 will get support for this facility. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/linux/filter.h | 7 +++- include/uapi/linux/pkt_cls.h | 1 + include/uapi/linux/tc_act/tc_bpf.h | 1 + kernel/bpf/core.c | 65 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 24 +++++++++++++- kernel/bpf/verifier.c | 2 ++ net/sched/act_bpf.c | 9 ++++++ net/sched/cls_bpf.c | 8 +++++ 9 files changed, 116 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 69d0a7f12a3b..8796ff03f472 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,6 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +void bpf_prog_calc_digest(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index 97338134398f..f078d2b1cff6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -56,6 +57,9 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 +/* Maximum BPF program size in bytes. */ +#define MAX_BPF_SIZE (BPF_MAXINSNS * sizeof(struct bpf_insn)) + /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ @@ -404,8 +408,9 @@ struct bpf_prog { cb_access:1, /* Is control block accessed? */ dst_needed:1; /* Do we need dst entry? */ kmemcheck_bitfield_end(meta); - u32 len; /* Number of filter blocks */ enum bpf_prog_type type; /* Type of BPF program */ + u32 len; /* Number of filter blocks */ + u32 digest[SHA_DIGEST_WORDS]; /* Program digest */ struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ unsigned int (*bpf_func)(const void *ctx, diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 86786d45ee66..1adc0b654996 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -397,6 +397,7 @@ enum { TCA_BPF_NAME, TCA_BPF_FLAGS, TCA_BPF_FLAGS_GEN, + TCA_BPF_DIGEST, __TCA_BPF_MAX, }; diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 063d9d465119..a6b88a6f7f71 100644 --- a/include/uapi/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h @@ -27,6 +27,7 @@ enum { TCA_ACT_BPF_FD, TCA_ACT_BPF_NAME, TCA_ACT_BPF_PAD, + TCA_ACT_BPF_DIGEST, __TCA_ACT_BPF_MAX, }; #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 82a04143368e..bdcc9f4ba767 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,6 +136,71 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } +#define SHA_BPF_RAW_SIZE \ + round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) + +/* Called under verifier mutex. */ +void bpf_prog_calc_digest(struct bpf_prog *fp) +{ + const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); + static u32 ws[SHA_WORKSPACE_WORDS]; + static u8 raw[SHA_BPF_RAW_SIZE]; + struct bpf_insn *dst = (void *)raw; + u32 i, bsize, psize, blocks; + bool was_ld_map; + u8 *todo = raw; + __be32 *result; + __be64 *bits; + + sha_init(fp->digest); + memset(ws, 0, sizeof(ws)); + + /* We need to take out the map fd for the digest calculation + * since they are unstable from user space side. + */ + for (i = 0, was_ld_map = false; i < fp->len; i++) { + dst[i] = fp->insnsi[i]; + if (!was_ld_map && + dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && + dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + was_ld_map = true; + dst[i].imm = 0; + } else if (was_ld_map && + dst[i].code == 0 && + dst[i].dst_reg == 0 && + dst[i].src_reg == 0 && + dst[i].off == 0) { + was_ld_map = false; + dst[i].imm = 0; + } else { + was_ld_map = false; + } + } + + psize = fp->len * sizeof(struct bpf_insn); + memset(&raw[psize], 0, sizeof(raw) - psize); + raw[psize++] = 0x80; + + bsize = round_up(psize, SHA_MESSAGE_BYTES); + blocks = bsize / SHA_MESSAGE_BYTES; + if (bsize - psize >= sizeof(__be64)) { + bits = (__be64 *)(todo + bsize - sizeof(__be64)); + } else { + bits = (__be64 *)(todo + bsize + bits_offset); + blocks++; + } + *bits = cpu_to_be64((psize - 1) << 3); + + while (blocks--) { + sha_transform(fp->digest, todo, ws); + todo += SHA_MESSAGE_BYTES; + } + + result = (__force __be32 *)fp->digest; + for (i = 0; i < SHA_DIGEST_WORDS; i++) + result[i] = cpu_to_be32(fp->digest[i]); +} + static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85af86c496cd..c0d2b423ce93 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -662,8 +662,30 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_prog *prog = filp->private_data; + char prog_digest[sizeof(prog->digest) * 2 + 1] = { }; + + bin2hex(prog_digest, prog->digest, sizeof(prog->digest)); + seq_printf(m, + "prog_type:\t%u\n" + "prog_jited:\t%u\n" + "prog_digest:\t%s\n" + "memlock:\t%llu\n", + prog->type, + prog->jited, + prog_digest, + prog->pages * 1ULL << PAGE_SHIFT); +} +#endif + static const struct file_operations bpf_prog_fops = { - .release = bpf_prog_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_prog_show_fdinfo, +#endif + .release = bpf_prog_release, }; int bpf_prog_new_fd(struct bpf_prog *prog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 38d05da84a49..cb37339ca0da 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3176,6 +3176,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } + bpf_prog_calc_digest(env->prog); + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 84c1d2da4f8b..1c60317f0121 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -117,10 +117,19 @@ static int tcf_bpf_dump_bpf_info(const struct tcf_bpf *prog, static int tcf_bpf_dump_ebpf_info(const struct tcf_bpf *prog, struct sk_buff *skb) { + struct nlattr *nla; + if (prog->bpf_name && nla_put_string(skb, TCA_ACT_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_ACT_BPF_DIGEST, + sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index f70e03d2d2c8..adc776048d1a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -549,10 +549,18 @@ static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { + struct nlattr *nla; + if (prog->bpf_name && nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; + nla = nla_reserve(skb, TCA_BPF_DIGEST, sizeof(prog->filter->digest)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->filter->digest, nla_len(nla)); + return 0; } -- cgit v1.2.3 From 3cd5eca8d7a2fe43098df4c33a1272fe6945cac9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:19:09 -0500 Subject: fsnotify: constify 'data' passed to ->handle_event() Signed-off-by: Al Viro --- fs/notify/dnotify/dnotify.c | 2 +- fs/notify/fanotify/fanotify.c | 8 ++++---- fs/notify/fanotify/fanotify.h | 2 +- fs/notify/inotify/inotify.h | 2 +- fs/notify/inotify/inotify_fsnotify.c | 4 ++-- include/linux/fsnotify_backend.h | 2 +- kernel/audit_fsnotify.c | 10 +++++----- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 8 ++++---- 9 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 6faaf710e563..5a4ec309e283 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -85,7 +85,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e0e5f7c3c99f..bbc175d4213d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -90,10 +90,10 @@ static int fanotify_get_response(struct fsnotify_group *group, static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, u32 event_mask, - void *data, int data_type) + const void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; - struct path *path = data; + const struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, @@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path) + const struct path *path) { struct fanotify_event_info *event; @@ -177,7 +177,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { int ret = 0; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 2a5fb14115df..4500a74f8d38 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -47,4 +47,4 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path); + const struct path *path); diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index ed855ef6f077..a6f5907a3fee 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -26,7 +26,7 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 2cd900c2c737..19e7ec109a75 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; @@ -80,7 +80,7 @@ int inotify_handle_event(struct fsnotify_group *group, if ((inode_mark->mask & FS_EXCL_UNLINK) && (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; + const struct path *path = data; if (d_unlinked(path->dentry)) return 0; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 79467b239fcf..d357041bbec8 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -96,7 +96,7 @@ struct fsnotify_ops { struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index f84f8d06e1f6..1173f7cc7ba3 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -74,7 +74,7 @@ int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_ } static void audit_update_mark(struct audit_fsnotify_mark *audit_mark, - struct inode *inode) + const struct inode *inode) { audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET; audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET; @@ -168,11 +168,11 @@ static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { struct audit_fsnotify_mark *audit_mark; - struct inode *inode = NULL; + const struct inode *inode = NULL; audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark); @@ -180,10 +180,10 @@ static int audit_mark_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = ((struct path *)data)->dentry->d_inode; + inode = ((const struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 25772476fa4a..3a2f5dfe8093 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -948,7 +948,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0d302a87f21b..f476c46b9c20 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -472,10 +472,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie) { - struct inode *inode; + const struct inode *inode; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); @@ -484,10 +484,10 @@ static int audit_watch_handle_event(struct fsnotify_group *group, switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = d_backing_inode(((struct path *)data)->dentry); + inode = d_backing_inode(((const struct path *)data)->dentry); break; case (FSNOTIFY_EVENT_INODE): - inode = (struct inode *)data; + inode = (const struct inode *)data; break; default: BUG(); -- cgit v1.2.3 From 8bd107633b64195a0748b05236c3d14db0a8bed4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 20:36:51 -0500 Subject: audit_log_{name,link_denied}: constify struct path Signed-off-by: Al Viro --- include/linux/audit.h | 2 +- kernel/audit.c | 4 ++-- kernel/audit.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 9d4443f93db6..f51fca8d0b6f 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -147,7 +147,7 @@ extern void audit_log_d_path(struct audit_buffer *ab, extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_link_denied(const char *operation, - struct path *link); + const struct path *link); extern void audit_log_lost(const char *message); #ifdef CONFIG_SECURITY extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); diff --git a/kernel/audit.c b/kernel/audit.c index f1ca11613379..06008c422bd5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1760,7 +1760,7 @@ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, * @call_panic: optional pointer to int that will be updated if secid fails */ void audit_log_name(struct audit_context *context, struct audit_names *n, - struct path *path, int record_num, int *call_panic) + const struct path *path, int record_num, int *call_panic) { struct audit_buffer *ab; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); @@ -1948,7 +1948,7 @@ EXPORT_SYMBOL(audit_log_task_info); * @operation: specific link operation * @link: the path that triggered the restriction */ -void audit_log_link_denied(const char *operation, struct path *link) +void audit_log_link_denied(const char *operation, const struct path *link) { struct audit_buffer *ab; struct audit_names *name; diff --git a/kernel/audit.h b/kernel/audit.h index 431444c3708b..960d49c9db5e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -212,7 +212,7 @@ extern void audit_copy_inode(struct audit_names *name, extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, - struct audit_names *n, struct path *path, + struct audit_names *n, const struct path *path, int record_num, int *call_panic); extern int audit_pid; -- cgit v1.2.3 From 8fc31ce8896fc3cea1d79688c8ff950ad4e73afe Mon Sep 17 00:00:00 2001 From: David Carrillo-Cisneros Date: Sun, 4 Dec 2016 00:46:17 -0800 Subject: perf/core: Remove invalid warning from list_update_cgroup_even()t The warning introduced in commit: 864c2357ca89 ("perf/core: Do not set cpuctx->cgrp for unscheduled cgroups") assumed that a cgroup switch always precedes list_del_event. This is not the case. Remove warning. Make sure that cpuctx->cgrp is NULL until a cgroup event is sched in or ctx->nr_cgroups == 0. Signed-off-by: David Carrillo-Cisneros Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Fenghua Yu Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Nilay Vaish Cc: Paul Turner Cc: Peter Zijlstra Cc: Ravi V Shankar Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vegard Nossum Cc: Vikas Shivappa Cc: Vince Weaver Link: http://lkml.kernel.org/r/1480841177-27299-1-git-send-email-davidcc@google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6ee1febdf6ff..02c8421f8c01 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -903,17 +903,14 @@ list_update_cgroup_event(struct perf_event *event, */ cpuctx = __get_cpu_context(ctx); - /* Only set/clear cpuctx->cgrp if current task uses event->cgrp. */ - if (perf_cgroup_from_task(current, ctx) != event->cgrp) { - /* - * We are removing the last cpu event in this context. - * If that event is not active in this cpu, cpuctx->cgrp - * should've been cleared by perf_cgroup_switch. - */ - WARN_ON_ONCE(!add && cpuctx->cgrp); - return; - } - cpuctx->cgrp = add ? event->cgrp : NULL; + /* + * cpuctx->cgrp is NULL until a cgroup event is sched in or + * ctx->nr_cgroup == 0 . + */ + if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) + cpuctx->cgrp = event->cgrp; + else if (!add) + cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From f943fe0faf27991d256e10b5a85f175385c64cdc Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 28 Nov 2016 15:24:43 +0100 Subject: lockdep: Fix report formatting Since commit: 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") printk() requires KERN_CONT to continue log messages. Lots of printk() in lockdep.c and print_ip_sym() don't have it. As the result lockdep reports are completely messed up. Add missing KERN_CONT and inline print_ip_sym() where necessary. Example of a messed up report: 0-rc5+ #41 Not tainted ------------------------------------------------------- syz-executor0/5036 is trying to acquire lock: ( rtnl_mutex ){+.+.+.} , at: [] rtnl_lock+0x1c/0x20 but task is already holding lock: ( &net->packet.sklist_lock ){+.+...} , at: [] packet_diag_dump+0x1a6/0x1920 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 ( &net->packet.sklist_lock +.+...} ... Without this patch all scripts that parse kernel bug reports are broken. Signed-off-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andreyknvl@google.com Cc: aryabinin@virtuozzo.com Cc: joe@perches.com Cc: syzkaller@googlegroups.com Link: http://lkml.kernel.org/r/1480343083-48731-1-git-send-email-dvyukov@google.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 111 ++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b3..4d7ffc0a0d00 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -506,13 +506,13 @@ static void __print_lock_name(struct lock_class *class) name = class->name; if (!name) { name = __get_key_name(class->key, str); - printk("%s", name); + printk(KERN_CONT "%s", name); } else { - printk("%s", name); + printk(KERN_CONT "%s", name); if (class->name_version > 1) - printk("#%d", class->name_version); + printk(KERN_CONT "#%d", class->name_version); if (class->subclass) - printk("/%d", class->subclass); + printk(KERN_CONT "/%d", class->subclass); } } @@ -522,9 +522,9 @@ static void print_lock_name(struct lock_class *class) get_usage_chars(class, usage); - printk(" ("); + printk(KERN_CONT " ("); __print_lock_name(class); - printk("){%s}", usage); + printk(KERN_CONT "){%s}", usage); } static void print_lockdep_cache(struct lockdep_map *lock) @@ -536,7 +536,7 @@ static void print_lockdep_cache(struct lockdep_map *lock) if (!name) name = __get_key_name(lock->key->subkeys, str); - printk("%s", name); + printk(KERN_CONT "%s", name); } static void print_lock(struct held_lock *hlock) @@ -551,13 +551,13 @@ static void print_lock(struct held_lock *hlock) barrier(); if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { - printk("\n"); + printk(KERN_CONT "\n"); return; } print_lock_name(lock_classes + class_idx - 1); - printk(", at: "); - print_ip_sym(hlock->acquire_ip); + printk(KERN_CONT ", at: [<%p>] %pS\n", + (void *)hlock->acquire_ip, (void *)hlock->acquire_ip); } static void lockdep_print_held_locks(struct task_struct *curr) @@ -792,8 +792,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); + printk(KERN_CONT "#%d", class->name_version); + printk(KERN_CONT "\n"); dump_stack(); if (!graph_lock()) { @@ -1071,7 +1071,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) return 0; printk("\n-> #%u", depth); print_lock_name(target->class); - printk(":\n"); + printk(KERN_CONT ":\n"); print_stack_trace(&target->trace, 6); return 0; @@ -1102,11 +1102,11 @@ print_circular_lock_scenario(struct held_lock *src, if (parent != source) { printk("Chain exists of:\n "); __print_lock_name(source); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(parent); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(target); - printk("\n\n"); + printk(KERN_CONT "\n\n"); } printk(" Possible unsafe locking scenario:\n\n"); @@ -1114,16 +1114,16 @@ print_circular_lock_scenario(struct held_lock *src, printk(" ---- ----\n"); printk(" lock("); __print_lock_name(target); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(parent); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(target); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(source); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1359,22 +1359,22 @@ static void print_lock_class_header(struct lock_class *class, int depth) printk("%*s->", depth, ""); print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); + printk(KERN_CONT " ops: %lu", class->ops); + printk(KERN_CONT " {\n"); for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { if (class->usage_mask & (1 << bit)) { int len = depth; len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); + len += printk(KERN_CONT " at:\n"); print_stack_trace(class->usage_traces + bit, len); } } printk("%*s }\n", depth, ""); - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); + printk("%*s ... key at: [<%p>] %pS\n", + depth, "", class->key, class->key); } /* @@ -1437,11 +1437,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, if (middle_class != unsafe_class) { printk("Chain exists of:\n "); __print_lock_name(safe_class); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(middle_class); - printk(" --> "); + printk(KERN_CONT " --> "); __print_lock_name(unsafe_class); - printk("\n\n"); + printk(KERN_CONT "\n\n"); } printk(" Possible interrupt unsafe locking scenario:\n\n"); @@ -1449,18 +1449,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" ---- ----\n"); printk(" lock("); __print_lock_name(unsafe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); __print_lock_name(safe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(middle_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" \n"); printk(" lock("); __print_lock_name(safe_class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -1497,9 +1497,9 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock(prev); printk("which would create a new lock dependency:\n"); print_lock_name(hlock_class(prev)); - printk(" ->"); + printk(KERN_CONT " ->"); print_lock_name(hlock_class(next)); - printk("\n"); + printk(KERN_CONT "\n"); printk("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); @@ -1521,8 +1521,7 @@ print_bad_irq_dependency(struct task_struct *curr, lockdep_print_held_locks(curr); - printk("\nthe dependencies between %s-irq-safe lock", irqclass); - printk(" and the holding lock:\n"); + printk("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) return 0; print_shortest_lock_dependencies(backwards_entry, prev_root); @@ -1694,10 +1693,10 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" ----\n"); printk(" lock("); __print_lock_name(prev); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" lock("); __print_lock_name(next); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); } @@ -1891,9 +1890,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); - printk(" => "); + printk(KERN_CONT " => "); print_lock_name(hlock_class(next)); - printk("\n"); + printk(KERN_CONT "\n"); dump_stack(); return graph_lock(); } @@ -2343,11 +2342,11 @@ print_usage_bug_scenario(struct held_lock *lock) printk(" ----\n"); printk(" lock("); __print_lock_name(class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk(" \n"); printk(" lock("); __print_lock_name(class); - printk(");\n"); + printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2522,14 +2521,18 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, void print_irqtrace_events(struct task_struct *curr) { printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); - print_ip_sym(curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); - print_ip_sym(curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); - print_ip_sym(curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); - print_ip_sym(curr->softirq_disable_ip); + printk("hardirqs last enabled at (%u): [<%p>] %pS\n", + curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip, + (void *)curr->hardirq_enable_ip); + printk("hardirqs last disabled at (%u): [<%p>] %pS\n", + curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip, + (void *)curr->hardirq_disable_ip); + printk("softirqs last enabled at (%u): [<%p>] %pS\n", + curr->softirq_enable_event, (void *)curr->softirq_enable_ip, + (void *)curr->softirq_enable_ip); + printk("softirqs last disabled at (%u): [<%p>] %pS\n", + curr->softirq_disable_event, (void *)curr->softirq_disable_ip, + (void *)curr->softirq_disable_ip); } static int HARDIRQ_verbose(struct lock_class *class) @@ -3235,8 +3238,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (very_verbose(class)) { printk("\nacquire class [%p] %s", class->key, class->name); if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); + printk(KERN_CONT "#%d", class->name_version); + printk(KERN_CONT "\n"); dump_stack(); } @@ -3378,7 +3381,7 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(") at:\n"); + printk(KERN_CONT ") at:\n"); print_ip_sym(ip); printk("but there are no more locks to release!\n"); printk("\nother info that might help us debug this:\n"); @@ -3871,7 +3874,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); - printk(") at:\n"); + printk(KERN_CONT ") at:\n"); print_ip_sym(ip); printk("but there are no locks held!\n"); printk("\nother info that might help us debug this:\n"); -- cgit v1.2.3 From b18cc3de00ec3442cf40ac60787dbe0703b99e24 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 7 Dec 2016 14:31:33 +0100 Subject: tracing/rb: Init the CPU mask on allocation Before commit b32614c03413 ("tracing/rb: Convert to hotplug state machine") the allocated cpumask was initialized to the mask of online or possible CPUs. After the CPU hotplug changes the buffer initialization moved to trace_rb_cpu_prepare() but the cpumask is allocated with alloc_cpumask() and therefor has random content. As a consequence the cpu buffers are not initialized and a later access dereferences a NULL pointer. Use zalloc_cpumask() instead so trace_rb_cpu_prepare() initializes the buffers properly. Fixes: b32614c03413 ("tracing/rb: Convert to hotplug state machine") Reported-by: Tetsuo Handa Signed-off-by: Sebastian Andrzej Siewior Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20161207133133.hzkcqfllxcdi3joz@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a7a055f167c7..89a2611a1635 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1297,7 +1297,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer) return NULL; - if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); -- cgit v1.2.3 From 5304121adae9fc59f4b640f82200868112edd0bd Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Tue, 6 Dec 2016 18:00:43 -0600 Subject: clocksource: export the clocks_calc_mult_shift to use by timestamp code The CPSW CPTS driver is capable of doing timestamping on tx/rx packets and requires to know mult and shift factors for timestamp conversion from raw value to nanoseconds (ptp clock). Now these mult and shift factors are calculated manually and provided through DT, which makes very hard to support of a lot number of platforms, especially if CPTS refclk is not the same for some kind of boards and depends on efuse settings (Keystone 2 platforms). Hence, export clocks_calc_mult_shift() to allow drivers like CPSW CPTS (and other ptp drivesr) to benefit from automaitc calculation of mult and shift factors. Cc: John Stultz Signed-off-by: Murali Karicheri Signed-off-by: Grygorii Strashko Acked-by: Thomas Gleixner Signed-off-by: David S. Miller --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7e4fad75acaa..150242ccfcd2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -89,6 +89,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) *mult = tmp; *shift = sft; } +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); /*[Clocksource internal variables]--------- * curr_clocksource: -- cgit v1.2.3 From ef0915cacd04c9e35be5f9d62a4e4b5b4b9bcfd1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 7 Dec 2016 01:15:44 +0100 Subject: bpf: fix loading of BPF_MAXINSNS sized programs General assumption is that single program can hold up to BPF_MAXINSNS, that is, 4096 number of instructions. It is the case with cBPF and that limit was carried over to eBPF. When recently testing digest, I noticed that it's actually not possible to feed 4096 instructions via bpf(2). The check for > BPF_MAXINSNS was added back then to bpf_check() in cbd357008604 ("bpf: verifier (add ability to receive verification log)"). However, 09756af46893 ("bpf: expand BPF syscall with program load/unload") added yet another check that comes before that into bpf_prog_load(), but this time bails out already in case of >= BPF_MAXINSNS. Fix it up and perform the check early in bpf_prog_load(), so we can drop the second one in bpf_check(). It makes sense, because also a 0 insn program is useless and we don't want to waste any resources doing work up to bpf_check() point. The existing bpf(2) man page documents E2BIG as the official error for such cases, so just stick with it as well. Fixes: 09756af46893 ("bpf: expand BPF syscall with program load/unload") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 ++-- kernel/bpf/verifier.c | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0d2b423ce93..88f609f1c0c3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -786,8 +786,8 @@ static int bpf_prog_load(union bpf_attr *attr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt >= BPF_MAXINSNS) - return -EINVAL; + if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + return -E2BIG; if (type == BPF_PROG_TYPE_KPROBE && attr->kern_version != LINUX_VERSION_CODE) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cb37339ca0da..da9fb2a9b7eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3133,9 +3133,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) struct bpf_verifier_env *env; int ret = -EINVAL; - if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) - return -E2BIG; - /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ -- cgit v1.2.3 From 166ad0e1e2132ff0cda08b94af8301655fcabbcd Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Dec 2016 14:44:36 -0800 Subject: kcov: add missing #include In __sanitizer_cov_trace_pc we use task_struct and fields within it, but as we haven't included , it is not guaranteed to be defined. While we usually happen to acquire the definition through a transitive include, this is fragile (and hasn't been true in the past, causing issues with backports). Include to avoid any fragility. [mark.rutland@arm.com: rewrote changelog] Link: http://lkml.kernel.org/r/1481007384-27529-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mark Rutland Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 30e6d05aa5a9..3cbb0c879705 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 777c6e0daebb3fcefbbd6f620410a946b07ef6d0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 7 Dec 2016 14:54:38 +0100 Subject: hotplug: Make register and unregister notifier API symmetric Yu Zhao has noticed that __unregister_cpu_notifier only unregisters its notifiers when HOTPLUG_CPU=y while the registration might succeed even when HOTPLUG_CPU=n if MODULE is enabled. This means that e.g. zswap might keep a stale notifier on the list on the manual clean up during the pool tear down and thus corrupt the list. Resulting in the following [ 144.964346] BUG: unable to handle kernel paging request at ffff880658a2be78 [ 144.971337] IP: [] raw_notifier_chain_register+0x1b/0x40 [ 145.122628] Call Trace: [ 145.125086] [] __register_cpu_notifier+0x18/0x20 [ 145.131350] [] zswap_pool_create+0x273/0x400 [ 145.137268] [] __zswap_param_set+0x1fc/0x300 [ 145.143188] [] ? trace_hardirqs_on+0xd/0x10 [ 145.149018] [] ? kernel_param_lock+0x28/0x30 [ 145.154940] [] ? __might_fault+0x4f/0xa0 [ 145.160511] [] zswap_compressor_param_set+0x17/0x20 [ 145.167035] [] param_attr_store+0x5c/0xb0 [ 145.172694] [] module_attr_store+0x1d/0x30 [ 145.178443] [] sysfs_kf_write+0x4f/0x70 [ 145.183925] [] kernfs_fop_write+0x149/0x180 [ 145.189761] [] __vfs_write+0x18/0x40 [ 145.194982] [] vfs_write+0xb2/0x1a0 [ 145.200122] [] SyS_write+0x52/0xa0 [ 145.205177] [] entry_SYSCALL_64_fastpath+0x12/0x17 This can be even triggered manually by changing /sys/module/zswap/parameters/compressor multiple times. Fix this issue by making unregister APIs symmetric to the register so there are no surprises. Fixes: 47e627bc8c9a ("[PATCH] hotplug: Allow modules to use the cpu hotplug notifiers even if !CONFIG_HOTPLUG_CPU") Reported-and-tested-by: Yu Zhao Signed-off-by: Michal Hocko Cc: linux-mm@kvack.org Cc: Andrew Morton Cc: Dan Streetman Link: http://lkml.kernel.org/r/20161207135438.4310-1-mhocko@kernel.org Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 15 ++++----------- kernel/cpu.c | 2 +- 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index b886dc17f2f3..e571128ad99a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -93,22 +93,16 @@ extern bool cpuhp_tasks_frozen; { .notifier_call = fn, .priority = pri }; \ __register_cpu_notifier(&fn##_nb); \ } -#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ -#define cpu_notifier(fn, pri) do { (void)(fn); } while (0) -#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) -#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ -#ifdef CONFIG_HOTPLUG_CPU extern int register_cpu_notifier(struct notifier_block *nb); extern int __register_cpu_notifier(struct notifier_block *nb); extern void unregister_cpu_notifier(struct notifier_block *nb); extern void __unregister_cpu_notifier(struct notifier_block *nb); -#else -#ifndef MODULE -extern int register_cpu_notifier(struct notifier_block *nb); -extern int __register_cpu_notifier(struct notifier_block *nb); -#else +#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ +#define cpu_notifier(fn, pri) do { (void)(fn); } while (0) +#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) + static inline int register_cpu_notifier(struct notifier_block *nb) { return 0; @@ -118,7 +112,6 @@ static inline int __register_cpu_notifier(struct notifier_block *nb) { return 0; } -#endif static inline void unregister_cpu_notifier(struct notifier_block *nb) { diff --git a/kernel/cpu.c b/kernel/cpu.c index 29de1a9352c0..217fd2e7f435 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -659,7 +659,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -#ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) @@ -676,6 +675,7 @@ void __unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(__unregister_cpu_notifier); +#ifdef CONFIG_HOTPLUG_CPU /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id -- cgit v1.2.3 From 1da5c46fa965ff90f5ffc080b6ab3fae5e227bc3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 Nov 2016 18:50:57 +0100 Subject: kthread: Make struct kthread kmalloc'ed commit 23196f2e5f5d "kthread: Pin the stack via try_get_task_stack() / put_task_stack() in to_live_kthread() function" is a workaround for the fragile design of struct kthread being allocated on the task stack. struct kthread in its current form should be removed, but this needs cleanups outside of kthread.c. As a first step move struct kthread away from the task stack by making it kmalloc'ed. This allows to access kthread.exited without the magic of trying to pin task stack and the try logic in to_live_kthread(). Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Chunming Zhou Cc: Roman Pen Cc: Petr Mladek Cc: Andy Lutomirski Cc: Tejun Heo Cc: Andy Lutomirski Cc: Alex Deucher Cc: Andrew Morton Link: http://lkml.kernel.org/r/20161129175057.GA5330@redhat.com Signed-off-by: Thomas Gleixner --- include/linux/kthread.h | 1 + kernel/fork.c | 2 ++ kernel/kthread.c | 58 ++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 48 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index a6e82a69c363..c1c3e63d52c1 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -48,6 +48,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), __k; \ }) +void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); int kthread_stop(struct task_struct *k); diff --git a/kernel/fork.c b/kernel/fork.c index 600e93b5e539..7ffa16033ded 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -354,6 +354,8 @@ void free_task(struct task_struct *tsk) ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); arch_release_task_struct(tsk); + if (tsk->flags & PF_KTHREAD) + free_kthread_struct(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); diff --git a/kernel/kthread.c b/kernel/kthread.c index be2cc1f9dd57..9d64b6526d0b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -53,14 +53,38 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define __to_kthread(vfork) \ - container_of(vfork, struct kthread, exited) +static inline void set_kthread_struct(void *kthread) +{ + /* + * We abuse ->set_child_tid to avoid the new member and because it + * can't be wrongly copied by copy_process(). We also rely on fact + * that the caller can't exec, so PF_KTHREAD can't be cleared. + */ + current->set_child_tid = (__force void __user *)kthread; +} static inline struct kthread *to_kthread(struct task_struct *k) { - return __to_kthread(k->vfork_done); + WARN_ON(!(k->flags & PF_KTHREAD)); + return (__force void *)k->set_child_tid; +} + +void free_kthread_struct(struct task_struct *k) +{ + /* + * Can be NULL if this kthread was created by kernel_thread() + * or if kmalloc() in kthread() failed. + */ + kfree(to_kthread(k)); } +#define __to_kthread(vfork) \ + container_of(vfork, struct kthread, exited) + +/* + * TODO: kill it and use to_kthread(). But we still need the users + * like kthread_stop() which has to sync with the exiting kthread. + */ static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); @@ -181,14 +205,11 @@ static int kthread(void *_create) int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; - struct kthread self; + struct kthread *self; int ret; - self.flags = 0; - self.data = data; - init_completion(&self.exited); - init_completion(&self.parked); - current->vfork_done = &self.exited; + self = kmalloc(sizeof(*self), GFP_KERNEL); + set_kthread_struct(self); /* If user was SIGKILLed, I release the structure. */ done = xchg(&create->done, NULL); @@ -196,6 +217,19 @@ static int kthread(void *_create) kfree(create); do_exit(-EINTR); } + + if (!self) { + create->result = ERR_PTR(-ENOMEM); + complete(done); + do_exit(-ENOMEM); + } + + self->flags = 0; + self->data = data; + init_completion(&self->exited); + init_completion(&self->parked); + current->vfork_done = &self->exited; + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @@ -203,12 +237,10 @@ static int kthread(void *_create) schedule(); ret = -EINTR; - - if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { - __kthread_parkme(&self); + if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { + __kthread_parkme(self); ret = threadfn(data); } - /* we can't just return, we must preserve "self" on stack */ do_exit(ret); } -- cgit v1.2.3 From eff9662547f358239b98dfc4a8e6905b494e14d6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 Nov 2016 18:51:00 +0100 Subject: Revert "kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function" This reverts commit 23196f2e5f5d810578a772785807dcdc2b9fdce9. Now that struct kthread is kmalloc'ed and not longer on the task stack there is no need anymore to pin the stack. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Cc: Chunming Zhou Cc: Roman Pen Cc: Petr Mladek Cc: Andy Lutomirski Cc: Tejun Heo Cc: Andy Lutomirski Cc: Alex Deucher Cc: Andrew Morton Link: http://lkml.kernel.org/r/20161129175100.GA5333@redhat.com Signed-off-by: Thomas Gleixner --- kernel/kthread.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9d64b6526d0b..7891a940007d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -88,7 +88,7 @@ void free_kthread_struct(struct task_struct *k) static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork) && try_get_task_stack(k)) + if (likely(vfork)) return __to_kthread(vfork); return NULL; } @@ -473,10 +473,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_live_kthread(k); - if (kthread) { + if (kthread) __kthread_unpark(k, kthread); - put_task_stack(k); - } } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -505,7 +503,6 @@ int kthread_park(struct task_struct *k) wait_for_completion(&kthread->parked); } } - put_task_stack(k); ret = 0; } return ret; @@ -541,7 +538,6 @@ int kthread_stop(struct task_struct *k) __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); - put_task_stack(k); } ret = k->exit_code; put_task_struct(k); -- cgit v1.2.3 From efb29fbfa50c490dac64a9418ebe553be82df781 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 Nov 2016 18:51:03 +0100 Subject: kthread: Don't use to_live_kthread() in kthread_stop() kthread_stop() had to use to_live_kthread() simply because it was not possible to access kthread->exited after the exiting task clears task_struct->vfork_done. Now that to_kthread() is always valid, wake_up_process() + wait_for_completion() can be done ununconditionally. It's not an issue anymore if the task has already issued complete_vfork_done() or died. The exiting task can get the spurious wakeup after mm_release() but this is possible without this change too and is fine; do_task_dead() ensures that this can't make any harm. As a further enhancement this could be converted to task_work_add() later, so ->vfork_done can be avoided completely. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Chunming Zhou Cc: Roman Pen Cc: Petr Mladek Cc: Andy Lutomirski Cc: Tejun Heo Cc: Andy Lutomirski Cc: Alex Deucher Cc: Andrew Morton Link: http://lkml.kernel.org/r/20161129175103.GA5336@redhat.com Signed-off-by: Thomas Gleixner --- kernel/kthread.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 7891a940007d..4dcbc8b5d6b6 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -532,13 +532,11 @@ int kthread_stop(struct task_struct *k) trace_sched_kthread_stop(k); get_task_struct(k); - kthread = to_live_kthread(k); - if (kthread) { - set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - __kthread_unpark(k, kthread); - wake_up_process(k); - wait_for_completion(&kthread->exited); - } + kthread = to_kthread(k); + set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); + __kthread_unpark(k, kthread); + wake_up_process(k); + wait_for_completion(&kthread->exited); ret = k->exit_code; put_task_struct(k); -- cgit v1.2.3 From cf380a4a96e2260742051fa7fc831596bb26cc8b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 Nov 2016 18:51:07 +0100 Subject: kthread: Don't use to_live_kthread() in kthread_[un]park() Now that to_kthread() is always validm change kthread_park() and kthread_unpark() to use it and kill to_live_kthread(). The conversion of kthread_unpark() is trivial. If KTHREAD_IS_PARKED is set then the task has called complete(&self->parked) and there the function cannot race against a concurrent kthread_stop() and exit. kthread_park() is more tricky, because its semantics are not well defined. It returns -ENOSYS if the thread exited but this can never happen and as Roman pointed out kthread_park() can obviously block forever if it would race with the exiting kthread. The usage of kthread_park() in cpuhp code (cpu.c, smpboot.c, stop_machine.c) is fine. It can never see an exiting/exited kthread, smpboot_destroy_threads() clears *ht->store, smpboot_park_thread() checks it is not NULL under the same smpboot_threads_lock. cpuhp_threads and cpu_stop_threads never exit, so other callers are fine too. But it has two more users: - watchdog_park_threads(): The code is actually correct, get_online_cpus() ensures that kthread_park() can't race with itself (note that kthread_park() can't handle this race correctly), but it should not use kthread_park() directly. - drivers/gpu/drm/amd/scheduler/gpu_scheduler.c should not use kthread_park() either. kthread_park() must not be called after amd_sched_fini() which does kthread_stop(), otherwise even to_live_kthread() is not safe because task_struct can be already freed and sched->thread can point to nowhere. The usage of kthread_park/unpark should either be restricted to core code which is properly protected against the exit race or made more robust so it is safe to use it in drivers. To catch eventual exit issues, add a WARN_ON(PF_EXITING) for now. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Cc: Chunming Zhou Cc: Roman Pen Cc: Petr Mladek Cc: Andy Lutomirski Cc: Tejun Heo Cc: Andy Lutomirski Cc: Alex Deucher Cc: Andrew Morton Link: http://lkml.kernel.org/r/20161129175107.GA5339@redhat.com Signed-off-by: Thomas Gleixner --- kernel/kthread.c | 69 ++++++++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 4dcbc8b5d6b6..01d27164e5b7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -78,21 +78,6 @@ void free_kthread_struct(struct task_struct *k) kfree(to_kthread(k)); } -#define __to_kthread(vfork) \ - container_of(vfork, struct kthread, exited) - -/* - * TODO: kill it and use to_kthread(). But we still need the users - * like kthread_stop() which has to sync with the exiting kthread. - */ -static struct kthread *to_live_kthread(struct task_struct *k) -{ - struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) - return __to_kthread(vfork); - return NULL; -} - /** * kthread_should_stop - should this kthread return now? * @@ -441,8 +426,18 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; } -static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) +/** + * kthread_unpark - unpark a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_park() for @k to return false, wakes it, and + * waits for it to return. If the thread is marked percpu then its + * bound to the cpu again. + */ +void kthread_unpark(struct task_struct *k) { + struct kthread *kthread = to_kthread(k); + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * We clear the IS_PARKED bit here as we don't wait @@ -460,22 +455,6 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) wake_up_state(k, TASK_PARKED); } } - -/** - * kthread_unpark - unpark a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * - * Sets kthread_should_park() for @k to return false, wakes it, and - * waits for it to return. If the thread is marked percpu then its - * bound to the cpu again. - */ -void kthread_unpark(struct task_struct *k) -{ - struct kthread *kthread = to_live_kthread(k); - - if (kthread) - __kthread_unpark(k, kthread); -} EXPORT_SYMBOL_GPL(kthread_unpark); /** @@ -492,20 +471,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark); */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = to_live_kthread(k); - int ret = -ENOSYS; - - if (kthread) { - if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - if (k != current) { - wake_up_process(k); - wait_for_completion(&kthread->parked); - } + struct kthread *kthread = to_kthread(k); + + if (WARN_ON(k->flags & PF_EXITING)) + return -ENOSYS; + + if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + if (k != current) { + wake_up_process(k); + wait_for_completion(&kthread->parked); } - ret = 0; } - return ret; + + return 0; } EXPORT_SYMBOL_GPL(kthread_park); @@ -534,7 +513,7 @@ int kthread_stop(struct task_struct *k) get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - __kthread_unpark(k, kthread); + kthread_unpark(k); wake_up_process(k); wait_for_completion(&kthread->exited); ret = k->exit_code; -- cgit v1.2.3 From 8fb9dcbdc3619741c10c573199d804161c34c89a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 Nov 2016 18:51:10 +0100 Subject: kthread: Don't abuse kthread_create_on_cpu() in __kthread_create_worker() kthread_create_on_cpu() sets KTHREAD_IS_PER_CPU and kthread->cpu, this only makes sense if this kthread can be parked/unparked by cpuhp code. kthread workers never call kthread_parkme() so this has no effect. Change __kthread_create_worker() to simply call kthread_bind(task, cpu). The very fact that kthread_create_on_cpu() doesn't accept a generic fmt shows that it should not be used outside of smpboot.c. Now, the only reason we can not unexport this helper and move it into smpboot.c is that it sets kthread->cpu and struct kthread is not exported. And the only reason we can not kill kthread->cpu is that kthread_unpark() is used by drivers/gpu/drm/amd/scheduler/gpu_scheduler.c and thus we can not turn _unpark into kthread_unpark(struct smp_hotplug_thread *, cpu). Signed-off-by: Oleg Nesterov Tested-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Reviewed-by: Petr Mladek Cc: Chunming Zhou Cc: Roman Pen Cc: Andy Lutomirski Cc: Tejun Heo Cc: Andy Lutomirski Cc: Alex Deucher Cc: Andrew Morton Link: http://lkml.kernel.org/r/20161129175110.GA5342@redhat.com Signed-off-by: Thomas Gleixner --- kernel/kthread.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 01d27164e5b7..956495f0efaf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -641,6 +641,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; + int node = -1; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -648,25 +649,17 @@ __kthread_create_worker(int cpu, unsigned int flags, kthread_init_worker(worker); - if (cpu >= 0) { - char name[TASK_COMM_LEN]; - - /* - * kthread_create_worker_on_cpu() allows to pass a generic - * namefmt in compare with kthread_create_on_cpu. We need - * to format it here. - */ - vsnprintf(name, sizeof(name), namefmt, args); - task = kthread_create_on_cpu(kthread_worker_fn, worker, - cpu, name); - } else { - task = __kthread_create_on_node(kthread_worker_fn, worker, - -1, namefmt, args); - } + if (cpu >= 0) + node = cpu_to_node(cpu); + task = __kthread_create_on_node(kthread_worker_fn, worker, + node, namefmt, args); if (IS_ERR(task)) goto fail_task; + if (cpu >= 0) + kthread_bind(task, cpu); + worker->flags = flags; worker->task = task; wake_up_process(task); -- cgit v1.2.3 From d2a4dd37f6b41fbcad76efbf63124eb3126c66fe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 7 Dec 2016 10:57:59 -0800 Subject: bpf: fix state equivalence Commmits 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") and 484611357c19 ("bpf: allow access into map value arrays") by themselves are correct, but in combination they make state equivalence ignore 'id' field of the register state which can lead to accepting invalid program. Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") Fixes: 484611357c19 ("bpf: allow access into map value arrays") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 14 +++++++------- kernel/bpf/verifier.c | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7453c1281531..a13b031dc6b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -18,13 +18,6 @@ struct bpf_reg_state { enum bpf_reg_type type; - /* - * Used to determine if any memory access using this register will - * result in a bad access. - */ - s64 min_value; - u64 max_value; - u32 id; union { /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ s64 imm; @@ -40,6 +33,13 @@ struct bpf_reg_state { */ struct bpf_map *map_ptr; }; + u32 id; + /* Used to determine if any memory access using this register will + * result in a bad access. These two fields must be last. + * See states_equal() + */ + s64 min_value; + u64 max_value; }; enum bpf_stack_slot_type { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index da9fb2a9b7eb..5b14f85f45c6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2528,7 +2528,7 @@ static bool states_equal(struct bpf_verifier_env *env, * we didn't do a variable access into a map then we are a-ok. */ if (!varlen_map_access && - rold->type == rcur->type && rold->imm == rcur->imm) + memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0) continue; /* If we didn't map access then again we don't care about the -- cgit v1.2.3 From 17bedab2723145d17b14084430743549e6943d03 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 7 Dec 2016 15:53:11 -0800 Subject: bpf: xdp: Allow head adjustment in XDP prog This patch allows XDP prog to extend/remove the packet data at the head (like adding or removing header). It is done by adding a new XDP helper bpf_xdp_adjust_head(). It also renames bpf_helper_changes_skb_data() to bpf_helper_changes_pkt_data() to better reflect that XDP prog does not work on skb. This patch adds one "xdp_adjust_head" bit to bpf_prog for the XDP-capable driver to check if the XDP prog requires bpf_xdp_adjust_head() support. The driver can then decide to error out during XDP_SETUP_PROG. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- arch/s390/net/bpf_jit_comp.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 5 ++++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++++ .../net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++++ drivers/net/ethernet/qlogic/qede/qede_main.c | 5 ++++ include/linux/filter.h | 6 +++-- include/uapi/linux/bpf.h | 11 ++++++++- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 ++ kernel/bpf/verifier.c | 2 +- net/core/filter.c | 28 ++++++++++++++++++++-- 13 files changed, 67 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0fe98a567125..73a5cf18fd84 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -766,7 +766,7 @@ emit_clear: func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_skb_data(func)) + if (bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -775,7 +775,7 @@ emit_clear: PPC_MR(b2p[BPF_REG_0], 3); /* refresh skb cache */ - if (bpf_helper_changes_skb_data(func)) { + if (bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bee281f3163d..167b31b186c1 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if (bpf_helper_changes_skb_data((void *)func)) { + if (bpf_helper_changes_pkt_data((void *)func)) { jit->seen |= SEEN_SKB_CHANGE; /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..e76d1af60f7a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 49a81f1fc1d6..f441eda63bec 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2686,6 +2686,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog) int err; int i; + if (prog && prog->xdp_adjust_head) { + en_err(priv, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + xdp_ring_num = prog ? priv->rx_ring_num : 0; /* No need to reconfigure buffers when simply swapping the diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 07020276fe73..cbfa38fc72c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3183,6 +3183,11 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog) bool reset, was_opened; int i; + if (prog && prog->xdp_adjust_head) { + netdev_err(netdev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + mutex_lock(&priv->state_lock); if ((netdev->features & NETIF_F_LRO) && prog) { diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 00d9a03be31d..e8d448109e03 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2946,6 +2946,10 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog) }; int err; + if (prog && prog->xdp_adjust_head) { + nn_err(nn, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } if (!prog && !nn->xdp_prog) return 0; if (prog && nn->xdp_prog) { diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index cf1dd1436d93..aecdd1c5c0ea 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2507,6 +2507,11 @@ static int qede_xdp_set(struct qede_dev *edev, struct bpf_prog *prog) { struct qede_reload_args args; + if (prog && prog->xdp_adjust_head) { + DP_ERR(edev, "Does not support bpf_xdp_adjust_head()\n"); + return -EOPNOTSUPP; + } + /* If we're called, there was already a bpf reference increment */ args.func = &qede_xdp_reload_func; args.u.new_prog = prog; diff --git a/include/linux/filter.h b/include/linux/filter.h index f078d2b1cff6..6a1658308612 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -406,7 +406,8 @@ struct bpf_prog { u16 jited:1, /* Is our filter JIT'ed? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + xdp_adjust_head:1; /* Adjusting pkt head? */ kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ @@ -440,6 +441,7 @@ struct bpf_skb_data_end { struct xdp_buff { void *data; void *data_end; + void *data_hard_start; }; /* compute the linear packet data range [data, data_end) which @@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); -bool bpf_helper_changes_skb_data(void *func); +bool bpf_helper_changes_pkt_data(void *func); struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6123d9b8e828..0eb0e87dbe9f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,12 @@ union bpf_attr { * @len: length of header to be pushed in front * @flags: Flags (unused for now) * Return: 0 on success or negative error + * + * int bpf_xdp_adjust_head(xdp_md, delta) + * Adjust the xdp_md.data by delta + * @xdp_md: pointer to xdp_md + * @delta: An positive/negative integer to be added to xdp_md.data + * Return: 0 on success or negative on error */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -469,7 +475,8 @@ union bpf_attr { FN(csum_update), \ FN(set_hash_invalid), \ FN(get_numa_node_id), \ - FN(skb_change_head), + FN(skb_change_head), \ + FN(xdp_adjust_head), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -576,6 +583,8 @@ struct bpf_sock { __u32 protocol; }; +#define XDP_PACKET_HEADROOM 256 + /* User return codes for XDP prog type. * A valid XDP program must return one of these defined values. All other * return codes are reserved for future use. Unknown return codes will result diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bdcc9f4ba767..83e0d153b0b4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1143,7 +1143,7 @@ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) return prog; } -bool __weak bpf_helper_changes_skb_data(void *func) +bool __weak bpf_helper_changes_pkt_data(void *func) { return false; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 88f609f1c0c3..4819ec9d95f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -579,6 +579,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_xdp_adjust_head) + prog->xdp_adjust_head = 1; if (insn->imm == BPF_FUNC_tail_call) { /* mark bpf_tail_call as different opcode * to avoid conditional branch in diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5b14f85f45c6..d28f9a3380a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1216,7 +1216,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id) return -EINVAL; } - changes_data = bpf_helper_changes_skb_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(fn->func); memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; diff --git a/net/core/filter.c b/net/core/filter.c index b751202e12f8..b1461708a977 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2234,7 +2234,28 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_skb_data(void *func) +BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) +{ + void *data = xdp->data + offset; + + if (unlikely(data < xdp->data_hard_start || + data > xdp->data_end - ETH_HLEN)) + return -EINVAL; + + xdp->data = data; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { + .func = bpf_xdp_adjust_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || @@ -2244,7 +2265,8 @@ bool bpf_helper_changes_skb_data(void *func) func == bpf_skb_change_tail || func == bpf_skb_pull_data || func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace) + func == bpf_l4_csum_replace || + func == bpf_xdp_adjust_head) return true; return false; @@ -2670,6 +2692,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_xdp_adjust_head: + return &bpf_xdp_adjust_head_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 9c1645727b8fa90d07256fdfcc45bf831242a3ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 8 Dec 2016 20:49:32 +0000 Subject: timekeeping_Force_unsigned_clocksource_to_nanoseconds_conversion The clocksource delta to nanoseconds conversion is using signed math, but the delta is unsigned. This makes the conversion space smaller than necessary and in case of a multiplication overflow the conversion can become negative. The conversion is done with scaled math: s64 nsec_delta = ((s64)clkdelta * clk->mult) >> clk->shift; Shifting a signed integer right obvioulsy preserves the sign, which has interesting consequences: - Time jumps backwards - __iter_div_u64_rem() which is used in one of the calling code pathes will take forever to piecewise calculate the seconds/nanoseconds part. This has been reported by several people with different scenarios: David observed that when stopping a VM with a debugger: "It was essentially the stopped by debugger case. I forget exactly why, but the guest was being explicitly stopped from outside, it wasn't just scheduling lag. I think it was something in the vicinity of 10 minutes stopped." When lifting the stop the machine went dead. The stopped by debugger case is not really interesting, but nevertheless it would be a good thing not to die completely. But this was also observed on a live system by Liav: "When the OS is too overloaded, delta will get a high enough value for the msb of the sum delta * tkr->mult + tkr->xtime_nsec to be set, and so after the shift the nsec variable will gain a value similar to 0xffffffffff000000." Unfortunately this has been reintroduced recently with commit 6bd58f09e1d8 ("time: Add cycles to nanoseconds translation"). It had been fixed a year ago already in commit 35a4933a8959 ("time: Avoid signed overflow in timekeeping_get_ns()"). Though it's not surprising that the issue has been reintroduced because the function itself and the whole call chain uses s64 for the result and the propagation of it. The change in this recent commit is subtle: s64 nsec; - nsec = (d * m + n) >> s: + nsec = d * m + n; + nsec >>= s; d being type of cycle_t adds another level of obfuscation. This wouldn't have happened if the previous change to unsigned computation would have made the 'nsec' variable u64 right away and a follow up patch had cleaned up the whole call chain. There have been patches submitted which basically did a revert of the above patch leaving everything else unchanged as signed. Back to square one. This spawned a admittedly pointless discussion about potential users which rely on the unsigned behaviour until someone pointed out that it had been fixed before. The changelogs of said patches added further confusion as they made finally false claims about the consequences for eventual users which expect signed results. Despite delta being cycle_t, aka. u64, it's very well possible to hand in a signed negative value and the signed computation will happily return the correct result. But nobody actually sat down and analyzed the code which was added as user after the propably unintended signed conversion. Though in sensitive code like this it's better to analyze it proper and make sure that nothing relies on this than hunting the subtle wreckage half a year later. After analyzing all call chains it stands that no caller can hand in a negative value (which actually would work due to the s64 cast) and rely on the signed math to do the right thing. Change the conversion function to unsigned math. The conversion of all call chains is done in a follow up patch. This solves the starvation issue, which was caused by the negative result, but it does not solve the underlying problem. It merily procrastinates it. When the timekeeper update is deferred long enough that the unsigned multiplication overflows, then time going backwards is observable again. It does neither solve the issue of clocksources with a small counter width which will wrap around possibly several times and cause random time stamps to be generated. But those are usually not found on systems used for virtualization, so this is likely a non issue. I took the liberty to claim authorship for this simply because analyzing all callsites and writing the changelog took substantially more time than just making the simple s/s64/u64/ change and ignore the rest. Fixes: 6bd58f09e1d8 ("time: Add cycles to nanoseconds translation") Reported-by: David Gibson Reported-by: Liav Rehana Signed-off-by: Thomas Gleixner Reviewed-by: David Gibson Acked-by: Peter Zijlstra (Intel) Cc: Parit Bhargava Cc: Laurent Vivier Cc: "Christopher S. Hall" Cc: Chris Metcalf Cc: Richard Cochran Cc: John Stultz Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20161208204228.688545601@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b2286e94c934..bfe589e929e8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -299,10 +299,10 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, +static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, cycle_t delta) { - s64 nsec; + u64 nsec; nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; -- cgit v1.2.3 From acc89612a70e370a5640fd77a83f15b7b94d85e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 8 Dec 2016 20:49:34 +0000 Subject: timekeeping: Make the conversion call chain consistently unsigned Propagating a unsigned value through signed variables and functions makes absolutely no sense and is just prone to (re)introduce subtle signed vs. unsigned issues as happened recently. Clean it up. Signed-off-by: Thomas Gleixner Reviewed-by: David Gibson Acked-by: Peter Zijlstra (Intel) Cc: Parit Bhargava Cc: Laurent Vivier Cc: "Christopher S. Hall" Cc: Chris Metcalf Cc: Richard Cochran Cc: Liav Rehana Cc: John Stultz Link: http://lkml.kernel.org/r/20161208204228.765843099@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bfe589e929e8..5244821643a4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -311,7 +311,7 @@ static inline u64 timekeeping_delta_to_ns(struct tk_read_base *tkr, return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +static inline u64 timekeeping_get_ns(struct tk_read_base *tkr) { cycle_t delta; @@ -319,8 +319,8 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) return timekeeping_delta_to_ns(tkr, delta); } -static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, - cycle_t cycles) +static inline u64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) { cycle_t delta; @@ -652,7 +652,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) { struct clocksource *clock = tk->tkr_mono.clock; cycle_t cycle_now, delta; - s64 nsec; + u64 nsec; cycle_now = tk->tkr_mono.read(clock); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); @@ -681,7 +681,7 @@ int __getnstimeofday64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; - s64 nsecs = 0; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -721,7 +721,7 @@ ktime_t ktime_get(void) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; - s64 nsecs; + u64 nsecs; WARN_ON(timekeeping_suspended); @@ -764,7 +764,7 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; - s64 nsecs; + u64 nsecs; WARN_ON(timekeeping_suspended); @@ -808,7 +808,7 @@ ktime_t ktime_get_raw(void) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base; - s64 nsecs; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); @@ -833,8 +833,8 @@ void ktime_get_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 tomono; - s64 nsec; unsigned int seq; + u64 nsec; WARN_ON(timekeeping_suspended); @@ -922,8 +922,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) unsigned long seq; ktime_t base_raw; ktime_t base_real; - s64 nsec_raw; - s64 nsec_real; + u64 nsec_raw; + u64 nsec_real; cycle_t now; WARN_ON_ONCE(timekeeping_suspended); @@ -1081,7 +1081,7 @@ int get_device_system_crosststamp(int (*get_time_fn) cycle_t cycles, now, interval_start; unsigned int clock_was_set_seq = 0; ktime_t base_real, base_raw; - s64 nsec_real, nsec_raw; + u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; unsigned long seq; bool do_interp; @@ -1394,7 +1394,7 @@ void getrawmonotonic64(struct timespec64 *ts) struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 ts64; unsigned long seq; - s64 nsecs; + u64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); -- cgit v1.2.3 From cbd99e3b289e43000c29aa4aa9b94b394cdc68bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 8 Dec 2016 20:49:36 +0000 Subject: timekeeping: Get rid of pointless typecasts cycle_t is defined as u64, so casting it to u64 is a pointless and confusing exercise. cycle_t should simply go away and be replaced with a plain u64 to avoid further confusion. Signed-off-by: Thomas Gleixner Reviewed-by: David Gibson Acked-by: Peter Zijlstra (Intel) Cc: Parit Bhargava Cc: Laurent Vivier Cc: "Christopher S. Hall" Cc: Chris Metcalf Cc: Richard Cochran Cc: Liav Rehana Cc: John Stultz Link: http://lkml.kernel.org/r/20161208204228.844699737@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5244821643a4..82e1b5cbebbb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -258,10 +258,9 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->cycle_interval = interval; /* Go back from cycles -> shifted ns */ - tk->xtime_interval = (u64) interval * clock->mult; + tk->xtime_interval = interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = - ((u64) interval * clock->mult) >> clock->shift; + tk->raw_interval = (interval * clock->mult) >> clock->shift; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { -- cgit v1.2.3 From c029a2bec66e42e57538cb65e28618baf6a4b311 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 8 Dec 2016 20:49:38 +0000 Subject: timekeeping: Use mul_u64_u32_shr() instead of open coding it The resume code must deal with a clocksource delta which is potentially big enough to overflow the 64bit mult. Replace the open coded handling with the proper function. Signed-off-by: Thomas Gleixner Reviewed-by: David Gibson Acked-by: Peter Zijlstra (Intel) Cc: Parit Bhargava Cc: Laurent Vivier Cc: "Christopher S. Hall" Cc: Chris Metcalf Cc: Richard Cochran Cc: Liav Rehana Cc: John Stultz Link: http://lkml.kernel.org/r/20161208204228.921674404@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 82e1b5cbebbb..da233cdf89b0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1644,7 +1644,7 @@ void timekeeping_resume(void) struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; - cycle_t cycle_now, cycle_delta; + cycle_t cycle_now; sleeptime_injected = false; read_persistent_clock64(&ts_new); @@ -1670,27 +1670,11 @@ void timekeeping_resume(void) cycle_now = tk->tkr_mono.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { - u64 num, max = ULLONG_MAX; - u32 mult = clock->mult; - u32 shift = clock->shift; - s64 nsec = 0; - - cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, - tk->tkr_mono.mask); - - /* - * "cycle_delta * mutl" may cause 64 bits overflow, if the - * suspended time is too long. In that case we need do the - * 64 bits math carefully - */ - do_div(max, mult); - if (cycle_delta > max) { - num = div64_u64(cycle_delta, max); - nsec = (((u64) max * mult) >> shift) * num; - cycle_delta -= num * max; - } - nsec += ((u64) cycle_delta * mult) >> shift; + u64 nsec, cyc_delta; + cyc_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, + tk->tkr_mono.mask); + nsec = mul_u64_u32_shr(cyc_delta, clock->mult, clock->shift); ts_delta = ns_to_timespec64(nsec); sleeptime_injected = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { -- cgit v1.2.3 From 8cf868affdc459beee1a941df0cfaba1673740e3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Nov 2016 13:03:21 -0500 Subject: tracing: Have the reg function allow to fail Some tracepoints have a registration function that gets enabled when the tracepoint is enabled. There may be cases that the registraction function must fail (for example, can't allocate enough memory). In this case, the tracepoint should also fail to register, otherwise the user would not know why the tracepoint is not working. Cc: David Howells Cc: Seiji Aguchi Cc: Anton Blanchard Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/powerpc/include/asm/trace.h | 4 ++-- arch/powerpc/platforms/powernv/opal-tracepoints.c | 6 ++++-- arch/powerpc/platforms/pseries/lpar.c | 6 ++++-- arch/x86/include/asm/trace/exceptions.h | 2 +- arch/x86/include/asm/trace/irq_vectors.h | 2 +- arch/x86/kernel/tracepoint.c | 3 ++- drivers/i2c/i2c-core.c | 3 ++- include/linux/tracepoint-defs.h | 2 +- include/linux/tracepoint.h | 2 +- include/trace/events/i2c.h | 2 +- kernel/trace/trace_benchmark.c | 3 ++- kernel/trace/trace_benchmark.h | 2 +- kernel/tracepoint.c | 12 +++++++++--- samples/trace_events/trace-events-sample.c | 3 ++- samples/trace_events/trace-events-sample.h | 2 +- 15 files changed, 34 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index 32e36b16773f..c05cef6ee06c 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -54,7 +54,7 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit, ); #ifdef CONFIG_PPC_PSERIES -extern void hcall_tracepoint_regfunc(void); +extern int hcall_tracepoint_regfunc(void); extern void hcall_tracepoint_unregfunc(void); TRACE_EVENT_FN_COND(hcall_entry, @@ -104,7 +104,7 @@ TRACE_EVENT_FN_COND(hcall_exit, #endif #ifdef CONFIG_PPC_POWERNV -extern void opal_tracepoint_regfunc(void); +extern int opal_tracepoint_regfunc(void); extern void opal_tracepoint_unregfunc(void); TRACE_EVENT_FN(opal_entry, diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c index 1e496b780efd..3c447002edff 100644 --- a/arch/powerpc/platforms/powernv/opal-tracepoints.c +++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c @@ -6,9 +6,10 @@ #ifdef HAVE_JUMP_LABEL struct static_key opal_tracepoint_key = STATIC_KEY_INIT; -void opal_tracepoint_regfunc(void) +int opal_tracepoint_regfunc(void) { static_key_slow_inc(&opal_tracepoint_key); + return 0; } void opal_tracepoint_unregfunc(void) @@ -25,9 +26,10 @@ void opal_tracepoint_unregfunc(void) /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ extern long opal_tracepoint_refcount; -void opal_tracepoint_regfunc(void) +int opal_tracepoint_regfunc(void) { opal_tracepoint_refcount++; + return 0; } void opal_tracepoint_unregfunc(void) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index aa35245d8d6d..c0423ce3955c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -661,9 +661,10 @@ EXPORT_SYMBOL(arch_free_page); #ifdef HAVE_JUMP_LABEL struct static_key hcall_tracepoint_key = STATIC_KEY_INIT; -void hcall_tracepoint_regfunc(void) +int hcall_tracepoint_regfunc(void) { static_key_slow_inc(&hcall_tracepoint_key); + return 0; } void hcall_tracepoint_unregfunc(void) @@ -680,9 +681,10 @@ void hcall_tracepoint_unregfunc(void) /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ extern long hcall_tracepoint_refcount; -void hcall_tracepoint_regfunc(void) +int hcall_tracepoint_regfunc(void) { hcall_tracepoint_refcount++; + return 0; } void hcall_tracepoint_unregfunc(void) diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index 2fbc66c7885b..2422b14c50a7 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -6,7 +6,7 @@ #include -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_exceptions, diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h index 38a09a13a9bc..32dd6a9e343c 100644 --- a/arch/x86/include/asm/trace/irq_vectors.h +++ b/arch/x86/include/asm/trace/irq_vectors.h @@ -6,7 +6,7 @@ #include -extern void trace_irq_vector_regfunc(void); +extern int trace_irq_vector_regfunc(void); extern void trace_irq_vector_unregfunc(void); DECLARE_EVENT_CLASS(x86_irq_vector, diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1c113db9ed57..15515132bf0d 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -34,7 +34,7 @@ static void switch_idt(void *arg) local_irq_restore(flags); } -void trace_irq_vector_regfunc(void) +int trace_irq_vector_regfunc(void) { mutex_lock(&irq_vector_mutex); if (!trace_irq_vector_refcount) { @@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void) } trace_irq_vector_refcount++; mutex_unlock(&irq_vector_mutex); + return 0; } void trace_irq_vector_unregfunc(void) diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index b432b64e307a..6a2b995d7fc4 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -77,9 +77,10 @@ static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver); static struct static_key i2c_trace_msg = STATIC_KEY_INIT_FALSE; static bool is_registered; -void i2c_transfer_trace_reg(void) +int i2c_transfer_trace_reg(void) { static_key_slow_inc(&i2c_trace_msg); + return 0; } void i2c_transfer_trace_unreg(void) diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 4ac89acb6136..a03192052066 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ struct static_key key; - void (*regfunc)(void); + int (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; }; diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index be586c632a0c..f72fcfe0e66a 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -81,7 +81,7 @@ static inline void tracepoint_synchronize_unregister(void) } #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -extern void syscall_regfunc(void); +extern int syscall_regfunc(void); extern void syscall_unregfunc(void); #endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */ diff --git a/include/trace/events/i2c.h b/include/trace/events/i2c.h index fe17187df65d..4abb8eab34d3 100644 --- a/include/trace/events/i2c.h +++ b/include/trace/events/i2c.h @@ -20,7 +20,7 @@ /* * drivers/i2c/i2c-core.c */ -extern void i2c_transfer_trace_reg(void); +extern int i2c_transfer_trace_reg(void); extern void i2c_transfer_trace_unreg(void); /* diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 0f109c4130d3..f76d0416dd83 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -164,11 +164,12 @@ static int benchmark_event_kthread(void *arg) * When the benchmark tracepoint is enabled, it calls this * function and the thread that calls the tracepoint is created. */ -void trace_benchmark_reg(void) +int trace_benchmark_reg(void) { bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); WARN_ON(!bm_event_thread); + return 0; } /* diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index 3c1df1df4e29..ebdbfc2f2a64 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -6,7 +6,7 @@ #include -extern void trace_benchmark_reg(void); +extern int trace_benchmark_reg(void); extern void trace_benchmark_unreg(void); #define BENCHMARK_EVENT_STRLEN 128 diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d0639d917899..1f9a31f934a4 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -194,9 +194,13 @@ static int tracepoint_add_func(struct tracepoint *tp, struct tracepoint_func *func, int prio) { struct tracepoint_func *old, *tp_funcs; + int ret; - if (tp->regfunc && !static_key_enabled(&tp->key)) - tp->regfunc(); + if (tp->regfunc && !static_key_enabled(&tp->key)) { + ret = tp->regfunc(); + if (ret < 0) + return ret; + } tp_funcs = rcu_dereference_protected(tp->funcs, lockdep_is_held(&tracepoints_mutex)); @@ -529,7 +533,7 @@ EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint); /* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; -void syscall_regfunc(void) +int syscall_regfunc(void) { struct task_struct *p, *t; @@ -541,6 +545,8 @@ void syscall_regfunc(void) read_unlock(&tasklist_lock); } sys_tracepoint_refcount++; + + return 0; } void syscall_unregfunc(void) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 880a7d1d27d2..30e282d33d4d 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -79,7 +79,7 @@ static int simple_thread_fn(void *arg) static DEFINE_MUTEX(thread_mutex); -void foo_bar_reg(void) +int foo_bar_reg(void) { pr_info("Starting thread for foo_bar_fn\n"); /* @@ -90,6 +90,7 @@ void foo_bar_reg(void) mutex_lock(&thread_mutex); simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn"); mutex_unlock(&thread_mutex); + return 0; } void foo_bar_unreg(void) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index d6b75bb495b3..76a75ab7a608 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -354,7 +354,7 @@ TRACE_EVENT_CONDITION(foo_bar_with_cond, TP_printk("foo %s %d", __get_str(foo), __entry->bar) ); -void foo_bar_reg(void); +int foo_bar_reg(void); void foo_bar_unreg(void); /* -- cgit v1.2.3 From 1dd349ab74d278cee50fc24dc26c023f3b149642 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Nov 2016 13:17:25 -0500 Subject: tracing: Do not start benchmark on boot up Trace events are enabled very early on boot up via the boot command line parameter. The benchmark tool creates a new thread to perform the trace event benchmarking. But at start up, it is called before scheduling is set up and because it creates a new thread before the init thread is created, this crashes the kernel. Have the benchmark fail to register when started via the kernel command line. Also, since the registering of a tracepoint now can handle failure cases, return -ENOMEM instead of warning if the thread cannot be created. Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index f76d0416dd83..2bc7dc3e8ff8 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -166,9 +166,18 @@ static int benchmark_event_kthread(void *arg) */ int trace_benchmark_reg(void) { + if (system_state != SYSTEM_RUNNING) { + pr_warning("trace benchmark cannot be started via kernel command line\n"); + return -EBUSY; + } + bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); - WARN_ON(!bm_event_thread); + if (!bm_event_thread) { + pr_warning("trace benchmark failed to create kernel thread\n"); + return -ENOMEM; + } + return 0; } @@ -183,6 +192,7 @@ void trace_benchmark_unreg(void) return; kthread_stop(bm_event_thread); + bm_event_thread = NULL; strcpy(bm_str, "START"); bm_total = 0; -- cgit v1.2.3 From 989a0a3d248192b6f5d16cc2aea95faed89bb7ce Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Nov 2016 13:54:57 -0500 Subject: tracing: Have system enable return error if one of the events fail If one of the events within a system fails to enable when "1" is written to the system "enable" file, it should return an error. Note, some events may still be enabled, but the user should know that something did go wrong. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index d35fc2b0d304..93116549a284 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -702,6 +702,7 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, struct trace_event_call *call; const char *name; int ret = -EINVAL; + int eret = 0; list_for_each_entry(file, &tr->events, list) { @@ -725,9 +726,17 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, if (event && strcmp(event, name) != 0) continue; - ftrace_event_enable_disable(file, set); + ret = ftrace_event_enable_disable(file, set); - ret = 0; + /* + * Save the first error and return that. Some events + * may still have been enabled, but let the user + * know that something went wrong. + */ + if (ret && !eret) + eret = ret; + + ret = eret; } return ret; -- cgit v1.2.3 From 9c1f6bb8c88ab6c2779bdaf12d4f3407d336f085 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 28 Nov 2016 17:48:07 -0500 Subject: tracing: Allow benchmark to be enabled at early_initcall() The trace event start up selftests fails when the trace benchmark is enabled, because it is disabled during boot. It really only needs to be disabled before scheduling is set up, as it creates a thread. Signed-off-by: Steven Rostedt --- kernel/trace/trace_benchmark.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 2bc7dc3e8ff8..e3b488825ae3 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -21,6 +21,8 @@ static u64 bm_stddev; static unsigned int bm_avg; static unsigned int bm_std; +static bool ok_to_run; + /* * This gets called in a loop recording the time it took to write * the tracepoint. What it writes is the time statistics of the last @@ -166,7 +168,7 @@ static int benchmark_event_kthread(void *arg) */ int trace_benchmark_reg(void) { - if (system_state != SYSTEM_RUNNING) { + if (!ok_to_run) { pr_warning("trace benchmark cannot be started via kernel command line\n"); return -EBUSY; } @@ -207,3 +209,12 @@ void trace_benchmark_unreg(void) bm_avg = 0; bm_stddev = 0; } + +static __init int ok_to_run_trace_benchmark(void) +{ + ok_to_run = true; + + return 0; +} + +early_initcall(ok_to_run_trace_benchmark); -- cgit v1.2.3 From 656c7f0d2d2b3237a31b105d5ed217a65350104f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 8 Dec 2016 12:40:18 -0500 Subject: tracing: Replace kmap with copy_from_user() in trace_marker writing Instead of using get_user_pages_fast() and kmap_atomic() when writing to the trace_marker file, just allocate enough space on the ring buffer directly, and write into it via copy_from_user(). Writing into the trace_marker file use to allocate a temporary buffer to perform the copy_from_user(), as we didn't want to write into the ring buffer if the copy failed. But as a trace_marker write is suppose to be extremely fast, and allocating memory causes other tracepoints to trigger, Peter Zijlstra suggested using get_user_pages_fast() and kmap_atomic() to keep the user space pages in memory and reading it directly. But Henrik Austad had issues with this because it required taking the mm->mmap_sem and causing long delays with the write. Instead, just allocate the space in the ring buffer and use copy_from_user() directly. If it faults, return -EFAULT and write "" into the ring buffer. Link: http://lkml.kernel.org/r/20161208124018.72dd0f86@gandalf.local.home Cc: Ingo Molnar Cc: Henrik Austad Cc: Peter Zijlstra Updates: d696b58ca2c3ca "tracing: Do not allocate buffer for trace_marker" Suggested-by: Thomas Gleixner Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 139 ++++++++++++++------------------------------------- 1 file changed, 37 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60416bf7c591..6f420d7b703b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5738,61 +5738,6 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static inline int lock_user_pages(const char __user *ubuf, size_t cnt, - struct page **pages, void **map_page, - int *offset) -{ - unsigned long addr = (unsigned long)ubuf; - int nr_pages = 1; - int ret; - int i; - - /* - * Userspace is injecting traces into the kernel trace buffer. - * We want to be as non intrusive as possible. - * To do so, we do not want to allocate any special buffers - * or take any locks, but instead write the userspace data - * straight into the ring buffer. - * - * First we need to pin the userspace buffer into memory, - * which, most likely it is, because it just referenced it. - * But there's no guarantee that it is. By using get_user_pages_fast() - * and kmap_atomic/kunmap_atomic() we can get access to the - * pages directly. We then write the data directly into the - * ring buffer. - */ - - /* check if we cross pages */ - if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) - nr_pages = 2; - - *offset = addr & (PAGE_SIZE - 1); - addr &= PAGE_MASK; - - ret = get_user_pages_fast(addr, nr_pages, 0, pages); - if (ret < nr_pages) { - while (--ret >= 0) - put_page(pages[ret]); - return -EFAULT; - } - - for (i = 0; i < nr_pages; i++) - map_page[i] = kmap_atomic(pages[i]); - - return nr_pages; -} - -static inline void unlock_user_pages(struct page **pages, - void **map_page, int nr_pages) -{ - int i; - - for (i = nr_pages - 1; i >= 0; i--) { - kunmap_atomic(map_page[i]); - put_page(pages[i]); - } -} - static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) @@ -5802,14 +5747,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - struct page *pages[2]; - void *map_page[2]; - int nr_pages = 1; + const char faulted[] = ""; ssize_t written; - int offset; int size; int len; +/* Used in tracing_mark_raw_write() as well */ +#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ + if (tracing_disabled) return -EINVAL; @@ -5821,30 +5766,31 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset); - if (nr_pages < 0) - return nr_pages; - local_save_flags(irq_flags); - size = sizeof(*entry) + cnt + 2; /* possible \n added */ + size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */ + + /* If less than "", then make sure we can still add that */ + if (cnt < FAULTED_SIZE) + size += FAULTED_SIZE - cnt; + buffer = tr->trace_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); - if (!event) { + if (unlikely(!event)) /* Ring buffer disabled, return as if not open for write */ - written = -EBADF; - goto out_unlock; - } + return -EBADF; entry = ring_buffer_event_data(event); entry->ip = _THIS_IP_; - if (nr_pages == 2) { - len = PAGE_SIZE - offset; - memcpy(&entry->buf, map_page[0] + offset, len); - memcpy(&entry->buf[len], map_page[1], cnt - len); + len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + if (len) { + memcpy(&entry->buf, faulted, FAULTED_SIZE); + cnt = FAULTED_SIZE; + written = -EFAULT; } else - memcpy(&entry->buf, map_page[0] + offset, cnt); + written = cnt; + len = cnt; if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; @@ -5854,12 +5800,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - written = cnt; - - *fpos += written; - - out_unlock: - unlock_user_pages(pages, map_page, nr_pages); + if (written > 0) + *fpos += written; return written; } @@ -5875,15 +5817,14 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, struct ring_buffer_event *event; struct ring_buffer *buffer; struct raw_data_entry *entry; + const char faulted[] = ""; unsigned long irq_flags; - struct page *pages[2]; - void *map_page[2]; - int nr_pages = 1; ssize_t written; - int offset; int size; int len; +#define FAULT_SIZE_ID (FAULTED_SIZE + sizeof(int)) + if (tracing_disabled) return -EINVAL; @@ -5899,38 +5840,32 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - nr_pages = lock_user_pages(ubuf, cnt, pages, map_page, &offset); - if (nr_pages < 0) - return nr_pages; - local_save_flags(irq_flags); size = sizeof(*entry) + cnt; + if (cnt < FAULT_SIZE_ID) + size += FAULT_SIZE_ID - cnt; + buffer = tr->trace_buffer.buffer; event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size, irq_flags, preempt_count()); - if (!event) { + if (!event) /* Ring buffer disabled, return as if not open for write */ - written = -EBADF; - goto out_unlock; - } + return -EBADF; entry = ring_buffer_event_data(event); - if (nr_pages == 2) { - len = PAGE_SIZE - offset; - memcpy(&entry->id, map_page[0] + offset, len); - memcpy(((char *)&entry->id) + len, map_page[1], cnt - len); + len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + if (len) { + entry->id = -1; + memcpy(&entry->buf, faulted, FAULTED_SIZE); + written = -EFAULT; } else - memcpy(&entry->id, map_page[0] + offset, cnt); + written = cnt; __buffer_unlock_commit(buffer, event); - written = cnt; - - *fpos += written; - - out_unlock: - unlock_user_pages(pages, map_page, nr_pages); + if (written > 0) + *fpos += written; return written; } -- cgit v1.2.3 From 794de08a16cf1fc1bf785dc48f66d36218cf6d88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 8 Dec 2016 20:54:49 -0500 Subject: fgraph: Handle a case where a tracer ignores set_graph_notrace Both the wakeup and irqsoff tracers can use the function graph tracer when the display-graph option is set. The problem is that they ignore the notrace file, and record the entry of functions that would be ignored by the function_graph tracer. This causes the trace->depth to be recorded into the ring buffer. The set_graph_notrace uses a trick by adding a large negative number to the trace->depth when a graph function is to be ignored. On trace output, the graph function uses the depth to record a stack of functions. But since the depth is negative, it accesses the array with a negative number and causes an out of bounds access that can cause a kernel oops or corrupt data. Have the print functions handle cases where a tracer still records functions even when they are in set_graph_notrace. Also add warnings if the depth is below zero before accessing the array. Note, the function graph logic will still prevent the return of these functions from being recorded, which means that they will be left hanging without a return. For example: # echo '*spin*' > set_graph_notrace # echo 1 > options/display-graph # echo wakeup > current_tracer # cat trace [...] _raw_spin_lock() { preempt_count_add() { do_raw_spin_lock() { update_rq_clock(); Where it should look like: _raw_spin_lock() { preempt_count_add(); do_raw_spin_lock(); } update_rq_clock(); Cc: stable@vger.kernel.org Cc: Namhyung Kim Fixes: 29ad23b00474 ("ftrace: Add set_graph_notrace filter") Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8e1a115439fa..566f7327c3aa 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -842,6 +842,10 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data = per_cpu_ptr(data->cpu_data, cpu); + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments @@ -850,7 +854,8 @@ print_graph_entry_leaf(struct trace_iterator *iter, cpu_data->depth = call->depth - 1; /* No need to keep this function around for this depth */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = 0; } @@ -880,11 +885,16 @@ print_graph_entry_nested(struct trace_iterator *iter, struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; + /* If a graph tracer ignored set_graph_notrace */ + if (call->depth < -1) + call->depth += FTRACE_NOTRACE_DEPTH; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); cpu_data->depth = call->depth; /* Save this function pointer to see if the exit matches */ - if (call->depth < FTRACE_RETFUNC_DEPTH) + if (call->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(call->depth < 0)) cpu_data->enter_funcs[call->depth] = call->func; } @@ -1114,7 +1124,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, */ cpu_data->depth = trace->depth - 1; - if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (trace->depth < FTRACE_RETFUNC_DEPTH && + !WARN_ON_ONCE(trace->depth < 0)) { if (cpu_data->enter_funcs[trace->depth] != trace->func) func_match = 0; cpu_data->enter_funcs[trace->depth] = 0; -- cgit v1.2.3 From 1a41442864e35bff859582fe9c5d051d0b1040ba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 8 Dec 2016 19:28:28 -0500 Subject: tracing/fgraph: Have wakeup and irqsoff tracers ignore graph functions too Currently both the wakeup and irqsoff traces do not handle set_graph_notrace well. The ftrace infrastructure will ignore the return paths of all functions leaving them hanging without an end: # echo '*spin*' > set_graph_notrace # cat trace [...] _raw_spin_lock() { preempt_count_add() { do_raw_spin_lock() { update_rq_clock(); Where the '*spin*' functions should have looked like this: _raw_spin_lock() { preempt_count_add(); do_raw_spin_lock(); } update_rq_clock(); Instead, have the wakeup and irqsoff tracers ignore the functions that are set by the set_graph_notrace like the function_graph tracer does. Move the logic in the function_graph tracer into a header to allow wakeup and irqsoff tracers to use it as well. Cc: Namhyung Kim Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 11 +++++++++++ kernel/trace/trace_functions_graph.c | 14 +++++++------- kernel/trace/trace_irqsoff.c | 12 ++++++++++++ kernel/trace/trace_sched_wakeup.c | 12 ++++++++++++ 4 files changed, 42 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 37602e722336..c2234494f40c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -846,6 +846,17 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr) return 0; } #endif /* CONFIG_DYNAMIC_FTRACE */ + +extern unsigned int fgraph_max_depth; + +static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) +{ + /* trace it when it is-nested-in or is a function enabled. */ + return !(trace->depth || ftrace_graph_addr(trace->func)) || + (trace->depth < 0) || + (fgraph_max_depth && trace->depth >= fgraph_max_depth); +} + #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 566f7327c3aa..d56123cdcc89 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -65,7 +65,7 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 -static unsigned int max_depth; +unsigned int fgraph_max_depth; static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -384,10 +384,10 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (!ftrace_trace_task(tr)) return 0; - /* trace it when it is-nested-in or is a function enabled. */ - if ((!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) || (trace->depth < 0) || - (max_depth && trace->depth >= max_depth)) + if (ftrace_graph_ignore_func(trace)) + return 0; + + if (ftrace_graph_ignore_irqs()) return 0; /* @@ -1500,7 +1500,7 @@ graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret) return ret; - max_depth = val; + fgraph_max_depth = val; *ppos += cnt; @@ -1514,7 +1514,7 @@ graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/ int n; - n = sprintf(buf, "%d\n", max_depth); + n = sprintf(buf, "%d\n", fgraph_max_depth); return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 03cdff84d026..86654d7e1afe 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -175,6 +175,18 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) int ret; int pc; + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + if (!func_prolog_dec(tr, &data, &flags)) return 0; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 1bf2324dc682..5d0bb025bb21 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -239,6 +239,18 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) unsigned long flags; int pc, ret = 0; + if (ftrace_graph_ignore_func(trace)) + return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + if (!func_prolog_preempt_disable(tr, &data, &pc)) return 0; -- cgit v1.2.3 From f519a3f1c6b7a990e5aed37a8f853c6ecfdee945 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 8 Dec 2016 17:56:53 +0100 Subject: sched/core: Fix find_idlest_group() for fork During fork, the utilization of a task is init once the rq has been selected because the current utilization level of the rq is used to set the utilization of the fork task. As the task's utilization is still 0 at this step of the fork sequence, it doesn't make sense to look for some spare capacity that can fit the task's utilization. Furthermore, I can see perf regressions for the test: hackbench -P -g 1 because the least loaded policy is always bypassed and tasks are not spread during fork. With this patch and the fix below, we are back to same performances as for v4.8. The fix below is only a temporary one used for the test until a smarter solution is found because we can't simply remove the test which is useful for others benchmarks | @@ -5708,13 +5708,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t | | avg_cost = this_sd->avg_scan_cost; | | - /* | - * Due to large variance we need a large fuzz factor; hackbench in | - * particularly is sensitive here. | - */ | - if ((avg_idle / 512) < avg_cost) | - return -1; | - | time = local_clock(); | | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { Tested-by: Matt Fleming Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Matt Fleming Acked-by: Morten Rasmussen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: kernellwp@gmail.com Cc: umgwanakikbuti@gmail.com Cc: yuyang.du@intel.comc Link: http://lkml.kernel.org/r/1481216215-24651-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 18d9e75f1f6e..ebb815f6bda7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5473,13 +5473,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * utilized systems if we require spare_capacity > task_util(p), * so we allow for some task stuffing by using * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + if (this_spare > task_util(p) / 2 && imbalance*this_spare > 100*most_spare) return NULL; else if (most_spare > task_util(p) / 2) return most_spare_sg; +skip_spare: if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; -- cgit v1.2.3 From 6b94780e45c17b83e3e75f8aaca5a328db583c74 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 8 Dec 2016 17:56:54 +0100 Subject: sched/core: Use load_avg for selecting idlest group find_idlest_group() only compares the runnable_load_avg when looking for the least loaded group. But on fork intensive use case like hackbench where tasks blocked quickly after the fork, this can lead to selecting the same CPU instead of other CPUs, which have similar runnable load but a lower load_avg. When the runnable_load_avg of 2 CPUs are close, we now take into account the amount of blocked load as a 2nd selection factor. There is now 3 zones for the runnable_load of the rq: - [0 .. (runnable_load - imbalance)]: Select the new rq which has significantly less runnable_load - [(runnable_load - imbalance) .. (runnable_load + imbalance)]: The runnable loads are close so we use load_avg to chose between the 2 rq - [(runnable_load + imbalance) .. ULONG_MAX]: Keep the current rq which has significantly less runnable_load The scale factor that is currently used for comparing runnable_load, doesn't work well with small value. As an example, the use of a scaling factor fails as soon as this_runnable_load == 0 because we always select local rq even if min_runnable_load is only 1, which doesn't really make sense because they are just the same. So instead of scaling factor, we use an absolute margin for runnable_load to detect CPUs with similar runnable_load and we keep using scaling factor for blocked load. For use case like hackbench, this enable the scheduler to select different CPUs during the fork sequence and to spread tasks across the system. Tests have been done on a Hikey board (ARM based octo cores) for several kernel. The result below gives min, max, avg and stdev values of 18 runs with each configuration. The patches depend on the "no missing update_rq_clock()" work. hackbench -P -g 1 ea86cb4b7621 7dc603c9028e v4.8 v4.8+patches min 0.049 0.050 0.051 0,048 avg 0.057 0.057(0%) 0.057(0%) 0,055(+5%) max 0.066 0.068 0.070 0,063 stdev +/-9% +/-9% +/-8% +/-9% More performance numbers here: https://lkml.kernel.org/r/20161203214707.GI20785@codeblueprint.co.uk Tested-by: Matt Fleming Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Matt Fleming Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: kernellwp@gmail.com Cc: umgwanakikbuti@gmail.com Cc: yuyang.du@intel.comc Link: http://lkml.kernel.org/r/1481216215-24651-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ebb815f6bda7..6559d197e08a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5405,16 +5405,20 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; do { - unsigned long load, avg_load, spare_cap, max_spare_cap; + unsigned long load, avg_load, runnable_load; + unsigned long spare_cap, max_spare_cap; int local_group; int i; @@ -5431,6 +5435,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * the group containing the CPU with most spare capacity. */ avg_load = 0; + runnable_load = 0; max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { @@ -5440,7 +5445,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, else load = target_load(i, load_idx); - avg_load += load; + runnable_load += load; + + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); spare_cap = capacity_spare_wake(i, p); @@ -5449,14 +5456,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; if (local_group) { - this_load = avg_load; + this_runnable_load = runnable_load; + this_avg_load = avg_load; this_spare = max_spare_cap; } else { - if (avg_load < min_load) { - min_load = avg_load; + if (min_runnable_load > (runnable_load + imbalance)) { + /* + * The runnable load is significantly smaller + * so we can pick this new cpu + */ + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + /* + * The runnable loads are close so take the + * blocked load into account through avg_load. + */ + min_avg_load = avg_load; idlest = group; } @@ -5482,14 +5506,23 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, goto skip_spare; if (this_spare > task_util(p) / 2 && - imbalance*this_spare > 100*most_spare) + imbalance_scale*this_spare > 100*most_spare) return NULL; - else if (most_spare > task_util(p) / 2) + + if (most_spare > task_util(p) / 2) return most_spare_sg; skip_spare: - if (!idlest || 100*this_load < imbalance*min_load) + if (!idlest) + return NULL; + + if (min_runnable_load > (this_runnable_load + imbalance)) return NULL; + + if ((this_runnable_load < (min_runnable_load + imbalance)) && + (100*this_avg_load < imbalance_scale*min_avg_load)) + return NULL; + return idlest; } -- cgit v1.2.3 From c59f29cb144a6a0dfac16ede9dc8eafc02dc56ca Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Fri, 9 Dec 2016 21:50:17 +0530 Subject: tracing: Use SOFTIRQ_OFFSET for softirq dectection for more accurate results The 's' flag is supposed to indicate that a softirq is running. This can be detected by testing the preempt_count with SOFTIRQ_OFFSET. The current code tests the preempt_count with SOFTIRQ_MASK, which would be true even when softirqs are disabled but not serving a softirq. Link: http://lkml.kernel.org/r/1481300417-3564-1-git-send-email-pkondeti@codeaurora.org Signed-off-by: Pavankumar Kondeti Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6f420d7b703b..970aafe80b49 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1949,7 +1949,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | - ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); } -- cgit v1.2.3 From 99e6f6e8134b0c9d5d82eb2af1068a57199125e4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 7 Dec 2016 14:31:33 +0100 Subject: tracing/rb: Init the CPU mask on allocation Before commit b32614c03413 ("tracing/rb: Convert to hotplug state machine") the allocated cpumask was initialized to the mask of ONLINE or POSSIBLE CPUs. After the CPU hotplug changes the buffer initialisation moved to trace_rb_cpu_prepare() but I forgot to initially set the cpumask to zero. This is done now. Link: http://lkml.kernel.org/r/20161207133133.hzkcqfllxcdi3joz@linutronix.de Fixes: b32614c03413 ("tracing/rb: Convert to hotplug state machine") Reported-by: Tetsuo Handa Tested-by: Tetsuo Handa Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f17476f9d7f4..7edfd41d506c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1303,7 +1303,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!buffer) return NULL; - if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); -- cgit v1.2.3 From f18f97ac43d72ae84fe012dd4f19fe3f8f901469 Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Fri, 9 Dec 2016 15:19:37 +0100 Subject: tracing/kprobes: Add a helper method to return number of probe hits The number of probe hits is stored in a percpu variable and therefore can't be read directly. Add a helper method trace_kprobe_nhit() that performs the required calculation. It will be used in a follow-up commit that changes kprobe selftests to verify the number of probe hits. Link: http://lkml.kernel.org/r/1481293178-3128-1-git-send-email-marcin.nowakowski@imgtec.com Acked-by: Masami Hiramatsu Signed-off-by: Marcin Nowakowski Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index eb6c9f1d3a93..a2af1bceca29 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -73,6 +73,17 @@ static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk) return !!strchr(trace_kprobe_symbol(tk), ':'); } +static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) +{ + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); + + return nhit; +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -882,14 +893,10 @@ static const struct file_operations kprobe_events_ops = { static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; - unsigned long nhit = 0; - int cpu; - - for_each_possible_cpu(cpu) - nhit += *per_cpu_ptr(tk->nhit, cpu); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), nhit, + trace_event_name(&tk->tp.call), + trace_kprobe_nhit(tk), tk->rp.kp.nmissed); return 0; -- cgit v1.2.3 From d4d7ccc834fe2401084e01fb043ad70ac410b19d Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Fri, 9 Dec 2016 15:19:38 +0100 Subject: kprobes/trace: Fix kprobe selftest for newer gcc Commit 265a5b7ee3eb ("kprobes/trace: Fix kprobe selftest for gcc 4.6") has added __used attribute to kprobe_trace_selftest_target to ensure that the method is listed in kallsyms table. However, even though the method remains in the kernel image, the actual call is optimized away as there are no side effects and the return value is never checked. Add a return value check and a 'noinline' attribute to ensure that an inlined copy of the method is not used by the caller. Also add checks that verify that the kprobe was really hit, as at the moment the tests show positive results despite the test method being optimized away. Finally, add __init annotations to find_trace_probe_file() and kprobe_trace_selftest_target() as they are only called from within an __init method. Link: http://lkml.kernel.org/r/1481293178-3128-2-git-send-email-marcin.nowakowski@imgtec.com Acked-by: Masami Hiramatsu Signed-off-by: Marcin Nowakowski Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a2af1bceca29..a133ecd741e4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1361,18 +1361,18 @@ fs_initcall(init_kprobe_trace); #ifdef CONFIG_FTRACE_STARTUP_TEST - /* * The "__used" keeps gcc from removing the function symbol - * from the kallsyms table. + * from the kallsyms table. 'noinline' makes sure that there + * isn't an inlined version used by the test method below */ -static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, - int a4, int a5, int a6) +static __used __init noinline int +kprobe_trace_selftest_target(int a1, int a2, int a3, int a4, int a5, int a6) { return a1 + a2 + a3 + a4 + a5 + a6; } -static struct trace_event_file * +static struct __init trace_event_file * find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { struct trace_event_file *file; @@ -1450,12 +1450,25 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); + /* + * Not expecting an error here, the check is only to prevent the + * optimizer from removing the call to target() as otherwise there + * are no side-effects and the call is never performed. + */ + if (ret != 21) + warn++; + /* Disable trace points before removing it */ tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting test probe.\n"); warn++; } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe hits\n"); + warn++; + } + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); @@ -1469,6 +1482,11 @@ static __init int kprobe_trace_self_tests_init(void) pr_warn("error on getting 2nd test probe.\n"); warn++; } else { + if (trace_kprobe_nhit(tk) != 1) { + pr_warn("incorrect number of testprobe2 hits\n"); + warn++; + } + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); -- cgit v1.2.3 From c0b942a76361e08fc9fb17989e0f266e64ff0688 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Mon, 12 Dec 2016 16:40:39 -0800 Subject: kthread: add __printf attributes When commit fbae2d44aa1d ("kthread: add kthread_create_worker*()") introduced some kthread_create_...() functions which were taking printf-like parametter, it introduced __printf attributes to some functions (e.g. kthread_create_worker()). Nevertheless some new functions were forgotten (they have been detected thanks to -Wmissing-format-attribute warning flag). Add the missing __printf attributes to the newly-introduced functions in order to detect formatting issues at build-time with -Wformat flag. Link: http://lkml.kernel.org/r/20161126193543.22672-1-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 2 +- kernel/kthread.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c1c3e63d52c1..4fec8b775895 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -175,7 +175,7 @@ __printf(2, 3) struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...); -struct kthread_worker * +__printf(3, 4) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...); diff --git a/kernel/kthread.c b/kernel/kthread.c index 956495f0efaf..2318fba86277 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create) } } -static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), +static __printf(4, 0) +struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) @@ -635,7 +636,7 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); -static struct kthread_worker * +static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { -- cgit v1.2.3 From 3fb4afd9a504c2386b8435028d43283216bf588e Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskiy Date: Mon, 12 Dec 2016 16:40:42 -0800 Subject: prctl: remove one-shot limitation for changing exe link This limitation came with the reason to remove "another way for malicious code to obscure a compromised program and masquerade as a benign process" by allowing "security-concious program can use this prctl once during its early initialization to ensure the prctl cannot later be abused for this purpose": http://marc.info/?l=linux-kernel&m=133160684517468&w=2 This explanation doesn't look sufficient. The only thing "exe" link is indicating is the file, used to execve, which is basically nothing and not reliable immediately after process has returned from execve system call. Moreover, to use this feture, all the mappings to previous exe file have to be unmapped and all the new exe file permissions must be satisfied. Which means, that changing exe link is very similar to calling execve on the binary. The need to remove this limitations comes from migration of NFS mount point, which is not accessible during restore and replaced by other file system. Because of this exe link has to be changed twice. [akpm@linux-foundation.org: fix up comment] Link: http://lkml.kernel.org/r/20160927153755.9337.69650.stgit@localhost.localdomain Signed-off-by: Stanislav Kinsburskiy Acked-by: Oleg Nesterov Acked-by: Cyrill Gorcunov Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Michal Hocko Cc: Kees Cook Cc: Andy Lutomirski Cc: John Stultz Cc: Matt Helsley Cc: Pavel Emelyanov Cc: Vlastimil Babka Cc: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 +++++- kernel/sys.c | 10 ---------- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7551d3e2ab70..0e90f2973719 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -540,7 +540,11 @@ static inline int get_dumpable(struct mm_struct *mm) /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ -#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +/* + * This one-shot flag is dropped due to necessity of changing exe once again + * on NFS restore + */ +//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ diff --git a/kernel/sys.c b/kernel/sys.c index 89d5be418157..fd6f50809b6e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1696,16 +1696,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) fput(exe_file); } - /* - * The symlink can be changed only once, just to disallow arbitrary - * transitions malicious software might bring in. This means one - * could make a snapshot over all processes running and monitor - * /proc/pid/exe changes to notice unusual activity if needed. - */ - err = -EPERM; - if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit; - err = 0; /* set the new file, lockless */ get_file(exe.file); -- cgit v1.2.3 From 0f110a9b956c1678b53986b003d59794604807ba Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 12 Dec 2016 16:44:14 -0800 Subject: kernel/fork: use vfree_atomic() to free thread stack vfree() is going to use sleeping lock. Thread stack freed in atomic context, therefore we must use vfree_atomic() here. Link: http://lkml.kernel.org/r/1479474236-4139-6-git-send-email-hch@lst.de Signed-off-by: Andrey Ryabinin Signed-off-by: Christoph Hellwig Cc: Joel Fernandes Cc: Jisheng Zhang Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7ffa16033ded..00492b22adfe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk) } local_irq_restore(flags); - vfree(tsk->stack); + vfree_atomic(tsk->stack); return; } #endif -- cgit v1.2.3 From 4ca5ede07c9871c13ae422c96d6d08dbd0df5eda Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 12 Dec 2016 16:45:35 -0800 Subject: hung_task: decrement sysctl_hung_task_warnings only if it is positive Since sysctl_hung_task_warnings == -1 is allowed (infinite warnings), commit 48a6d64edadb ("hung_task: allow hung_task_panic when hung_task_warnings is 0") should decrement it only when it is not -1. This prevents the kernel from ceasing warnings after the first 4294967295 ;) Signed-off-by: Tetsuo Handa Cc: John Siddle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2b59c82cc3e1..40c07e4fa116 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { - sysctl_hung_task_warnings--; + if (sysctl_hung_task_warnings > 0) + sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", t->comm, t->pid, timeout); pr_err(" %s %s %.*s\n", -- cgit v1.2.3 From 4a998e322abc935e95efc1a8108e6102be636a43 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:40 -0800 Subject: printk/NMI: fix up handling of the full nmi log buffer vsnprintf() adds the trailing '\0' but it does not count it into the number of printed characters. The result is that there is one byte less space for the real characters in the buffer. The broken check for the free space might cause that we will repeatedly try to print 1 character into the buffer, never reach the full buffer, and do not count the messages as missed. Also vsnprintf() returns the number of characters that would be printed if the buffer was big enough. As a result, s->len might be bigger than the size of the buffer[*]. And the printk() function might return bigger len than it really printed. Both problems are fixed by using vscnprintf() instead. Note that I though about increasing the number of missed messages even when the message was shrunken. But it made the code even more complicated. I think that it is not worth it. Shrunken messages are usually easy to recognize. And it should be a corner case. [*] The overflown s->len value is crazy and unexpected. I "made a mistake" and reported this situation as an internal error when fixed handling of PR_CONT headers in some other patch. Link: http://lkml.kernel.org/r/20161208174912.GA17042@linux.suse Signed-off-by: Petr Mladek CcL Sergey Senozhatsky Cc: Chris Mason Cc: David Sterba Cc: Jason Wessel Cc: Josef Bacik Cc: Joe Perches Cc: Jaroslav Kysela Cc: Steven Rostedt Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 16bab471c7e2..152533edc56f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) { + /* The trailing '\0' is not counted into len. */ + if (len >= sizeof(s->buffer) - 1) { atomic_inc(&nmi_message_lost); return 0; } @@ -79,7 +80,7 @@ again: if (!len) smp_rmb(); - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. -- cgit v1.2.3 From 22c2c7b2ef7864389b1b75f9fd604da14b21e2c2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:44 -0800 Subject: printk/NMI: handle continuous lines and missing newline Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") added back KERN_CONT message header. As a result it might appear in the middle of the line when the parts are squashed via the temporary NMI buffer. A reasonable solution seems to be to split the text in the NNI temporary not only by newlines but also by the message headers. Another solution would be to filter out KERN_CONT when writing to the temporary buffer. But this would complicate the lockless handling. Also it would not solve problems with a missing newline that was there even before the KERN_CONT stuff. This patch moves the temporary buffer handling into separate function. I played with it and it seems that using the char pointers make the code easier to read. Also it prints the final newline as a continuous line. Finally, it moves handling of the s->len overflow into the paranoid check. And allows to recover from the disaster. Link: http://lkml.kernel.org/r/1478695291-12169-2-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Cc: Joe Perches Cc: Steven Rostedt Cc: Jason Wessel Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 78 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 152533edc56f..f011aaef583c 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -114,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len) } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, - int start, int end) +/* printk part of the temporary buffer line by line */ +static int printk_nmi_flush_buffer(const char *start, size_t len) { - const char *buf = s->buffer + start; + const char *c, *end; + bool header; + + c = start; + end = start + len; + header = true; + + /* Print line by line. */ + while (c < end) { + if (*c == '\n') { + printk_nmi_flush_line(start, c - start + 1); + start = ++c; + header = true; + continue; + } + + /* Handle continuous lines or missing new line. */ + if ((c + 1 < end) && printk_get_level(c)) { + if (header) { + c = printk_skip_level(c); + continue; + } + + printk_nmi_flush_line(start, c - start); + start = c++; + header = true; + continue; + } + + header = false; + c++; + } - printk_nmi_flush_line(buf, (end - start) + 1); + /* Check if there was a partial line. Ignore pure header. */ + if (start < end && !header) { + static const char newline[] = KERN_CONT "\n"; + + printk_nmi_flush_line(start, end - start); + printk_nmi_flush_line(newline, strlen(newline)); + } + + return len; } /* @@ -136,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work) __RAW_SPIN_LOCK_INITIALIZER(read_lock); struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); unsigned long flags; - size_t len, size; - int i, last_i; + size_t len; + int i; /* * The lock has two functions. First, one reader has to flush all @@ -155,12 +190,14 @@ more: /* * This is just a paranoid check that nobody has manipulated * the buffer an unexpected way. If we printed something then - * @len must only increase. + * @len must only increase. Also it should never overflow the + * buffer size. */ - if (i && i >= len) { + if ((i && i >= len) || len > sizeof(s->buffer)) { const char *msg = "printk_nmi_flush: internal error\n"; printk_nmi_flush_line(msg, strlen(msg)); + len = 0; } if (!len) @@ -168,22 +205,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - - size = min(len, sizeof(s->buffer)); - last_i = i; - - /* Print line by line. */ - for (; i < size; i++) { - if (s->buffer[i] == '\n') { - printk_nmi_flush_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < size) { - printk_nmi_flush_seq_line(s, last_i, size - 1); - printk_nmi_flush_line("\n", strlen("\n")); - } + i += printk_nmi_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate -- cgit v1.2.3 From 497957576cf8a2150d723aedd74ea60b5d498bfe Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:47 -0800 Subject: printk/kdb: handle more message headers Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") allows to define more message headers for a single message. The motivation is that continuous lines might get mixed. Therefore it make sense to define the right log level for every piece of a cont line. This patch introduces printk_skip_headers() that will skip all headers and uses it in the kdb code instead of printk_skip_level(). This approach helps to fix other printk_skip_level() users independently. Link: http://lkml.kernel.org/r/1478695291-12169-3-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Joe Perches Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Wessel Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 8 ++++++++ kernel/debug/kdb/kdb_io.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index eac1af8502bb..a0859e169bc3 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -31,6 +31,14 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +static inline const char *printk_skip_headers(const char *buffer) +{ + while (printk_get_level(buffer)) + buffer = printk_skip_level(buffer); + + return buffer; +} + #define CONSOLE_EXT_LOG_MAX 8192 /* printk's without a loglevel use this.. */ diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index fc1ef736253c..98c9011eac78 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -697,7 +697,7 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); - cp = (char *) printk_skip_level(kdb_buffer); + cp = (char *) printk_skip_headers(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { -- cgit v1.2.3 From d06505b2a95efba25dc635b64a481e3197e9369a Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 1 Nov 2016 23:31:50 +0100 Subject: Remove last traces of ikconfig.h The build system stopped generating ikconfig.h in v2.6.8. Remove an entry for it in dontdiff. There's also a reference to it in a small comment. Remove that comment too, as it is of little help in any case. Signed-off-by: Paul Bolle Signed-off-by: Jiri Kosina --- Documentation/dontdiff | 1 - kernel/Makefile | 2 -- 2 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 5385cba941d2..a23edccd2059 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -139,7 +139,6 @@ hpet_example hugepage-mmap hugepage-shm ihex2fw -ikconfig.h* inat-tables.c initramfs_list int16.c diff --git a/kernel/Makefile b/kernel/Makefile index eb26e12c6c2a..eaee9de224bd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,8 +115,6 @@ obj-$(CONFIG_HAS_IOMEM) += memremap.o $(obj)/configs.o: $(obj)/config_data.h -# config_data.h contains the same information as ikconfig.h but gzipped. -# Info from config_data can be extracted from /proc/config* targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) -- cgit v1.2.3 From 55a6f170a413cd8dc7a3a52e5a326e1a87579b4f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 29 Nov 2016 16:53:23 -0500 Subject: audit: move kaudit thread start from auditd registration to kaudit init (#2) Richard made this change some time ago but Eric backed it out because the rest of the supporting code wasn't ready. In order to move the netlink multicast send to kauditd_thread we need to ensure the kauditd_thread is always running, so restore commit 6ff5e459 ("audit: move kaudit thread start from auditd registration to kaudit init"). Signed-off-by: Richard Guy Briggs [PM: brought forward and merged based on Richard's old patch] Signed-off-by: Paul Moore --- kernel/audit.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a8a91bd2b2a9..d4c78ba5c4f9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -832,16 +832,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; - /* As soon as there's any sign of userspace auditd, - * start kauditd to talk to it */ - if (!kauditd_task) { - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; - } - } seq = nlh->nlmsg_seq; data = nlmsg_data(nlh); @@ -1190,6 +1180,10 @@ static int __init audit_init(void) audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) + return PTR_ERR(kauditd_task); + skb_queue_head_init(&audit_skb_queue); skb_queue_head_init(&audit_skb_hold_queue); audit_initialized = AUDIT_INITIALIZED; -- cgit v1.2.3 From 6c9255645350ce2aecb7c3cd032d0e93d4a7a71a Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:23 -0500 Subject: audit: fixup audit_init() Make sure everything is initialized before we start the kauditd_thread and don't emit the "initialized" record until everything is finished. We also panic with a descriptive message if we can't start the kauditd_thread. Signed-off-by: Paul Moore --- kernel/audit.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d4c78ba5c4f9..b61642b1934f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1180,21 +1180,23 @@ static int __init audit_init(void) audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); - kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) - return PTR_ERR(kauditd_task); - skb_queue_head_init(&audit_skb_queue); skb_queue_head_init(&audit_skb_hold_queue); audit_initialized = AUDIT_INITIALIZED; audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); - for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) { + int err = PTR_ERR(kauditd_task); + panic("audit: failed to start the kauditd thread (%d)\n", err); + } + + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + return 0; } __initcall(audit_init); -- cgit v1.2.3 From 4aa83872d346806d9b54768aa0d1329050542bad Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:24 -0500 Subject: audit: queue netlink multicast sends just like we do for unicast sends Sending audit netlink multicast messages is bad for all the same reasons that sending audit netlink unicast messages is bad, so this patch reworks things so that we don't do the multicast send in audit_log_end(), we do it from the dedicated kauditd_thread thread just as we do for unicast messages. See the GitHub issues below for more information/history: * https://github.com/linux-audit/audit-kernel/issues/23 * https://github.com/linux-audit/audit-kernel/issues/22 Signed-off-by: Paul Moore --- kernel/audit.c | 70 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b61642b1934f..801247a6c9e5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -511,26 +511,46 @@ static void flush_hold_queue(void) static int kauditd_thread(void *dummy) { + struct sk_buff *skb; + struct nlmsghdr *nlh; + set_freezable(); while (!kthread_should_stop()) { - struct sk_buff *skb; - flush_hold_queue(); skb = skb_dequeue(&audit_skb_queue); - if (skb) { - if (!audit_backlog_limit || - (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)) - wake_up(&audit_backlog_wait); + nlh = nlmsg_hdr(skb); + + /* if nlh->nlmsg_len is zero then we haven't attempted + * to send the message to userspace yet, if nlmsg_len + * is non-zero then we have attempted to send it to + * the multicast listeners as well as auditd; keep + * trying to send to auditd but don't repeat the + * multicast send */ + if (nlh->nlmsg_len == 0) { + nlh->nlmsg_len = skb->len; + kauditd_send_multicast_skb(skb, GFP_KERNEL); + + /* see the note in kauditd_send_multicast_skb + * regarding the nlh->nlmsg_len value and why + * it differs between the multicast and unicast + * clients */ + nlh->nlmsg_len -= NLMSG_HDRLEN; + } + if (audit_pid) kauditd_send_skb(skb); else audit_printk_skb(skb); - continue; + } else { + /* we have flushed the backlog so wake everyone up who + * is blocked and go to sleep until we have something + * in the backlog again */ + wake_up(&audit_backlog_wait); + wait_event_freezable(kauditd_wait, + skb_queue_len(&audit_skb_queue)); } - - wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue)); } return 0; } @@ -1969,10 +1989,10 @@ out: * audit_log_end - end one audit record * @ab: the audit_buffer * - * netlink_unicast() cannot be called inside an irq context because it blocks - * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed - * on a queue and a tasklet is scheduled to remove them from the queue outside - * the irq context. May be called in any context. + * We can not do a netlink send inside an irq context because it blocks (last + * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a + * queue and a tasklet is scheduled to remove them from the queue outside the + * irq context. May be called in any context. */ void audit_log_end(struct audit_buffer *ab) { @@ -1981,28 +2001,8 @@ void audit_log_end(struct audit_buffer *ab) if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - - nlh->nlmsg_len = ab->skb->len; - kauditd_send_multicast_skb(ab->skb, ab->gfp_mask); - - /* - * The original kaudit unicast socket sends up messages with - * nlmsg_len set to the payload length rather than the entire - * message length. This breaks the standard set by netlink. - * The existing auditd daemon assumes this breakage. Fixing - * this would require co-ordinating a change in the established - * protocol between the kaudit kernel subsystem and the auditd - * userspace code. - */ - nlh->nlmsg_len -= NLMSG_HDRLEN; - - if (audit_pid) { - skb_queue_tail(&audit_skb_queue, ab->skb); - wake_up_interruptible(&kauditd_wait); - } else { - audit_printk_skb(ab->skb); - } + skb_queue_tail(&audit_skb_queue, ab->skb); + wake_up_interruptible(&kauditd_wait); ab->skb = NULL; } audit_buffer_free(ab); -- cgit v1.2.3 From af8b824f283de5acc9b9ae8dbb60e4adacff721b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:24 -0500 Subject: audit: rename the queues and kauditd related functions The audit queue names can be shortened and the record sending helpers associated with the kauditd task could be named better, do these small cleanups now to make life easier once we start reworking the queues and kauditd code. Signed-off-by: Paul Moore --- kernel/audit.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 801247a6c9e5..6ac1df116c0b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -138,9 +138,9 @@ static DEFINE_SPINLOCK(audit_freelist_lock); static int audit_freelist_count; static LIST_HEAD(audit_freelist); -static struct sk_buff_head audit_skb_queue; +static struct sk_buff_head audit_queue; /* queue of skbs to send to auditd when/if it comes back */ -static struct sk_buff_head audit_skb_hold_queue; +static struct sk_buff_head audit_hold_queue; static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); @@ -377,8 +377,8 @@ static void audit_hold_skb(struct sk_buff *skb) { if (audit_default && (!audit_backlog_limit || - skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit)) - skb_queue_tail(&audit_skb_hold_queue, skb); + skb_queue_len(&audit_hold_queue) < audit_backlog_limit)) + skb_queue_tail(&audit_hold_queue, skb); else kfree_skb(skb); } @@ -387,7 +387,7 @@ static void audit_hold_skb(struct sk_buff *skb) * For one reason or another this nlh isn't getting delivered to the userspace * audit daemon, just send it to printk. */ -static void audit_printk_skb(struct sk_buff *skb) +static void kauditd_printk_skb(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); char *data = nlmsg_data(nlh); @@ -402,7 +402,7 @@ static void audit_printk_skb(struct sk_buff *skb) audit_hold_skb(skb); } -static void kauditd_send_skb(struct sk_buff *skb) +static void kauditd_send_unicast_skb(struct sk_buff *skb) { int err; int attempts = 0; @@ -493,13 +493,13 @@ static void flush_hold_queue(void) if (!audit_default || !audit_pid) return; - skb = skb_dequeue(&audit_skb_hold_queue); + skb = skb_dequeue(&audit_hold_queue); if (likely(!skb)) return; while (skb && audit_pid) { - kauditd_send_skb(skb); - skb = skb_dequeue(&audit_skb_hold_queue); + kauditd_send_unicast_skb(skb); + skb = skb_dequeue(&audit_hold_queue); } /* @@ -518,7 +518,7 @@ static int kauditd_thread(void *dummy) while (!kthread_should_stop()) { flush_hold_queue(); - skb = skb_dequeue(&audit_skb_queue); + skb = skb_dequeue(&audit_queue); if (skb) { nlh = nlmsg_hdr(skb); @@ -540,16 +540,16 @@ static int kauditd_thread(void *dummy) } if (audit_pid) - kauditd_send_skb(skb); + kauditd_send_unicast_skb(skb); else - audit_printk_skb(skb); + kauditd_printk_skb(skb); } else { /* we have flushed the backlog so wake everyone up who * is blocked and go to sleep until we have something * in the backlog again */ wake_up(&audit_backlog_wait); wait_event_freezable(kauditd_wait, - skb_queue_len(&audit_skb_queue)); + skb_queue_len(&audit_queue)); } } return 0; @@ -865,7 +865,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); - s.backlog = skb_queue_len(&audit_skb_queue); + s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time_master; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); @@ -1200,8 +1200,8 @@ static int __init audit_init(void) audit_default ? "enabled" : "disabled"); register_pernet_subsys(&audit_net_ops); - skb_queue_head_init(&audit_skb_queue); - skb_queue_head_init(&audit_skb_hold_queue); + skb_queue_head_init(&audit_queue); + skb_queue_head_init(&audit_hold_queue); audit_initialized = AUDIT_INITIALIZED; audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; @@ -1357,7 +1357,7 @@ static long wait_for_auditd(long sleep_time) DECLARE_WAITQUEUE(wait, current); if (audit_backlog_limit && - skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { + skb_queue_len(&audit_queue) > audit_backlog_limit) { add_wait_queue_exclusive(&audit_backlog_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); sleep_time = schedule_timeout(sleep_time); @@ -1406,7 +1406,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } while (audit_backlog_limit - && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { + && skb_queue_len(&audit_queue) > audit_backlog_limit + reserve) { if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; @@ -1419,7 +1419,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } if (audit_rate_check() && printk_ratelimit()) pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", - skb_queue_len(&audit_skb_queue), + skb_queue_len(&audit_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); audit_backlog_wait_time = 0; @@ -2001,7 +2001,7 @@ void audit_log_end(struct audit_buffer *ab) if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { - skb_queue_tail(&audit_skb_queue, ab->skb); + skb_queue_tail(&audit_queue, ab->skb); wake_up_interruptible(&kauditd_wait); ab->skb = NULL; } -- cgit v1.2.3 From c6480207fdf7b61de216ee23e93eac0a6878fa74 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:25 -0500 Subject: audit: rework the audit queue handling The audit record backlog queue has always been a bit of a mess, and the moving the multicast send into kauditd_thread() from audit_log_end() only makes things worse. This patch attempts to fix the backlog queue with a better design that should hold up better under load and have less of a performance impact at syscall invocation time. While it looks like there is a log going on in this patch, the main change is the move from a single backlog queue to three queues: * A queue for holding records generated from audit_log_end() that haven't been consumed by kauditd_thread() (audit_queue). * A queue for holding records that have been sent via multicast but had a temporary failure when sending via unicast and need a resend (audit_retry_queue). * A queue for holding records that haven't been sent via unicast because no one is listening (audit_hold_queue). Special care is taken in this patch to ensure that the proper record ordering is preserved, e.g. we send everything in the hold queue first, then the retry queue, and finally the main queue. Signed-off-by: Paul Moore --- kernel/audit.c | 347 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 226 insertions(+), 121 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6ac1df116c0b..f4056bc331fc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -138,11 +138,18 @@ static DEFINE_SPINLOCK(audit_freelist_lock); static int audit_freelist_count; static LIST_HEAD(audit_freelist); +/* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; -/* queue of skbs to send to auditd when/if it comes back */ +/* queue msgs due to temporary unicast send problems */ +static struct sk_buff_head audit_retry_queue; +/* queue msgs waiting for new auditd connection */ static struct sk_buff_head audit_hold_queue; + +/* queue servicing thread */ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); + +/* waitqueue for callers who are blocked on the audit backlog */ static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, @@ -364,25 +371,6 @@ static int audit_set_failure(u32 state) return audit_do_config_change("audit_failure", &audit_failure, state); } -/* - * Queue skbs to be sent to auditd when/if it comes back. These skbs should - * already have been sent via prink/syslog and so if these messages are dropped - * it is not a huge concern since we already passed the audit_log_lost() - * notification and stuff. This is just nice to get audit messages during - * boot before auditd is running or messages generated while auditd is stopped. - * This only holds messages is audit_default is set, aka booting with audit=1 - * or building your kernel that way. - */ -static void audit_hold_skb(struct sk_buff *skb) -{ - if (audit_default && - (!audit_backlog_limit || - skb_queue_len(&audit_hold_queue) < audit_backlog_limit)) - skb_queue_tail(&audit_hold_queue, skb); - else - kfree_skb(skb); -} - /* * For one reason or another this nlh isn't getting delivered to the userspace * audit daemon, just send it to printk. @@ -398,58 +386,114 @@ static void kauditd_printk_skb(struct sk_buff *skb) else audit_log_lost("printk limit exceeded"); } +} + +/** + * kauditd_hold_skb - Queue an audit record, waiting for auditd + * @skb: audit record + * + * Description: + * Queue the audit record, waiting for an instance of auditd. When this + * function is called we haven't given up yet on sending the record, but things + * are not looking good. The first thing we want to do is try to write the + * record via printk and then see if we want to try and hold on to the record + * and queue it, if we have room. If we want to hold on to the record, but we + * don't have room, record a record lost message. + */ +static void kauditd_hold_skb(struct sk_buff *skb) +{ + /* at this point it is uncertain if we will ever send this to auditd so + * try to send the message via printk before we go any further */ + kauditd_printk_skb(skb); + + /* can we just silently drop the message? */ + if (!audit_default) { + kfree_skb(skb); + return; + } + + /* if we have room, queue the message */ + if (!audit_backlog_limit || + skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_hold_queue, skb); + return; + } - audit_hold_skb(skb); + /* we have no other options - drop the message */ + audit_log_lost("kauditd hold queue overflow"); + kfree_skb(skb); } -static void kauditd_send_unicast_skb(struct sk_buff *skb) +/** + * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd + * @skb: audit record + * + * Description: + * Not as serious as kauditd_hold_skb() as we still have a connected auditd, + * but for some reason we are having problems sending it audit records so + * queue the given record and attempt to resend. + */ +static void kauditd_retry_skb(struct sk_buff *skb) { - int err; - int attempts = 0; -#define AUDITD_RETRIES 5 + /* NOTE: because records should only live in the retry queue for a + * short period of time, before either being sent or moved to the hold + * queue, we don't currently enforce a limit on this queue */ + skb_queue_tail(&audit_retry_queue, skb); +} + +/** + * auditd_reset - Disconnect the auditd connection + * + * Description: + * Break the auditd/kauditd connection and move all the records in the retry + * queue into the hold queue in case auditd reconnects. + */ +static void auditd_reset(void) +{ + struct sk_buff *skb; + + /* break the connection */ + audit_pid = 0; + audit_sock = NULL; + + /* flush all of the retry queue to the hold queue */ + while ((skb = skb_dequeue(&audit_retry_queue))) + kauditd_hold_skb(skb); +} + +/** + * kauditd_send_unicast_skb - Send a record via unicast to auditd + * @skb: audit record + */ +static int kauditd_send_unicast_skb(struct sk_buff *skb) +{ + int rc; -restart: - /* take a reference in case we can't send it and we want to hold it */ + /* get an extra skb reference in case we fail to send */ skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); - if (err < 0) { - pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n", - audit_pid, err); - if (audit_pid) { - if (err == -ECONNREFUSED || err == -EPERM - || ++attempts >= AUDITD_RETRIES) { - char s[32]; - - snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid); - audit_log_lost(s); - audit_pid = 0; - audit_sock = NULL; - } else { - pr_warn("re-scheduling(#%d) write to audit_pid=%d\n", - attempts, audit_pid); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - goto restart; - } - } - /* we might get lucky and get this in the next auditd */ - audit_hold_skb(skb); - } else - /* drop the extra reference if sent ok */ + rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); + if (rc >= 0) { consume_skb(skb); + rc = 0; + } + + return rc; } /* - * kauditd_send_multicast_skb - send the skb to multicast userspace listeners + * kauditd_send_multicast_skb - Send a record to any multicast listeners + * @skb: audit record * + * Description: * This function doesn't consume an skb as might be expected since it has to * copy it anyways. */ -static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) +static void kauditd_send_multicast_skb(struct sk_buff *skb) { - struct sk_buff *copy; - struct audit_net *aunet = net_generic(&init_net, audit_net_id); - struct sock *sock = aunet->nlsk; + struct sk_buff *copy; + struct audit_net *aunet = net_generic(&init_net, audit_net_id); + struct sock *sock = aunet->nlsk; + struct nlmsghdr *nlh; if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG)) return; @@ -464,94 +508,155 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb, gfp_t gfp_mask) * no reason for new multicast clients to continue with this * non-compliance. */ - copy = skb_copy(skb, gfp_mask); + copy = skb_copy(skb, GFP_KERNEL); if (!copy) return; + nlh = nlmsg_hdr(copy); + nlh->nlmsg_len = skb->len; - nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, gfp_mask); + nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL); } -/* - * flush_hold_queue - empty the hold queue if auditd appears - * - * If auditd just started, drain the queue of messages already - * sent to syslog/printk. Remember loss here is ok. We already - * called audit_log_lost() if it didn't go out normally. so the - * race between the skb_dequeue and the next check for audit_pid - * doesn't matter. +/** + * kauditd_wake_condition - Return true when it is time to wake kauditd_thread * - * If you ever find kauditd to be too slow we can get a perf win - * by doing our own locking and keeping better track if there - * are messages in this queue. I don't see the need now, but - * in 5 years when I want to play with this again I'll see this - * note and still have no friggin idea what i'm thinking today. + * Description: + * This function is for use by the wait_event_freezable() call in + * kauditd_thread(). */ -static void flush_hold_queue(void) +static int kauditd_wake_condition(void) { - struct sk_buff *skb; - - if (!audit_default || !audit_pid) - return; - - skb = skb_dequeue(&audit_hold_queue); - if (likely(!skb)) - return; + static int pid_last = 0; + int rc; + int pid = audit_pid; - while (skb && audit_pid) { - kauditd_send_unicast_skb(skb); - skb = skb_dequeue(&audit_hold_queue); - } + /* wake on new messages or a change in the connected auditd */ + rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last); + if (rc) + pid_last = pid; - /* - * if auditd just disappeared but we - * dequeued an skb we need to drop ref - */ - consume_skb(skb); + return rc; } static int kauditd_thread(void *dummy) { + int rc; + int auditd = 0; + int reschedule = 0; struct sk_buff *skb; struct nlmsghdr *nlh; +#define UNICAST_RETRIES 5 +#define AUDITD_BAD(x,y) \ + ((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES) + + /* NOTE: we do invalidate the auditd connection flag on any sending + * errors, but we only "restore" the connection flag at specific places + * in the loop in order to help ensure proper ordering of audit + * records */ + set_freezable(); while (!kthread_should_stop()) { - flush_hold_queue(); + /* NOTE: possible area for future improvement is to look at + * the hold and retry queues, since only this thread + * has access to these queues we might be able to do + * our own queuing and skip some/all of the locking */ + + /* NOTE: it might be a fun experiment to split the hold and + * retry queue handling to another thread, but the + * synchronization issues and other overhead might kill + * any performance gains */ + + /* attempt to flush the hold queue */ + while (auditd && (skb = skb_dequeue(&audit_hold_queue))) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + /* requeue to the same spot */ + skb_queue_head(&audit_hold_queue, skb); + + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + auditd_reset(); + reschedule = 0; + } + } else + /* we were able to send successfully */ + reschedule = 0; + } + + /* attempt to flush the retry queue */ + while (auditd && (skb = skb_dequeue(&audit_retry_queue))) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + kauditd_hold_skb(skb); + auditd_reset(); + reschedule = 0; + } else + /* temporary problem (we hope), queue + * to the same spot and retry */ + skb_queue_head(&audit_retry_queue, skb); + } else + /* we were able to send successfully */ + reschedule = 0; + } + /* standard queue processing, try to be as quick as possible */ +quick_loop: skb = skb_dequeue(&audit_queue); if (skb) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ nlh = nlmsg_hdr(skb); - - /* if nlh->nlmsg_len is zero then we haven't attempted - * to send the message to userspace yet, if nlmsg_len - * is non-zero then we have attempted to send it to - * the multicast listeners as well as auditd; keep - * trying to send to auditd but don't repeat the - * multicast send */ - if (nlh->nlmsg_len == 0) { - nlh->nlmsg_len = skb->len; - kauditd_send_multicast_skb(skb, GFP_KERNEL); - - /* see the note in kauditd_send_multicast_skb - * regarding the nlh->nlmsg_len value and why - * it differs between the multicast and unicast - * clients */ - nlh->nlmsg_len -= NLMSG_HDRLEN; - } - - if (audit_pid) - kauditd_send_unicast_skb(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* attempt to send to any multicast listeners */ + kauditd_send_multicast_skb(skb); + + /* attempt to send to auditd, queue on failure */ + if (auditd) { + rc = kauditd_send_unicast_skb(skb); + if (rc) { + auditd = 0; + if (AUDITD_BAD(rc, reschedule)) { + auditd_reset(); + reschedule = 0; + } + + /* move to the retry queue */ + kauditd_retry_skb(skb); + } else + /* everything is working so go fast! */ + goto quick_loop; + } else if (reschedule) + /* we are currently having problems, move to + * the retry queue */ + kauditd_retry_skb(skb); else - kauditd_printk_skb(skb); + /* dump the message via printk and hold it */ + kauditd_hold_skb(skb); } else { - /* we have flushed the backlog so wake everyone up who - * is blocked and go to sleep until we have something - * in the backlog again */ + /* we have flushed the backlog so wake everyone */ wake_up(&audit_backlog_wait); - wait_event_freezable(kauditd_wait, - skb_queue_len(&audit_queue)); + + /* if everything is okay with auditd (if present), go + * to sleep until there is something new in the queue + * or we have a change in the connected auditd; + * otherwise simply reschedule to give things a chance + * to recover */ + if (reschedule) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } else + wait_event_freezable(kauditd_wait, + kauditd_wake_condition()); + + /* update the auditd connection status */ + auditd = (audit_pid ? 1 : 0); } } + return 0; } @@ -616,6 +721,7 @@ static int audit_send_reply_thread(void *arg) kfree(reply); return 0; } + /** * audit_send_reply - send an audit reply message via netlink * @request_skb: skb of request we are replying to (used to target the reply) @@ -1171,10 +1277,8 @@ static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); struct sock *sock = aunet->nlsk; - if (sock == audit_sock) { - audit_pid = 0; - audit_sock = NULL; - } + if (sock == audit_sock) + auditd_reset(); RCU_INIT_POINTER(aunet->nlsk, NULL); synchronize_net(); @@ -1201,6 +1305,7 @@ static int __init audit_init(void) register_pernet_subsys(&audit_net_ops); skb_queue_head_init(&audit_queue); + skb_queue_head_init(&audit_retry_queue); skb_queue_head_init(&audit_hold_queue); audit_initialized = AUDIT_INITIALIZED; audit_enabled = audit_default; -- cgit v1.2.3 From 3197542482df22c2a131d4a813280bd7c54cedf5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:25 -0500 Subject: audit: rework audit_log_start() The backlog queue handling in audit_log_start() is a little odd with some questionable design decisions, this patch attempts to rectify this with the following changes: * Never make auditd wait, ignore any backlog limits as we need auditd awake so it can drain the backlog queue. * When we hit a backlog limit and start dropping records, don't wake all the tasks sleeping on the backlog, that's silly. Instead, let kauditd_thread() take care of waking everyone once it has had a chance to drain the backlog queue. * Don't keep a global backlog timeout countdown, make it per-task. A per-task timer means we won't have all the sleeping tasks waking at the same time and hammering on an already stressed backlog queue. Signed-off-by: Paul Moore --- kernel/audit.c | 92 +++++++++++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f4056bc331fc..e23ce6ce101f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -107,7 +107,6 @@ static u32 audit_rate_limit; * When set to zero, this means unlimited. */ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) -static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; /* The identity of the user shutting down the audit system. */ @@ -345,7 +344,7 @@ static int audit_set_backlog_limit(u32 limit) static int audit_set_backlog_wait_time(u32 timeout) { return audit_do_config_change("audit_backlog_wait_time", - &audit_backlog_wait_time_master, timeout); + &audit_backlog_wait_time, timeout); } static int audit_set_enabled(u32 state) @@ -973,7 +972,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_queue); s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; - s.backlog_wait_time = audit_backlog_wait_time_master; + s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; } @@ -1454,24 +1453,6 @@ static inline void audit_get_stamp(struct audit_context *ctx, } } -/* - * Wait for auditd to drain the queue a little - */ -static long wait_for_auditd(long sleep_time) -{ - DECLARE_WAITQUEUE(wait, current); - - if (audit_backlog_limit && - skb_queue_len(&audit_queue) > audit_backlog_limit) { - add_wait_queue_exclusive(&audit_backlog_wait, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); - sleep_time = schedule_timeout(sleep_time); - remove_wait_queue(&audit_backlog_wait, &wait); - } - - return sleep_time; -} - /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) @@ -1490,12 +1471,9 @@ static long wait_for_auditd(long sleep_time) struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { - struct audit_buffer *ab = NULL; - struct timespec t; - unsigned int uninitialized_var(serial); - int reserve = 5; /* Allow atomic callers to go up to five - entries over the normal backlog limit */ - unsigned long timeout_start = jiffies; + struct audit_buffer *ab; + struct timespec t; + unsigned int uninitialized_var(serial); if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1503,38 +1481,40 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; - if (gfp_mask & __GFP_DIRECT_RECLAIM) { - if (audit_pid && audit_pid == current->tgid) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; - else - reserve = 0; - } - - while (audit_backlog_limit - && skb_queue_len(&audit_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { - long sleep_time; - - sleep_time = timeout_start + audit_backlog_wait_time - jiffies; - if (sleep_time > 0) { - sleep_time = wait_for_auditd(sleep_time); - if (sleep_time > 0) - continue; + /* don't ever fail/sleep on auditd since we need auditd to drain the + * queue; also, when we are checking for auditd, compare PIDs using + * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() using + * a PID anchored in the caller's namespace */ + if (!(audit_pid && audit_pid == task_tgid_vnr(current))) { + long sleep_time = audit_backlog_wait_time; + + while (audit_backlog_limit && + (skb_queue_len(&audit_queue) > audit_backlog_limit)) { + /* wake kauditd to try and flush the queue */ + wake_up_interruptible(&kauditd_wait); + + /* sleep if we are allowed and we haven't exhausted our + * backlog wait limit */ + if ((gfp_mask & __GFP_DIRECT_RECLAIM) && + (sleep_time > 0)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&audit_backlog_wait, + &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + sleep_time = schedule_timeout(sleep_time); + remove_wait_queue(&audit_backlog_wait, &wait); + } else { + if (audit_rate_check() && printk_ratelimit()) + pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", + skb_queue_len(&audit_queue), + audit_backlog_limit); + audit_log_lost("backlog limit exceeded"); + return NULL; } } - if (audit_rate_check() && printk_ratelimit()) - pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n", - skb_queue_len(&audit_queue), - audit_backlog_limit); - audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = 0; - wake_up(&audit_backlog_wait); - return NULL; } - if (!reserve && !audit_backlog_wait_time) - audit_backlog_wait_time = audit_backlog_wait_time_master; - ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); @@ -1542,9 +1522,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_log_format(ab, "audit(%lu.%03lu:%u): ", t.tv_sec, t.tv_nsec/1000000, serial); + return ab; } -- cgit v1.2.3 From e1d166212894d9d959a601c4802882b877bb420a Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:25 -0500 Subject: audit: wake up kauditd_thread after auditd registers This patch was suggested by Richard Briggs back in 2015, see the link to the mail archive below. Unfortunately, that patch is no longer even remotely valid due to other changes to the code. * https://www.redhat.com/archives/linux-audit/2015-October/msg00075.html Suggested-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e23ce6ce101f..0572e5dcfda7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1009,6 +1009,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = new_pid; audit_nlk_portid = NETLINK_CB(skb).portid; audit_sock = skb->sk; + wake_up_interruptible(&kauditd_wait); } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); -- cgit v1.2.3 From 6c54e7899693dee3db67ea996e9be0e10f67920f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:26 -0500 Subject: audit: handle a clean auditd shutdown with grace When auditd stops cleanly it sets 'auditd_pid' to 0 with an AUDIT_SET message, in this case we should reset our backlog queues via the auditd_reset() function. This patch also adds a 'auditd_pid' check to the top of kauditd_send_unicast_skb() so we can fail quicker. Signed-off-by: Paul Moore --- kernel/audit.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0572e5dcfda7..b447a6b1fdc8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -468,6 +468,10 @@ static int kauditd_send_unicast_skb(struct sk_buff *skb) { int rc; + /* if we know nothing is connected, don't even try the netlink call */ + if (!audit_pid) + return -ECONNREFUSED; + /* get an extra skb reference in case we fail to send */ skb_get(skb); rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); @@ -1009,6 +1013,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = new_pid; audit_nlk_portid = NETLINK_CB(skb).portid; audit_sock = skb->sk; + if (!new_pid) + auditd_reset(); wake_up_interruptible(&kauditd_wait); } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { -- cgit v1.2.3 From a09cfa470817ac086cf68418da13a2b91c2744ef Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 29 Nov 2016 16:53:26 -0500 Subject: audit: don't ever sleep on a command record/message Sleeping on a command record/message in audit_log_start() could slow something, e.g. auditd, from doing something important, e.g. clean shutdown, which could present problems on a heavily loaded system. This patch allows tasks to bypass any queue restrictions if they are logging a command record/message. Signed-off-by: Paul Moore --- kernel/audit.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index b447a6b1fdc8..f20eee0db7e6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1488,11 +1488,19 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE))) return NULL; - /* don't ever fail/sleep on auditd since we need auditd to drain the - * queue; also, when we are checking for auditd, compare PIDs using - * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() using - * a PID anchored in the caller's namespace */ - if (!(audit_pid && audit_pid == task_tgid_vnr(current))) { + /* don't ever fail/sleep on these two conditions: + * 1. auditd generated record - since we need auditd to drain the + * queue; also, when we are checking for auditd, compare PIDs using + * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() + * using a PID anchored in the caller's namespace + * 2. audit command message - record types 1000 through 1099 inclusive + * are command messages/records used to manage the kernel subsystem + * and the audit userspace, blocking on these messages could cause + * problems under load so don't do it (note: not all of these + * command types are valid as record types, but it is quicker to + * just check two ints than a series of ints in a if/switch stmt) */ + if (!((audit_pid && audit_pid == task_tgid_vnr(current)) || + (type >= 1000 && type <= 1099))) { long sleep_time = audit_backlog_wait_time; while (audit_backlog_limit && -- cgit v1.2.3 From 533c7b69c764ad5febb3e716899f43a75564fcab Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 13 Dec 2016 10:03:01 -0500 Subject: audit: use proper refcount locking on audit_sock Resetting audit_sock appears to be racy. audit_sock was being copied and dereferenced without using a refcount on the source sock. Bump the refcount on the underlying sock when we store a refrence in audit_sock and release it when we reset audit_sock. audit_sock modification needs the audit_cmd_mutex. See: https://lkml.org/lkml/2016/11/26/232 Thanks to Eric Dumazet and Cong Wang on ideas how to fix it. Signed-off-by: Richard Guy Briggs Reviewed-by: Cong Wang [PM: fixed the comment block text formatting for auditd_reset()] Signed-off-by: Paul Moore --- kernel/audit.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f20eee0db7e6..41017685f9f2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -445,15 +445,20 @@ static void kauditd_retry_skb(struct sk_buff *skb) * * Description: * Break the auditd/kauditd connection and move all the records in the retry - * queue into the hold queue in case auditd reconnects. + * queue into the hold queue in case auditd reconnects. The audit_cmd_mutex + * must be held when calling this function. */ static void auditd_reset(void) { struct sk_buff *skb; /* break the connection */ + if (audit_sock) { + sock_put(audit_sock); + audit_sock = NULL; + } audit_pid = 0; - audit_sock = NULL; + audit_nlk_portid = 0; /* flush all of the retry queue to the hold queue */ while ((skb = skb_dequeue(&audit_retry_queue))) @@ -579,7 +584,9 @@ static int kauditd_thread(void *dummy) auditd = 0; if (AUDITD_BAD(rc, reschedule)) { + mutex_lock(&audit_cmd_mutex); auditd_reset(); + mutex_unlock(&audit_cmd_mutex); reschedule = 0; } } else @@ -594,7 +601,9 @@ static int kauditd_thread(void *dummy) auditd = 0; if (AUDITD_BAD(rc, reschedule)) { kauditd_hold_skb(skb); + mutex_lock(&audit_cmd_mutex); auditd_reset(); + mutex_unlock(&audit_cmd_mutex); reschedule = 0; } else /* temporary problem (we hope), queue @@ -623,7 +632,9 @@ quick_loop: if (rc) { auditd = 0; if (AUDITD_BAD(rc, reschedule)) { + mutex_lock(&audit_cmd_mutex); auditd_reset(); + mutex_unlock(&audit_cmd_mutex); reschedule = 0; } @@ -1010,11 +1021,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); - audit_pid = new_pid; - audit_nlk_portid = NETLINK_CB(skb).portid; - audit_sock = skb->sk; - if (!new_pid) + if (new_pid) { + if (audit_sock) + sock_put(audit_sock); + audit_pid = new_pid; + audit_nlk_portid = NETLINK_CB(skb).portid; + sock_hold(skb->sk); + audit_sock = skb->sk; + } else { auditd_reset(); + } wake_up_interruptible(&kauditd_wait); } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { @@ -1283,8 +1299,10 @@ static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); struct sock *sock = aunet->nlsk; + mutex_lock(&audit_cmd_mutex); if (sock == audit_sock) auditd_reset(); + mutex_unlock(&audit_cmd_mutex); RCU_INIT_POINTER(aunet->nlsk, NULL); synchronize_net(); -- cgit v1.2.3 From 4d1f0fb096aedea7bb5489af93498a82e467c480 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 14 Dec 2016 15:04:04 -0800 Subject: kernel/watchdog: use nmi registers snapshot in hardlockup handler NMI handler doesn't call set_irq_regs(), it's set only by normal IRQ. Thus get_irq_regs() returns NULL or stale registers snapshot with IP/SP pointing to the code interrupted by IRQ which was interrupted by NMI. NULL isn't a problem: in this case watchdog calls dump_stack() and prints full stack trace including NMI. But if we're stuck in IRQ handler then NMI watchlog will print stack trace without IRQ part at all. This patch uses registers snapshot passed into NMI handler as arguments: these registers point exactly to the instruction interrupted by NMI. Fixes: 55537871ef66 ("kernel/watchdog.c: perform all-CPU backtrace in case of hard lockup") Link: http://lkml.kernel.org/r/146771764784.86724.6006627197118544150.stgit@buzz Signed-off-by: Konstantin Khlebnikov Cc: Jiri Kosina Cc: Ulrich Obergfell Cc: Aaron Tomlin Cc: [4.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..6d1020c03d41 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -344,7 +344,6 @@ static void watchdog_overflow_callback(struct perf_event *event, */ if (is_hardlockup()) { int this_cpu = smp_processor_id(); - struct pt_regs *regs = get_irq_regs(); /* only print hardlockups once */ if (__this_cpu_read(hard_watchdog_warn) == true) -- cgit v1.2.3 From c7be96af89d4b53211862d8599b2430e8900ed92 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 14 Dec 2016 15:04:10 -0800 Subject: signals: avoid unnecessary taking of sighand->siglock When running certain database workload on a high-end system with many CPUs, it was found that spinlock contention in the sigprocmask syscalls became a significant portion of the overall CPU cycles as shown below. 9.30% 9.30% 905387 dataserver /proc/kcore 0x7fff8163f4d2 [k] _raw_spin_lock_irq | ---_raw_spin_lock_irq | |--99.34%-- __set_current_blocked | sigprocmask | sys_rt_sigprocmask | system_call_fastpath | | | |--50.63%-- __swapcontext | | | | | |--99.91%-- upsleepgeneric | | | |--49.36%-- __setcontext | | ktskRun Looking further into the swapcontext function in glibc, it was found that the function always call sigprocmask() without checking if there are changes in the signal mask. A check was added to the __set_current_blocked() function to avoid taking the sighand->siglock spinlock if there is no change in the signal mask. This will prevent unneeded spinlock contention when many threads are trying to call sigprocmask(). With this patch applied, the spinlock contention in sigprocmask() was gone. Link: http://lkml.kernel.org/r/1474979209-11867-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Waiman Long Acked-by: Oleg Nesterov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Stas Sergeev Cc: Scott J Norton Cc: Douglas Hatch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 17 +++++++++++++++++ kernel/signal.c | 7 +++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index b63f63eaa39c..5308304993be 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set) } } +static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) +{ + switch (_NSIG_WORDS) { + case 4: + return (set1->sig[3] == set2->sig[3]) && + (set1->sig[2] == set2->sig[2]) && + (set1->sig[1] == set2->sig[1]) && + (set1->sig[0] == set2->sig[0]); + case 2: + return (set1->sig[1] == set2->sig[1]) && + (set1->sig[0] == set2->sig[0]); + case 1: + return set1->sig[0] == set2->sig[0]; + } + return 0; +} + #define sigmask(sig) (1UL << ((sig) - 1)) #ifndef __HAVE_ARCH_SIG_SETOPS diff --git a/kernel/signal.c b/kernel/signal.c index 29a410780aa9..ae60996fedff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; + /* + * In case the signal mask hasn't changed, there is nothing we need + * to do. The current->blocked shouldn't be modified by other task. + */ + if (sigequalsets(&tsk->blocked, newset)) + return; + spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); -- cgit v1.2.3 From 760c6a9139c37e16502362b22656d0cc4e840e8f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 14 Dec 2016 15:04:14 -0800 Subject: coredump: clarify "unsafe core_pattern" warning I was amused to find "unsafe core_pattern" warning having these lines in /etc/sysctl.conf: fs.suid_dumpable=2 kernel.core_pattern=/core/core-%e-%p-%E kernel.core_uses_pid=0 Turns out kernel is formally right. Default core_pattern is just "core", which doesn't qualify for secure path while setting suid.dumpable. Hint admins about solution, clarify sysctl names, delete unnecessary '\' characters (string literals are concatenated regardless) and reformat for easier grepping. Link: http://lkml.kernel.org/r/20161029152124.GA1258@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 39b3368f6de6..1475d2545b7e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void) #ifdef CONFIG_COREDUMP if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING "Unsafe core_pattern used with "\ - "suid_dumpable=2. Pipe handler or fully qualified "\ - "core dump path required.\n"); + printk(KERN_WARNING +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); } #endif } -- cgit v1.2.3 From 401721ecd1dcb0a428aa5d6832ee05ffbdbffbbe Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 14 Dec 2016 15:04:20 -0800 Subject: kexec: export the value of phys_base instead of symbol address Currently in x86_64, the symbol address of phys_base is exported to vmcoreinfo. Dave Anderson complained this is really useless for his Crash implementation. Because in user-space utility Crash and Makedumpfile which exported vmcore information is mainly used for, value of phys_base is needed to covert virtual address of exported kernel symbol to physical address. Especially init_level4_pgt, if we want to access and go over the page table to look up a PA corresponding to VA, firstly we need calculate page_dir = SYMBOL(init_level4_pgt) - __START_KERNEL_map + phys_base; Now in Crash and Makedumpfile, we have to analyze the vmcore elf program header to get value of phys_base. As Dave said, it would be preferable if it were readily availabl in vmcoreinfo rather than depending upon the PT_LOAD semantics. Hence in this patch change to export the value of phys_base instead of its virtual address. And people also complained that KERNEL_IMAGE_SIZE exporting is x86_64 only, should be moved into arch dependent function arch_crash_save_vmcoreinfo. Do the moving in this patch. Link: http://lkml.kernel.org/r/1478568596-30060-2-git-send-email-bhe@redhat.com Signed-off-by: Baoquan He Cc: Thomas Garnier Cc: Baoquan He Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: Eric Biederman Cc: Xunlei Pang Cc: HATAYAMA Daisuke Cc: Kees Cook Cc: Eugene Surovegin Cc: Dave Young Cc: AKASHI Takahiro Cc: Atsushi Kumagai Cc: Dave Anderson Cc: Pratyush Anand Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 3 ++- kernel/kexec_core.c | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 5a294e48b185..307b1f4543de 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { - VMCOREINFO_SYMBOL(phys_base); + VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA @@ -337,6 +337,7 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); + VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 561675589511..8ad3a29eb728 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1467,9 +1467,6 @@ static int __init crash_save_vmcoreinfo_init(void) #endif VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_X86 - VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); -#endif #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); #endif -- cgit v1.2.3 From 8e53c073a41795365eb0e47ae23441dc01c70511 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Wed, 14 Dec 2016 15:04:23 -0800 Subject: kexec: add cond_resched into kimage_alloc_crash_control_pages A soft lookup will occur when I run trinity in syscall kexec_load. the corresponding stack information is as follows. BUG: soft lockup - CPU#6 stuck for 22s! [trinity-c6:13859] Kernel panic - not syncing: softlockup: hung tasks CPU: 6 PID: 13859 Comm: trinity-c6 Tainted: G O L ----V------- 3.10.0-327.28.3.35.zhongjiang.x86_64 #1 Hardware name: Huawei Technologies Co., Ltd. Tecal BH622 V2/BC01SRSA0, BIOS RMIBV386 06/30/2014 Call Trace: dump_stack+0x19/0x1b panic+0xd8/0x214 watchdog_timer_fn+0x1cc/0x1e0 __hrtimer_run_queues+0xd2/0x260 hrtimer_interrupt+0xb0/0x1e0 ? call_softirq+0x1c/0x30 local_apic_timer_interrupt+0x37/0x60 smp_apic_timer_interrupt+0x3f/0x60 apic_timer_interrupt+0x6d/0x80 ? kimage_alloc_control_pages+0x80/0x270 ? kmem_cache_alloc_trace+0x1ce/0x1f0 ? do_kimage_alloc_init+0x1f/0x90 kimage_alloc_init+0x12a/0x180 SyS_kexec_load+0x20a/0x260 system_call_fastpath+0x16/0x1b the first time allocation of control pages may take too much time because crash_res.end can be set to a higher value. we need to add cond_resched to avoid the issue. The patch have been tested and above issue is not appear. Link: http://lkml.kernel.org/r/1481164674-42775-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Acked-by: "Eric W. Biederman" Cc: Xunlei Pang Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8ad3a29eb728..5617cc412444 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, while (hole_end <= crashk_res.end) { unsigned long i; + cond_resched(); + if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ -- cgit v1.2.3 From 7560ef39dc0bfebba8f43766d8bb4c1ec5eb66fc Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 14 Dec 2016 15:04:26 -0800 Subject: sysctl: add KERN_CONT to deprecated_sysctl_warning() Do not break lines while printk()ing values. kernel: warning: process `tomoyo_file_tes' used the deprecated sysctl system call with kernel: 3. kernel: 5. kernel: 56. kernel: Link: http://lkml.kernel.org/r/1480814833-4976-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_binary.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6eb99c17dbd8..ece4b177052b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen) "warning: process `%s' used the deprecated sysctl " "system call with ", current->comm); for (i = 0; i < nlen; i++) - printk("%d.", name[i]); - printk("\n"); + printk(KERN_CONT "%d.", name[i]); + printk(KERN_CONT "\n"); } return; } -- cgit v1.2.3 From 9a29d0fbc2d9ad99fb8a981ab72548cc360e9d4c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Dec 2016 15:05:38 -0800 Subject: relay: check array offset before using it Smatch complains that we started using the array offset before we checked that it was valid. Fixes: 017c59c042d0 ('relay: Use per CPU constructs for the relay channel buffer pointers') Link: http://lkml.kernel.org/r/20161013084947.GC16198@mwanda Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index da79a109dbeb..8f18d314a96a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan, { struct rchan_buf *buf; - if (!chan) + if (!chan || cpu >= NR_CPUS) return; buf = *per_cpu_ptr(chan->buf, cpu); - if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) + if (!buf || subbufs_consumed > chan->n_subbufs) return; if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) -- cgit v1.2.3 From db862358a4a96f52d3b0c713c703828f90d97de9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 14 Dec 2016 15:05:46 -0800 Subject: kcov: add more missing includes It is fragile that some definitions acquired via transitive dependencies, as shown in below: atomic_* () ENOMEM/EN* () EXPORT_SYMBOL () device_initcall () preempt_* () Include them to prevent possible issues. Link: http://lkml.kernel.org/r/1481163221-40170-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Suggested-by: Mark Rutland Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Mark Rutland Cc: James Morse Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 3cbb0c879705..cc2fa35ca480 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1,11 +1,16 @@ #define pr_fmt(fmt) "kcov: " fmt #define DISABLE_BRANCH_PROFILING +#include #include +#include +#include #include #include #include +#include #include +#include #include #include #include -- cgit v1.2.3 From 2d13bb6494c807bcf3f78af0e96c0b8615a94385 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 14 Dec 2016 15:05:49 -0800 Subject: kernel/debug/debug_core.c: more properly delay for secondary CPUs We've got a delay loop waiting for secondary CPUs. That loop uses loops_per_jiffy. However, loops_per_jiffy doesn't actually mean how many tight loops make up a jiffy on all architectures. It is quite common to see things like this in the boot log: Calibrating delay loop (skipped), value calculated using timer frequency.. 48.00 BogoMIPS (lpj=24000) In my case I was seeing lots of cases where other CPUs timed out entering the debugger only to print their stack crawls shortly after the kdb> prompt was written. Elsewhere in kgdb we already use udelay(), so that should be safe enough to use to implement our timeout. We'll delay 1 ms for 1000 times, which should give us a full second of delay (just like the old code wanted) but allow us to notice that we're done every 1 ms. [akpm@linux-foundation.org: simplifications, per Daniel] Link: http://lkml.kernel.org/r/1477091361-2039-1-git-send-email-dianders@chromium.org Signed-off-by: Douglas Anderson Reviewed-by: Daniel Thompson Cc: Jason Wessel Cc: Brian Norris Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/debug/debug_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0874e2edd275..79517e5549f1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -598,11 +598,11 @@ return_normal: /* * Wait for the other CPUs to be notified and be waiting for us: */ - time_left = loops_per_jiffy * HZ; + time_left = MSEC_PER_SEC; while (kgdb_do_roundup && --time_left && (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != online_cpus) - cpu_relax(); + udelay(1000); if (!time_left) pr_crit("Timed out waiting for secondary CPUs.\n"); -- cgit v1.2.3 From d1bd8ead126668a2d6c42d97cc3664e95b3fa1dc Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 14 Dec 2016 15:05:52 -0800 Subject: kdb: remove unused kdb_event handling kdb_event state variable is only set but never checked in the kernel code. http://www.spinics.net/lists/kdb/msg01733.html suggests that this variable affected WARN_CONSOLE_UNLOCKED() in the original implementation. But this check never went upstream. The semantic is unclear and racy. The value is updated after the kdb_printf_lock is acquired and after it is released. It should be symmetric at minimum. The value should be manipulated either inside or outside the locked area. Fortunately, it seems that the original function is gone and we could simply remove the state variable. Link: http://lkml.kernel.org/r/1480412276-16690-2-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Suggested-by: Daniel Thompson Cc: Jason Wessel Cc: Peter Zijlstra Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kdb.h | 1 - kernel/debug/kdb/kdb_io.c | 2 -- kernel/debug/kdb/kdb_main.c | 1 - 3 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 410decacff8f..eb706188dc23 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -77,7 +77,6 @@ extern int kdb_poll_idx; * number whenever the kernel debugger is entered. */ extern int kdb_initial_cpu; -extern atomic_t kdb_event; /* Types and messages used for dynamically added kdb shell commands */ diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 98c9011eac78..46f477bebe0c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -576,7 +576,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) KDB_STATE_SET(PRINTF_LOCK); spin_lock_irqsave(&kdb_printf_lock, flags); got_printf_lock = 1; - atomic_inc(&kdb_event); } else { __acquire(kdb_printf_lock); } @@ -851,7 +850,6 @@ kdb_print_out: got_printf_lock = 0; spin_unlock_irqrestore(&kdb_printf_lock, flags); KDB_STATE_CLEAR(PRINTF_LOCK); - atomic_dec(&kdb_event); } else { __release(kdb_printf_lock); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2a20c0dfdafc..ca183919d302 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -60,7 +60,6 @@ int kdb_grep_trailing; * Kernel debugger state flags */ int kdb_flags; -atomic_t kdb_event; /* * kdb_lock protects updates to kdb_initial_cpu. Used to -- cgit v1.2.3 From d5d8d3d0d4adcc3aec6e2e0fb656165014a712b7 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 14 Dec 2016 15:05:55 -0800 Subject: kdb: properly synchronize vkdb_printf() calls with other CPUs kdb_printf_lock does not prevent other CPUs from entering the critical section because it is ignored when KDB_STATE_PRINTF_LOCK is set. The problematic situation might look like: CPU0 CPU1 vkdb_printf() if (!KDB_STATE(PRINTF_LOCK)) KDB_STATE_SET(PRINTF_LOCK); spin_lock_irqsave(&kdb_printf_lock, flags); vkdb_printf() if (!KDB_STATE(PRINTF_LOCK)) BANG: The PRINTF_LOCK state is set and CPU1 is entering the critical section without spinning on the lock. The problem is that the code tries to implement locking using two state variables that are not handled atomically. Well, we need a custom locking because we want to allow reentering the critical section on the very same CPU. Let's use solution from Petr Zijlstra that was proposed for a similar scenario, see https://lkml.kernel.org/r/20161018171513.734367391@infradead.org This patch uses the same trick with cmpxchg(). The only difference is that we want to handle only recursion from the same context and therefore we disable interrupts. In addition, KDB_STATE_PRINTF_LOCK is removed. In fact, we are not able to set it a non-racy way. Link: http://lkml.kernel.org/r/1480412276-16690-3-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Daniel Thompson Cc: Jason Wessel Cc: Peter Zijlstra Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_io.c | 30 +++++++++++++----------------- kernel/debug/kdb/kdb_private.h | 1 - 2 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 46f477bebe0c..daa76154fb1b 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -555,16 +555,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int colcount; int logging, saved_loglevel = 0; int saved_trap_printk; - int got_printf_lock = 0; int retlen = 0; int fnd, len; + int this_cpu, old_cpu; + static int kdb_printf_cpu = -1; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; struct console *c = console_drivers; - static DEFINE_SPINLOCK(kdb_printf_lock); unsigned long uninitialized_var(flags); - preempt_disable(); + local_irq_save(flags); saved_trap_printk = kdb_trap_printk; kdb_trap_printk = 0; @@ -572,12 +572,13 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) * But if any cpu goes recursive in kdb, just print the output, * even if it is interleaved with any other text. */ - if (!KDB_STATE(PRINTF_LOCK)) { - KDB_STATE_SET(PRINTF_LOCK); - spin_lock_irqsave(&kdb_printf_lock, flags); - got_printf_lock = 1; - } else { - __acquire(kdb_printf_lock); + this_cpu = smp_processor_id(); + for (;;) { + old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu); + if (old_cpu == -1 || old_cpu == this_cpu) + break; + + cpu_relax(); } diag = kdbgetintenv("LINES", &linecount); @@ -846,15 +847,10 @@ kdb_print_out: suspend_grep = 0; /* end of what may have been a recursive call */ if (logging) console_loglevel = saved_loglevel; - if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { - got_printf_lock = 0; - spin_unlock_irqrestore(&kdb_printf_lock, flags); - KDB_STATE_CLEAR(PRINTF_LOCK); - } else { - __release(kdb_printf_lock); - } + /* kdb_printf_cpu locked the code above. */ + smp_store_release(&kdb_printf_cpu, old_cpu); kdb_trap_printk = saved_trap_printk; - preempt_enable(); + local_irq_restore(flags); return retlen; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 75014d7f4568..fc224fbcf954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -132,7 +132,6 @@ extern int kdb_state; #define KDB_STATE_PAGER 0x00000400 /* pager is available */ #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching * back to initial cpu */ -#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been -- cgit v1.2.3 From 34aaff40b42148b23dcde40152480e25c7d2d759 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 14 Dec 2016 15:05:58 -0800 Subject: kdb: call vkdb_printf() from vprintk_default() only when wanted kdb_trap_printk allows to pass normal printk() messages to kdb via vkdb_printk(). For example, it is used to get backtrace using the classic show_stack(), see kdb_show_stack(). vkdb_printf() tries to avoid a potential infinite loop by disabling the trap. But this approach is racy, for example: CPU1 CPU2 vkdb_printf() // assume that kdb_trap_printk == 0 saved_trap_printk = kdb_trap_printk; kdb_trap_printk = 0; kdb_show_stack() kdb_trap_printk++; Problem1: Now, a nested printk() on CPU0 calls vkdb_printf() even when it should have been disabled. It will not cause a deadlock but... // using the outdated saved value: 0 kdb_trap_printk = saved_trap_printk; kdb_trap_printk--; Problem2: Now, kdb_trap_printk == -1 and will stay like this. It means that all messages will get passed to kdb from now on. This patch removes the racy saved_trap_printk handling. Instead, the recursion is prevented by a check for the locked CPU. The solution is still kind of racy. A non-related printk(), from another process, might get trapped by vkdb_printf(). And the wanted printk() might not get trapped because kdb_printf_cpu is assigned. But this problem existed even with the original code. A proper solution would be to get_cpu() before setting kdb_trap_printk and trap messages only from this CPU. I am not sure if it is worth the effort, though. In fact, the race is very theoretical. When kdb is running any of the commands that use kdb_trap_printk there is a single active CPU and the other CPUs should be in a holding pen inside kgdb_cpu_enter(). The only time this is violated is when there is a timeout waiting for the other CPUs to report to the holding pen. Finally, note that the situation is a bit schizophrenic. vkdb_printf() explicitly allows recursion but only from KDB code that calls kdb_printf() directly. On the other hand, the generic printk() recursion is not allowed because it might cause an infinite loop. This is why we could not hide the decision inside vkdb_printf() easily. Link: http://lkml.kernel.org/r/1480412276-16690-4-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Daniel Thompson Cc: Jason Wessel Cc: Peter Zijlstra Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kdb.h | 1 + kernel/debug/kdb/kdb_io.c | 9 ++------- kernel/printk/printk.c | 3 ++- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index eb706188dc23..68bd88223417 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -161,6 +161,7 @@ enum kdb_msgsrc { }; extern int kdb_trap_printk; +extern int kdb_printf_cpu; extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list args); extern __printf(1, 2) int kdb_printf(const char *, ...); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index daa76154fb1b..e74be38245ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -30,6 +30,7 @@ char kdb_prompt_str[CMD_BUFLEN]; int kdb_trap_printk; +int kdb_printf_cpu = -1; static int kgdb_transition_check(char *buffer) { @@ -554,24 +555,19 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int linecount; int colcount; int logging, saved_loglevel = 0; - int saved_trap_printk; int retlen = 0; int fnd, len; int this_cpu, old_cpu; - static int kdb_printf_cpu = -1; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; struct console *c = console_drivers; unsigned long uninitialized_var(flags); - local_irq_save(flags); - saved_trap_printk = kdb_trap_printk; - kdb_trap_printk = 0; - /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, * even if it is interleaved with any other text. */ + local_irq_save(flags); this_cpu = smp_processor_id(); for (;;) { old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu); @@ -849,7 +845,6 @@ kdb_print_out: console_loglevel = saved_loglevel; /* kdb_printf_cpu locked the code above. */ smp_store_release(&kdb_printf_cpu, old_cpu); - kdb_trap_printk = saved_trap_printk; local_irq_restore(flags); return retlen; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 577f2288d19f..a3ce35e0fa1e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args) int r; #ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); return r; } -- cgit v1.2.3 From b6f8a92c9ca835b4a079ecee8433d0d110398448 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Wed, 14 Dec 2016 15:06:13 -0800 Subject: posix-timers: give lazy compilers some help optimizing code away The OpenRISC compiler (so far) fails to optimize away a large portion of code containing a reference to posix_timer_event in alarmtimer.c when CONFIG_POSIX_TIMERS is unset. Let's give it a direct clue to let the build succeed. This fixes [linux-next:master 6682/7183] alarmtimer.c:undefined reference to `posix_timer_event' reported by kbuild test robot. Signed-off-by: Nicolas Pitre Cc: Thomas Gleixner Cc: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/alarmtimer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9b08ca391aed..3921cf7fea8e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { - if (posix_timer_event(ptr, 0) != 0) + if (IS_ENABLED(CONFIG_POSIX_TIMERS) && + posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; } -- cgit v1.2.3 From 249e52e35580fcfe5dad53a7dcd7c1252788749c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 14 Dec 2016 15:06:21 -0800 Subject: kernel/watchdog.c: move shared definitions to nmi.h Patch series "Clean up watchdog handlers", v2. This is an attempt to cleanup watchdog handlers. Right now, kernel/watchdog.c implements both softlockup and hardlockup detectors. Softlockup code is generic. Hardlockup code is arch specific. Some architectures don't use hardlockup detectors. They use their own watchdog detectors. To make both these combination work, we have numerous #ifdefs in kernel/watchdog.c. We are trying here to make these handlers independent of each other. Also provide an interface for architectures to implement their own handlers. watchdog_nmi_enable and watchdog_nmi_disable will be defined as weak such that architectures can override its definitions. Thanks to Don Zickus for his suggestions. Here are our previous discussions http://www.spinics.net/lists/sparclinux/msg16543.html http://www.spinics.net/lists/sparclinux/msg16441.html This patch (of 3): Move shared macros and definitions to nmi.h so that watchdog.c, new file watchdog_hld.c or any other architecture specific handler can use those definitions. Link: http://lkml.kernel.org/r/1478034826-43888-2-git-send-email-babu.moger@oracle.com Signed-off-by: Babu Moger Acked-by: Don Zickus Cc: Ingo Molnar Cc: Jiri Kosina Cc: Andi Kleen Cc: Yaowei Bai Cc: Aaron Tomlin Cc: Ulrich Obergfell Cc: Tejun Heo Cc: Hidehiro Kawai Cc: Josh Hunt Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 24 ++++++++++++++++++++++++ kernel/watchdog.c | 28 ++++------------------------ 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a78c35cff1ae..aacca824a6ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -7,6 +7,23 @@ #include #include +/* + * The run state of the lockup detectors is controlled by the content of the + * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - + * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. + * + * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' + * are variables that are only used as an 'interface' between the parameters + * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The + * 'watchdog_thresh' variable is handled differently because its value is not + * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' + * is equal zero. + */ +#define NMI_WATCHDOG_ENABLED_BIT 0 +#define SOFT_WATCHDOG_ENABLED_BIT 1 +#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) +#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long watchdog_enabled; extern unsigned long *watchdog_cpumask_bits; +#ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 +#endif +extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6d1020c03d41..d21049496e26 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -27,29 +27,12 @@ #include #include -/* - * The run state of the lockup detectors is controlled by the content of the - * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. - * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. - */ -#define NMI_WATCHDOG_ENABLED_BIT 0 -#define SOFT_WATCHDOG_ENABLED_BIT 1 -#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) -#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) - static DEFINE_MUTEX(watchdog_proc_mutex); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif int __read_mostly nmi_watchdog_enabled; int __read_mostly soft_watchdog_enabled; @@ -59,9 +42,6 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -289,7 +269,7 @@ void touch_softlockup_watchdog_sync(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static bool is_hardlockup(void) +bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -- cgit v1.2.3 From 73ce0511c43686095efd2f65ef564aab952e07bc Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 14 Dec 2016 15:06:24 -0800 Subject: kernel/watchdog.c: move hardlockup detector to separate file Separate hardlockup code from watchdog.c and move it to watchdog_hld.c. It is mostly straight forward. Remove everything inside CONFIG_HARDLOCKUP_DETECTORS. This code will go to file watchdog_hld.c. Also update the makefile accordigly. Link: http://lkml.kernel.org/r/1478034826-43888-3-git-send-email-babu.moger@oracle.com Signed-off-by: Babu Moger Acked-by: Don Zickus Cc: Ingo Molnar Cc: Jiri Kosina Cc: Andi Kleen Cc: Yaowei Bai Cc: Aaron Tomlin Cc: Ulrich Obergfell Cc: Tejun Heo Cc: Hidehiro Kawai Cc: Josh Hunt Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/watchdog.c | 241 +++----------------------------------------------- kernel/watchdog_hld.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+), 230 deletions(-) create mode 100644 kernel/watchdog_hld.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index eb26e12c6c2a..314e7d62f5f0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d21049496e26..d4b0fa01cae3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -24,7 +24,6 @@ #include #include -#include #include static DEFINE_MUTEX(watchdog_proc_mutex); @@ -80,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static DEFINE_PER_CPU(bool, hard_watchdog_warn); -static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); -static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); -#endif static unsigned long soft_lockup_nmi_warn; -/* boot commands */ -/* - * Should we panic when a soft-lockup or hard-lockup occurs: - */ -#ifdef CONFIG_HARDLOCKUP_DETECTOR -unsigned int __read_mostly hardlockup_panic = - CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; -static unsigned long hardlockup_allcpu_dumped; -/* - * We may not want to enable hard lockup detection by default in all cases, - * for example when running the kernel as a guest on a hypervisor. In these - * cases this function can be called to disable hard lockup detection. This - * function should only be executed once by the boot processor before the - * kernel command line parameters are parsed, because otherwise it is not - * possible to override this in hardlockup_panic_setup(). - */ -void hardlockup_detector_disable(void) -{ - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; -} - -static int __init hardlockup_panic_setup(char *str) -{ - if (!strncmp(str, "panic", 5)) - hardlockup_panic = 1; - else if (!strncmp(str, "nopanic", 7)) - hardlockup_panic = 0; - else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; - else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; - return 1; -} -__setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -244,30 +202,12 @@ void touch_all_softlockup_watchdogs(void) wq_watchdog_touch(-1); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -void touch_nmi_watchdog(void) -{ - /* - * Using __raw here because some code paths have - * preemption enabled. If preemption is enabled - * then interrupts should be enabled too, in which - * case we shouldn't have to worry about the watchdog - * going off. - */ - raw_cpu_write(watchdog_nmi_touch, true); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -#endif - void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ bool is_hardlockup(void) { @@ -279,7 +219,6 @@ bool is_hardlockup(void) __this_cpu_write(hrtimer_interrupts_saved, hrint); return false; } -#endif static int is_softlockup(unsigned long touch_ts) { @@ -293,77 +232,22 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR - -static struct perf_event_attr wd_hw_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES, - .size = sizeof(struct perf_event_attr), - .pinned = 1, - .disabled = 1, -}; - -/* Callback function for perf event subsystem */ -static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - /* Ensure the watchdog never gets throttled */ - event->hw.interrupts = 0; - - if (__this_cpu_read(watchdog_nmi_touch) == true) { - __this_cpu_write(watchdog_nmi_touch, false); - return; - } - - /* check for a hardlockup - * This is done by making sure our timer interrupt - * is incrementing. The timer interrupt should have - * fired multiple times before we overflow'd. If it hasn't - * then this is a good indication the cpu is stuck - */ - if (is_hardlockup()) { - int this_cpu = smp_processor_id(); - - /* only print hardlockups once */ - if (__this_cpu_read(hard_watchdog_warn) == true) - return; - - pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - - /* - * Perform all-CPU dump only once to avoid multiple hardlockups - * generating interleaving traces - */ - if (sysctl_hardlockup_all_cpu_backtrace && - !test_and_set_bit(0, &hardlockup_allcpu_dumped)) - trigger_allbutself_cpu_backtrace(); - - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); - - __this_cpu_write(hard_watchdog_warn, true); - return; - } - - __this_cpu_write(hard_watchdog_warn, false); - return; -} -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static void watchdog_interrupt_count(void) { __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_nmi_enable(unsigned int cpu); -static void watchdog_nmi_disable(unsigned int cpu); +/* + * These two functions are mostly architecture specific + * defining them as weak here. + */ +int __weak watchdog_nmi_enable(unsigned int cpu) +{ + return 0; +} +void __weak watchdog_nmi_disable(unsigned int cpu) +{ +} static int watchdog_enable_all_cpus(void); static void watchdog_disable_all_cpus(void); @@ -556,109 +440,6 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long cpu0_err; - -static int watchdog_nmi_enable(unsigned int cpu) -{ - struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); - - /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); - - /* save cpu0 error for future comparision */ - if (cpu == 0 && IS_ERR(event)) - cpu0_err = PTR_ERR(event); - - if (!IS_ERR(event)) { - /* only print for cpu0 or different than cpu0 */ - if (cpu == 0 || cpu0_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } - - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - - /* skip displaying the same error again */ - if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Shutting down hard lockup detector on all cpus\n"); - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; -} - -static void watchdog_nmi_disable(unsigned int cpu) -{ - struct perf_event *event = per_cpu(watchdog_ev, cpu); - - if (event) { - perf_event_disable(event); - per_cpu(watchdog_ev, cpu) = NULL; - - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); - } - if (cpu == 0) { - /* watchdog_nmi_enable() expects this to be zero initially. */ - cpu0_err = 0; - } -} - -#else -static int watchdog_nmi_enable(unsigned int cpu) { return 0; } -static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ - static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, .thread_should_run = watchdog_should_run, diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c new file mode 100644 index 000000000000..84016c8aee6b --- /dev/null +++ b/kernel/watchdog_hld.c @@ -0,0 +1,227 @@ +/* + * Detect hard lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks + * to those contributors as well. + */ + +#define pr_fmt(fmt) "NMI watchdog: " fmt + +#include +#include +#include +#include + +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +unsigned int __read_mostly hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static unsigned long hardlockup_allcpu_dumped; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void hardlockup_detector_disable(void) +{ + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; +} + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + else if (!strncmp(str, "1", 1)) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); + +void touch_nmi_watchdog(void) +{ + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + raw_cpu_write(watchdog_nmi_touch, true); + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + /* + * Perform all-CPU dump only once to avoid multiple hardlockups + * generating interleaving traces + */ + if (sysctl_hardlockup_all_cpu_backtrace && + !test_and_set_bit(0, &hardlockup_allcpu_dumped)) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} + +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + +int watchdog_nmi_enable(unsigned int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* nothing to do if the hard lockup detector is disabled */ + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + goto out; + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + + /* Try to register using hardware perf events */ + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + + if (!IS_ERR(event)) { + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); + goto out_save; + } + + /* + * Disable the hard lockup detector if _any_ CPU fails to set up + * set up the hardware perf event. The watchdog() function checks + * the NMI_WATCHDOG_ENABLED bit periodically. + * + * The barriers are for syncing up watchdog_enabled across all the + * cpus, as clear_bit() does not use barriers. + */ + smp_mb__before_atomic(); + clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); + smp_mb__after_atomic(); + + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + pr_warn("disabled (cpu%i): hardware events not enabled\n", + cpu); + else + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); + + pr_info("Shutting down hard lockup detector on all cpus\n"); + + return PTR_ERR(event); + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +void watchdog_nmi_disable(unsigned int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } +} -- cgit v1.2.3 From 5b56d49fc31dbb0487e14ead790fc81ca9fb2c99 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Dec 2016 15:06:52 -0800 Subject: mm: add locked parameter to get_user_pages_remote() Patch series "mm: unexport __get_user_pages_unlocked()". This patch series continues the cleanup of get_user_pages*() functions taking advantage of the fact we can now pass gup_flags as we please. It firstly adds an additional 'locked' parameter to get_user_pages_remote() to allow for its callers to utilise VM_FAULT_RETRY functionality. This is necessary as the invocation of __get_user_pages_unlocked() in process_vm_rw_single_vec() makes use of this and no other existing higher level function would allow it to do so. Secondly existing callers of __get_user_pages_unlocked() are replaced with the appropriate higher-level replacement - get_user_pages_unlocked() if the current task and memory descriptor are referenced, or get_user_pages_remote() if other task/memory descriptors are referenced (having acquiring mmap_sem.) This patch (of 2): Add a int *locked parameter to get_user_pages_remote() to allow VM_FAULT_RETRY faulting behaviour similar to get_user_pages_[un]locked(). Taking into account the previous adjustments to get_user_pages*() functions allowing for the passing of gup_flags, we are now in a position where __get_user_pages_unlocked() need only be exported for his ability to allow VM_FAULT_RETRY behaviour, this adjustment allows us to subsequently unexport __get_user_pages_unlocked() as well as allowing for future flexibility in the use of get_user_pages_remote(). [sfr@canb.auug.org.au: merge fix for get_user_pages_remote API change] Link: http://lkml.kernel.org/r/20161122210511.024ec341@canb.auug.org.au Link: http://lkml.kernel.org/r/20161027095141.2569-2-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Jan Kara Cc: Hugh Dickins Cc: Dave Hansen Cc: Rik van Riel Cc: Mel Gorman Cc: Paolo Bonzini Cc: Radim Krcmar Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/infiniband/core/umem_odp.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 2 +- fs/exec.c | 2 +- include/linux/mm.h | 2 +- kernel/events/uprobes.c | 4 ++-- mm/gup.c | 12 ++++++++---- mm/memory.c | 2 +- security/tomoyo/domain.c | 2 +- 10 files changed, 18 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 7d066a91d778..6f5dfabb3096 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -759,7 +759,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - flags, pvec + pinned, NULL); + flags, pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 107ddf51065e..d068af2ec3a3 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, flags, - pvec + pinned, NULL); + pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1f0fe3217f23..6b079a31dced 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - flags, local_page_list, NULL); + flags, local_page_list, NULL, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9815e45c23c4..f3726ba12aa6 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, - NULL); + NULL, NULL); up_read(&mm->mmap_sem); } diff --git a/fs/exec.c b/fs/exec.c index 923c57d96899..eac60886b0ab 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, - &page, NULL); + &page, NULL, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index a92c8d73aeaf..cc154454675a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1274,7 +1274,7 @@ extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..215871bda3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma); + &vma, NULL); if (ret <= 0) return ret; @@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * essentially a kernel access to the memory. */ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL); + NULL, NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index e50178c58b97..b64c907aa4f0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/memory.c b/mm/memory.c index c264f7cd3e47..3a6a1239c42b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3919,7 +3919,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 682b73af7766..838ffa78cfda 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; -- cgit v1.2.3 From 5aa068ea4082b39eafc356c27c9ecd155b0895f6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 25 Oct 2016 11:27:31 -0700 Subject: printk: remove games with previous record flags The record logging code looks at the previous record flags in various ways, and they are all wrong. You can't use the previous record flags to determine anything about the next record, because they may simply not be related. In particular, the reason the previous record was a continuation record may well be exactly _because_ the new record was printed by a different process, which is why the previous record was flushed. So all those games are simply wrong, and make the code hard to understand (because the code fundamentally cdoes not make sense). So remove it. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 109 +++++++++++-------------------------------------- 1 file changed, 24 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a3ce35e0fa1e..50b2703d6d6a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -356,7 +356,6 @@ DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; -static enum log_flags syslog_prev; static size_t syslog_partial; /* index and sequence number of the first record stored in the buffer */ @@ -370,7 +369,6 @@ static u32 log_next_idx; /* the next printk record to write to the console */ static u64 console_seq; static u32 console_idx; -static enum log_flags console_prev; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; @@ -639,27 +637,15 @@ static void append_char(char **pp, char *e, char c) } static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq, - enum log_flags prev_flags) + struct printk_log *msg, u64 seq) { u64 ts_usec = msg->ts_nsec; - char cont = '-'; do_div(ts_usec, 1000); - /* - * If we couldn't merge continuation line fragments during the print, - * export the stored flags to allow an optional external merge of the - * records. Merging the records isn't always neccessarily correct, like - * when we hit a race during printing. In most cases though, it produces - * better readable output. 'c' in the record flags mark the first - * fragment of a line, '+' the following. - */ - if (msg->flags & LOG_CONT) - cont = (prev_flags & LOG_CONT) ? '+' : 'c'; - return scnprintf(buf, size, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, seq, ts_usec, cont); + (msg->facility << 3) | msg->level, seq, ts_usec, + msg->flags & LOG_CONT ? 'c' : '-'); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -714,7 +700,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, struct devkmsg_user { u64 seq; u32 idx; - enum log_flags prev; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; @@ -824,12 +809,11 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); len = msg_print_ext_header(user->buf, sizeof(user->buf), - msg, user->seq, user->prev); + msg, user->seq); len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, log_dict(msg), msg->dict_len, log_text(msg), msg->text_len); - user->prev = msg->flags; user->idx = log_next(user->idx); user->seq++; raw_spin_unlock_irq(&logbuf_lock); @@ -1210,26 +1194,12 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf) return len; } -static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, - bool syslog, char *buf, size_t size) +static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { const char *text = log_text(msg); size_t text_size = msg->text_len; - bool prefix = true; - bool newline = true; size_t len = 0; - if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) - prefix = false; - - if (msg->flags & LOG_CONT) { - if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) - prefix = false; - - if (!(msg->flags & LOG_NEWLINE)) - newline = false; - } - do { const char *next = memchr(text, '\n', text_size); size_t text_len; @@ -1247,22 +1217,17 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, text_len + 1 >= size - len) break; - if (prefix) - len += print_prefix(msg, syslog, buf + len); + len += print_prefix(msg, syslog, buf + len); memcpy(buf + len, text, text_len); len += text_len; - if (next || newline) - buf[len++] = '\n'; + buf[len++] = '\n'; } else { /* SYSLOG_ACTION_* buffer size only calculation */ - if (prefix) - len += print_prefix(msg, syslog, NULL); + len += print_prefix(msg, syslog, NULL); len += text_len; - if (next || newline) - len++; + len++; } - prefix = true; text = next; } while (text); @@ -1288,7 +1253,6 @@ static int syslog_print(char __user *buf, int size) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; - syslog_prev = 0; syslog_partial = 0; } if (syslog_seq == log_next_seq) { @@ -1298,13 +1262,11 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, - LOG_LINE_MAX + PREFIX_MAX); + n = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); syslog_seq++; - syslog_prev = msg->flags; n -= syslog_partial; syslog_partial = 0; } else if (!len){ @@ -1347,7 +1309,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u64 next_seq; u64 seq; u32 idx; - enum log_flags prev; /* * Find first record that fits, including all following records, @@ -1355,12 +1316,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear) */ seq = clear_seq; idx = clear_idx; - prev = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len += msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; + len += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; } @@ -1368,12 +1327,10 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; - prev = 0; while (len > size && seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - len -= msg_print_text(msg, prev, true, NULL, 0); - prev = msg->flags; + len -= msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; } @@ -1386,7 +1343,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct printk_log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, + textlen = msg_print_text(msg, true, text, LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; @@ -1394,7 +1351,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) } idx = log_next(idx); seq++; - prev = msg->flags; raw_spin_unlock_irq(&logbuf_lock); if (copy_to_user(buf + len, text, textlen)) @@ -1407,7 +1363,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* messages are gone, move to next one */ seq = log_first_seq; idx = log_first_idx; - prev = 0; } } } @@ -1508,7 +1463,6 @@ int do_syslog(int type, char __user *buf, int len, int source) /* messages are gone, move to first one */ syslog_seq = log_first_seq; syslog_idx = log_first_idx; - syslog_prev = 0; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { @@ -1521,16 +1475,14 @@ int do_syslog(int type, char __user *buf, int len, int source) } else { u64 seq = syslog_seq; u32 idx = syslog_idx; - enum log_flags prev = syslog_prev; error = 0; while (seq < log_next_seq) { struct printk_log *msg = log_from_idx(idx); - error += msg_print_text(msg, prev, true, NULL, 0); + error += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } error -= syslog_partial; } @@ -1712,7 +1664,7 @@ static size_t cont_print_text(char *text, size_t size) size_t textlen = 0; size_t len; - if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { + if (cont.cons == 0) { textlen += print_time(cont.ts_nsec, text); size -= textlen; } @@ -1981,11 +1933,9 @@ static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; static u32 console_idx; -static enum log_flags syslog_prev; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; -static enum log_flags console_prev; static struct cont { size_t len; size_t cons; @@ -1997,15 +1947,15 @@ static char *log_dict(const struct printk_log *msg) { return NULL; } static struct printk_log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static ssize_t msg_print_ext_header(char *buf, size_t size, - struct printk_log *msg, u64 seq, - enum log_flags prev_flags) { return 0; } + struct printk_log *msg, + u64 seq) { return 0; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } static void call_console_drivers(int level, const char *ext_text, size_t ext_len, const char *text, size_t len) {} -static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } @@ -2382,7 +2332,6 @@ again: /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; - console_prev = 0; } else { len = 0; } @@ -2407,16 +2356,14 @@ skip: * will properly dump everything later. */ msg->flags &= ~LOG_NOCONS; - console_prev = msg->flags; goto skip; } - len += msg_print_text(msg, console_prev, false, - text + len, sizeof(text) - len); + len += msg_print_text(msg, false, text + len, sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), - msg, console_seq, console_prev); + msg, console_seq); ext_len += msg_print_ext_body(ext_text + ext_len, sizeof(ext_text) - ext_len, log_dict(msg), msg->dict_len, @@ -2424,7 +2371,6 @@ skip: } console_idx = log_next(console_idx); console_seq++; - console_prev = msg->flags; raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ @@ -2719,7 +2665,6 @@ void register_console(struct console *newcon) raw_spin_lock_irqsave(&logbuf_lock, flags); console_seq = syslog_seq; console_idx = syslog_idx; - console_prev = syslog_prev; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -3075,7 +3020,7 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, goto out; msg = log_from_idx(dumper->cur_idx); - l = msg_print_text(msg, 0, syslog, line, size); + l = msg_print_text(msg, syslog, line, size); dumper->cur_idx = log_next(dumper->cur_idx); dumper->cur_seq++; @@ -3144,7 +3089,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, u32 idx; u64 next_seq; u32 next_idx; - enum log_flags prev; size_t l = 0; bool ret = false; @@ -3167,27 +3111,23 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, /* calculate length of entire buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, prev, true, NULL, 0); + l += msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } /* move first record forward until length fits into the buffer */ seq = dumper->cur_seq; idx = dumper->cur_idx; - prev = 0; while (l > size && seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l -= msg_print_text(msg, prev, true, NULL, 0); + l -= msg_print_text(msg, true, NULL, 0); idx = log_next(idx); seq++; - prev = msg->flags; } /* last message in next interation */ @@ -3198,10 +3138,9 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); - l += msg_print_text(msg, prev, syslog, buf + l, size - l); + l += msg_print_text(msg, syslog, buf + l, size - l); idx = log_next(idx); seq++; - prev = msg->flags; } dumper->next_seq = next_seq; -- cgit v1.2.3 From 5c2992ee7fd8a29d04125dc0aa3522784c5fa5eb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 25 Oct 2016 11:27:31 -0700 Subject: printk: remove console flushing special cases for partial buffered lines It actively hurts proper merging, and makes for a lot of special cases. There was a good(ish) reason for doing it originally, but it's getting too painful to maintain. And most of the original reasons for it are long gone. So instead of having special code to flush partial lines to the console (as opposed to the record buffers), do _all_ the console writing from the record buffer, and be done with it. If an oops happens (or some other synchronous event), we will flush the partial lines due to the oops printing activity, so this does not affect that. It does mean that if you have a completely hung machine, a partial preceding line may not have been printed out. That was some of the original reason for this complexity, in fact, back when we used to test for the historical i386 "halt" instruction problem by doing pr_info("Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { pr_cont("disabled\n"); return; } halt(); halt(); halt(); halt(); pr_cont("OK\n"); and that model no longer works (it the 'hlt' instruction kills the machine, the partial line won't have been flushed, so you won't even see it). Of course, that was also back in the days when people actually had textual console output rather than a graphical splash-screen at bootup. How times change.. Cc: Sergey Senozhatsky Cc: Joe Perches Cc: Steven Rostedt Tested-by: Petr Mladek Tested-by: Geert Uytterhoeven Tested-by: Mark Rutland Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 108 +++---------------------------------------------- 1 file changed, 5 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 50b2703d6d6a..b3c454b733da 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1583,46 +1583,25 @@ static inline void printk_delay(void) static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ - size_t cons; /* bytes written to console */ struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ - bool flushed:1; /* buffer sealed and committed */ } cont; static void cont_flush(void) { - if (cont.flushed) - return; if (cont.len == 0) return; - if (cont.cons) { - /* - * If a fragment of this line was directly flushed to the - * console; wait for the console to pick up the rest of the - * line. LOG_NOCONS suppresses a duplicated output. - */ - log_store(cont.facility, cont.level, cont.flags | LOG_NOCONS, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - cont.flushed = true; - } else { - /* - * If no fragment of this line ever reached the console, - * just submit it to the store and free the buffer. - */ - log_store(cont.facility, cont.level, cont.flags, 0, - NULL, 0, cont.buf, cont.len); - cont.len = 0; - } + + log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, + NULL, 0, cont.buf, cont.len); + cont.len = 0; } static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) { - if (cont.len && cont.flushed) - return false; - /* * If ext consoles are present, flush and skip in-kernel * continuation. See nr_ext_console_drivers definition. Also, if @@ -1639,8 +1618,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * cont.owner = current; cont.ts_nsec = local_clock(); cont.flags = flags; - cont.cons = 0; - cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); @@ -1659,34 +1636,6 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * return true; } -static size_t cont_print_text(char *text, size_t size) -{ - size_t textlen = 0; - size_t len; - - if (cont.cons == 0) { - textlen += print_time(cont.ts_nsec, text); - size -= textlen; - } - - len = cont.len - cont.cons; - if (len > 0) { - if (len+1 > size) - len = size-1; - memcpy(text + textlen, cont.buf + cont.cons, len); - textlen += len; - cont.cons = cont.len; - } - - if (cont.flushed) { - if (cont.flags & LOG_NEWLINE) - text[textlen++] = '\n'; - /* got everything, release buffer */ - cont.len = 0; - } - return textlen; -} - static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { /* @@ -1957,7 +1906,6 @@ static void call_console_drivers(int level, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, bool syslog, char *buf, size_t size) { return 0; } -static size_t cont_print_text(char *text, size_t size) { return 0; } static bool suppress_message_printing(int level) { return false; } /* Still needs to be defined for users */ @@ -2221,42 +2169,6 @@ static inline int can_use_console(void) return cpu_online(raw_smp_processor_id()) || have_callable_console(); } -static void console_cont_flush(char *text, size_t size) -{ - unsigned long flags; - size_t len; - - raw_spin_lock_irqsave(&logbuf_lock, flags); - - if (!cont.len) - goto out; - - if (suppress_message_printing(cont.level)) { - cont.cons = cont.len; - if (cont.flushed) - cont.len = 0; - goto out; - } - - /* - * We still queue earlier records, likely because the console was - * busy. The earlier ones need to be printed before this one, we - * did not flush any fragment so far, so just let it queue up. - */ - if (console_seq < log_next_seq && !cont.cons) - goto out; - - len = cont_print_text(text, size); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, NULL, 0, text, len); - start_critical_timings(); - local_irq_restore(flags); - return; -out: - raw_spin_unlock_irqrestore(&logbuf_lock, flags); -} - /** * console_unlock - unlock the console system * @@ -2310,9 +2222,6 @@ again: return; } - /* flush buffered message fragment immediately to console */ - console_cont_flush(text, sizeof(text)); - for (;;) { struct printk_log *msg; size_t ext_len = 0; @@ -2341,8 +2250,7 @@ skip: msg = log_from_idx(console_idx); level = msg->level; - if ((msg->flags & LOG_NOCONS) || - suppress_message_printing(level)) { + if (suppress_message_printing(level)) { /* * Skip record we have buffered and already printed * directly to the console when we received it, and @@ -2350,12 +2258,6 @@ skip: */ console_idx = log_next(console_idx); console_seq++; - /* - * We will get here again when we register a new - * CON_PRINTBUFFER console. Clear the flag so we - * will properly dump everything later. - */ - msg->flags &= ~LOG_NOCONS; goto skip; } -- cgit v1.2.3 From c1a9eeb938b5433947e5ea22f89baff3182e7075 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Dec 2016 12:10:37 +0100 Subject: tick/broadcast: Prevent NULL pointer dereference When a disfunctional timer, e.g. dummy timer, is installed, the tick core tries to setup the broadcast timer. If no broadcast device is installed, the kernel crashes with a NULL pointer dereference in tick_broadcast_setup_oneshot() because the function has no sanity check. Reported-by: Mason Signed-off-by: Thomas Gleixner Cc: Mark Rutland Cc: Anna-Maria Gleixner Cc: Richard Cochran Cc: Sebastian Andrzej Siewior Cc: Daniel Lezcano Cc: Peter Zijlstra , Cc: Sebastian Frias Cc: Thibaud Cornic Cc: Robin Murphy Link: http://lkml.kernel.org/r/1147ef90-7877-e4d2-bb2b-5c4fa8d3144b@free.fr --- kernel/time/tick-broadcast.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f6aae7977824..d2a20e83ebae 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -871,6 +871,9 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { int cpu = smp_processor_id(); + if (!bc) + return; + /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = clockevent_state_periodic(bc); -- cgit v1.2.3 From c0af52437254fda8b0cdbaae5a9b6d9327f1fcd5 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 14 Dec 2016 16:01:12 -0200 Subject: genirq/affinity: Fix node generation from cpumask Commit 34c3d9819fda ("genirq/affinity: Provide smarter irq spreading infrastructure") introduced a better IRQ spreading mechanism, taking account of the available NUMA nodes in the machine. Problem is that the algorithm of retrieving the nodemask iterates "linearly" based on the number of online nodes - some architectures present non-linear node distribution among the nodemask, like PowerPC. If this is the case, the algorithm lead to a wrong node count number and therefore to a bad/incomplete IRQ affinity distribution. For example, this problem were found in a machine with 128 CPUs and two nodes, namely nodes 0 and 8 (instead of 0 and 1, if it was linearly distributed). This led to a wrong affinity distribution which then led to a bad mq allocation for nvme driver. Finally, we take the opportunity to fix a comment regarding the affinity distribution when we have _more_ nodes than vectors. Fixes: 34c3d9819fda ("genirq/affinity: Provide smarter irq spreading infrastructure") Reported-by: Gabriel Krisman Bertazi Signed-off-by: Guilherme G. Piccoli Reviewed-by: Christoph Hellwig Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Gavin Shan Cc: linux-pci@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: hch@lst.de Link: http://lkml.kernel.org/r/1481738472-2671-1-git-send-email-gpiccoli@linux.vnet.ibm.com Signed-off-by: Thomas Gleixner --- kernel/irq/affinity.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 9be9bda7c1f9..4544b115f5eb 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -37,10 +37,10 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk) { - int n, nodes; + int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ - for (n = 0, nodes = 0; n < num_online_nodes(); n++) { + for_each_online_node(n) { if (cpumask_intersects(mask, cpumask_of_node(n))) { node_set(n, *nodemsk); nodes++; @@ -82,7 +82,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) nodes = get_nodes_in_cpumask(cpu_online_mask, &nodemsk); /* - * If the number of nodes in the mask is less than or equal the + * If the number of nodes in the mask is greater than or equal the * number of vectors we just spread the vectors across the nodes. */ if (affv <= nodes) { -- cgit v1.2.3 From 512f09801b356c54baef62543e51169f03b2e642 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 15 Dec 2016 10:00:57 -0500 Subject: cpu/hotplug: Clarify description of __cpuhp_setup_state() return value When invoked with CPUHP_AP_ONLINE_DYN state __cpuhp_setup_state() is expected to return positive value which is the hotplug state that the routine assigns. Signed-off-by: Boris Ostrovsky Cc: linux-pm@vger.kernel.org Cc: viresh.kumar@linaro.org Cc: bigeasy@linutronix.de Cc: rjw@rjwysocki.net Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1481814058-4799-2-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Thomas Gleixner --- kernel/cpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 217fd2e7f435..5339aca811d2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1586,7 +1586,11 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance); * @startup: startup callback function * @teardown: teardown callback function * - * Returns 0 if successful, otherwise a proper error code + * Returns: + * On success: + * Positive state number if @state is CPUHP_AP_ONLINE_DYN + * 0 for all other states + * On failure: proper (negative) error code */ int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, -- cgit v1.2.3 From 8fa9a697ab083af0f4a2807869752685ed8a7a6a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 15 Dec 2016 13:53:58 +0100 Subject: printk: Remove no longer used second struct cont MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_PRINTK=n: kernel/printk/printk.c:1893: warning: ‘cont’ defined but not used Note that there are actually two different struct cont definitions and objects: the first one is used if CONFIG_PRINTK=y, the second one became unused by removing console_cont_flush(). Fixes: 5c2992ee7fd8 ("printk: remove console flushing special cases for partial buffered lines") Signed-off-by: Geert Uytterhoeven Acked-by: Petr Mladek [ I do the occasional "allnoconfig" builds, but apparently not often enough - Linus ] Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b3c454b733da..bc2e220ed2b0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1885,12 +1885,6 @@ static u32 console_idx; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; -static struct cont { - size_t len; - size_t cons; - u8 level; - bool flushed:1; -} cont; static char *log_text(const struct printk_log *msg) { return NULL; } static char *log_dict(const struct printk_log *msg) { return NULL; } static struct printk_log *log_from_idx(u32 idx) { return NULL; } -- cgit v1.2.3 From a08dd0da5307ba01295c8383923e51e7997c3576 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Dec 2016 01:30:06 +0100 Subject: bpf: fix regression on verifier pruning wrt map lookups Commit 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") introduced a regression where existing programs stopped loading due to reaching the verifier's maximum complexity limit, whereas prior to this commit they were loading just fine; the affected program has roughly 2k instructions. What was found is that state pruning couldn't be performed effectively anymore due to mismatches of the verifier's register state, in particular in the id tracking. It doesn't mean that 57a09bf0a416 is incorrect per se, but rather that verifier needs to perform a lot more work for the same program with regards to involved map lookups. Since commit 57a09bf0a416 is only about tracking registers with type PTR_TO_MAP_VALUE_OR_NULL, the id is only needed to follow registers until they are promoted through pattern matching with a NULL check to either PTR_TO_MAP_VALUE or UNKNOWN_VALUE type. After that point, the id becomes irrelevant for the transitioned types. For UNKNOWN_VALUE, id is already reset to 0 via mark_reg_unknown_value(), but not so for PTR_TO_MAP_VALUE where id is becoming stale. It's even transferred further into other types that don't make use of it. Among others, one example is where UNKNOWN_VALUE is set on function call return with RET_INTEGER return type. states_equal() will then fall through the memcmp() on register state; note that the second memcmp() uses offsetofend(), so the id is part of that since d2a4dd37f6b4 ("bpf: fix state equivalence"). But the bisect pointed already to 57a09bf0a416, where we really reach beyond complexity limit. What I found was that states_equal() often failed in this case due to id mismatches in spilled regs with registers in type PTR_TO_MAP_VALUE. Unlike non-spilled regs, spilled regs just perform a memcmp() on their reg state and don't have any other optimizations in place, therefore also id was relevant in this case for making a pruning decision. We can safely reset id to 0 as well when converting to PTR_TO_MAP_VALUE. For the affected program, it resulted in a ~17 fold reduction of complexity and let the program load fine again. Selftest suite also runs fine. The only other place where env->id_gen is used currently is through direct packet access, but for these cases id is long living, thus a different scenario. Also, the current logic in mark_map_regs() is not fully correct when marking NULL branch with UNKNOWN_VALUE. We need to cache the destination reg's id in any case. Otherwise, once we marked that reg as UNKNOWN_VALUE, it's id is reset and any subsequent registers that hold the original id and are of type PTR_TO_MAP_VALUE_OR_NULL won't be marked UNKNOWN_VALUE anymore, since mark_map_reg() reuses the uncached regs[regno].id that was just overridden. Note, we don't need to cache it outside of mark_map_regs(), since it's called once on this_branch and the other time on other_branch, which are both two independent verifier states. A test case for this is added here, too. Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") Signed-off-by: Daniel Borkmann Acked-by: Thomas Graf Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 11 ++++++++--- tools/testing/selftests/bpf/test_verifier.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d28f9a3380a9..81e267bc4640 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1970,6 +1970,11 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { reg->type = type; + /* We don't need id from this point onwards anymore, thus we + * should better reset it, so that state pruning has chances + * to take effect. + */ + reg->id = 0; if (type == UNKNOWN_VALUE) mark_reg_unknown_value(regs, regno); } @@ -1982,16 +1987,16 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs; + u32 id = regs[regno].id; int i; for (i = 0; i < MAX_BPF_REG; i++) - mark_map_reg(regs, i, regs[regno].id, type); + mark_map_reg(regs, i, id, type); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, - regs[regno].id, type); + mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type); } } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 0103bf2e0c0d..072dc63dc957 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2660,6 +2660,34 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS }, + { + "multiple registers share map_lookup_elem bad reg type", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 10), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_4, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_5, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 0), + BPF_MOV64_IMM(BPF_REG_1, 3), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 4 }, + .result = REJECT, + .errstr = "R3 invalid mem access 'inv'", + .prog_type = BPF_PROG_TYPE_SCHED_CLS + }, { "invalid map access from else condition", .insns = { -- cgit v1.2.3 From aafe6ae9cee32df85eb5e8bb6dd1d918e6807b09 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 18 Dec 2016 01:52:57 +0100 Subject: bpf: dynamically allocate digest scratch buffer Geert rightfully complained that 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink") added a too large allocation of variable 'raw' from bss section, and should instead be done dynamically: # ./scripts/bloat-o-meter kernel/bpf/core.o.1 kernel/bpf/core.o.2 add/remove: 3/0 grow/shrink: 0/0 up/down: 33291/0 (33291) function old new delta raw - 32832 +32832 [...] Since this is only relevant during program creation path, which can be considered slow-path anyway, lets allocate that dynamically and be not implicitly dependent on verifier mutex. Move bpf_prog_calc_digest() at the beginning of replace_map_fd_with_map_ptr() and also error handling stays straight forward. Reported-by: Geert Uytterhoeven Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/linux/filter.h | 14 +++++++++++--- kernel/bpf/core.c | 27 ++++++++++++++++----------- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 6 ++++-- 5 files changed, 33 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8796ff03f472..201eb483c46f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,7 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); -void bpf_prog_calc_digest(struct bpf_prog *fp); +int bpf_prog_calc_digest(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index af8a1804cac6..702314253797 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -57,9 +57,6 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 -/* Maximum BPF program size in bytes. */ -#define MAX_BPF_SIZE (BPF_MAXINSNS * sizeof(struct bpf_insn)) - /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ @@ -517,6 +514,17 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, return BPF_PROG_RUN(prog, xdp); } +static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) +{ + return prog->len * sizeof(struct bpf_insn); +} + +static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog) +{ + return round_up(bpf_prog_insn_size(prog) + + sizeof(__be64) + 1, SHA_MESSAGE_BYTES); +} + static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 83e0d153b0b4..75c08b8068d6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -136,28 +136,29 @@ void __bpf_prog_free(struct bpf_prog *fp) vfree(fp); } -#define SHA_BPF_RAW_SIZE \ - round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES) - -/* Called under verifier mutex. */ -void bpf_prog_calc_digest(struct bpf_prog *fp) +int bpf_prog_calc_digest(struct bpf_prog *fp) { const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); - static u32 ws[SHA_WORKSPACE_WORDS]; - static u8 raw[SHA_BPF_RAW_SIZE]; - struct bpf_insn *dst = (void *)raw; + u32 raw_size = bpf_prog_digest_scratch_size(fp); + u32 ws[SHA_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; + struct bpf_insn *dst; bool was_ld_map; - u8 *todo = raw; + u8 *raw, *todo; __be32 *result; __be64 *bits; + raw = vmalloc(raw_size); + if (!raw) + return -ENOMEM; + sha_init(fp->digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ + dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -177,12 +178,13 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) } } - psize = fp->len * sizeof(struct bpf_insn); - memset(&raw[psize], 0, sizeof(raw) - psize); + psize = bpf_prog_insn_size(fp); + memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; bsize = round_up(psize, SHA_MESSAGE_BYTES); blocks = bsize / SHA_MESSAGE_BYTES; + todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); } else { @@ -199,6 +201,9 @@ void bpf_prog_calc_digest(struct bpf_prog *fp) result = (__force __be32 *)fp->digest; for (i = 0; i < SHA_DIGEST_WORDS; i++) result[i] = cpu_to_be32(fp->digest[i]); + + vfree(raw); + return 0; } static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4819ec9d95f6..35d674c1f12e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -811,7 +811,7 @@ static int bpf_prog_load(union bpf_attr *attr) err = -EFAULT; if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), - prog->len * sizeof(struct bpf_insn)) != 0) + bpf_prog_insn_size(prog)) != 0) goto free_prog; prog->orig_prog = NULL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 81e267bc4640..64b7b1abe087 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2931,6 +2931,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i, j, err; + err = bpf_prog_calc_digest(env->prog); + if (err) + return err; + for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { @@ -3178,8 +3182,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log_level = 0; } - bpf_prog_calc_digest(env->prog); - ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From 5ccb071e97fbd9ffe623a0d3977cc6d013bee93c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 18 Dec 2016 01:52:58 +0100 Subject: bpf: fix overflow in prog accounting Commit aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs") made a wrong assumption of charging against prog->pages. Unlike map->pages, prog->pages are still subject to change when we need to expand the program through bpf_prog_realloc(). This can for example happen during verification stage when we need to expand and rewrite parts of the program. Should the required space cross a page boundary, then prog->pages is not the same anymore as its original value that we used to bpf_prog_charge_memlock() on. Thus, we'll hit a wrap-around during bpf_prog_uncharge_memlock() when prog is freed eventually. I noticed this that despite having unlimited memlock, programs suddenly refused to load with EPERM error due to insufficient memlock. There are two ways to fix this issue. One would be to add a cached variable to struct bpf_prog that takes a snapshot of prog->pages at the time of charging. The other approach is to also account for resizes. I chose to go with the latter for a couple of reasons: i) We want accounting rather to be more accurate instead of further fooling limits, ii) adding yet another page counter on struct bpf_prog would also be a waste just for this purpose. We also do want to charge as early as possible to avoid going into the verifier just to find out later on that we crossed limits. The only place that needs to be fixed is bpf_prog_realloc(), since only here we expand the program, so we try to account for the needed delta and should we fail, call-sites check for outcome anyway. On cBPF to eBPF migrations, we don't grab a reference to the user as they are charged differently. With that in place, my test case worked fine. Fixes: aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 11 +++++++++++ kernel/bpf/core.c | 16 +++++++++++++--- kernel/bpf/syscall.c | 36 ++++++++++++++++++++++++++++-------- 3 files changed, 52 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 201eb483c46f..f74ae68086dc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -238,6 +238,8 @@ struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); +int __bpf_prog_charge(struct user_struct *user, u32 pages); +void __bpf_prog_uncharge(struct user_struct *user, u32 pages); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); @@ -318,6 +320,15 @@ static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog) { return ERR_PTR(-EOPNOTSUPP); } + +static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) +{ + return 0; +} + +static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +{ +} #endif /* CONFIG_BPF_SYSCALL */ /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 75c08b8068d6..1eb4f1303756 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -105,19 +105,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; + u32 pages, delta; + int ret; BUG_ON(fp_old == NULL); size = round_up(size, PAGE_SIZE); - if (size <= fp_old->pages * PAGE_SIZE) + pages = size / PAGE_SIZE; + if (pages <= fp_old->pages) return fp_old; + delta = pages - fp_old->pages; + ret = __bpf_prog_charge(fp_old->aux->user, delta); + if (ret) + return NULL; + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); - if (fp != NULL) { + if (fp == NULL) { + __bpf_prog_uncharge(fp_old->aux->user, delta); + } else { kmemcheck_annotate_bitfield(fp, meta); memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); - fp->pages = size / PAGE_SIZE; + fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35d674c1f12e..e89acea22ecf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -615,19 +615,39 @@ static void free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +int __bpf_prog_charge(struct user_struct *user, u32 pages) +{ + unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + unsigned long user_bufs; + + if (user) { + user_bufs = atomic_long_add_return(pages, &user->locked_vm); + if (user_bufs > memlock_limit) { + atomic_long_sub(pages, &user->locked_vm); + return -EPERM; + } + } + + return 0; +} + +void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +{ + if (user) + atomic_long_sub(pages, &user->locked_vm); +} + static int bpf_prog_charge_memlock(struct bpf_prog *prog) { struct user_struct *user = get_current_user(); - unsigned long memlock_limit; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + int ret; - atomic_long_add(prog->pages, &user->locked_vm); - if (atomic_long_read(&user->locked_vm) > memlock_limit) { - atomic_long_sub(prog->pages, &user->locked_vm); + ret = __bpf_prog_charge(user, prog->pages); + if (ret) { free_uid(user); - return -EPERM; + return ret; } + prog->aux->user = user; return 0; } @@ -636,7 +656,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) { struct user_struct *user = prog->aux->user; - atomic_long_sub(prog->pages, &user->locked_vm); + __bpf_prog_uncharge(user, prog->pages); free_uid(user); } -- cgit v1.2.3 From 6760bf2ddde8ad64f8205a651223a93de3a35494 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 18 Dec 2016 01:52:59 +0100 Subject: bpf: fix mark_reg_unknown_value for spilled regs on map value marking Martin reported a verifier issue that hit the BUG_ON() for his test case in the mark_reg_unknown_value() function: [ 202.861380] kernel BUG at kernel/bpf/verifier.c:467! [...] [ 203.291109] Call Trace: [ 203.296501] [] mark_map_reg+0x45/0x50 [ 203.308225] [] mark_map_regs+0x78/0x90 [ 203.320140] [] do_check+0x226d/0x2c90 [ 203.331865] [] bpf_check+0x48b/0x780 [ 203.343403] [] bpf_prog_load+0x27e/0x440 [ 203.355705] [] ? handle_mm_fault+0x11af/0x1230 [ 203.369158] [] ? security_capable+0x48/0x60 [ 203.382035] [] SyS_bpf+0x124/0x960 [ 203.393185] [] ? __do_page_fault+0x276/0x490 [ 203.406258] [] entry_SYSCALL_64_fastpath+0x13/0x94 This issue got uncovered after the fix in a08dd0da5307 ("bpf: fix regression on verifier pruning wrt map lookups"). The reason why it wasn't noticed before was, because as mentioned in a08dd0da5307, mark_map_regs() was doing the id matching incorrectly based on the uncached regs[regno].id. So, in the first loop, we walked all regs and as soon as we found regno == i, then this reg's id was cleared when calling mark_reg_unknown_value() thus that every subsequent register was probed against id of 0 (which, in combination with the PTR_TO_MAP_VALUE_OR_NULL type is an invalid condition that no other register state can hold), and therefore wasn't type transitioned such as in the spilled register case for the second loop. Now since that got fixed, it turned out that 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") used mark_reg_unknown_value() incorrectly for the spilled regs, and thus hitting the BUG_ON() in some cases due to regno >= MAX_BPF_REG. Although spilled regs have the same type as the non-spilled regs for the verifier state, that is, struct bpf_reg_state, they are semantically different from the non-spilled regs. In other words, there can be up to 64 (MAX_BPF_STACK / BPF_REG_SIZE) spilled regs in the stack, for example, register R could have been spilled by the program to stack location X, Y, Z, and in mark_map_regs() we need to scan these stack slots of type STACK_SPILL for potential registers that we have to transition from PTR_TO_MAP_VALUE_OR_NULL. Therefore, depending on the location, the spilled_regs regno can be a lot higher than just MAX_BPF_REG's value since we operate on stack instead. The reset in mark_reg_unknown_value() itself is just fine, only that the BUG_ON() was inappropriate for this. Fix it by making a __mark_reg_unknown_value() version that can be called from mark_map_reg() generically; we know for the non-spilled case that the regno is always < MAX_BPF_REG anyway. Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") Reported-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 64b7b1abe087..83ed2f8f6f22 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -462,14 +462,19 @@ static void init_reg_state(struct bpf_reg_state *regs) regs[BPF_REG_1].type = PTR_TO_CTX; } -static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) +static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) { - BUG_ON(regno >= MAX_BPF_REG); regs[regno].type = UNKNOWN_VALUE; regs[regno].id = 0; regs[regno].imm = 0; } +static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + __mark_reg_unknown_value(regs, regno); +} + static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) { regs[regno].min_value = BPF_REGISTER_MIN_RANGE; @@ -1976,7 +1981,7 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, */ reg->id = 0; if (type == UNKNOWN_VALUE) - mark_reg_unknown_value(regs, regno); + __mark_reg_unknown_value(regs, regno); } } -- cgit v1.2.3 From 297e765e390a2ac996000b5f7228cbd84d995174 Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Tue, 13 Dec 2016 11:40:57 +0100 Subject: uprobes: Fix uprobes on MIPS, allow for a cache flush after ixol breakpoint creation Commit: 72e6ae285a1d ('ARM: 8043/1: uprobes need icache flush after xol write' ... has introduced an arch-specific method to ensure all caches are flushed appropriately after an instruction is written to an XOL page. However, when the XOL area is created and the out-of-line breakpoint instruction is copied, caches are not flushed at all and stale data may be found in icache. Replace a simple copy_to_page() with arch_uprobe_copy_ixol() to allow the arch to ensure all caches are updated accordingly. This change fixes uprobes on MIPS InterAptiv (tested on Creator Ci40). Signed-off-by: Marcin Nowakowski Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Victor Kamensky Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/1481625657-22850-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..b5916b4969a5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1194,7 +1194,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); + arch_uprobe_copy_ixol(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; -- cgit v1.2.3 From 7b8589cc29e7c35dcfd2d5138979f17b48f90110 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Mon, 19 Dec 2016 16:22:48 -0800 Subject: ima: on soft reboot, save the measurement list The TPM PCRs are only reset on a hard reboot. In order to validate a TPM's quote after a soft reboot (eg. kexec -e), the IMA measurement list of the running kernel must be saved and restored on boot. This patch uses the kexec buffer passing mechanism to pass the serialized IMA binary_runtime_measurements to the next kernel. Link: http://lkml.kernel.org/r/1480554346-29071-7-git-send-email-zohar@linux.vnet.ibm.com Signed-off-by: Thiago Jung Bauermann Signed-off-by: Mimi Zohar Acked-by: "Eric W. Biederman" Acked-by: Dmitry Kasatkin Cc: Andreas Steffen Cc: Josh Sklar Cc: Dave Young Cc: Vivek Goyal Cc: Baoquan He Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Stewart Smith Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ima.h | 12 ++++ kernel/kexec_file.c | 4 ++ security/integrity/ima/ima.h | 1 + security/integrity/ima/ima_fs.c | 2 +- security/integrity/ima/ima_kexec.c | 117 +++++++++++++++++++++++++++++++++++++ 5 files changed, 135 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ima.h b/include/linux/ima.h index 0eb7c2e7f0d6..7f6952f8d6aa 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -11,6 +11,7 @@ #define _LINUX_IMA_H #include +#include struct linux_binprm; #ifdef CONFIG_IMA @@ -23,6 +24,10 @@ extern int ima_post_read_file(struct file *file, void *buf, loff_t size, enum kernel_read_file_id id); extern void ima_post_path_mknod(struct dentry *dentry); +#ifdef CONFIG_IMA_KEXEC +extern void ima_add_kexec_buffer(struct kimage *image); +#endif + #else static inline int ima_bprm_check(struct linux_binprm *bprm) { @@ -62,6 +67,13 @@ static inline void ima_post_path_mknod(struct dentry *dentry) #endif /* CONFIG_IMA */ +#ifndef CONFIG_IMA_KEXEC +struct kimage; + +static inline void ima_add_kexec_buffer(struct kimage *image) +{} +#endif + #ifdef CONFIG_IMA_APPRAISE extern void ima_inode_post_setattr(struct dentry *dentry); extern int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name, diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 0c2df7f73792..b56a558e406d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -132,6 +133,9 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, return ret; image->kernel_buf_len = size; + /* IMA needs to pass the measurement list to the next kernel. */ + ima_add_kexec_buffer(image); + /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index ea1dcc452911..139dec67dcbf 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -143,6 +143,7 @@ void ima_print_digest(struct seq_file *m, u8 *digest, u32 size); struct ima_template_desc *ima_template_desc_current(void); int ima_restore_measurement_entry(struct ima_template_entry *entry); int ima_restore_measurement_list(loff_t bufsize, void *buf); +int ima_measurements_show(struct seq_file *m, void *v); unsigned long ima_get_binary_runtime_size(void); int ima_init_template(void); diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 3df46906492d..10bea0125fa1 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -116,7 +116,7 @@ void ima_putc(struct seq_file *m, void *data, int datalen) * [eventdata length] * eventdata[n]=template specific data */ -static int ima_measurements_show(struct seq_file *m, void *v) +int ima_measurements_show(struct seq_file *m, void *v) { /* the list never shrinks, so we don't need a lock here */ struct ima_queue_entry *qe = v; diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index 36afd0fe9747..2c4824ac1ce1 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -10,8 +10,125 @@ * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ +#include +#include +#include #include "ima.h" +#ifdef CONFIG_IMA_KEXEC +static int ima_dump_measurement_list(unsigned long *buffer_size, void **buffer, + unsigned long segment_size) +{ + struct ima_queue_entry *qe; + struct seq_file file; + struct ima_kexec_hdr khdr = { + .version = 1, .buffer_size = 0, .count = 0}; + int ret = 0; + + /* segment size can't change between kexec load and execute */ + file.buf = vmalloc(segment_size); + if (!file.buf) { + ret = -ENOMEM; + goto out; + } + + file.size = segment_size; + file.read_pos = 0; + file.count = sizeof(khdr); /* reserved space */ + + list_for_each_entry_rcu(qe, &ima_measurements, later) { + if (file.count < file.size) { + khdr.count++; + ima_measurements_show(&file, qe); + } else { + ret = -EINVAL; + break; + } + } + + if (ret < 0) + goto out; + + /* + * fill in reserved space with some buffer details + * (eg. version, buffer size, number of measurements) + */ + khdr.buffer_size = file.count; + memcpy(file.buf, &khdr, sizeof(khdr)); + print_hex_dump(KERN_DEBUG, "ima dump: ", DUMP_PREFIX_NONE, + 16, 1, file.buf, + file.count < 100 ? file.count : 100, true); + + *buffer_size = file.count; + *buffer = file.buf; +out: + if (ret == -EINVAL) + vfree(file.buf); + return ret; +} + +/* + * Called during kexec_file_load so that IMA can add a segment to the kexec + * image for the measurement list for the next kernel. + * + * This function assumes that kexec_mutex is held. + */ +void ima_add_kexec_buffer(struct kimage *image) +{ + struct kexec_buf kbuf = { .image = image, .buf_align = PAGE_SIZE, + .buf_min = 0, .buf_max = ULONG_MAX, + .top_down = true }; + unsigned long binary_runtime_size; + + /* use more understandable variable names than defined in kbuf */ + void *kexec_buffer = NULL; + size_t kexec_buffer_size; + size_t kexec_segment_size; + int ret; + + /* + * Reserve an extra half page of memory for additional measurements + * added during the kexec load. + */ + binary_runtime_size = ima_get_binary_runtime_size(); + if (binary_runtime_size >= ULONG_MAX - PAGE_SIZE) + kexec_segment_size = ULONG_MAX; + else + kexec_segment_size = ALIGN(ima_get_binary_runtime_size() + + PAGE_SIZE / 2, PAGE_SIZE); + if ((kexec_segment_size == ULONG_MAX) || + ((kexec_segment_size >> PAGE_SHIFT) > totalram_pages / 2)) { + pr_err("Binary measurement list too large.\n"); + return; + } + + ima_dump_measurement_list(&kexec_buffer_size, &kexec_buffer, + kexec_segment_size); + if (!kexec_buffer) { + pr_err("Not enough memory for the kexec measurement buffer.\n"); + return; + } + + kbuf.buffer = kexec_buffer; + kbuf.bufsz = kexec_buffer_size; + kbuf.memsz = kexec_segment_size; + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error passing over kexec measurement buffer.\n"); + return; + } + + ret = arch_ima_add_kexec_buffer(image, kbuf.mem, kexec_segment_size); + if (ret) { + pr_err("Error passing over kexec measurement buffer.\n"); + return; + } + + pr_debug("kexec measurement buffer for the loaded kernel at 0x%lx.\n", + kbuf.mem); +} +#endif /* IMA_KEXEC */ + /* * Restore the measurement list from the previous kernel. */ -- cgit v1.2.3 From 4983f0ab7ffaad1e534b21975367429736475205 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Mon, 19 Dec 2016 16:23:09 -0800 Subject: kcov: make kcov work properly with KASLR enabled Subtract KASLR offset from the kernel addresses reported by kcov. Tested on x86_64 and AArch64 (Hikey LeMaker). Link: http://lkml.kernel.org/r/1481417456-28826-3-git-send-email-alex.popov@linux.com Signed-off-by: Alexander Popov Cc: Catalin Marinas Cc: Will Deacon Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Rob Herring Cc: Kefeng Wang Cc: AKASHI Takahiro Cc: Jon Masters Cc: David Daney Cc: Ganapatrao Kulkarni Cc: Dmitry Vyukov Cc: Nicolai Stange Cc: James Morse Cc: Andrey Ryabinin Cc: Andrey Konovalov Cc: Alexander Popov Cc: syzkaller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index cc2fa35ca480..85e5546cd791 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -19,6 +19,7 @@ #include #include #include +#include /* * kcov descriptor (one per opened debugfs file). @@ -73,6 +74,11 @@ void notrace __sanitizer_cov_trace_pc(void) if (mode == KCOV_MODE_TRACE) { unsigned long *area; unsigned long pos; + unsigned long ip = _RET_IP_; + +#ifdef CONFIG_RANDOMIZE_BASE + ip -= kaslr_offset(); +#endif /* * There is some code that runs in interrupts but for which @@ -86,7 +92,7 @@ void notrace __sanitizer_cov_trace_pc(void) /* The first word is number of subsequent PCs. */ pos = READ_ONCE(area[0]) + 1; if (likely(pos < t->kcov_size)) { - area[pos] = _RET_IP_; + area[pos] = ip; WRITE_ONCE(area[0], pos); } } -- cgit v1.2.3 From c00d2c7e89880036f288a764599b2b8b87c0a364 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 20 Dec 2016 07:04:57 -0500 Subject: move aio compat to fs/aio.c ... and fix the minor buglet in compat io_submit() - native one kills ioctx as cleanup when put_user() fails. Get rid of bogus compat_... in !CONFIG_AIO case, while we are at it - they should simply fail with ENOSYS, same as for native counterparts. Signed-off-by: Al Viro --- fs/aio.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/compat.c | 75 ----------------------------------------- include/linux/aio.h | 5 --- kernel/sys_ni.c | 3 ++ 4 files changed, 98 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/fs/aio.c b/fs/aio.c index 8edf253484af..8c79e1a53af9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1367,6 +1367,39 @@ out: return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctx32p); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || nr_events == 0)) { + pr_debug("EINVAL: ctx %lu nr_events %u\n", + ctx, nr_events); + goto out; + } + + ioctx = ioctx_alloc(nr_events); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + /* truncating is ok because it's a user address */ + ret = put_user((u32)ioctx->user_id, ctx32p); + if (ret) + kill_ioctx(current->mm, ioctx, NULL); + percpu_ref_put(&ioctx->users); + } + +out: + return ret; +} +#endif + /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not @@ -1591,8 +1624,8 @@ out_put_req: return ret; } -long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat) +static long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) { struct kioctx *ctx; long ret = 0; @@ -1662,6 +1695,44 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, return do_io_submit(ctx_id, nr, iocbpp, 0); } +#ifdef CONFIG_COMPAT +static inline long +copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +{ + compat_uptr_t uptr; + int i; + + for (i = 0; i < nr; ++i) { + if (get_user(uptr, ptr32 + i)) + return -EFAULT; + if (put_user(compat_ptr(uptr), ptr64 + i)) + return -EFAULT; + } + return 0; +} + +#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) +{ + struct iocb __user * __user *iocb64; + long ret; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (nr > MAX_AIO_SUBMITS) + nr = MAX_AIO_SUBMITS; + + iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); + ret = copy_iocb(nr, iocb, iocb64); + if (!ret) + ret = do_io_submit(ctx_id, nr, iocb64, 1); + return ret; +} +#endif + /* lookup_kiocb * Finds a given iocb for cancellation. */ @@ -1761,3 +1832,25 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, } return ret; } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) +{ + struct timespec t; + struct timespec __user *ut = NULL; + + if (timeout) { + if (compat_get_timespec(&t, timeout)) + return -EFAULT; + + ut = compat_alloc_user_space(sizeof(*ut)); + if (copy_to_user(ut, &t, sizeof(t))) + return -EFAULT; + } + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); +} +#endif diff --git a/fs/compat.c b/fs/compat.c index 543b48c29ac3..3f4908c28698 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -487,45 +487,6 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, return compat_sys_fcntl64(fd, cmd, arg); } -COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p) -{ - long ret; - aio_context_t ctx64; - - mm_segment_t oldfs = get_fs(); - if (unlikely(get_user(ctx64, ctx32p))) - return -EFAULT; - - set_fs(KERNEL_DS); - /* The __user pointer cast is valid because of the set_fs() */ - ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64); - set_fs(oldfs); - /* truncating is ok because it's a user address */ - if (!ret) - ret = put_user((u32) ctx64, ctx32p); - return ret; -} - -COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, - compat_long_t, min_nr, - compat_long_t, nr, - struct io_event __user *, events, - struct compat_timespec __user *, timeout) -{ - struct timespec t; - struct timespec __user *ut = NULL; - - if (timeout) { - if (compat_get_timespec(&t, timeout)) - return -EFAULT; - - ut = compat_alloc_user_space(sizeof(*ut)); - if (copy_to_user(ut, &t, sizeof(t)) ) - return -EFAULT; - } - return sys_io_getevents(ctx_id, min_nr, nr, events, ut); -} - /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) @@ -602,42 +563,6 @@ out: return ret; } -static inline long -copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) -{ - compat_uptr_t uptr; - int i; - - for (i = 0; i < nr; ++i) { - if (get_user(uptr, ptr32 + i)) - return -EFAULT; - if (put_user(compat_ptr(uptr), ptr64 + i)) - return -EFAULT; - } - return 0; -} - -#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) - -COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, - int, nr, u32 __user *, iocb) -{ - struct iocb __user * __user *iocb64; - long ret; - - if (unlikely(nr < 0)) - return -EINVAL; - - if (nr > MAX_AIO_SUBMITS) - nr = MAX_AIO_SUBMITS; - - iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); - ret = copy_iocb(nr, iocb, iocb64); - if (!ret) - ret = do_io_submit(ctx_id, nr, iocb64, 1); - return ret; -} - struct compat_ncp_mount_data { compat_int_t version; compat_uint_t ncp_fd; diff --git a/include/linux/aio.h b/include/linux/aio.h index 9eb42dbc5582..fdd0a343f455 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -14,14 +14,9 @@ typedef int (kiocb_cancel_fn)(struct kiocb *); /* prototypes */ #ifdef CONFIG_AIO extern void exit_aio(struct mm_struct *mm); -extern long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else static inline void exit_aio(struct mm_struct *mm) { } -static inline long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user * __user *iocbpp, - bool compat) { return 0; } static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 635482e60ca3..8acef8576ce9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -150,6 +150,9 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(compat_sys_io_setup); +cond_syscall(compat_sys_io_submit); +cond_syscall(compat_sys_io_getevents); cond_syscall(sys_sysfs); cond_syscall(sys_syslog); cond_syscall(sys_process_vm_readv); -- cgit v1.2.3 From e3ba730702af370563f66cb610b71aa0ca67955e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Dec 2016 10:15:20 +0100 Subject: fsnotify: Remove fsnotify_duplicate_mark() There are only two calls sites of fsnotify_duplicate_mark(). Those are in kernel/audit_tree.c and both are bogus. Vfsmount pointer is unused for audit tree, inode pointer and group gets set in fsnotify_add_mark_locked() later anyway, mask and free_mark are already set in alloc_chunk(). In fact, calling fsnotify_duplicate_mark() is actively harmful because following fsnotify_add_mark_locked() will leak group reference by overwriting the group pointer. So just remove the two calls to fsnotify_duplicate_mark() and the function. Signed-off-by: Jan Kara [PM: line wrapping to fit in 80 chars] Signed-off-by: Paul Moore --- fs/notify/mark.c | 12 ------------ include/linux/fsnotify_backend.h | 2 -- kernel/audit_tree.c | 8 ++++---- 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d3fea0bd89e2..6043306e8e21 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -510,18 +510,6 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) } } -void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) -{ - assert_spin_locked(&old->lock); - new->inode = old->inode; - new->mnt = old->mnt; - if (old->group) - fsnotify_get_group(old->group); - new->group = old->group; - new->mask = old->mask; - new->free_mark = old->free_mark; -} - /* * Nothing fancy, just initialize lists and locks and counters. */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7268ed076be8..ce77caa2bb10 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -324,8 +324,6 @@ extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(str extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode); /* find (and take a reference) to a mark associated with group and vfsmount */ extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt); -/* copy the values from old into new */ -extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old); /* set the ignored_mask of a mark */ extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask); /* set the mask of a mark (might pin the object into memory */ diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 055f11b0a50f..b4b58400531f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -258,8 +258,8 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { + if (fsnotify_add_mark(&new->mark, + entry->group, entry->inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -395,8 +395,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return -ENOENT; } - fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { + if (fsnotify_add_mark(chunk_entry, + old_entry->group, old_entry->inode, NULL, 1)) { spin_unlock(&old_entry->lock); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); -- cgit v1.2.3 From 7c0f6ba682b9c7632072ffbedf8d328c8f3c42ba Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Dec 2016 11:46:01 -0800 Subject: Replace with globally This was entirely automated, using the script by Al: PATT='^[[:blank:]]*#[[:blank:]]*include[[:blank:]]*' sed -i -e "s!$PATT!#include !" \ $(git grep -l "$PATT"|grep -v ^include/linux/uaccess.h) to do the replacement at the end of the merge window. Requested-by: Al Viro Signed-off-by: Linus Torvalds --- arch/alpha/boot/misc.c | 2 +- arch/alpha/kernel/irq.c | 2 +- arch/alpha/kernel/osf_sys.c | 2 +- arch/alpha/kernel/process.c | 2 +- arch/alpha/kernel/ptrace.c | 2 +- arch/alpha/kernel/setup.c | 2 +- arch/alpha/kernel/signal.c | 2 +- arch/alpha/kernel/srm_env.c | 2 +- arch/alpha/kernel/srmcons.c | 2 +- arch/alpha/kernel/time.c | 2 +- arch/alpha/kernel/traps.c | 2 +- arch/alpha/lib/csum_partial_copy.c | 2 +- arch/alpha/math-emu/math.c | 2 +- arch/alpha/mm/init.c | 2 +- arch/arm/common/bL_switcher_dummy_if.c | 2 +- arch/arm/kernel/swp_emulate.c | 2 +- arch/arm/kvm/arm.c | 2 +- arch/arm/kvm/guest.c | 2 +- arch/arm/mach-iop13xx/irq.c | 2 +- arch/arm/mach-ixp4xx/common.c | 2 +- arch/arm/mach-rpc/dma.c | 2 +- arch/arm/plat-iop/time.c | 2 +- arch/arm64/include/asm/word-at-a-time.h | 2 +- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/entry.S | 2 +- arch/arm64/kernel/probes/kprobes.c | 2 +- arch/arm64/kernel/signal32.c | 2 +- arch/arm64/kvm/guest.c | 2 +- arch/arm64/lib/clear_user.S | 2 +- arch/arm64/lib/copy_from_user.S | 2 +- arch/arm64/lib/copy_in_user.S | 2 +- arch/arm64/lib/copy_to_user.S | 2 +- arch/arm64/mm/cache.S | 2 +- arch/arm64/xen/hypercall.S | 2 +- arch/avr32/kernel/avr32_ksyms.c | 2 +- arch/avr32/kernel/ptrace.c | 2 +- arch/avr32/kernel/signal.c | 2 +- arch/avr32/mm/cache.c | 2 +- arch/blackfin/kernel/bfin_dma.c | 2 +- arch/blackfin/kernel/kgdb_test.c | 2 +- arch/blackfin/kernel/module.c | 2 +- arch/c6x/mm/init.c | 2 +- arch/cris/arch-v10/drivers/eeprom.c | 2 +- arch/cris/arch-v10/drivers/sync_serial.c | 2 +- arch/cris/arch-v10/kernel/ptrace.c | 2 +- arch/cris/arch-v10/kernel/signal.c | 2 +- arch/cris/arch-v10/kernel/traps.c | 2 +- arch/cris/arch-v10/lib/usercopy.c | 2 +- arch/cris/arch-v10/mm/fault.c | 2 +- arch/cris/arch-v32/drivers/cryptocop.c | 2 +- arch/cris/arch-v32/kernel/ptrace.c | 2 +- arch/cris/arch-v32/kernel/signal.c | 2 +- arch/cris/arch-v32/kernel/traps.c | 2 +- arch/cris/arch-v32/lib/usercopy.c | 2 +- arch/cris/kernel/crisksyms.c | 2 +- arch/cris/kernel/process.c | 2 +- arch/cris/kernel/profile.c | 2 +- arch/cris/kernel/ptrace.c | 2 +- arch/cris/kernel/sys_cris.c | 2 +- arch/cris/kernel/traps.c | 2 +- arch/frv/include/asm/futex.h | 2 +- arch/frv/kernel/irq.c | 2 +- arch/frv/kernel/pm-mb93093.c | 2 +- arch/frv/kernel/pm.c | 2 +- arch/frv/kernel/process.c | 2 +- arch/frv/kernel/ptrace.c | 2 +- arch/frv/kernel/signal.c | 2 +- arch/frv/kernel/sys_frv.c | 2 +- arch/frv/kernel/sysctl.c | 2 +- arch/frv/kernel/traps.c | 2 +- arch/frv/kernel/uaccess.c | 2 +- arch/frv/mm/dma-alloc.c | 2 +- arch/frv/mm/extable.c | 2 +- arch/h8300/boot/compressed/misc.c | 2 +- arch/h8300/kernel/process.c | 2 +- arch/h8300/kernel/signal.c | 2 +- arch/hexagon/kernel/hexagon_ksyms.c | 2 +- arch/hexagon/kernel/signal.c | 2 +- arch/hexagon/mm/uaccess.c | 2 +- arch/hexagon/mm/vm_fault.c | 2 +- arch/ia64/kernel/brl_emu.c | 2 +- arch/ia64/kernel/crash_dump.c | 2 +- arch/ia64/kernel/init_task.c | 2 +- arch/ia64/kernel/irq.c | 2 +- arch/ia64/kernel/kprobes.c | 2 +- arch/ia64/kernel/perfmon.c | 2 +- arch/ia64/kernel/process.c | 2 +- arch/ia64/kernel/ptrace.c | 2 +- arch/ia64/kernel/salinfo.c | 2 +- arch/ia64/kernel/signal.c | 2 +- arch/ia64/kernel/sys_ia64.c | 2 +- arch/ia64/kernel/traps.c | 2 +- arch/ia64/kernel/unaligned.c | 2 +- arch/ia64/kernel/unwind.c | 2 +- arch/ia64/lib/csum_partial_copy.c | 2 +- arch/ia64/mm/extable.c | 2 +- arch/ia64/mm/init.c | 2 +- arch/ia64/sn/kernel/sn2/sn_hwperf.c | 2 +- arch/ia64/sn/kernel/sn2/sn_proc_fs.c | 2 +- arch/ia64/sn/kernel/tiocx.c | 2 +- arch/m32r/kernel/align.c | 2 +- arch/m32r/kernel/irq.c | 2 +- arch/m32r/kernel/m32r_ksyms.c | 2 +- arch/m32r/kernel/process.c | 2 +- arch/m32r/kernel/ptrace.c | 2 +- arch/m32r/kernel/signal.c | 2 +- arch/m32r/kernel/sys_m32r.c | 2 +- arch/m32r/kernel/traps.c | 2 +- arch/m32r/lib/csum_partial_copy.c | 2 +- arch/m32r/lib/usercopy.c | 2 +- arch/m32r/mm/extable.c | 2 +- arch/m32r/mm/fault-nommu.c | 2 +- arch/m68k/bvme6000/rtc.c | 2 +- arch/m68k/kernel/process.c | 2 +- arch/m68k/kernel/ptrace.c | 2 +- arch/m68k/kernel/signal.c | 2 +- arch/m68k/kernel/sys_m68k.c | 2 +- arch/m68k/kernel/traps.c | 2 +- arch/m68k/lib/uaccess.c | 2 +- arch/m68k/mac/misc.c | 2 +- arch/m68k/mm/init.c | 2 +- arch/m68k/mm/motorola.c | 2 +- arch/m68k/mm/sun3mmu.c | 2 +- arch/m68k/mvme16x/rtc.c | 2 +- arch/m68k/sun3/mmu_emu.c | 2 +- arch/metag/kernel/irq.c | 2 +- arch/mips/alchemy/common/power.c | 2 +- arch/mips/dec/kn01-berr.c | 2 +- arch/mips/include/asm/checksum.h | 2 +- arch/mips/include/asm/compat-signal.h | 2 +- arch/mips/include/asm/r4kcache.h | 2 +- arch/mips/include/asm/termios.h | 2 +- arch/mips/jazz/jazzdma.c | 2 +- arch/mips/kernel/branch.c | 2 +- arch/mips/kernel/cpu-probe.c | 2 +- arch/mips/kernel/crash_dump.c | 2 +- arch/mips/kernel/irq.c | 2 +- arch/mips/kernel/kgdb.c | 2 +- arch/mips/kernel/linux32.c | 2 +- arch/mips/kernel/mips-mt-fpaff.c | 2 +- arch/mips/kernel/mips-r2-to-r6-emul.c | 2 +- arch/mips/kernel/mips_ksyms.c | 2 +- arch/mips/kernel/process.c | 2 +- arch/mips/kernel/ptrace.c | 2 +- arch/mips/kernel/ptrace32.c | 2 +- arch/mips/kernel/signal32.c | 2 +- arch/mips/kernel/signal_n32.c | 2 +- arch/mips/kernel/syscall.c | 2 +- arch/mips/kernel/traps.c | 2 +- arch/mips/kernel/unaligned.c | 2 +- arch/mips/math-emu/cp1emu.c | 2 +- arch/mips/math-emu/dsemul.c | 2 +- arch/mips/mm/extable.c | 2 +- arch/mips/mm/sc-debugfs.c | 2 +- arch/mips/oprofile/op_model_loongson3.c | 2 +- arch/mips/sgi-ip22/ip28-berr.c | 2 +- arch/mips/sgi-ip27/ip27-berr.c | 2 +- arch/mips/sgi-ip32/ip32-berr.c | 2 +- arch/mips/sibyte/common/sb_tbprof.c | 2 +- arch/mn10300/kernel/fpu.c | 2 +- arch/mn10300/kernel/mn10300_ksyms.c | 2 +- arch/mn10300/kernel/process.c | 2 +- arch/mn10300/kernel/ptrace.c | 2 +- arch/mn10300/kernel/setup.c | 2 +- arch/mn10300/kernel/signal.c | 2 +- arch/mn10300/kernel/sys_mn10300.c | 2 +- arch/mn10300/lib/checksum.c | 2 +- arch/mn10300/mm/cache-smp.c | 2 +- arch/mn10300/mm/cache.c | 2 +- arch/mn10300/mm/extable.c | 2 +- arch/mn10300/mm/init.c | 2 +- arch/mn10300/mm/misalignment.c | 2 +- arch/mn10300/proc-mn2ws0050/proc-init.c | 2 +- arch/nios2/kernel/traps.c | 2 +- arch/openrisc/kernel/or32_ksyms.c | 2 +- arch/openrisc/kernel/process.c | 2 +- arch/openrisc/kernel/signal.c | 2 +- arch/openrisc/kernel/traps.c | 2 +- arch/openrisc/mm/fault.c | 2 +- arch/parisc/kernel/asm-offsets.c | 2 +- arch/parisc/kernel/parisc_ksyms.c | 2 +- arch/parisc/kernel/pci-dma.c | 2 +- arch/parisc/kernel/perf.c | 2 +- arch/parisc/kernel/ptrace.c | 2 +- arch/parisc/kernel/signal.c | 2 +- arch/parisc/kernel/signal32.c | 2 +- arch/parisc/kernel/sys_parisc.c | 2 +- arch/parisc/kernel/time.c | 2 +- arch/parisc/kernel/unaligned.c | 2 +- arch/parisc/kernel/unwind.c | 2 +- arch/parisc/lib/checksum.c | 2 +- arch/powerpc/include/asm/asm-prototypes.h | 2 +- arch/powerpc/kernel/align.c | 2 +- arch/powerpc/kernel/crash_dump.c | 2 +- arch/powerpc/kernel/hw_breakpoint.c | 2 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/kprobes.c | 2 +- arch/powerpc/kernel/module.c | 2 +- arch/powerpc/kernel/nvram_64.c | 2 +- arch/powerpc/kernel/pci_32.c | 2 +- arch/powerpc/kernel/proc_powerpc.c | 2 +- arch/powerpc/kernel/ptrace.c | 2 +- arch/powerpc/kernel/ptrace32.c | 2 +- arch/powerpc/kernel/rtas-proc.c | 2 +- arch/powerpc/kernel/rtas.c | 2 +- arch/powerpc/kernel/rtas_flash.c | 2 +- arch/powerpc/kernel/rtasd.c | 2 +- arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/signal.c | 2 +- arch/powerpc/kernel/signal_32.c | 2 +- arch/powerpc/kernel/signal_64.c | 2 +- arch/powerpc/kernel/sys_ppc32.c | 2 +- arch/powerpc/kernel/syscalls.c | 2 +- arch/powerpc/kernel/time.c | 2 +- arch/powerpc/kernel/traps.c | 2 +- arch/powerpc/kernel/vecemu.c | 2 +- arch/powerpc/kvm/book3s.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_pr.c | 2 +- arch/powerpc/kvm/book3s_pr_papr.c | 2 +- arch/powerpc/kvm/book3s_rtas.c | 2 +- arch/powerpc/kvm/book3s_xics.c | 2 +- arch/powerpc/kvm/booke.c | 2 +- arch/powerpc/kvm/mpic.c | 2 +- arch/powerpc/kvm/powerpc.c | 2 +- arch/powerpc/lib/checksum_wrappers.c | 2 +- arch/powerpc/lib/code-patching.c | 2 +- arch/powerpc/lib/sstep.c | 2 +- arch/powerpc/lib/usercopy_64.c | 2 +- arch/powerpc/math-emu/fabs.c | 2 +- arch/powerpc/math-emu/fadd.c | 2 +- arch/powerpc/math-emu/fadds.c | 2 +- arch/powerpc/math-emu/fcmpo.c | 2 +- arch/powerpc/math-emu/fcmpu.c | 2 +- arch/powerpc/math-emu/fctiw.c | 2 +- arch/powerpc/math-emu/fctiwz.c | 2 +- arch/powerpc/math-emu/fdiv.c | 2 +- arch/powerpc/math-emu/fdivs.c | 2 +- arch/powerpc/math-emu/fmadd.c | 2 +- arch/powerpc/math-emu/fmadds.c | 2 +- arch/powerpc/math-emu/fmr.c | 2 +- arch/powerpc/math-emu/fmsub.c | 2 +- arch/powerpc/math-emu/fmsubs.c | 2 +- arch/powerpc/math-emu/fmul.c | 2 +- arch/powerpc/math-emu/fmuls.c | 2 +- arch/powerpc/math-emu/fnabs.c | 2 +- arch/powerpc/math-emu/fneg.c | 2 +- arch/powerpc/math-emu/fnmadd.c | 2 +- arch/powerpc/math-emu/fnmadds.c | 2 +- arch/powerpc/math-emu/fnmsub.c | 2 +- arch/powerpc/math-emu/fnmsubs.c | 2 +- arch/powerpc/math-emu/fre.c | 2 +- arch/powerpc/math-emu/fres.c | 2 +- arch/powerpc/math-emu/frsp.c | 2 +- arch/powerpc/math-emu/frsqrte.c | 2 +- arch/powerpc/math-emu/frsqrtes.c | 2 +- arch/powerpc/math-emu/fsel.c | 2 +- arch/powerpc/math-emu/fsqrt.c | 2 +- arch/powerpc/math-emu/fsqrts.c | 2 +- arch/powerpc/math-emu/fsub.c | 2 +- arch/powerpc/math-emu/fsubs.c | 2 +- arch/powerpc/math-emu/lfd.c | 2 +- arch/powerpc/math-emu/lfs.c | 2 +- arch/powerpc/math-emu/math.c | 2 +- arch/powerpc/math-emu/math_efp.c | 2 +- arch/powerpc/math-emu/mcrfs.c | 2 +- arch/powerpc/math-emu/mffs.c | 2 +- arch/powerpc/math-emu/mtfsb0.c | 2 +- arch/powerpc/math-emu/mtfsb1.c | 2 +- arch/powerpc/math-emu/mtfsf.c | 2 +- arch/powerpc/math-emu/mtfsfi.c | 2 +- arch/powerpc/math-emu/stfd.c | 2 +- arch/powerpc/math-emu/stfiwx.c | 2 +- arch/powerpc/math-emu/stfs.c | 2 +- arch/powerpc/mm/40x_mmu.c | 2 +- arch/powerpc/mm/fsl_booke_mmu.c | 2 +- arch/powerpc/mm/hash_utils_64.c | 2 +- arch/powerpc/mm/init_64.c | 2 +- arch/powerpc/mm/subpage-prot.c | 2 +- arch/powerpc/platforms/cell/spufs/coredump.c | 2 +- arch/powerpc/platforms/cell/spufs/file.c | 2 +- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- arch/powerpc/platforms/cell/spufs/syscalls.c | 2 +- arch/powerpc/platforms/chrp/nvram.c | 2 +- arch/powerpc/platforms/powernv/opal-elog.c | 2 +- arch/powerpc/platforms/powernv/opal-lpc.c | 2 +- arch/powerpc/platforms/powernv/opal-prd.c | 2 +- arch/powerpc/platforms/pseries/cmm.c | 2 +- arch/powerpc/platforms/pseries/dlpar.c | 2 +- arch/powerpc/platforms/pseries/dtl.c | 2 +- arch/powerpc/platforms/pseries/lparcfg.c | 2 +- arch/powerpc/platforms/pseries/nvram.c | 2 +- arch/powerpc/platforms/pseries/reconfig.c | 2 +- arch/powerpc/platforms/pseries/scanlog.c | 2 +- arch/powerpc/sysdev/scom.c | 2 +- arch/powerpc/sysdev/tsi108_pci.c | 2 +- arch/s390/appldata/appldata_base.c | 2 +- arch/s390/boot/compressed/misc.c | 2 +- arch/s390/crypto/prng.c | 2 +- arch/s390/include/asm/checksum.h | 2 +- arch/s390/include/asm/idals.h | 2 +- arch/s390/include/asm/mmu_context.h | 2 +- arch/s390/kernel/compat_linux.c | 2 +- arch/s390/kernel/compat_signal.c | 2 +- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/dis.c | 2 +- arch/s390/kernel/kprobes.c | 2 +- arch/s390/kernel/ptrace.c | 2 +- arch/s390/kernel/signal.c | 2 +- arch/s390/kernel/sys_s390.c | 2 +- arch/s390/kernel/time.c | 2 +- arch/s390/kernel/traps.c | 2 +- arch/s390/kvm/interrupt.c | 2 +- arch/s390/mm/init.c | 2 +- arch/score/include/asm/checksum.h | 2 +- arch/score/kernel/ptrace.c | 2 +- arch/score/kernel/traps.c | 2 +- arch/score/lib/checksum_copy.c | 2 +- arch/sh/boards/mach-landisk/gio.c | 2 +- arch/sh/boot/compressed/misc.c | 2 +- arch/sh/include/asm/mmu_context.h | 2 +- arch/sh/kernel/cpu/init.c | 2 +- arch/sh/kernel/cpu/shmobile/cpuidle.c | 2 +- arch/sh/kernel/cpu/shmobile/pm.c | 2 +- arch/sh/kernel/crash_dump.c | 2 +- arch/sh/kernel/io_trapped.c | 2 +- arch/sh/kernel/irq.c | 2 +- arch/sh/kernel/kprobes.c | 2 +- arch/sh/kernel/process_32.c | 2 +- arch/sh/kernel/process_64.c | 2 +- arch/sh/kernel/ptrace_32.c | 2 +- arch/sh/kernel/ptrace_64.c | 2 +- arch/sh/kernel/setup.c | 2 +- arch/sh/kernel/sh_ksyms_64.c | 2 +- arch/sh/kernel/signal_32.c | 2 +- arch/sh/kernel/signal_64.c | 2 +- arch/sh/kernel/sys_sh.c | 2 +- arch/sh/kernel/sys_sh32.c | 2 +- arch/sh/kernel/traps_64.c | 2 +- arch/sh/math-emu/math.c | 2 +- arch/sh/mm/cache-debugfs.c | 2 +- arch/sh/mm/cache-sh3.c | 2 +- arch/sh/mm/cache-sh5.c | 2 +- arch/sh/mm/cache-sh7705.c | 2 +- arch/sh/mm/extable_32.c | 2 +- arch/sh/mm/extable_64.c | 2 +- arch/sh/mm/nommu.c | 2 +- arch/sh/mm/pmb.c | 2 +- arch/sh/mm/tlb-sh3.c | 2 +- arch/sh/mm/tlbex_64.c | 2 +- arch/sh/mm/tlbflush_64.c | 2 +- arch/sh/oprofile/backtrace.c | 2 +- arch/sparc/include/asm/checksum_32.h | 2 +- arch/sparc/include/asm/checksum_64.h | 2 +- arch/sparc/kernel/apc.c | 2 +- arch/sparc/kernel/irq_64.c | 2 +- arch/sparc/kernel/kprobes.c | 2 +- arch/sparc/kernel/mdesc.c | 2 +- arch/sparc/kernel/pci.c | 2 +- arch/sparc/kernel/pcic.c | 2 +- arch/sparc/kernel/pmc.c | 2 +- arch/sparc/kernel/process_32.c | 2 +- arch/sparc/kernel/process_64.c | 2 +- arch/sparc/kernel/ptrace_32.c | 2 +- arch/sparc/kernel/ptrace_64.c | 2 +- arch/sparc/kernel/signal32.c | 2 +- arch/sparc/kernel/signal_32.c | 2 +- arch/sparc/kernel/signal_64.c | 2 +- arch/sparc/kernel/smp_64.c | 2 +- arch/sparc/kernel/sys_sparc32.c | 2 +- arch/sparc/kernel/sys_sparc_32.c | 2 +- arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/sparc/kernel/time_64.c | 2 +- arch/sparc/kernel/traps_64.c | 2 +- arch/sparc/kernel/unaligned_32.c | 2 +- arch/sparc/kernel/unaligned_64.c | 2 +- arch/sparc/kernel/uprobes.c | 2 +- arch/sparc/kernel/visemul.c | 2 +- arch/sparc/kernel/windows.c | 2 +- arch/sparc/math-emu/math_32.c | 2 +- arch/sparc/math-emu/math_64.c | 2 +- arch/sparc/mm/extable.c | 2 +- arch/sparc/mm/init_64.c | 2 +- arch/tile/kernel/process.c | 2 +- arch/tile/kernel/single_step.c | 2 +- arch/tile/kernel/unaligned.c | 2 +- arch/um/drivers/harddog_kern.c | 2 +- arch/um/drivers/hostaudio_kern.c | 2 +- arch/um/drivers/mconsole_kern.c | 2 +- arch/um/drivers/mmapper_kern.c | 2 +- arch/um/drivers/random.c | 2 +- arch/um/kernel/exec.c | 2 +- arch/um/kernel/exitcode.c | 2 +- arch/um/kernel/process.c | 2 +- arch/um/kernel/ptrace.c | 2 +- arch/um/kernel/syscall.c | 2 +- arch/x86/entry/common.c | 2 +- arch/x86/ia32/ia32_aout.c | 2 +- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/include/asm/asm-prototypes.h | 2 +- arch/x86/include/asm/checksum_64.h | 2 +- arch/x86/include/asm/xen/page.h | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 +- arch/x86/kernel/crash_dump_32.c | 2 +- arch/x86/kernel/doublefault.c | 2 +- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/kernel/kprobes/opt.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/test_nx.c | 2 +- arch/x86/kernel/tls.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- arch/x86/lib/usercopy_32.c | 2 +- arch/x86/math-emu/errors.c | 2 +- arch/x86/math-emu/fpu_entry.c | 2 +- arch/x86/math-emu/get_address.c | 2 +- arch/x86/math-emu/load_store.c | 2 +- arch/x86/math-emu/reg_ld_str.c | 2 +- arch/x86/mm/extable.c | 2 +- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pageattr.c | 2 +- arch/x86/um/ptrace_32.c | 2 +- arch/x86/um/ptrace_64.c | 2 +- arch/x86/um/signal.c | 2 +- arch/x86/um/tls_32.c | 2 +- arch/x86/xen/p2m.c | 2 +- arch/xtensa/include/asm/checksum.h | 2 +- arch/xtensa/include/asm/segment.h | 2 +- arch/xtensa/kernel/asm-offsets.c | 2 +- arch/xtensa/kernel/irq.c | 2 +- arch/xtensa/kernel/process.c | 2 +- arch/xtensa/kernel/ptrace.c | 2 +- arch/xtensa/kernel/signal.c | 2 +- arch/xtensa/kernel/stacktrace.c | 2 +- arch/xtensa/kernel/syscall.c | 2 +- arch/xtensa/kernel/traps.c | 2 +- arch/xtensa/kernel/xtensa_ksyms.c | 2 +- arch/xtensa/platforms/iss/console.c | 2 +- arch/xtensa/platforms/iss/simdisk.c | 2 +- block/ioctl.c | 2 +- block/partitions/ibm.c | 2 +- block/scsi_ioctl.c | 2 +- drivers/acpi/acpi_video.c | 2 +- drivers/acpi/battery.c | 2 +- drivers/acpi/osl.c | 2 +- drivers/acpi/proc.c | 2 +- drivers/acpi/processor_thermal.c | 2 +- drivers/acpi/processor_throttling.c | 2 +- drivers/acpi/thermal.c | 2 +- drivers/atm/adummy.c | 2 +- drivers/atm/atmtcp.c | 2 +- drivers/atm/eni.c | 2 +- drivers/atm/firestream.c | 2 +- drivers/atm/fore200e.c | 2 +- drivers/atm/he.c | 2 +- drivers/atm/horizon.c | 2 +- drivers/atm/idt77105.c | 2 +- drivers/atm/idt77252.c | 2 +- drivers/atm/iphase.c | 2 +- drivers/atm/nicstar.c | 2 +- drivers/atm/suni.c | 2 +- drivers/atm/uPD98402.c | 2 +- drivers/atm/zatm.c | 2 +- drivers/base/memory.c | 2 +- drivers/block/DAC960.c | 2 +- drivers/block/amiflop.c | 2 +- drivers/block/brd.c | 2 +- drivers/block/cciss.c | 2 +- drivers/block/cryptoloop.c | 2 +- drivers/block/hd.c | 2 +- drivers/block/loop.c | 2 +- drivers/block/nbd.c | 2 +- drivers/block/paride/pcd.c | 2 +- drivers/block/paride/pd.c | 2 +- drivers/block/paride/pf.c | 2 +- drivers/block/paride/pg.c | 2 +- drivers/block/paride/pt.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/swim3.c | 2 +- drivers/block/sx8.c | 2 +- drivers/block/umem.c | 2 +- drivers/cdrom/cdrom.c | 2 +- drivers/char/agp/compat_ioctl.c | 2 +- drivers/char/agp/frontend.c | 2 +- drivers/char/applicom.c | 2 +- drivers/char/bfin-otp.c | 2 +- drivers/char/ds1620.c | 2 +- drivers/char/dtlk.c | 2 +- drivers/char/generic_nvram.c | 2 +- drivers/char/hangcheck-timer.c | 2 +- drivers/char/hw_random/core.c | 2 +- drivers/char/ipmi/ipmi_watchdog.c | 2 +- drivers/char/lp.c | 2 +- drivers/char/mbcs.c | 2 +- drivers/char/mmtimer.c | 2 +- drivers/char/mwave/3780i.c | 2 +- drivers/char/mwave/mwavedd.h | 2 +- drivers/char/nsc_gpio.c | 2 +- drivers/char/nwbutton.c | 2 +- drivers/char/nwflash.c | 2 +- drivers/char/pc8736x_gpio.c | 2 +- drivers/char/pcmcia/cm4040_cs.c | 2 +- drivers/char/pcmcia/synclink_cs.c | 2 +- drivers/char/random.c | 2 +- drivers/char/raw.c | 2 +- drivers/char/scx200_gpio.c | 2 +- drivers/char/sonypi.c | 2 +- drivers/char/tlclk.c | 2 +- drivers/char/toshiba.c | 2 +- drivers/char/xilinx_hwicap/xilinx_hwicap.c | 2 +- drivers/cpufreq/ia64-acpi-cpufreq.c | 2 +- drivers/cpuidle/governors/ladder.c | 2 +- drivers/dio/dio.c | 2 +- drivers/edac/edac_device.c | 2 +- drivers/edac/edac_mc.c | 2 +- drivers/edac/edac_pci.c | 2 +- drivers/i2c/i2c-core.c | 2 +- drivers/ide/hpt366.c | 2 +- drivers/ide/ide-disk.c | 2 +- drivers/ide/ide-io.c | 2 +- drivers/ide/ide-iops.c | 2 +- drivers/ide/ide-probe.c | 2 +- drivers/ide/ide-proc.c | 2 +- drivers/infiniband/core/ucm.c | 2 +- drivers/infiniband/core/user_mad.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/core/uverbs_main.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 2 +- drivers/infiniband/ulp/iser/iscsi_iser.c | 2 +- drivers/input/input-compat.c | 2 +- drivers/input/misc/atlas_btns.c | 2 +- drivers/input/mouse/amimouse.c | 2 +- drivers/input/mouse/atarimouse.c | 2 +- drivers/input/mouse/trackpoint.c | 2 +- drivers/input/serio/hp_sdc.c | 2 +- drivers/input/serio/q40kbd.c | 2 +- drivers/input/serio/serport.c | 2 +- drivers/input/tablet/aiptek.c | 2 +- drivers/input/tablet/gtco.c | 2 +- drivers/isdn/capi/kcapi.c | 2 +- drivers/isdn/hardware/avm/b1.c | 2 +- drivers/isdn/hardware/avm/b1dma.c | 2 +- drivers/isdn/hardware/avm/c4.c | 2 +- drivers/isdn/hardware/eicon/capimain.c | 2 +- drivers/isdn/hardware/eicon/divamnt.c | 2 +- drivers/isdn/hardware/eicon/divasi.c | 2 +- drivers/isdn/hardware/eicon/divasmain.c | 2 +- drivers/isdn/hardware/eicon/divasproc.c | 2 +- drivers/isdn/hysdn/hysdn_boot.c | 2 +- drivers/lguest/core.c | 2 +- drivers/lguest/page_tables.c | 2 +- drivers/lguest/x86/core.c | 2 +- drivers/macintosh/ans-lcd.c | 2 +- drivers/macintosh/smu.c | 2 +- drivers/macintosh/via-pmu.c | 2 +- drivers/macintosh/via-pmu68k.c | 2 +- drivers/md/dm-ioctl.c | 2 +- drivers/media/dvb-core/dmxdev.c | 2 +- drivers/media/dvb-core/dvb_demux.c | 2 +- drivers/media/dvb-core/dvb_net.c | 2 +- drivers/media/dvb-core/dvb_ringbuffer.c | 2 +- drivers/media/i2c/adv7170.c | 2 +- drivers/media/i2c/adv7175.c | 2 +- drivers/media/i2c/bt856.c | 2 +- drivers/media/i2c/bt866.c | 2 +- drivers/media/i2c/cs53l32a.c | 2 +- drivers/media/i2c/m52790.c | 2 +- drivers/media/i2c/saa6588.c | 2 +- drivers/media/i2c/saa7110.c | 2 +- drivers/media/i2c/saa7185.c | 2 +- drivers/media/i2c/tlv320aic23b.c | 2 +- drivers/media/i2c/vp27smpx.c | 2 +- drivers/media/i2c/vpx3220.c | 2 +- drivers/media/i2c/wm8739.c | 2 +- drivers/media/i2c/wm8775.c | 2 +- drivers/media/pci/ivtv/ivtv-driver.h | 2 +- drivers/media/pci/meye/meye.c | 2 +- drivers/media/pci/zoran/videocodec.c | 2 +- drivers/media/pci/zoran/zoran_driver.c | 2 +- drivers/media/platform/arv.c | 2 +- drivers/media/usb/pvrusb2/pvrusb2-ioread.c | 2 +- drivers/media/usb/pwc/pwc-ctrl.c | 2 +- drivers/media/v4l2-core/v4l2-common.c | 2 +- drivers/media/v4l2-core/v4l2-dev.c | 2 +- drivers/message/fusion/mptctl.c | 2 +- drivers/message/fusion/mptlan.h | 2 +- drivers/misc/ibmasm/ibmasmfs.c | 2 +- drivers/mmc/core/block.c | 2 +- drivers/mmc/host/android-goldfish.c | 2 +- drivers/mtd/devices/pmc551.c | 2 +- drivers/mtd/devices/slram.c | 2 +- drivers/mtd/ftl.c | 2 +- drivers/mtd/inftlcore.c | 2 +- drivers/mtd/inftlmount.c | 2 +- drivers/mtd/maps/sun_uflash.c | 2 +- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/mtd/mtdchar.c | 2 +- drivers/mtd/nftlcore.c | 2 +- drivers/net/appletalk/ipddp.c | 2 +- drivers/net/eql.c | 2 +- drivers/net/ethernet/3com/3c509.c | 2 +- drivers/net/ethernet/3com/3c515.c | 2 +- drivers/net/ethernet/3com/3c574_cs.c | 2 +- drivers/net/ethernet/3com/3c59x.c | 2 +- drivers/net/ethernet/3com/typhoon.c | 2 +- drivers/net/ethernet/8390/axnet_cs.c | 2 +- drivers/net/ethernet/8390/ne2k-pci.c | 2 +- drivers/net/ethernet/8390/pcnet_cs.c | 2 +- drivers/net/ethernet/adaptec/starfire.c | 2 +- drivers/net/ethernet/alteon/acenic.c | 2 +- drivers/net/ethernet/amd/amd8111e.c | 2 +- drivers/net/ethernet/amd/nmclan_cs.c | 2 +- drivers/net/ethernet/broadcom/b44.c | 2 +- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 2 +- drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- drivers/net/ethernet/dec/tulip/de2104x.c | 2 +- drivers/net/ethernet/dec/tulip/de4x5.c | 2 +- drivers/net/ethernet/dec/tulip/dmfe.c | 2 +- drivers/net/ethernet/dec/tulip/tulip_core.c | 2 +- drivers/net/ethernet/dec/tulip/uli526x.c | 2 +- drivers/net/ethernet/dec/tulip/winbond-840.c | 2 +- drivers/net/ethernet/dec/tulip/xircom_cb.c | 2 +- drivers/net/ethernet/dlink/dl2k.h | 2 +- drivers/net/ethernet/dlink/sundance.c | 2 +- drivers/net/ethernet/fealnx.c | 2 +- drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c | 2 +- drivers/net/ethernet/freescale/fs_enet/mac-fcc.c | 2 +- drivers/net/ethernet/freescale/fs_enet/mac-fec.c | 2 +- drivers/net/ethernet/freescale/fs_enet/mac-scc.c | 2 +- drivers/net/ethernet/freescale/fs_enet/mii-fec.c | 2 +- drivers/net/ethernet/freescale/gianfar.c | 2 +- drivers/net/ethernet/freescale/gianfar.h | 2 +- drivers/net/ethernet/freescale/gianfar_ethtool.c | 2 +- drivers/net/ethernet/freescale/ucc_geth.c | 2 +- drivers/net/ethernet/freescale/ucc_geth_ethtool.c | 2 +- drivers/net/ethernet/fujitsu/fmvj18x_cs.c | 2 +- drivers/net/ethernet/ibm/emac/core.c | 2 +- drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c | 2 +- drivers/net/ethernet/natsemi/natsemi.c | 2 +- drivers/net/ethernet/natsemi/ns83820.c | 2 +- drivers/net/ethernet/packetengines/hamachi.c | 2 +- drivers/net/ethernet/packetengines/yellowfin.c | 2 +- drivers/net/ethernet/realtek/8139cp.c | 2 +- drivers/net/ethernet/sgi/ioc3-eth.c | 2 +- drivers/net/ethernet/sis/sis900.c | 2 +- drivers/net/ethernet/smsc/epic100.c | 2 +- drivers/net/ethernet/smsc/smc91c92_cs.c | 2 +- drivers/net/ethernet/sun/cassini.c | 2 +- drivers/net/ethernet/sun/sungem.c | 2 +- drivers/net/ethernet/sun/sunhme.c | 2 +- drivers/net/ethernet/via/via-rhine.c | 2 +- drivers/net/ethernet/xircom/xirc2ps_cs.c | 2 +- drivers/net/fddi/skfp/skfddi.c | 2 +- drivers/net/hamradio/6pack.c | 2 +- drivers/net/hamradio/baycom_epp.c | 2 +- drivers/net/hamradio/baycom_par.c | 2 +- drivers/net/hamradio/baycom_ser_fdx.c | 2 +- drivers/net/hamradio/baycom_ser_hdx.c | 2 +- drivers/net/hamradio/bpqether.c | 2 +- drivers/net/hamradio/dmascc.c | 2 +- drivers/net/hamradio/hdlcdrv.c | 2 +- drivers/net/hamradio/mkiss.c | 2 +- drivers/net/hamradio/scc.c | 2 +- drivers/net/hamradio/yam.c | 2 +- drivers/net/hippi/rrunner.c | 2 +- drivers/net/irda/irtty-sir.c | 2 +- drivers/net/irda/kingsun-sir.c | 2 +- drivers/net/irda/ks959-sir.c | 2 +- drivers/net/irda/ksdazzle-sir.c | 2 +- drivers/net/irda/mcs7780.c | 2 +- drivers/net/irda/vlsi_ir.c | 2 +- drivers/net/loopback.c | 2 +- drivers/net/phy/davicom.c | 2 +- drivers/net/phy/icplus.c | 2 +- drivers/net/phy/lxt.c | 2 +- drivers/net/phy/qsemi.c | 2 +- drivers/net/ppp/ppp_async.c | 2 +- drivers/net/ppp/ppp_synctty.c | 2 +- drivers/net/ppp/pppoe.c | 2 +- drivers/net/ppp/pppox.c | 2 +- drivers/net/sb1000.c | 2 +- drivers/net/slip/slhc.c | 2 +- drivers/net/slip/slip.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/usb/catc.c | 2 +- drivers/net/usb/kaweth.c | 2 +- drivers/net/usb/pegasus.c | 2 +- drivers/net/usb/rtl8150.c | 2 +- drivers/net/wan/dlci.c | 2 +- drivers/net/wan/dscc4.c | 2 +- drivers/net/wan/farsync.c | 2 +- drivers/net/wan/hd64570.c | 2 +- drivers/net/wan/hd64572.c | 2 +- drivers/net/wan/lapbether.c | 2 +- drivers/net/wan/lmc/lmc_main.c | 2 +- drivers/net/wan/lmc/lmc_media.c | 2 +- drivers/net/wan/sbni.c | 2 +- drivers/net/wan/sdla.c | 2 +- drivers/net/wireless/atmel/atmel.c | 2 +- drivers/net/wireless/intel/ipw2x00/ipw2100.c | 2 +- drivers/net/wireless/intel/ipw2x00/libipw_geo.c | 2 +- drivers/net/wireless/intel/ipw2x00/libipw_module.c | 2 +- drivers/net/wireless/intel/ipw2x00/libipw_rx.c | 2 +- drivers/net/wireless/intel/ipw2x00/libipw_tx.c | 2 +- drivers/net/wireless/intersil/hostap/hostap_hw.c | 2 +- drivers/net/wireless/intersil/hostap/hostap_main.c | 2 +- drivers/net/wireless/intersil/prism54/isl_38xx.c | 2 +- drivers/net/wireless/intersil/prism54/isl_ioctl.c | 2 +- drivers/net/wireless/ray_cs.c | 2 +- drivers/net/wireless/wl3501_cs.c | 2 +- drivers/nubus/proc.c | 2 +- drivers/oprofile/event_buffer.c | 2 +- drivers/oprofile/oprofilefs.c | 2 +- drivers/parisc/ccio-dma.c | 2 +- drivers/parisc/ccio-rm-dma.c | 2 +- drivers/parisc/eisa_eeprom.c | 2 +- drivers/parisc/eisa_enumerator.c | 2 +- drivers/parisc/led.c | 2 +- drivers/parisc/pdc_stable.c | 2 +- drivers/parport/daisy.c | 2 +- drivers/parport/ieee1284_ops.c | 2 +- drivers/parport/parport_gsc.c | 2 +- drivers/parport/probe.c | 2 +- drivers/parport/procfs.c | 2 +- drivers/pci/hotplug/acpiphp_ibm.c | 2 +- drivers/pci/hotplug/cpqphp_core.c | 2 +- drivers/pci/hotplug/cpqphp_nvram.c | 2 +- drivers/pci/hotplug/pci_hotplug_core.c | 2 +- drivers/pci/proc.c | 2 +- drivers/pci/syscall.c | 2 +- drivers/platform/x86/sony-laptop.c | 2 +- drivers/platform/x86/thinkpad_acpi.c | 2 +- drivers/pnp/interface.c | 2 +- drivers/pnp/pnpbios/proc.c | 2 +- drivers/s390/block/dasd_devmap.c | 2 +- drivers/s390/block/dasd_eckd.c | 2 +- drivers/s390/block/dasd_eer.c | 2 +- drivers/s390/block/dasd_erp.c | 2 +- drivers/s390/block/dasd_genhd.c | 2 +- drivers/s390/block/dasd_ioctl.c | 2 +- drivers/s390/block/dasd_proc.c | 2 +- drivers/s390/block/xpram.c | 2 +- drivers/s390/char/con3215.c | 2 +- drivers/s390/char/keyboard.c | 2 +- drivers/s390/char/monreader.c | 2 +- drivers/s390/char/monwriter.c | 2 +- drivers/s390/char/sclp_rw.c | 2 +- drivers/s390/char/sclp_tty.c | 2 +- drivers/s390/char/sclp_vt220.c | 2 +- drivers/s390/char/tape_char.c | 2 +- drivers/s390/char/tty3270.c | 2 +- drivers/s390/char/vmcp.c | 2 +- drivers/s390/char/vmlogrdr.c | 2 +- drivers/s390/char/vmur.c | 2 +- drivers/s390/char/zcore.c | 2 +- drivers/s390/cio/blacklist.c | 2 +- drivers/s390/crypto/zcrypt_api.c | 2 +- drivers/s390/crypto/zcrypt_cex2a.c | 2 +- drivers/s390/crypto/zcrypt_pcixcc.c | 2 +- drivers/s390/net/netiucv.c | 2 +- drivers/sbus/char/display7seg.c | 2 +- drivers/sbus/char/envctrl.c | 2 +- drivers/sbus/char/flash.c | 2 +- drivers/sbus/char/jsflash.c | 2 +- drivers/sbus/char/openprom.c | 2 +- drivers/scsi/3w-9xxx.c | 2 +- drivers/scsi/3w-sas.c | 2 +- drivers/scsi/3w-xxxx.c | 2 +- drivers/scsi/aacraid/aachba.c | 2 +- drivers/scsi/aacraid/commctrl.c | 2 +- drivers/scsi/arcmsr/arcmsr_hba.c | 2 +- drivers/scsi/bfa/bfad.c | 2 +- drivers/scsi/dpt_i2o.c | 2 +- drivers/scsi/gdth.c | 2 +- drivers/scsi/hptiop.c | 2 +- drivers/scsi/ips.h | 2 +- drivers/scsi/megaraid.c | 2 +- drivers/scsi/megaraid/megaraid_mm.h | 2 +- drivers/scsi/megaraid/megaraid_sas_base.c | 2 +- drivers/scsi/osst.c | 2 +- drivers/scsi/qla2xxx/qla_sup.c | 2 +- drivers/scsi/scsi_ioctl.c | 2 +- drivers/scsi/scsi_proc.c | 2 +- drivers/scsi/sd.c | 2 +- drivers/scsi/sr.c | 2 +- drivers/scsi/sr_ioctl.c | 2 +- drivers/scsi/st.c | 2 +- drivers/tty/amiserial.c | 2 +- drivers/tty/hvc/hvc_console.c | 2 +- drivers/tty/hvc/hvcs.c | 2 +- drivers/tty/hvc/hvsi.c | 2 +- drivers/tty/moxa.c | 2 +- drivers/tty/mxser.c | 2 +- drivers/tty/n_hdlc.c | 2 +- drivers/tty/n_r3964.c | 2 +- drivers/tty/serial/icom.c | 2 +- drivers/tty/serial/serial_core.c | 2 +- drivers/tty/synclink.c | 2 +- drivers/tty/synclink_gt.c | 2 +- drivers/tty/synclinkmp.c | 2 +- drivers/tty/tty_ioctl.c | 2 +- drivers/tty/vt/consolemap.c | 2 +- drivers/tty/vt/selection.c | 2 +- drivers/tty/vt/vc_screen.c | 2 +- drivers/tty/vt/vt_ioctl.c | 2 +- drivers/usb/atm/usbatm.c | 2 +- drivers/usb/core/hub.c | 2 +- drivers/usb/gadget/legacy/inode.c | 2 +- drivers/usb/host/uhci-hcd.c | 2 +- drivers/usb/misc/ftdi-elan.c | 2 +- drivers/usb/misc/idmouse.c | 2 +- drivers/usb/misc/ldusb.c | 2 +- drivers/usb/misc/legousbtower.c | 2 +- drivers/usb/mon/mon_bin.c | 2 +- drivers/usb/mon/mon_stat.c | 2 +- drivers/usb/mon/mon_text.c | 2 +- drivers/usb/musb/musb_debugfs.c | 2 +- drivers/video/console/newport_con.c | 2 +- drivers/video/fbdev/68328fb.c | 2 +- drivers/video/fbdev/hitfb.c | 2 +- drivers/video/fbdev/hpfb.c | 2 +- drivers/video/fbdev/mx3fb.c | 2 +- drivers/video/fbdev/q40fb.c | 2 +- drivers/video/fbdev/sm501fb.c | 2 +- drivers/video/fbdev/stifb.c | 2 +- drivers/video/fbdev/w100fb.c | 2 +- drivers/zorro/proc.c | 2 +- fs/9p/vfs_file.c | 2 +- fs/afs/proc.c | 2 +- fs/aio.c | 2 +- fs/anon_inodes.c | 2 +- fs/bfs/inode.c | 2 +- fs/binfmt_aout.c | 2 +- fs/binfmt_elf.c | 2 +- fs/binfmt_elf_fdpic.c | 2 +- fs/block_dev.c | 2 +- fs/cifs/cifs_debug.c | 2 +- fs/cifs/cifssmb.c | 2 +- fs/cifs/connect.c | 2 +- fs/cifs/transport.c | 2 +- fs/compat.c | 2 +- fs/compat_ioctl.c | 2 +- fs/configfs/file.c | 2 +- fs/coredump.c | 2 +- fs/dcache.c | 2 +- fs/dcookies.c | 2 +- fs/dlm/dlm_internal.h | 2 +- fs/efs/efs.h | 2 +- fs/eventpoll.c | 2 +- fs/exec.c | 2 +- fs/ext2/ioctl.c | 2 +- fs/ext2/super.c | 2 +- fs/ext4/extents.c | 2 +- fs/ext4/ioctl.c | 2 +- fs/ext4/super.c | 2 +- fs/fcntl.c | 2 +- fs/fhandle.c | 2 +- fs/filesystems.c | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/glock.c | 2 +- fs/gfs2/inode.c | 2 +- fs/gfs2/sys.c | 2 +- fs/gfs2/util.c | 2 +- fs/gfs2/xattr.c | 2 +- fs/hfs/hfs_fs.h | 2 +- fs/hfsplus/ioctl.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/jbd2/journal.c | 2 +- fs/jfs/ioctl.c | 2 +- fs/jfs/jfs_debug.c | 2 +- fs/jfs/super.c | 2 +- fs/libfs.c | 2 +- fs/locks.c | 2 +- fs/namei.c | 2 +- fs/ncpfs/dir.c | 2 +- fs/ncpfs/file.c | 2 +- fs/ncpfs/inode.c | 2 +- fs/ncpfs/ioctl.c | 2 +- fs/ncpfs/mmap.c | 2 +- fs/ncpfs/ncplib_kernel.h | 2 +- fs/ncpfs/sock.c | 2 +- fs/ncpfs/symlink.c | 2 +- fs/nfs/direct.c | 2 +- fs/nfs/file.c | 2 +- fs/nfs/getroot.c | 2 +- fs/nfs/inode.c | 2 +- fs/nfs/super.c | 2 +- fs/nfs/write.c | 2 +- fs/nfsd/fault_inject.c | 2 +- fs/nfsd/vfs.c | 2 +- fs/ntfs/file.c | 2 +- fs/ocfs2/cluster/masklog.c | 2 +- fs/ocfs2/cluster/tcp.c | 2 +- fs/ocfs2/dlmfs/dlmfs.c | 2 +- fs/ocfs2/stack_user.c | 2 +- fs/open.c | 2 +- fs/openpromfs/inode.c | 2 +- fs/pipe.c | 2 +- fs/proc/base.c | 2 +- fs/proc/generic.c | 2 +- fs/proc/inode.c | 2 +- fs/proc/kcore.c | 2 +- fs/proc/kmsg.c | 2 +- fs/proc/nommu.c | 2 +- fs/proc/page.c | 2 +- fs/proc/proc_net.c | 2 +- fs/proc/proc_tty.c | 2 +- fs/proc/root.c | 2 +- fs/proc/task_mmu.c | 2 +- fs/proc/vmcore.c | 2 +- fs/ramfs/file-nommu.c | 2 +- fs/ramfs/inode.c | 2 +- fs/read_write.c | 2 +- fs/readdir.c | 2 +- fs/select.c | 2 +- fs/seq_file.c | 2 +- fs/stat.c | 2 +- fs/ufs/inode.c | 2 +- fs/ufs/super.c | 2 +- fs/utimes.c | 2 +- fs/xattr.c | 2 +- fs/xfs/xfs_ioctl32.c | 2 +- fs/xfs/xfs_linux.h | 2 +- include/asm-generic/termios-base.h | 2 +- include/asm-generic/termios.h | 2 +- include/drm/drmP.h | 2 +- include/linux/isdnif.h | 2 +- include/linux/pagemap.h | 2 +- include/linux/poll.h | 2 +- include/net/checksum.h | 2 +- include/net/sctp/sctp.h | 2 +- include/rdma/ib_verbs.h | 2 +- init/init_task.c | 2 +- kernel/capability.c | 2 +- kernel/compat.c | 2 +- kernel/configs.c | 2 +- kernel/cpuset.c | 2 +- kernel/exit.c | 2 +- kernel/extable.c | 2 +- kernel/fork.c | 2 +- kernel/futex_compat.c | 2 +- kernel/groups.c | 2 +- kernel/kmod.c | 2 +- kernel/kprobes.c | 2 +- kernel/locking/lockdep_proc.c | 2 +- kernel/module.c | 2 +- kernel/power/snapshot.c | 2 +- kernel/power/user.c | 2 +- kernel/printk/printk.c | 2 +- kernel/profile.c | 2 +- kernel/signal.c | 2 +- kernel/sys.c | 2 +- kernel/sysctl.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/itimer.c | 2 +- kernel/time/posix-cpu-timers.c | 2 +- kernel/time/posix-timers.c | 2 +- kernel/time/time.c | 2 +- kernel/time/timer.c | 2 +- kernel/time/timer_list.c | 2 +- kernel/time/timer_stats.c | 2 +- kernel/uid16.c | 2 +- lib/extable.c | 2 +- lib/kstrtox.c | 2 +- mm/memcontrol.c | 2 +- mm/memory.c | 2 +- mm/mempolicy.c | 2 +- mm/mincore.c | 2 +- mm/mmap.c | 2 +- mm/mprotect.c | 2 +- mm/nommu.c | 2 +- mm/shmem.c | 2 +- mm/util.c | 2 +- mm/vmalloc.c | 2 +- net/802/fc.c | 2 +- net/802/hippi.c | 2 +- net/8021q/vlan.c | 2 +- net/ax25/af_ax25.c | 2 +- net/ax25/ax25_addr.c | 2 +- net/ax25/ax25_dev.c | 2 +- net/ax25/ax25_ds_in.c | 2 +- net/ax25/ax25_ds_subr.c | 2 +- net/ax25/ax25_ds_timer.c | 2 +- net/ax25/ax25_iface.c | 2 +- net/ax25/ax25_in.c | 2 +- net/ax25/ax25_ip.c | 2 +- net/ax25/ax25_out.c | 2 +- net/ax25/ax25_route.c | 2 +- net/ax25/ax25_std_in.c | 2 +- net/ax25/ax25_std_subr.c | 2 +- net/ax25/ax25_std_timer.c | 2 +- net/ax25/ax25_subr.c | 2 +- net/ax25/ax25_timer.c | 2 +- net/ax25/ax25_uid.c | 2 +- net/bridge/br_device.c | 2 +- net/bridge/br_ioctl.c | 2 +- net/bridge/br_netfilter_hooks.c | 2 +- net/bridge/br_netfilter_ipv6.c | 2 +- net/bridge/netfilter/ebtables.c | 2 +- net/compat.c | 2 +- net/core/datagram.c | 2 +- net/core/dev.c | 2 +- net/core/filter.c | 2 +- net/core/gen_estimator.c | 2 +- net/core/rtnetlink.c | 2 +- net/core/scm.c | 2 +- net/core/skbuff.c | 2 +- net/core/sock.c | 2 +- net/core/utils.c | 2 +- net/decnet/dn_dev.c | 2 +- net/decnet/dn_fib.c | 2 +- net/decnet/dn_table.c | 2 +- net/decnet/sysctl_net_decnet.c | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/devinet.c | 2 +- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_semantics.c | 2 +- net/ipv4/fib_trie.c | 2 +- net/ipv4/icmp.c | 2 +- net/ipv4/igmp.c | 2 +- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_options.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/ip_sockglue.c | 2 +- net/ipv4/ipconfig.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv4/netfilter/arp_tables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/datagram.c | 2 +- net/ipv6/icmp.c | 2 +- net/ipv6/ip6_flowlabel.c | 2 +- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6mr.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- net/ipv6/route.c | 2 +- net/ipv6/sit.c | 2 +- net/ipv6/udp.c | 2 +- net/ipx/af_ipx.c | 2 +- net/irda/af_irda.c | 2 +- net/irda/ircomm/ircomm_tty.c | 2 +- net/irda/ircomm/ircomm_tty_ioctl.c | 2 +- net/irda/irda_device.c | 2 +- net/irda/irnet/irnet.h | 2 +- net/lapb/lapb_iface.c | 2 +- net/lapb/lapb_in.c | 2 +- net/lapb/lapb_out.c | 2 +- net/lapb/lapb_subr.c | 2 +- net/lapb/lapb_timer.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- net/netfilter/nfnetlink.c | 2 +- net/netlink/af_netlink.c | 2 +- net/packet/af_packet.c | 2 +- net/rose/af_rose.c | 2 +- net/rose/rose_route.c | 2 +- net/sctp/ipv6.c | 2 +- net/socket.c | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/cache.c | 2 +- net/sunrpc/svcsock.c | 2 +- net/sunrpc/sysctl.c | 2 +- net/unix/af_unix.c | 2 +- net/x25/af_x25.c | 2 +- net/x25/x25_link.c | 2 +- net/xfrm/xfrm_state.c | 2 +- net/xfrm/xfrm_user.c | 2 +- security/keys/keyctl.c | 2 +- security/keys/process_keys.c | 2 +- security/keys/request_key_auth.c | 2 +- security/keys/user_defined.c | 2 +- sound/oss/dmasound/dmasound_atari.c | 2 +- sound/oss/dmasound/dmasound_core.c | 2 +- sound/oss/dmasound/dmasound_paula.c | 2 +- sound/oss/dmasound/dmasound_q40.c | 2 +- sound/oss/msnd.c | 2 +- sound/oss/os.h | 2 +- sound/oss/swarm_cs4297a.c | 2 +- virt/kvm/kvm_main.c | 2 +- 1088 files changed, 1088 insertions(+), 1088 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/boot/misc.c b/arch/alpha/boot/misc.c index 3ff9a957a25c..1b568ed74f95 100644 --- a/arch/alpha/boot/misc.c +++ b/arch/alpha/boot/misc.c @@ -21,7 +21,7 @@ #include #include -#include +#include #define memzero(s,n) memset ((s),0,(n)) #define puts srm_printk diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c index 2d6efcff3bf3..2f26ae74b61a 100644 --- a/arch/alpha/kernel/irq.c +++ b/arch/alpha/kernel/irq.c @@ -26,7 +26,7 @@ #include #include -#include +#include volatile unsigned long irq_err_count; DEFINE_PER_CPU(unsigned long, irq_pmi_count); diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 56e427c7aa3c..54d8616644e2 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index b483156698d5..bca963a4aa48 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index 04abdec7f496..bc4d2cdcf21d 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 4811e54069fc..491e6a604e82 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -53,7 +53,7 @@ static struct notifier_block alpha_panic_block = { INT_MAX /* try to do it first */ }; -#include +#include #include #include #include diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 8dbfb15f1745..17308f925306 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c index ffe996a54fad..705ae12acd15 100644 --- a/arch/alpha/kernel/srm_env.c +++ b/arch/alpha/kernel/srm_env.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #define BASE_DIR "srm_environment" /* Subdir in /proc/ */ diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c index 72b59511e59a..e9c45b65a905 100644 --- a/arch/alpha/kernel/srmcons.c +++ b/arch/alpha/kernel/srmcons.c @@ -18,7 +18,7 @@ #include #include -#include +#include static DEFINE_SPINLOCK(srmcons_callback_lock); diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 5b6202a825ff..992000e3d9e4 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 74aceead06e9..3328af7c2776 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index b4ff3b683bcd..5dfb7975895f 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -11,7 +11,7 @@ #include #include -#include +#include #define ldq_u(x,y) \ diff --git a/arch/alpha/math-emu/math.c b/arch/alpha/math-emu/math.c index 58c2669a1dd4..fa5ae0ad8983 100644 --- a/arch/alpha/math-emu/math.c +++ b/arch/alpha/math-emu/math.c @@ -3,7 +3,7 @@ #include #include -#include +#include #include "sfp-util.h" #include diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index a1bea91df56a..0542e973c73d 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/common/bL_switcher_dummy_if.c b/arch/arm/common/bL_switcher_dummy_if.c index 6053f64c3752..4c10c6452678 100644 --- a/arch/arm/common/bL_switcher_dummy_if.c +++ b/arch/arm/common/bL_switcher_dummy_if.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include static ssize_t bL_switcher_write(struct file *file, const char __user *buf, diff --git a/arch/arm/kernel/swp_emulate.c b/arch/arm/kernel/swp_emulate.c index c3fe769d7558..853221f81104 100644 --- a/arch/arm/kernel/swp_emulate.c +++ b/arch/arm/kernel/swp_emulate.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include /* * Error-checking SWP macros implemented using ldrex{b}/strex{b} diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 8f92efa8460e..11676787ad49 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -33,7 +33,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" -#include +#include #include #include #include diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c index 9aca92074f85..fa6182a40941 100644 --- a/arch/arm/kvm/guest.c +++ b/arch/arm/kvm/guest.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-iop13xx/irq.c b/arch/arm/mach-iop13xx/irq.c index c702cc4092de..bd9b43c8004e 100644 --- a/arch/arm/mach-iop13xx/irq.c +++ b/arch/arm/mach-iop13xx/irq.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c index 26874f608ca9..0f08f611c1a6 100644 --- a/arch/arm/mach-ixp4xx/common.c +++ b/arch/arm/mach-ixp4xx/common.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-rpc/dma.c b/arch/arm/mach-rpc/dma.c index 6d3517dc4772..fb48f3141fb4 100644 --- a/arch/arm/mach-rpc/dma.c +++ b/arch/arm/mach-rpc/dma.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/arm/plat-iop/time.c b/arch/arm/plat-iop/time.c index 101e8f2c7abe..ed8d129d4bea 100644 --- a/arch/arm/plat-iop/time.c +++ b/arch/arm/plat-iop/time.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h index 2b79b8a89457..b0d708ff7f4e 100644 --- a/arch/arm64/include/asm/word-at-a-time.h +++ b/arch/arm64/include/asm/word-at-a-time.h @@ -16,7 +16,7 @@ #ifndef __ASM_WORD_AT_A_TIME_H #define __ASM_WORD_AT_A_TIME_H -#include +#include #ifndef __AARCH64EB__ diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 04de188a36c9..fde04f029ec3 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 4f0d76339414..a7504f40d7ee 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 1decd2b2c730..f0593c92279b 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index b7063de792f7..c747a0fc5d7d 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include struct compat_sigcontext { diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 3f9e15722473..b37446a8ffdb 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index d7150e30438a..add4a1334085 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,7 +17,7 @@ */ #include -#include +#include .text diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index cfe13396085b..fd6cd05593f9 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -17,7 +17,7 @@ #include #include -#include +#include /* * Copy from user space to a kernel buffer (alignment handled by the hardware) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 718b1c4e2f85..d828540ded6f 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -19,7 +19,7 @@ #include #include -#include +#include /* * Copy from user space to user space (alignment handled by the hardware) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index e99e31c9acac..3e6ae2663b82 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -17,7 +17,7 @@ #include #include -#include +#include /* * Copy to user space from a kernel buffer (alignment handled by the hardware) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index da9576932322..17f422a4dc55 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -23,7 +23,7 @@ #include #include #include -#include +#include /* * flush_icache_range(start,end) diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index b41aff25426d..47cf3f9d89ff 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,7 +49,7 @@ #include #include -#include +#include #include diff --git a/arch/avr32/kernel/avr32_ksyms.c b/arch/avr32/kernel/avr32_ksyms.c index 7c6cf14f0985..0d05fd095468 100644 --- a/arch/avr32/kernel/avr32_ksyms.c +++ b/arch/avr32/kernel/avr32_ksyms.c @@ -12,7 +12,7 @@ #include #include -#include +#include /* * GCC functions diff --git a/arch/avr32/kernel/ptrace.c b/arch/avr32/kernel/ptrace.c index 4aedcab7cd4b..a89b893279bb 100644 --- a/arch/avr32/kernel/ptrace.c +++ b/arch/avr32/kernel/ptrace.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 8f1c63b9b983..b5fcc4914fe4 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include diff --git a/arch/avr32/mm/cache.c b/arch/avr32/mm/cache.c index 85d635cd7b28..d9476825fc43 100644 --- a/arch/avr32/mm/cache.c +++ b/arch/avr32/mm/cache.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/blackfin/kernel/bfin_dma.c b/arch/blackfin/kernel/bfin_dma.c index 4a32f2dd5ddc..9d3eb0cf8ccc 100644 --- a/arch/blackfin/kernel/bfin_dma.c +++ b/arch/blackfin/kernel/bfin_dma.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/blackfin/kernel/kgdb_test.c b/arch/blackfin/kernel/kgdb_test.c index 18ab004aea1c..b8b785dc4e3b 100644 --- a/arch/blackfin/kernel/kgdb_test.c +++ b/arch/blackfin/kernel/kgdb_test.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c index 4489efc52883..0188c933b155 100644 --- a/arch/blackfin/kernel/module.c +++ b/arch/blackfin/kernel/module.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include /* Transfer the section to the L1 memory */ int diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 63f5560d6eb2..4cc72b0d1c1d 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -18,7 +18,7 @@ #include #include -#include +#include /* * ZERO_PAGE is a special page that is used for zero-initialized diff --git a/arch/cris/arch-v10/drivers/eeprom.c b/arch/cris/arch-v10/drivers/eeprom.c index c903a9e53a47..33558d270a53 100644 --- a/arch/cris/arch-v10/drivers/eeprom.c +++ b/arch/cris/arch-v10/drivers/eeprom.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include "i2c.h" #define D(x) diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c index 0f3983241e60..9ac75d68f184 100644 --- a/arch/cris/arch-v10/drivers/sync_serial.c +++ b/arch/cris/arch-v10/drivers/sync_serial.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c index bfddfb99401f..eca94c7d56e7 100644 --- a/arch/cris/arch-v10/kernel/ptrace.c +++ b/arch/cris/arch-v10/kernel/ptrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index 7122d9773b13..db30c98e4926 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #define DEBUG_SIG 0 diff --git a/arch/cris/arch-v10/kernel/traps.c b/arch/cris/arch-v10/kernel/traps.c index 7001beda716c..96d004fe9740 100644 --- a/arch/cris/arch-v10/kernel/traps.c +++ b/arch/cris/arch-v10/kernel/traps.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include diff --git a/arch/cris/arch-v10/lib/usercopy.c b/arch/cris/arch-v10/lib/usercopy.c index b964c667aced..1ba7cc000dfc 100644 --- a/arch/cris/arch-v10/lib/usercopy.c +++ b/arch/cris/arch-v10/lib/usercopy.c @@ -8,7 +8,7 @@ * Pieces used from memcpy, originally by Kenny Ranerup long time ago. */ -#include +#include /* Asm:s have been tweaked (within the domain of correctness) to give satisfactory results for "gcc version 2.96 20000427 (experimental)". diff --git a/arch/cris/arch-v10/mm/fault.c b/arch/cris/arch-v10/mm/fault.c index ed60588f8467..75210cbe61ce 100644 --- a/arch/cris/arch-v10/mm/fault.c +++ b/arch/cris/arch-v10/mm/fault.c @@ -11,7 +11,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c index 0068fd411a84..ae6903d7fdbe 100644 --- a/arch/cris/arch-v32/drivers/cryptocop.c +++ b/arch/cris/arch-v32/drivers/cryptocop.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c index fe1f9cf7b391..c366bc05466a 100644 --- a/arch/cris/arch-v32/kernel/ptrace.c +++ b/arch/cris/arch-v32/kernel/ptrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index 150d1d76c29d..816bf2ca93ef 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include extern unsigned long cris_signal_return_page; diff --git a/arch/cris/arch-v32/kernel/traps.c b/arch/cris/arch-v32/kernel/traps.c index 8bbe09c93132..d79666aefd71 100644 --- a/arch/cris/arch-v32/kernel/traps.c +++ b/arch/cris/arch-v32/kernel/traps.c @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/cris/arch-v32/lib/usercopy.c b/arch/cris/arch-v32/lib/usercopy.c index f0f335d8aa79..05e58dab800d 100644 --- a/arch/cris/arch-v32/lib/usercopy.c +++ b/arch/cris/arch-v32/lib/usercopy.c @@ -8,7 +8,7 @@ * Pieces used from memcpy, originally by Kenny Ranerup long time ago. */ -#include +#include /* Asm:s have been tweaked (within the domain of correctness) to give satisfactory results for "gcc version 3.2.1 Axis release R53/1.53-v32". diff --git a/arch/cris/kernel/crisksyms.c b/arch/cris/kernel/crisksyms.c index 31b4bd288cad..3166d1cf2f84 100644 --- a/arch/cris/kernel/crisksyms.c +++ b/arch/cris/kernel/crisksyms.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index b78498eb079b..50a7dd451456 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/cris/kernel/profile.c b/arch/cris/kernel/profile.c index cd9f15b92f8f..ad56b37f8e11 100644 --- a/arch/cris/kernel/profile.c +++ b/arch/cris/kernel/profile.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #define SAMPLE_BUFFER_SIZE 8192 diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c index fd3427e563c5..806b764059d5 100644 --- a/arch/cris/kernel/ptrace.c +++ b/arch/cris/kernel/ptrace.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c index 7aa036ec78ff..8febb032fdd7 100644 --- a/arch/cris/kernel/sys_cris.c +++ b/arch/cris/kernel/sys_cris.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include asmlinkage long diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c index da4c72401e27..b2a312a7afc6 100644 --- a/arch/cris/kernel/traps.c +++ b/arch/cris/kernel/traps.c @@ -20,7 +20,7 @@ #endif #include -#include +#include #include extern void arch_enable_nmi(void); diff --git a/arch/frv/include/asm/futex.h b/arch/frv/include/asm/futex.h index 4bea27f50a7a..2e1da71e27a4 100644 --- a/arch/frv/include/asm/futex.h +++ b/arch/frv/include/asm/futex.h @@ -5,7 +5,7 @@ #include #include -#include +#include extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); diff --git a/arch/frv/kernel/irq.c b/arch/frv/kernel/irq.c index 2239346fa3db..93513e4ccd2b 100644 --- a/arch/frv/kernel/irq.c +++ b/arch/frv/kernel/irq.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/frv/kernel/pm-mb93093.c b/arch/frv/kernel/pm-mb93093.c index eaa7b582ef52..8358e34a3fad 100644 --- a/arch/frv/kernel/pm-mb93093.c +++ b/arch/frv/kernel/pm-mb93093.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c index ac767d94a880..051ccecbf7f1 100644 --- a/arch/frv/kernel/pm.c +++ b/arch/frv/kernel/pm.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 5d40aeb7712e..b306241c4ef2 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c index 3987ff88dab0..49768401ce0f 100644 --- a/arch/frv/kernel/ptrace.c +++ b/arch/frv/kernel/ptrace.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 82d5e914dc15..bf6e07a7a1b1 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #define DEBUG_SIG 0 diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c index 9c4980825bbb..f80cc8b9bd45 100644 --- a/arch/frv/kernel/sys_frv.c +++ b/arch/frv/kernel/sys_frv.c @@ -25,7 +25,7 @@ #include #include -#include +#include asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, diff --git a/arch/frv/kernel/sysctl.c b/arch/frv/kernel/sysctl.c index f4dfae2c75ad..b54a64971cf1 100644 --- a/arch/frv/kernel/sysctl.c +++ b/arch/frv/kernel/sysctl.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include static const char frv_cache_wback[] = "wback"; static const char frv_cache_wthru[] = "wthru"; diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c index a6d105d61b26..31221fb4348e 100644 --- a/arch/frv/kernel/traps.c +++ b/arch/frv/kernel/traps.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/frv/kernel/uaccess.c b/arch/frv/kernel/uaccess.c index 374f88d6cc00..8b360b4222a5 100644 --- a/arch/frv/kernel/uaccess.c +++ b/arch/frv/kernel/uaccess.c @@ -11,7 +11,7 @@ #include #include -#include +#include /*****************************************************************************/ /* diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c index 7a73aaeae3ac..e701aa9e6a14 100644 --- a/arch/frv/mm/dma-alloc.c +++ b/arch/frv/mm/dma-alloc.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include static int map_page(unsigned long va, unsigned long pa, pgprot_t prot) diff --git a/arch/frv/mm/extable.c b/arch/frv/mm/extable.c index 8863d6c1df6e..9a641c1b085a 100644 --- a/arch/frv/mm/extable.c +++ b/arch/frv/mm/extable.c @@ -4,7 +4,7 @@ #include #include -#include +#include extern const void __memset_end, __memset_user_error_lr, __memset_user_error_handler; extern const void __memcpy_end, __memcpy_user_error_lr, __memcpy_user_error_handler; diff --git a/arch/h8300/boot/compressed/misc.c b/arch/h8300/boot/compressed/misc.c index 9f64fe8f29ff..a947dbb4fd91 100644 --- a/arch/h8300/boot/compressed/misc.c +++ b/arch/h8300/boot/compressed/misc.c @@ -9,7 +9,7 @@ * Adapted for h8300 by Yoshinori Sato 2006 */ -#include +#include /* * gzip declarations diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index dee41256922c..891974a11704 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 7138303cbbf2..d784f7117f9a 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -41,7 +41,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c index c041d8ecb1e2..af9dec4c28eb 100644 --- a/arch/hexagon/kernel/hexagon_ksyms.c +++ b/arch/hexagon/kernel/hexagon_ksyms.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include /* Additional functions */ EXPORT_SYMBOL(__clear_user_hexagon); diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index b039a624c170..c6b22b9945a7 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/hexagon/mm/uaccess.c b/arch/hexagon/mm/uaccess.c index 34127261c2b7..ec90afdb3ad0 100644 --- a/arch/hexagon/mm/uaccess.c +++ b/arch/hexagon/mm/uaccess.c @@ -23,7 +23,7 @@ * we implement here as subroutines. */ #include -#include +#include #include /* diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index bd7c251e2bce..de863d6d802b 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/brl_emu.c b/arch/ia64/kernel/brl_emu.c index 0b286ca164f9..8682df6263d6 100644 --- a/arch/ia64/kernel/brl_emu.c +++ b/arch/ia64/kernel/brl_emu.c @@ -9,7 +9,7 @@ #include #include -#include +#include #include extern char ia64_set_b1, ia64_set_b2, ia64_set_b3, ia64_set_b4, ia64_set_b5; diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c index c8c9298666fb..9c12b794e774 100644 --- a/arch/ia64/kernel/crash_dump.c +++ b/arch/ia64/kernel/crash_dump.c @@ -11,7 +11,7 @@ #include #include -#include +#include /** * copy_oldmem_page - copy one page from "oldmem" diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c index 0eaa89f3defd..fa8ee64adac2 100644 --- a/arch/ia64/kernel/init_task.c +++ b/arch/ia64/kernel/init_task.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include static struct signal_struct init_signals = INIT_SIGNALS(init_signals); diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c index de4fc00dea98..2ff1df7b14ea 100644 --- a/arch/ia64/kernel/irq.c +++ b/arch/ia64/kernel/irq.c @@ -17,7 +17,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index c7c51445c3be..45ff27e9edbb 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -33,7 +33,7 @@ #include #include -#include +#include extern void jprobe_inst_return(void); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 2436ad5f92c1..677a86826771 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_PERFMON diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index aae6c4dc7ae7..52deab683ba1 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 36f660da8124..0b1153e610ea 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_PERFMON #include diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index aaf74f36cfa1..d194d5c83d32 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -48,7 +48,7 @@ #include #include -#include +#include MODULE_AUTHOR("Jesse Barnes "); MODULE_DESCRIPTION("/proc interface to IA-64 SAL features"); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index b3a124da71e5..5db52c6813c4 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index 41e33f84c185..a09c12230bc5 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -18,7 +18,7 @@ #include #include -#include +#include unsigned long arch_get_unmapped_area (struct file *filp, unsigned long addr, unsigned long len, diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 77edd68c5161..095bfaff82d0 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include fpswa_interface_t *fpswa_interface; diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index 7f0d31656b4d..9cd01c2200ee 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include extern int die_if_kernel(char *str, struct pt_regs *regs, long err); diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c index 8f66195999e7..9704e2cd9878 100644 --- a/arch/ia64/kernel/unwind.c +++ b/arch/ia64/kernel/unwind.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include "entry.h" #include "unwind_i.h" diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c index 118daf5a0632..42f7678ef6ad 100644 --- a/arch/ia64/lib/csum_partial_copy.c +++ b/arch/ia64/lib/csum_partial_copy.c @@ -11,7 +11,7 @@ #include #include -#include +#include /* * XXX Fixme: those 2 inlines are meant for debugging and will go away diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c index 8f70bb2d0c37..4edb816aba9a 100644 --- a/arch/ia64/mm/extable.c +++ b/arch/ia64/mm/extable.c @@ -5,7 +5,7 @@ * David Mosberger-Tang */ -#include +#include void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 1841ef69183d..bb4610faca84 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c index b9992571c036..4c3b84d8406a 100644 --- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c +++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c index 7aab87f48060..29cf8f8c08e9 100644 --- a/arch/ia64/sn/kernel/sn2/sn_proc_fs.c +++ b/arch/ia64/sn/kernel/sn2/sn_proc_fs.c @@ -9,7 +9,7 @@ #ifdef CONFIG_PROC_FS #include #include -#include +#include #include static int partition_id_show(struct seq_file *s, void *p) diff --git a/arch/ia64/sn/kernel/tiocx.c b/arch/ia64/sn/kernel/tiocx.c index e35f6485c1fd..32d0380eb72e 100644 --- a/arch/ia64/sn/kernel/tiocx.c +++ b/arch/ia64/sn/kernel/tiocx.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/align.c b/arch/m32r/kernel/align.c index ab871ccd33f8..ec51e5b34860 100644 --- a/arch/m32r/kernel/align.c +++ b/arch/m32r/kernel/align.c @@ -5,7 +5,7 @@ */ #include -#include +#include static int get_reg(struct pt_regs *regs, int nr) { diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c index c7272b894283..5537f7397297 100644 --- a/arch/m32r/kernel/irq.c +++ b/arch/m32r/kernel/irq.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include /* * do_IRQ handles all normal device IRQs (the special diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c index 23f26f4adfff..d763f0bd2106 100644 --- a/arch/m32r/kernel/m32r_ksyms.c +++ b/arch/m32r/kernel/m32r_ksyms.c @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index a88b1f01e91f..e0568bee60c0 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index c145605a981f..a68acb9fa515 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 1c81e24fd006..1ed597041fba 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #define DEBUG_SIG 0 diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c index c3fdd632fba7..f34957032504 100644 --- a/arch/m32r/kernel/sys_m32r.c +++ b/arch/m32r/kernel/sys_m32r.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index a7a424f852e4..c3c5fdfae920 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include diff --git a/arch/m32r/lib/csum_partial_copy.c b/arch/m32r/lib/csum_partial_copy.c index 5596f3df833f..b3cd59c12b8e 100644 --- a/arch/m32r/lib/csum_partial_copy.c +++ b/arch/m32r/lib/csum_partial_copy.c @@ -22,7 +22,7 @@ #include #include -#include +#include /* * Copy while checksumming, otherwise like csum_partial diff --git a/arch/m32r/lib/usercopy.c b/arch/m32r/lib/usercopy.c index 82abd159dbef..fd03f2731f20 100644 --- a/arch/m32r/lib/usercopy.c +++ b/arch/m32r/lib/usercopy.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include unsigned long __generic_copy_to_user(void __user *to, const void *from, unsigned long n) diff --git a/arch/m32r/mm/extable.c b/arch/m32r/mm/extable.c index 1743f23d49a3..40ccf80d29cf 100644 --- a/arch/m32r/mm/extable.c +++ b/arch/m32r/mm/extable.c @@ -3,7 +3,7 @@ */ #include -#include +#include int fixup_exception(struct pt_regs *regs) { diff --git a/arch/m32r/mm/fault-nommu.c b/arch/m32r/mm/fault-nommu.c index 80f18cc6f547..e22d5ddae5cb 100644 --- a/arch/m32r/mm/fault-nommu.c +++ b/arch/m32r/mm/fault-nommu.c @@ -22,7 +22,7 @@ #include /* For unblank_screen() */ #include -#include +#include #include #include #include diff --git a/arch/m68k/bvme6000/rtc.c b/arch/m68k/bvme6000/rtc.c index f7984f44ff0f..d53c9b301f84 100644 --- a/arch/m68k/bvme6000/rtc.c +++ b/arch/m68k/bvme6000/rtc.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include /* diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 4ba1ae7345c3..aaf28f8e342d 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c index 1bc10e62b9af..9cd86d7343a6 100644 --- a/arch/m68k/kernel/ptrace.c +++ b/arch/m68k/kernel/ptrace.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 58507edbdf1d..8ead291a902a 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 9aa01adb407f..98a2daaae30c 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 6c9ca24830e9..558f38402737 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/lib/uaccess.c b/arch/m68k/lib/uaccess.c index 35d1442dee89..a76b73abaf64 100644 --- a/arch/m68k/lib/uaccess.c +++ b/arch/m68k/lib/uaccess.c @@ -5,7 +5,7 @@ */ #include -#include +#include unsigned long __generic_copy_from_user(void *to, const void __user *from, unsigned long n) diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 0fb54a90eac2..c6d351f5bd79 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index b09a3cb29b68..9c1e656b1f8f 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 8f37fdd80be9..7cb72dbc2eaa 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index 269f81158a33..b5b7d53f7283 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/m68k/mvme16x/rtc.c b/arch/m68k/mvme16x/rtc.c index 1cdc73268188..8f00847a0e4b 100644 --- a/arch/m68k/mvme16x/rtc.c +++ b/arch/m68k/mvme16x/rtc.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include /* diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c index 3f258e230ba5..0f95134e9b85 100644 --- a/arch/m68k/sun3/mmu_emu.c +++ b/arch/m68k/sun3/mmu_emu.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/metag/kernel/irq.c b/arch/metag/kernel/irq.c index 3074b64793e6..c9939604a38f 100644 --- a/arch/metag/kernel/irq.c +++ b/arch/metag/kernel/irq.c @@ -13,7 +13,7 @@ #include #include -#include +#include #ifdef CONFIG_4KSTACKS union irq_ctx { diff --git a/arch/mips/alchemy/common/power.c b/arch/mips/alchemy/common/power.c index 921ed30b440c..303257b697c2 100644 --- a/arch/mips/alchemy/common/power.c +++ b/arch/mips/alchemy/common/power.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include /* diff --git a/arch/mips/dec/kn01-berr.c b/arch/mips/dec/kn01-berr.c index 44d8a87a8a68..e9d2db480aeb 100644 --- a/arch/mips/dec/kn01-berr.c +++ b/arch/mips/dec/kn01-berr.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h index bce1ce53149a..7749daf2a465 100644 --- a/arch/mips/include/asm/checksum.h +++ b/arch/mips/include/asm/checksum.h @@ -18,7 +18,7 @@ #include -#include +#include /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/mips/include/asm/compat-signal.h b/arch/mips/include/asm/compat-signal.h index 64e0b9343b8c..4c6176467146 100644 --- a/arch/mips/include/asm/compat-signal.h +++ b/arch/mips/include/asm/compat-signal.h @@ -8,7 +8,7 @@ #include #include -#include +#include static inline int __copy_conv_sigset_to_user(compat_sigset_t __user *d, const sigset_t *s) diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h index 667ca3c467b7..b42b513007a2 100644 --- a/arch/mips/include/asm/r4kcache.h +++ b/arch/mips/include/asm/r4kcache.h @@ -20,7 +20,7 @@ #include #include #include -#include /* for segment_eq() */ +#include /* for segment_eq() */ extern void (*r4k_blast_dcache)(void); extern void (*r4k_blast_icache)(void); diff --git a/arch/mips/include/asm/termios.h b/arch/mips/include/asm/termios.h index 6245b68a69a8..ce2d72e34274 100644 --- a/arch/mips/include/asm/termios.h +++ b/arch/mips/include/asm/termios.h @@ -9,7 +9,7 @@ #ifndef _ASM_TERMIOS_H #define _ASM_TERMIOS_H -#include +#include #include /* diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c index db6f5afff4ff..1900f39588ae 100644 --- a/arch/mips/jazz/jazzdma.c +++ b/arch/mips/jazz/jazzdma.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c index 12c718181e5e..ae037a304ee4 100644 --- a/arch/mips/kernel/branch.c +++ b/arch/mips/kernel/branch.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* * Calculate and return exception PC in case of branch delay slot diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index dd3175442c9e..07718bb5fc9d 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include /* Hardware capabilities */ unsigned int elf_hwcap __read_mostly; diff --git a/arch/mips/kernel/crash_dump.c b/arch/mips/kernel/crash_dump.c index 6fe7790e5868..77ee99a2d0aa 100644 --- a/arch/mips/kernel/crash_dump.c +++ b/arch/mips/kernel/crash_dump.c @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include static void *kdump_buf_page; diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c index f25f7eab7307..f8f5836eb3c1 100644 --- a/arch/mips/kernel/irq.c +++ b/arch/mips/kernel/irq.c @@ -23,7 +23,7 @@ #include #include -#include +#include /* * 'what should we do if we get a hw irq event on an illegal vector'. diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c index de63d36af895..1f4bd222ba76 100644 --- a/arch/mips/kernel/kgdb.c +++ b/arch/mips/kernel/kgdb.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include static struct hard_trap_info { unsigned char tt; /* Trap type code for MIPS R3xxx and R4xxx */ diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 50fb62544df7..0352f742d077 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index 789d7bf4fef3..a12904ea9f65 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* * CPU mask used to set process affinity for MT VPEs/TCs with FPUs diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c index bd09853aecdf..ef2ca28a028b 100644 --- a/arch/mips/kernel/mips-r2-to-r6-emul.c +++ b/arch/mips/kernel/mips-r2-to-r6-emul.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #ifdef CONFIG_64BIT #define ADDIU "daddiu " diff --git a/arch/mips/kernel/mips_ksyms.c b/arch/mips/kernel/mips_ksyms.c index e2b6ab74643d..93aeec705a6e 100644 --- a/arch/mips/kernel/mips_ksyms.c +++ b/arch/mips/kernel/mips_ksyms.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 9514e5f2209f..5142b1dfe8a7 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index a92994d60e91..c8ba26072132 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index 5fcbdcd7abd0..4f0998525626 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 97b7c51b8251..84165f2b31ff 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include "signal-common.h" diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index a7bc38430500..b672cebb4a1a 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 53a7ef9a8f32..833f82210528 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include /* diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 3905003dfe2b..6c7f9d7e92b3 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -61,7 +61,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c index f1c308dbbc4a..7ed98354fe9d 100644 --- a/arch/mips/kernel/unaligned.c +++ b/arch/mips/kernel/unaligned.c @@ -89,7 +89,7 @@ #include #include #include -#include +#include #define STR(x) __STR(x) #define __STR(x) #x diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index f8b7bf836437..a298ac93edcc 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/math-emu/dsemul.c b/arch/mips/math-emu/dsemul.c index 4a094f7acb3d..c4469ff4a996 100644 --- a/arch/mips/math-emu/dsemul.c +++ b/arch/mips/math-emu/dsemul.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include /** * struct emuframe - The 'emulation' frame structure diff --git a/arch/mips/mm/extable.c b/arch/mips/mm/extable.c index e474fa2efed4..81bc8a34a83f 100644 --- a/arch/mips/mm/extable.c +++ b/arch/mips/mm/extable.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include int fixup_exception(struct pt_regs *regs) { diff --git a/arch/mips/mm/sc-debugfs.c b/arch/mips/mm/sc-debugfs.c index 01f1154cdb0c..7e945e310b44 100644 --- a/arch/mips/mm/sc-debugfs.c +++ b/arch/mips/mm/sc-debugfs.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include diff --git a/arch/mips/oprofile/op_model_loongson3.c b/arch/mips/oprofile/op_model_loongson3.c index 85f3ee4ab456..40660392006f 100644 --- a/arch/mips/oprofile/op_model_loongson3.c +++ b/arch/mips/oprofile/op_model_loongson3.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include "op_impl.h" diff --git a/arch/mips/sgi-ip22/ip28-berr.c b/arch/mips/sgi-ip22/ip28-berr.c index 712cc0f6a58d..9960a8302eac 100644 --- a/arch/mips/sgi-ip22/ip28-berr.c +++ b/arch/mips/sgi-ip22/ip28-berr.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include static unsigned int count_be_is_fixup; diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c index 692778da9e76..2e0edb385656 100644 --- a/arch/mips/sgi-ip27/ip27-berr.c +++ b/arch/mips/sgi-ip27/ip27-berr.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include static void dump_hub_information(unsigned long errst0, unsigned long errst1) { diff --git a/arch/mips/sgi-ip32/ip32-berr.c b/arch/mips/sgi-ip32/ip32-berr.c index afc1cadbba37..ba8f46d80ab8 100644 --- a/arch/mips/sgi-ip32/ip32-berr.c +++ b/arch/mips/sgi-ip32/ip32-berr.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mips/sibyte/common/sb_tbprof.c b/arch/mips/sibyte/common/sb_tbprof.c index 059e28c8fd97..99c720be72d2 100644 --- a/arch/mips/sibyte/common/sb_tbprof.c +++ b/arch/mips/sibyte/common/sb_tbprof.c @@ -54,7 +54,7 @@ #define K_INT_PERF_CNT K_BCM1480_INT_PERF_CNT #endif -#include +#include #define SBPROF_TB_MAJOR 240 diff --git a/arch/mn10300/kernel/fpu.c b/arch/mn10300/kernel/fpu.c index 064fa194de2b..2578b7ae7dd5 100644 --- a/arch/mn10300/kernel/fpu.c +++ b/arch/mn10300/kernel/fpu.c @@ -8,7 +8,7 @@ * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ -#include +#include #include #include #include diff --git a/arch/mn10300/kernel/mn10300_ksyms.c b/arch/mn10300/kernel/mn10300_ksyms.c index f9eb9753a404..ec6c4f8f93a6 100644 --- a/arch/mn10300/kernel/mn10300_ksyms.c +++ b/arch/mn10300/kernel/mn10300_ksyms.c @@ -9,7 +9,7 @@ * 2 of the Licence, or (at your option) any later version. */ #include -#include +#include #include diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index cbede4e88dee..e5def2217f72 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c index 5bd58514e739..976020f469c1 100644 --- a/arch/mn10300/kernel/ptrace.c +++ b/arch/mn10300/kernel/ptrace.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/kernel/setup.c b/arch/mn10300/kernel/setup.c index 2ad7f32fa122..1b3d80d8a171 100644 --- a/arch/mn10300/kernel/setup.c +++ b/arch/mn10300/kernel/setup.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index cd8cb1d1176b..2f3cb5734235 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include "sigframe.h" diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c index 815f1355fad4..f999981e55c0 100644 --- a/arch/mn10300/kernel/sys_mn10300.c +++ b/arch/mn10300/kernel/sys_mn10300.c @@ -21,7 +21,7 @@ #include #include -#include +#include asmlinkage long old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, diff --git a/arch/mn10300/lib/checksum.c b/arch/mn10300/lib/checksum.c index b6580f5d89ee..0f569151ef11 100644 --- a/arch/mn10300/lib/checksum.c +++ b/arch/mn10300/lib/checksum.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include "internal.h" diff --git a/arch/mn10300/mm/cache-smp.c b/arch/mn10300/mm/cache-smp.c index 2d23b9eeee62..e80996064d3d 100644 --- a/arch/mn10300/mm/cache-smp.c +++ b/arch/mn10300/mm/cache-smp.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include "cache-smp.h" diff --git a/arch/mn10300/mm/cache.c b/arch/mn10300/mm/cache.c index 0a1f0aa92ebc..0b925cce2b83 100644 --- a/arch/mn10300/mm/cache.c +++ b/arch/mn10300/mm/cache.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include "cache-smp.h" diff --git a/arch/mn10300/mm/extable.c b/arch/mn10300/mm/extable.c index 25e5485ab87d..305de461cb8f 100644 --- a/arch/mn10300/mm/extable.c +++ b/arch/mn10300/mm/extable.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include int fixup_exception(struct pt_regs *regs) { diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c index 97a1ec0beeec..8ce677d5575e 100644 --- a/arch/mn10300/mm/init.c +++ b/arch/mn10300/mm/init.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c index b9920b1edd5a..31d04da85743 100644 --- a/arch/mn10300/mm/misalignment.c +++ b/arch/mn10300/mm/misalignment.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/mn10300/proc-mn2ws0050/proc-init.c b/arch/mn10300/proc-mn2ws0050/proc-init.c index 950cc8dbb284..25b1b453c515 100644 --- a/arch/mn10300/proc-mn2ws0050/proc-init.c +++ b/arch/mn10300/proc-mn2ws0050/proc-init.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 81f7da7b1d55..72ed30a93c85 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -19,7 +19,7 @@ #include #include -#include +#include static DEFINE_SPINLOCK(die_lock); diff --git a/arch/openrisc/kernel/or32_ksyms.c b/arch/openrisc/kernel/or32_ksyms.c index 83ccf7c0c58d..86e31cf1de1d 100644 --- a/arch/openrisc/kernel/or32_ksyms.c +++ b/arch/openrisc/kernel/or32_ksyms.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 277123bb4bf8..d7990df9025a 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index c82be69b43c6..265f10fb3930 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #define DEBUG_SIG 0 diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 3d3f6062f49c..a4574cb4b0fb 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index e94cd225e816..b1a7435e786a 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/arch/parisc/kernel/asm-offsets.c b/arch/parisc/kernel/asm-offsets.c index 78d30d2ea2d8..1c4fe61a592b 100644 --- a/arch/parisc/kernel/asm-offsets.c +++ b/arch/parisc/kernel/asm-offsets.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #ifdef CONFIG_64BIT #define FRAME_SIZE 128 diff --git a/arch/parisc/kernel/parisc_ksyms.c b/arch/parisc/kernel/parisc_ksyms.c index 3cad8aadc69e..7484b3d11e0d 100644 --- a/arch/parisc/kernel/parisc_ksyms.c +++ b/arch/parisc/kernel/parisc_ksyms.c @@ -43,7 +43,7 @@ EXPORT_SYMBOL(__xchg64); EXPORT_SYMBOL(__cmpxchg_u64); #endif -#include +#include EXPORT_SYMBOL(lclear_user); EXPORT_SYMBOL(lstrnlen_user); diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index b6298a85e8ae..697c53543a4d 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -33,7 +33,7 @@ #include #include /* get_order */ #include -#include +#include #include /* for purge_tlb_*() macros */ static struct proc_dir_entry * proc_gsc_root __read_mostly = NULL; diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 6eabce62463b..e282a5131d77 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -48,7 +48,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index e02d7b4d2b69..f8b6959d2d97 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 2264f68f3c2f..e58925ac64d1 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c index c342b2e17492..70aaabb8b3cb 100644 --- a/arch/parisc/kernel/signal32.c +++ b/arch/parisc/kernel/signal32.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include "signal32.h" diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index a81e177cac7b..bf3294171230 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -23,7 +23,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include +#include #include #include #include diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 4215f5596c8b..037d81f00520 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index 2b65c0177778..0a21067ac0a3 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index e278a87f43cc..1b73690477c5 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/parisc/lib/checksum.c b/arch/parisc/lib/checksum.c index ae66d31f9ecf..ba6384da6ade 100644 --- a/arch/parisc/lib/checksum.c +++ b/arch/parisc/lib/checksum.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #define addc(_t,_r) \ __asm__ __volatile__ ( \ diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 81592562e0f8..ba47c70712f9 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index 033f3385fa49..8d58c61908f7 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index cfa0f81a5bb0..d10ad258d41a 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #ifdef DEBUG diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 03d089b3ed72..4d3aa05e28be 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include /* * Stores the breakpoints currently in use on each breakpoint address diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 3c05c311e35e..a018f5cae899 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -55,7 +55,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index ad108b842669..735ff3d3f77d 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 30b89d5cbb03..3f7ba0f5bf29 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 34d2c595de23..d5e2b8309939 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 678f87a63645..41c86c6b6e4d 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #undef DEBUG diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index c30612aad68e..56548bf6231f 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index b1ec62f2cc31..e4744ff38a17 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index 1e887f3a61a6..f37eb53de1a1 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c index c82eed97bd22..df56dfc4b681 100644 --- a/arch/powerpc/kernel/rtas-proc.c +++ b/arch/powerpc/kernel/rtas-proc.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 6a3e5de544ce..112cc3b2ee1a 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index db2b482af658..f6f6a8a5103a 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define MODULE_VERS "1.0" diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index a26a02006576..2bf1f9b5b34b 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 5fe79182f0fa..7fcf1f7f01c1 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index bbe77aed198d..3a3671172436 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 27aa913ac91d..97bb1385e771 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -37,7 +37,7 @@ #include #endif -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 96698fdf93b4..c83c115858c1 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 8a285876aef8..15f216d022e2 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 644cce3d8dce..de04c9fbb5cd 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index be9751f1cb2a..19397e2a8bf5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -64,7 +64,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 4239aaf74886..e6cc56b61d01 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c index c4bfadb2606b..2d8f6d8ccafc 100644 --- a/arch/powerpc/kernel/vecemu.c +++ b/arch/powerpc/kernel/vecemu.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include /* Functions in vector.S */ extern void vaddfp(vector128 *dst, vector128 *a, vector128 *b); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index b6952dd23152..019f008775b9 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8dcbe37a4dac..66b2a35be424 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 826c541a12af..1482961ceb4d 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index 02176fd52f84..f102616febc7 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -17,7 +17,7 @@ #include -#include +#include #include #include diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c index ef27fbd5d9c5..20528701835b 100644 --- a/arch/powerpc/kvm/book3s_rtas.c +++ b/arch/powerpc/kvm/book3s_rtas.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 3bdc639157c1..20dff102a06f 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index df3f2706d3e5..0514cbd4e533 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index ed38f8114118..fe312c160d97 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index efd1183a6b16..cd892dec7cb6 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/lib/checksum_wrappers.c b/arch/powerpc/lib/checksum_wrappers.c index 08e3a3356c40..a0cb63fb76a1 100644 --- a/arch/powerpc/lib/checksum_wrappers.c +++ b/arch/powerpc/lib/checksum_wrappers.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum sum, int *err_ptr) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index d5edbeb8eb82..c1746df0f88e 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include int patch_instruction(unsigned int *addr, unsigned int instr) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 9c78a9c102c3..06c7e9b88408 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/lib/usercopy_64.c b/arch/powerpc/lib/usercopy_64.c index 5eea6f3c1e03..9bd3a3dad78d 100644 --- a/arch/powerpc/lib/usercopy_64.c +++ b/arch/powerpc/lib/usercopy_64.c @@ -7,7 +7,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) { diff --git a/arch/powerpc/math-emu/fabs.c b/arch/powerpc/math-emu/fabs.c index 549baba5948f..a5e7ad1384ee 100644 --- a/arch/powerpc/math-emu/fabs.c +++ b/arch/powerpc/math-emu/fabs.c @@ -1,6 +1,6 @@ #include #include -#include +#include int fabs(u32 *frD, u32 *frB) diff --git a/arch/powerpc/math-emu/fadd.c b/arch/powerpc/math-emu/fadd.c index 0158a16e2b82..29de37e0e0da 100644 --- a/arch/powerpc/math-emu/fadd.c +++ b/arch/powerpc/math-emu/fadd.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fadds.c b/arch/powerpc/math-emu/fadds.c index 5930f40a8687..7093c5b58002 100644 --- a/arch/powerpc/math-emu/fadds.c +++ b/arch/powerpc/math-emu/fadds.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fcmpo.c b/arch/powerpc/math-emu/fcmpo.c index 5bce011c2aec..5d644467221c 100644 --- a/arch/powerpc/math-emu/fcmpo.c +++ b/arch/powerpc/math-emu/fcmpo.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fcmpu.c b/arch/powerpc/math-emu/fcmpu.c index d4fb1babc6ad..0f9bf4864832 100644 --- a/arch/powerpc/math-emu/fcmpu.c +++ b/arch/powerpc/math-emu/fcmpu.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fctiw.c b/arch/powerpc/math-emu/fctiw.c index f694440ddc00..716d6da7f204 100644 --- a/arch/powerpc/math-emu/fctiw.c +++ b/arch/powerpc/math-emu/fctiw.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fctiwz.c b/arch/powerpc/math-emu/fctiwz.c index 71e782fd4fe3..7212fa7cfd36 100644 --- a/arch/powerpc/math-emu/fctiwz.c +++ b/arch/powerpc/math-emu/fctiwz.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fdiv.c b/arch/powerpc/math-emu/fdiv.c index a29239c05e3e..e1e452069e49 100644 --- a/arch/powerpc/math-emu/fdiv.c +++ b/arch/powerpc/math-emu/fdiv.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fdivs.c b/arch/powerpc/math-emu/fdivs.c index 526bc261275f..5511e2d1c3ad 100644 --- a/arch/powerpc/math-emu/fdivs.c +++ b/arch/powerpc/math-emu/fdivs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fmadd.c b/arch/powerpc/math-emu/fmadd.c index 8c3f20aa5a95..2b6fae0bc8c2 100644 --- a/arch/powerpc/math-emu/fmadd.c +++ b/arch/powerpc/math-emu/fmadd.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fmadds.c b/arch/powerpc/math-emu/fmadds.c index 794fb31e59d1..aff35f24a236 100644 --- a/arch/powerpc/math-emu/fmadds.c +++ b/arch/powerpc/math-emu/fmadds.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fmr.c b/arch/powerpc/math-emu/fmr.c index bd55384b8196..f6347911f6a3 100644 --- a/arch/powerpc/math-emu/fmr.c +++ b/arch/powerpc/math-emu/fmr.c @@ -1,6 +1,6 @@ #include #include -#include +#include int fmr(u32 *frD, u32 *frB) diff --git a/arch/powerpc/math-emu/fmsub.c b/arch/powerpc/math-emu/fmsub.c index 626f6fed84ac..1fb26cebe04e 100644 --- a/arch/powerpc/math-emu/fmsub.c +++ b/arch/powerpc/math-emu/fmsub.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fmsubs.c b/arch/powerpc/math-emu/fmsubs.c index 3425bc899760..f73965453e05 100644 --- a/arch/powerpc/math-emu/fmsubs.c +++ b/arch/powerpc/math-emu/fmsubs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fmul.c b/arch/powerpc/math-emu/fmul.c index 2c1929779892..ffd31b549290 100644 --- a/arch/powerpc/math-emu/fmul.c +++ b/arch/powerpc/math-emu/fmul.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fmuls.c b/arch/powerpc/math-emu/fmuls.c index f5ad5c9c77d0..21aee431ca9d 100644 --- a/arch/powerpc/math-emu/fmuls.c +++ b/arch/powerpc/math-emu/fmuls.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fnabs.c b/arch/powerpc/math-emu/fnabs.c index a7d34f3d9499..af877a53d264 100644 --- a/arch/powerpc/math-emu/fnabs.c +++ b/arch/powerpc/math-emu/fnabs.c @@ -1,6 +1,6 @@ #include #include -#include +#include int fnabs(u32 *frD, u32 *frB) diff --git a/arch/powerpc/math-emu/fneg.c b/arch/powerpc/math-emu/fneg.c index 1e988cd9c6cc..8417d174758c 100644 --- a/arch/powerpc/math-emu/fneg.c +++ b/arch/powerpc/math-emu/fneg.c @@ -1,6 +1,6 @@ #include #include -#include +#include int fneg(u32 *frD, u32 *frB) diff --git a/arch/powerpc/math-emu/fnmadd.c b/arch/powerpc/math-emu/fnmadd.c index e817bc5453ef..6316ef0e0874 100644 --- a/arch/powerpc/math-emu/fnmadd.c +++ b/arch/powerpc/math-emu/fnmadd.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fnmadds.c b/arch/powerpc/math-emu/fnmadds.c index 4db4b7d9ba8d..9ffe037df2b9 100644 --- a/arch/powerpc/math-emu/fnmadds.c +++ b/arch/powerpc/math-emu/fnmadds.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fnmsub.c b/arch/powerpc/math-emu/fnmsub.c index f65979fa770e..f97a9cfb54ea 100644 --- a/arch/powerpc/math-emu/fnmsub.c +++ b/arch/powerpc/math-emu/fnmsub.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fnmsubs.c b/arch/powerpc/math-emu/fnmsubs.c index 9021dacc03b8..7fa1217bd930 100644 --- a/arch/powerpc/math-emu/fnmsubs.c +++ b/arch/powerpc/math-emu/fnmsubs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fre.c b/arch/powerpc/math-emu/fre.c index 49ccf2cc6a5a..b621a790aa67 100644 --- a/arch/powerpc/math-emu/fre.c +++ b/arch/powerpc/math-emu/fre.c @@ -1,6 +1,6 @@ #include #include -#include +#include int fre(void *frD, void *frB) { diff --git a/arch/powerpc/math-emu/fres.c b/arch/powerpc/math-emu/fres.c index 10ecbd08b79e..211c30d0145f 100644 --- a/arch/powerpc/math-emu/fres.c +++ b/arch/powerpc/math-emu/fres.c @@ -1,6 +1,6 @@ #include #include -#include +#include int fres(void *frD, void *frB) diff --git a/arch/powerpc/math-emu/frsp.c b/arch/powerpc/math-emu/frsp.c index ddcc14664b1a..3e3bc73e27ae 100644 --- a/arch/powerpc/math-emu/frsp.c +++ b/arch/powerpc/math-emu/frsp.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/frsqrte.c b/arch/powerpc/math-emu/frsqrte.c index 1d0a3a0fd0e6..7c2ce43750dc 100644 --- a/arch/powerpc/math-emu/frsqrte.c +++ b/arch/powerpc/math-emu/frsqrte.c @@ -1,6 +1,6 @@ #include #include -#include +#include int frsqrte(void *frD, void *frB) diff --git a/arch/powerpc/math-emu/frsqrtes.c b/arch/powerpc/math-emu/frsqrtes.c index 7e838e380314..269951a8c650 100644 --- a/arch/powerpc/math-emu/frsqrtes.c +++ b/arch/powerpc/math-emu/frsqrtes.c @@ -1,6 +1,6 @@ #include #include -#include +#include int frsqrtes(void *frD, void *frB) { diff --git a/arch/powerpc/math-emu/fsel.c b/arch/powerpc/math-emu/fsel.c index 1b0c14498032..32b62c6c7f48 100644 --- a/arch/powerpc/math-emu/fsel.c +++ b/arch/powerpc/math-emu/fsel.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fsqrt.c b/arch/powerpc/math-emu/fsqrt.c index a55fc7d49983..0e2a34b616dc 100644 --- a/arch/powerpc/math-emu/fsqrt.c +++ b/arch/powerpc/math-emu/fsqrt.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fsqrts.c b/arch/powerpc/math-emu/fsqrts.c index 31dccbfc39ff..420cf19b5fd4 100644 --- a/arch/powerpc/math-emu/fsqrts.c +++ b/arch/powerpc/math-emu/fsqrts.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fsub.c b/arch/powerpc/math-emu/fsub.c index 02c5dff458ba..feedd705cf62 100644 --- a/arch/powerpc/math-emu/fsub.c +++ b/arch/powerpc/math-emu/fsub.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/fsubs.c b/arch/powerpc/math-emu/fsubs.c index 5d9b18c35e07..74190514063e 100644 --- a/arch/powerpc/math-emu/fsubs.c +++ b/arch/powerpc/math-emu/fsubs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/lfd.c b/arch/powerpc/math-emu/lfd.c index 79ac76d596c3..d998a50740a0 100644 --- a/arch/powerpc/math-emu/lfd.c +++ b/arch/powerpc/math-emu/lfd.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/lfs.c b/arch/powerpc/math-emu/lfs.c index 434ed27be8db..1ee10b83d7e3 100644 --- a/arch/powerpc/math-emu/lfs.c +++ b/arch/powerpc/math-emu/lfs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/math.c b/arch/powerpc/math-emu/math.c index ab151f040502..76ee2e5dba65 100644 --- a/arch/powerpc/math-emu/math.c +++ b/arch/powerpc/math-emu/math.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/math_efp.c b/arch/powerpc/math-emu/math_efp.c index 28337c9709ae..581f404caa1d 100644 --- a/arch/powerpc/math-emu/math_efp.c +++ b/arch/powerpc/math-emu/math_efp.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #define FP_EX_BOOKE_E500_SPE diff --git a/arch/powerpc/math-emu/mcrfs.c b/arch/powerpc/math-emu/mcrfs.c index e948d5708e2b..8e8e72397ebc 100644 --- a/arch/powerpc/math-emu/mcrfs.c +++ b/arch/powerpc/math-emu/mcrfs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/mffs.c b/arch/powerpc/math-emu/mffs.c index 5526cf96ede5..e00fdc22a0bc 100644 --- a/arch/powerpc/math-emu/mffs.c +++ b/arch/powerpc/math-emu/mffs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/mtfsb0.c b/arch/powerpc/math-emu/mtfsb0.c index bc985585bca8..5ed3e7d5063e 100644 --- a/arch/powerpc/math-emu/mtfsb0.c +++ b/arch/powerpc/math-emu/mtfsb0.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/mtfsb1.c b/arch/powerpc/math-emu/mtfsb1.c index fe6ed5ac85b3..602aa16eda81 100644 --- a/arch/powerpc/math-emu/mtfsb1.c +++ b/arch/powerpc/math-emu/mtfsb1.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/mtfsf.c b/arch/powerpc/math-emu/mtfsf.c index 44b0fc8214f4..b0d5593ad357 100644 --- a/arch/powerpc/math-emu/mtfsf.c +++ b/arch/powerpc/math-emu/mtfsf.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/mtfsfi.c b/arch/powerpc/math-emu/mtfsfi.c index fd2acc26813b..5df30541a784 100644 --- a/arch/powerpc/math-emu/mtfsfi.c +++ b/arch/powerpc/math-emu/mtfsfi.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/math-emu/stfd.c b/arch/powerpc/math-emu/stfd.c index 33a165c8df0f..6baeaec134a2 100644 --- a/arch/powerpc/math-emu/stfd.c +++ b/arch/powerpc/math-emu/stfd.c @@ -1,6 +1,6 @@ #include #include -#include +#include int stfd(void *frS, void *ea) diff --git a/arch/powerpc/math-emu/stfiwx.c b/arch/powerpc/math-emu/stfiwx.c index f15a35f67e2c..9da7c5d1a872 100644 --- a/arch/powerpc/math-emu/stfiwx.c +++ b/arch/powerpc/math-emu/stfiwx.c @@ -1,6 +1,6 @@ #include #include -#include +#include int stfiwx(u32 *frS, void *ea) diff --git a/arch/powerpc/math-emu/stfs.c b/arch/powerpc/math-emu/stfs.c index 6122147356d1..62bd25264fb5 100644 --- a/arch/powerpc/math-emu/stfs.c +++ b/arch/powerpc/math-emu/stfs.c @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 31a5d42df8c9..61ac468c87c6 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index 139dec421e57..080d49b26c3a 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 8410b4bb36ed..80334937e14f 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a000c3585390..93abf8a9813d 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index d5543514c1df..5c096c01e8bd 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include /* diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index 85c85eb3e245..e5a891ae80ee 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include "spufs.h" diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 3a147122bc98..a35e2c29d7ee 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include "spufs.h" #include "sputrace.h" diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 5364d4a54249..d8af9bc0489f 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include "spufs.h" diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index a87200a535fa..0d290ea83dc1 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include "spufs.h" diff --git a/arch/powerpc/platforms/chrp/nvram.c b/arch/powerpc/platforms/chrp/nvram.c index 9ef8cc3378d0..c3ede2c365c3 100644 --- a/arch/powerpc/platforms/chrp/nvram.c +++ b/arch/powerpc/platforms/chrp/nvram.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c index f2344cbd2f46..ecd6d9177d13 100644 --- a/arch/powerpc/platforms/powernv/opal-elog.c +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include struct elog_obj { diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index e4169d68cb32..4886eb8b6381 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include static int opal_lpc_chip_id = -1; diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index e315e704cca7..2d6ee1c5ad85 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include /** diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 972328829387..4839db385bb0 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 76caa4a45ccd..5cb2e4beffc5 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include static struct workqueue_struct *pseries_hp_wq; diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 39049e4884fb..6b04e3f0f982 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index e6397976060e..779fc2a1c8f7 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 79aef8c1c5b3..69cedc1b3b8a 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index cc66c49f07aa..e5bf1e84047f 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include "of_helpers.h" diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index 7d28cabf1206..c47585a78b69 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c index 6f5a8d177c42..d0e9f178a324 100644 --- a/arch/powerpc/sysdev/scom.c +++ b/arch/powerpc/sysdev/scom.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include const struct scom_controller *scom_controller; EXPORT_SYMBOL_GPL(scom_controller); diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c index 53a16aa4d384..5692dd569b9b 100644 --- a/arch/powerpc/sysdev/tsi108_pci.c +++ b/arch/powerpc/sysdev/tsi108_pci.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index f587c4811faf..5a8dfa22da7c 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/s390/boot/compressed/misc.c b/arch/s390/boot/compressed/misc.c index 4da604ebf6fd..8515dd5a5663 100644 --- a/arch/s390/boot/compressed/misc.c +++ b/arch/s390/boot/compressed/misc.c @@ -6,7 +6,7 @@ * Author(s): Martin Schwidefsky */ -#include +#include #include #include #include diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 1113389d0a39..daf9bb063aaa 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index d7f100c53f07..12bf4fef2a68 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -11,7 +11,7 @@ #ifndef _S390_CHECKSUM_H #define _S390_CHECKSUM_H -#include +#include /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/s390/include/asm/idals.h b/arch/s390/include/asm/idals.h index a7b2d7504049..280b60a0bcd4 100644 --- a/arch/s390/include/asm/idals.h +++ b/arch/s390/include/asm/idals.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include #define IDA_SIZE_LOG 12 /* 11 for 2k , 12 for 4k */ #define IDA_BLOCK_SIZE (1L< -#include +#include #include #include diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 0f9cd90c11af..96df4547377a 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -51,7 +51,7 @@ #include #include -#include +#include #include #include diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 6f2a6ab13cb5..362350cc485c 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include "compat_linux.h" diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index aa12de72fd47..79f8ae933520 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index c74c59236f44..9f017cf417f6 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index fdb40424acfe..84e0557b16fe 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include DEFINE_PER_CPU(struct kprobe *, current_kprobe); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index b81ab8882e2e..7447ba509c30 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include "entry.h" diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 9f241d1efeda..62a4c263e887 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include "entry.h" diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c index f145490cce54..b7af452978ca 100644 --- a/arch/s390/kernel/sys_s390.c +++ b/arch/s390/kernel/sys_s390.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include "entry.h" /* diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 867d0a057046..ec76315c9ee5 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index d0539f76fd24..283ad7840335 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include "entry.h" diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index af13f1a135b6..6843dd5a1cba 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index b3e9d18f2ec6..b67454ad8408 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/score/include/asm/checksum.h b/arch/score/include/asm/checksum.h index 539d9fd45d21..0338927f4826 100644 --- a/arch/score/include/asm/checksum.h +++ b/arch/score/include/asm/checksum.h @@ -2,7 +2,7 @@ #define _ASM_SCORE_CHECKSUM_H #include -#include +#include /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c index 4f7314d5f334..8b75e54816c1 100644 --- a/arch/score/kernel/ptrace.c +++ b/arch/score/kernel/ptrace.c @@ -29,7 +29,7 @@ #include #include -#include +#include /* * retrieve the contents of SCORE userspace general registers diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c index 5cea1e750cec..d948a6818961 100644 --- a/arch/score/kernel/traps.c +++ b/arch/score/kernel/traps.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include unsigned long exception_handlers[32]; diff --git a/arch/score/lib/checksum_copy.c b/arch/score/lib/checksum_copy.c index 9b770b30e8a5..39b99ef61804 100644 --- a/arch/score/lib/checksum_copy.c +++ b/arch/score/lib/checksum_copy.c @@ -25,7 +25,7 @@ #include -#include +#include unsigned int csum_partial_copy(const char *src, char *dst, int len, unsigned int sum) diff --git a/arch/sh/boards/mach-landisk/gio.c b/arch/sh/boards/mach-landisk/gio.c index 8132dff078fb..32c317f5d991 100644 --- a/arch/sh/boards/mach-landisk/gio.c +++ b/arch/sh/boards/mach-landisk/gio.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index 208a9753ab38..ae1dfdb0013b 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -11,7 +11,7 @@ * Modified to use standard LinuxSH BIOS by Greg Banks 7Jul2000 */ -#include +#include #include #include diff --git a/arch/sh/include/asm/mmu_context.h b/arch/sh/include/asm/mmu_context.h index 9f417feaf6e8..35ffdd081d26 100644 --- a/arch/sh/include/asm/mmu_context.h +++ b/arch/sh/include/asm/mmu_context.h @@ -10,7 +10,7 @@ #ifdef __KERNEL__ #include #include -#include +#include #include #include diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c index c8b3be1b54e6..c4f01c5c8736 100644 --- a/arch/sh/kernel/cpu/init.c +++ b/arch/sh/kernel/cpu/init.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c index 53b8eeb1db20..c32e66079f7c 100644 --- a/arch/sh/kernel/cpu/shmobile/cpuidle.c +++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include static unsigned long cpuidle_mode[] = { SUSP_SH_SLEEP, /* regular sleep mode */ diff --git a/arch/sh/kernel/cpu/shmobile/pm.c b/arch/sh/kernel/cpu/shmobile/pm.c index ac37b7234f85..fba2be5d72e9 100644 --- a/arch/sh/kernel/cpu/shmobile/pm.c +++ b/arch/sh/kernel/cpu/shmobile/pm.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c index 569e7b171c01..b33be505361e 100644 --- a/arch/sh/kernel/crash_dump.c +++ b/arch/sh/kernel/crash_dump.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include /** * copy_oldmem_page - copy one page from "oldmem" diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index f8ce36286cea..4d4e7a2a774b 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 6c0378c0b8b5..bc3591125df7 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 83acbf3f6de8..1653ff64b103 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index ee12e9451874..51741850a715 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 9d3e9916555d..e0b271bffd6a 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index c1a6b89bfe70..1aabfd356b35 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/ptrace_64.c b/arch/sh/kernel/ptrace_64.c index 5cea973a65b2..c49d0d05a215 100644 --- a/arch/sh/kernel/ptrace_64.c +++ b/arch/sh/kernel/ptrace_64.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index e7b49d81053e..3a44c753b642 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c index 26a0774f5272..6ee3740e009e 100644 --- a/arch/sh/kernel/sh_ksyms_64.c +++ b/arch/sh/kernel/sh_ksyms_64.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index f7c3d5c25caf..5128d3001ee5 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index d8a3f0d22809..7b77f1812434 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c index 8c6a350df751..6576e5ee1fc3 100644 --- a/arch/sh/kernel/sys_sh.c +++ b/arch/sh/kernel/sys_sh.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c index b66d1c62eb19..d5287d76809c 100644 --- a/arch/sh/kernel/sys_sh32.c +++ b/arch/sh/kernel/sys_sh32.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index d208c27ccc67..00835edb6e20 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/math-emu/math.c b/arch/sh/math-emu/math.c index 04aa55fa8c75..5078cb809750 100644 --- a/arch/sh/math-emu/math.c +++ b/arch/sh/math-emu/math.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sh/mm/cache-debugfs.c b/arch/sh/mm/cache-debugfs.c index 777e50f33c00..4eb9d43578b4 100644 --- a/arch/sh/mm/cache-debugfs.c +++ b/arch/sh/mm/cache-debugfs.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/mm/cache-sh3.c b/arch/sh/mm/cache-sh3.c index e37523f65195..031634f273fa 100644 --- a/arch/sh/mm/cache-sh3.c +++ b/arch/sh/mm/cache-sh3.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c index d1bffbcd9d52..d94dadedf74f 100644 --- a/arch/sh/mm/cache-sh5.c +++ b/arch/sh/mm/cache-sh5.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include extern void __weak sh4__flush_region_init(void); diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index 7729cca727eb..6cd2aa395817 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/mm/extable_32.c b/arch/sh/mm/extable_32.c index c1cf4463d09d..9cfcbb5848e4 100644 --- a/arch/sh/mm/extable_32.c +++ b/arch/sh/mm/extable_32.c @@ -5,7 +5,7 @@ */ #include -#include +#include int fixup_exception(struct pt_regs *regs) { diff --git a/arch/sh/mm/extable_64.c b/arch/sh/mm/extable_64.c index f05499688d88..96edaff8c983 100644 --- a/arch/sh/mm/extable_64.c +++ b/arch/sh/mm/extable_64.c @@ -12,7 +12,7 @@ */ #include #include -#include +#include extern unsigned long copy_user_memcpy, copy_user_memcpy_end; extern void __copy_user_fixup(void); diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c index 36312d254faf..82f8197b93f6 100644 --- a/arch/sh/mm/nommu.c +++ b/arch/sh/mm/nommu.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include /* * Nothing too terribly exciting here .. diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c index 7160c9fd6fe3..7b2cc490ebb7 100644 --- a/arch/sh/mm/pmb.c +++ b/arch/sh/mm/pmb.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sh/mm/tlb-sh3.c b/arch/sh/mm/tlb-sh3.c index 6554fb439f0e..5c66665bff8b 100644 --- a/arch/sh/mm/tlb-sh3.c +++ b/arch/sh/mm/tlb-sh3.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sh/mm/tlbex_64.c b/arch/sh/mm/tlbex_64.c index 8557548fc53e..8ff966dd0c74 100644 --- a/arch/sh/mm/tlbex_64.c +++ b/arch/sh/mm/tlbex_64.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index f33fdd2558e8..bd0715d5dca4 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/oprofile/backtrace.c b/arch/sh/oprofile/backtrace.c index 9c88dcd56e86..c7695f99c8c3 100644 --- a/arch/sh/oprofile/backtrace.c +++ b/arch/sh/oprofile/backtrace.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h index eff748c871ec..e25af5fc99fd 100644 --- a/arch/sparc/include/asm/checksum_32.h +++ b/arch/sparc/include/asm/checksum_32.h @@ -16,7 +16,7 @@ */ #include -#include +#include /* computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) diff --git a/arch/sparc/include/asm/checksum_64.h b/arch/sparc/include/asm/checksum_64.h index 0395d75322e9..96a5ed58cea6 100644 --- a/arch/sparc/include/asm/checksum_64.h +++ b/arch/sparc/include/asm/checksum_64.h @@ -16,7 +16,7 @@ */ #include -#include +#include /* computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) diff --git a/arch/sparc/kernel/apc.c b/arch/sparc/kernel/apc.c index 742f6c4436bf..9ebc37e7d64c 100644 --- a/arch/sparc/kernel/apc.c +++ b/arch/sparc/kernel/apc.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 34a7930b76ef..3bebf395252c 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c index b0377db12d83..2d13a4fc0384 100644 --- a/arch/sparc/kernel/kprobes.c +++ b/arch/sparc/kernel/kprobes.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include /* We do not have hardware single-stepping on sparc64. * So we implement software single-stepping with breakpoint diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c index 8a6982dfd733..c0765bbf60ea 100644 --- a/arch/sparc/kernel/mdesc.c +++ b/arch/sparc/kernel/mdesc.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 9c1878f4fa9f..015e55a7495d 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c index 24384e1dc33d..a38787b84322 100644 --- a/arch/sparc/kernel/pcic.c +++ b/arch/sparc/kernel/pcic.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include "kernel.h" diff --git a/arch/sparc/kernel/pmc.c b/arch/sparc/kernel/pmc.c index 97d123107ecb..f12b23f7b515 100644 --- a/arch/sparc/kernel/pmc.c +++ b/arch/sparc/kernel/pmc.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index b7780a5bef11..48ffc3e7d1dd 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 47ff5588e521..d249ca10b203 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -33,7 +33,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c index a331fdc11a2c..eca3dc76793c 100644 --- a/arch/sparc/kernel/ptrace_32.c +++ b/arch/sparc/kernel/ptrace_32.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include "kernel.h" diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 96494b2ef41f..901063c1cf7e 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 91cc2f4ae4d9..b4096bb665b2 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 9c0c8fd0b292..62c3e255ae7c 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index c782c9b716db..965d50e833e7 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 8182f7caf5b1..0ce347f8e4cc 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index 022c30c72ebd..bca44f3e6b86 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -44,7 +44,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 646988d4c1a3..fb7b185ee941 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include "systbls.h" diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index fe8b8ee8e660..884c70331345 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c index c69b21e51efc..807f7e2ce014 100644 --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include "entry.h" diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 496fa926e1e0..4bc10e44d1ca 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c index 32b61d1b6379..d20d4e3fd129 100644 --- a/arch/sparc/kernel/unaligned_32.c +++ b/arch/sparc/kernel/unaligned_32.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c index 52c00d90d4b4..cda7fd367c4f 100644 --- a/arch/sparc/kernel/unaligned_64.c +++ b/arch/sparc/kernel/unaligned_64.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/sparc/kernel/uprobes.c b/arch/sparc/kernel/uprobes.c index b68314050602..d852ae56ddc1 100644 --- a/arch/sparc/kernel/uprobes.c +++ b/arch/sparc/kernel/uprobes.c @@ -29,7 +29,7 @@ #include #include -#include +#include /* Compute the address of the breakpoint instruction and return it. * diff --git a/arch/sparc/kernel/visemul.c b/arch/sparc/kernel/visemul.c index c096c624ac4d..c4ac58e483a4 100644 --- a/arch/sparc/kernel/visemul.c +++ b/arch/sparc/kernel/visemul.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include /* OPF field of various VIS instructions. */ diff --git a/arch/sparc/kernel/windows.c b/arch/sparc/kernel/windows.c index 87bab0a3857a..435a467b0595 100644 --- a/arch/sparc/kernel/windows.c +++ b/arch/sparc/kernel/windows.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include "kernel.h" diff --git a/arch/sparc/math-emu/math_32.c b/arch/sparc/math-emu/math_32.c index 5ce8f2f64604..4d7e0fff054f 100644 --- a/arch/sparc/math-emu/math_32.c +++ b/arch/sparc/math-emu/math_32.c @@ -68,7 +68,7 @@ #include #include #include -#include +#include #include "sfp-util_32.h" #include diff --git a/arch/sparc/math-emu/math_64.c b/arch/sparc/math-emu/math_64.c index 034aadbff036..9647051853d3 100644 --- a/arch/sparc/math-emu/math_64.c +++ b/arch/sparc/math-emu/math_64.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include "sfp-util_64.h" diff --git a/arch/sparc/mm/extable.c b/arch/sparc/mm/extable.c index a61c349448e1..768a11e6bd4f 100644 --- a/arch/sparc/mm/extable.c +++ b/arch/sparc/mm/extable.c @@ -3,7 +3,7 @@ */ #include -#include +#include void sort_extable(struct exception_table_entry *start, struct exception_table_entry *finish) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 37aa537b3ad8..5d2f91511c60 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 9f37106ef93a..c84c54a1ac55 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #ifdef CONFIG_HARDWALL #include #endif diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c index 862973074bf9..de3eae813e52 100644 --- a/arch/tile/kernel/single_step.c +++ b/arch/tile/kernel/single_step.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c index 4fe78c5b8394..f229e979584e 100644 --- a/arch/tile/kernel/unaligned.c +++ b/arch/tile/kernel/unaligned.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c index 3282787bbcfb..6d381279b362 100644 --- a/arch/um/drivers/harddog_kern.c +++ b/arch/um/drivers/harddog_kern.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include "mconsole.h" MODULE_LICENSE("GPL"); diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index 3a4b58730f5f..12bdb5996bf5 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 8a6b57108ac2..8a4c72af3bc0 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c index 62145c276167..3645fcb2a787 100644 --- a/arch/um/drivers/mmapper_kern.c +++ b/arch/um/drivers/mmapper_kern.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include /* These are set in mmapper_init, which is called at boot time */ diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index dd16c902ff70..05523f14d7b2 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 0d7103c9eff3..770ec07b6a6a 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/um/kernel/exitcode.c b/arch/um/kernel/exitcode.c index 41ebbfebb333..546302e3b7fb 100644 --- a/arch/um/kernel/exitcode.c +++ b/arch/um/kernel/exitcode.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include /* * If read and write race, the read will still atomically read a valid diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 034b42c7ab40..078630d6448c 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c index 6a826cbb15c4..bc2a516c190f 100644 --- a/arch/um/kernel/ptrace.c +++ b/arch/um/kernel/ptrace.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include void user_enable_single_step(struct task_struct *child) diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c index c1d0ae069b53..6258676bed85 100644 --- a/arch/um/kernel/syscall.c +++ b/arch/um/kernel/syscall.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include long old_mmap(unsigned long addr, unsigned long len, diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index bdd9cc59d20f..b83c61cfd154 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #define CREATE_TRACE_POINTS diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index cb26f18d43af..7c0a711989d2 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index cb13c0564ea7..95c0b4ae09b0 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 719cd702b0a4..47956c6a4fd8 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 44b8762fa0c7..830b19dbfa31 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index c020ee75dce7..08e7efb2c140 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -8,7 +8,7 @@ */ #include -#include +#include #include /** diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index f5fb840b43e8..33cbd3db97b9 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 643818a7688b..45d44c173cf9 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -234,7 +234,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c7efbcfbeda6..87cc9ab7a13c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "mce-internal.h" diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 11891ca7b716..538fedea9b3f 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -10,7 +10,7 @@ #include #include -#include +#include static void *kdump_buf_page; diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index f6dfd9334b67..b2f7207ba86c 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d9d8d16b69db..eb3509338ae0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 3bb4c5f021f6..3d1bee9d6a72 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 37363e46b1f0..b615a1113f58 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0e63c0267f99..9cc7d5a330ef 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 27538f183c3b..a3b875c9e6af 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include extern int rodata_test_data; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 9692a5e9fdab..6c8934406dc9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 01f30e56f99e..ec5d7545e6dc 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 0b281217c890..1f65ff6540f0 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index 9e6545f269e5..2ccc424a57d9 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -19,7 +19,7 @@ #include -#include +#include #include "fpu_emu.h" #include "fpu_system.h" diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index e945fedf1de2..0203baefb5c0 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 8db26591d91a..b8ef9f9d2ffc 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -19,7 +19,7 @@ #include -#include +#include #include #include "fpu_system.h" diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index 95228ff042c0..1643054766eb 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -18,7 +18,7 @@ | other processes using the emulator while swapping is in progress. | +---------------------------------------------------------------------------*/ -#include +#include #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index d597fe7423c9..2c98965a60ba 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -19,7 +19,7 @@ #include "fpu_emu.h" -#include +#include #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index fcd06f7526de..61a7e9ea9aa1 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index cf8059016ec8..928d657de829 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 963895f9af7f..af85b686a7b0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e3353c97d086..5a287e523eab 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 60a5a5a85505..2497bac56066 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index e30202b1716e..a5c9910d234f 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -10,7 +10,7 @@ #include #define __FRAME_OFFSETS #include -#include +#include #include /* diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 49e503697022..727ed442e0a5 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 48e38584d5c1..5bd949da7a4a 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 37129db76d33..276da636dd39 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -71,7 +71,7 @@ #include #include -#include +#include #include #include diff --git a/arch/xtensa/include/asm/checksum.h b/arch/xtensa/include/asm/checksum.h index ec35074fcb03..3ae74d7e074b 100644 --- a/arch/xtensa/include/asm/checksum.h +++ b/arch/xtensa/include/asm/checksum.h @@ -12,7 +12,7 @@ #define _XTENSA_CHECKSUM_H #include -#include +#include #include /* diff --git a/arch/xtensa/include/asm/segment.h b/arch/xtensa/include/asm/segment.h index a2eb547a1a75..98964ad15ca2 100644 --- a/arch/xtensa/include/asm/segment.h +++ b/arch/xtensa/include/asm/segment.h @@ -11,6 +11,6 @@ #ifndef _XTENSA_SEGMENT_H #define _XTENSA_SEGMENT_H -#include +#include #endif /* _XTENSA_SEGEMENT_H */ diff --git a/arch/xtensa/kernel/asm-offsets.c b/arch/xtensa/kernel/asm-offsets.c index 8e10e357ee32..bcb5beb81177 100644 --- a/arch/xtensa/kernel/asm-offsets.c +++ b/arch/xtensa/kernel/asm-offsets.c @@ -24,7 +24,7 @@ #include #include -#include +#include int main(void) { diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c index 4ac3d23161cf..a265edd6ac37 100644 --- a/arch/xtensa/kernel/irq.c +++ b/arch/xtensa/kernel/irq.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include DECLARE_PER_CPU(unsigned long, nmi_count); diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index e0ded48561db..826d25104846 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index a651f3a628ee..32519b71d914 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include void user_enable_single_step(struct task_struct *child) diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index e87adaa07ff3..c41294745731 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c index 7538d802b65a..e7d30ee0fd48 100644 --- a/arch/xtensa/kernel/stacktrace.c +++ b/arch/xtensa/kernel/stacktrace.c @@ -14,7 +14,7 @@ #include #include -#include +#include #if IS_ENABLED(CONFIG_OPROFILE) || IS_ENABLED(CONFIG_PERF_EVENTS) diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index 83cf49685373..d3fd100dffc9 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -15,7 +15,7 @@ * Kevin Chea * */ -#include +#include #include #include #include diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index ce37d5b899fe..282bf721a4d6 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c index 4d2872fd9bb5..d159e9b9c018 100644 --- a/arch/xtensa/kernel/xtensa_ksyms.c +++ b/arch/xtensa/kernel/xtensa_ksyms.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/xtensa/platforms/iss/console.c b/arch/xtensa/platforms/iss/console.c index c68f1e6158aa..0140a22551c8 100644 --- a/arch/xtensa/platforms/iss/console.c +++ b/arch/xtensa/platforms/iss/console.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index ede04cca30dd..02e94bb3ad3e 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #define SIMDISK_MAJOR 240 diff --git a/block/ioctl.c b/block/ioctl.c index 656c8c6ed206..be7f4de3eb3c 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) { diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 47a61474e795..14b081af8d61 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include "check.h" diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index c6fee7437be4..c2b64923ab66 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index 201292e67ee8..d00bc0ef87a6 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #define PREFIX "ACPI: " diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 05fe9ebfb9b5..4ef1e4624b2b 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -36,7 +36,7 @@ #ifdef CONFIG_ACPI_PROCFS_POWER #include #include -#include +#include #endif #include diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index a404ff4d7151..57fb5f468ac2 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -42,7 +42,7 @@ #include #include -#include +#include #include #include "internal.h" diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c index 2a358154b770..a34669cc823b 100644 --- a/drivers/acpi/proc.c +++ b/drivers/acpi/proc.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include "sleep.h" #include "internal.h" diff --git a/drivers/acpi/processor_thermal.c b/drivers/acpi/processor_thermal.c index 1fed84a092c2..59c3a5d1e600 100644 --- a/drivers/acpi/processor_thermal.c +++ b/drivers/acpi/processor_thermal.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #define PREFIX "ACPI: " diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index d51ca1c05619..a12f96cc93ff 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #define PREFIX "ACPI: " diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 35e8fbca10ad..1d0417b87cb7 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #define PREFIX "ACPI: " diff --git a/drivers/atm/adummy.c b/drivers/atm/adummy.c index f9b983ae6877..1fd25e872ece 100644 --- a/drivers/atm/adummy.c +++ b/drivers/atm/adummy.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index 480fa6ffbc09..3ef6253e1cce 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 40c2d561417b..c53a9dd1353f 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 85aaf2222587..80c2ddcfa92c 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include "firestream.h" diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index 81aaa505862c..637c3e6b0f9e 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_SBUS diff --git a/drivers/atm/he.c b/drivers/atm/he.c index 31b513a23ae0..3617659b9184 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -71,7 +71,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c index 5fc81e240c24..584aa881882b 100644 --- a/drivers/atm/horizon.c +++ b/drivers/atm/horizon.c @@ -45,7 +45,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c index feb023d7eebd..082aa02abc57 100644 --- a/drivers/atm/idt77105.c +++ b/drivers/atm/idt77105.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include "idt77105.h" diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 074616b39f4d..471ddfd93ea8 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -45,7 +45,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index b2756765950e..8640bafeb471 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index c7296b583787..cb28579e8a94 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include #include "nicstar.h" diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c index 02159345566c..b0363149b2fd 100644 --- a/drivers/atm/suni.c +++ b/drivers/atm/suni.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include "suni.h" diff --git a/drivers/atm/uPD98402.c b/drivers/atm/uPD98402.c index 5120a96b3a89..4fa13a807873 100644 --- a/drivers/atm/uPD98402.c +++ b/drivers/atm/uPD98402.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include "uPD98402.h" diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index d3dc95484161..292dec18ffb8 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include "uPD98401.h" #include "uPD98402.h" diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bb69e58c29f3..8ab8ea1253e6 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -23,7 +23,7 @@ #include #include -#include +#include static DEFINE_MUTEX(mem_sysfs_mutex); diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 0809cda93cc0..26a51be77227 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include "DAC960.h" #define DAC960_GAM_MINOR 252 diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 5fd50a284168..a328f673adfe 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -70,7 +70,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/block/brd.c b/drivers/block/brd.c index ad793f35632c..3adc32a3153b 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -23,7 +23,7 @@ #include #endif -#include +#include #define SECTOR_SHIFT 9 #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index db9d6bb6352d..e5c5b8eb14a9 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c index 3d31761c0ed0..74e03aa537ad 100644 --- a/drivers/block/cryptoloop.c +++ b/drivers/block/cryptoloop.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "loop.h" MODULE_LICENSE("GPL"); diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 3abb121825bc..a9b48ed7a3cd 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -45,7 +45,7 @@ #define REALLY_SLOW_IO #include -#include +#include #ifdef __arm__ #undef HD_IRQ diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 4af818766797..f347285c67ec 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -78,7 +78,7 @@ #include #include "loop.h" -#include +#include static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_index_mutex); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 99c84468f154..38c576f76d36 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 93362362aa55..5fd2d0e25567 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -139,7 +139,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY}; #include #include #include -#include +#include static DEFINE_MUTEX(pcd_mutex); static DEFINE_SPINLOCK(pcd_lock); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 78a39f736c64..c3ed2fc72daa 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -155,7 +155,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV}; #include #include #include -#include +#include #include static DEFINE_MUTEX(pd_mutex); diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 7a7d977a76c5..ed93e8badf56 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -155,7 +155,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY}; #include #include #include -#include +#include static DEFINE_MUTEX(pf_mutex); static DEFINE_SPINLOCK(pf_spin_lock); diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c index bfbd4c852dd9..5db955fe3a94 100644 --- a/drivers/block/paride/pg.c +++ b/drivers/block/paride/pg.c @@ -166,7 +166,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY}; #include #include -#include +#include module_param(verbose, int, 0644); module_param(major, int, 0); diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c index 216a94fed5b4..61fc6824299a 100644 --- a/drivers/block/paride/pt.c +++ b/drivers/block/paride/pt.c @@ -150,7 +150,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3}; #include /* current, TASK_*, schedule_timeout() */ #include -#include +#include module_param(verbose, int, 0); module_param(major, int, 0); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 95c98de92971..1b94c1ca5c5f 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -68,7 +68,7 @@ #include #include -#include +#include #define DRIVER_NAME "pktcdvd" diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c264f2d284a7..aabd8e9d3035 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index ba4bfe933276..0e93ad7b8511 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #if 0 #define CARM_DEBUG diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 46f4c719fed9..c141cc3be22b 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -54,7 +54,7 @@ #include "umem.h" -#include +#include #include #define MM_MAXCARDS 4 diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 5d475b3a0b2e..59cca72647a6 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -282,7 +282,7 @@ #include #include -#include +#include /* used to tell the module to turn on full debugging messages */ static bool debug; diff --git a/drivers/char/agp/compat_ioctl.c b/drivers/char/agp/compat_ioctl.c index a48e05b31593..2053f70ef66b 100644 --- a/drivers/char/agp/compat_ioctl.c +++ b/drivers/char/agp/compat_ioctl.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "agp.h" #include "compat_ioctl.h" diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c index 0f64d149c98d..f6955888e676 100644 --- a/drivers/char/agp/frontend.c +++ b/drivers/char/agp/frontend.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include "agp.h" diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c index 14790304b84b..e5c62dcf2c11 100644 --- a/drivers/char/applicom.c +++ b/drivers/char/applicom.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include "applicom.h" diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c index 35d46da754d7..0584025bb0c2 100644 --- a/drivers/char/bfin-otp.c +++ b/drivers/char/bfin-otp.c @@ -20,7 +20,7 @@ #include #include -#include +#include #define stamp(fmt, args...) pr_debug("%s:%i: " fmt "\n", __func__, __LINE__, ## args) #define stampit() stamp("here i am") diff --git a/drivers/char/ds1620.c b/drivers/char/ds1620.c index 0fae5296e311..eb53cbadb68f 100644 --- a/drivers/char/ds1620.c +++ b/drivers/char/ds1620.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include #ifdef CONFIG_PROC_FS diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c index 65a8d96c0e93..58471394beb9 100644 --- a/drivers/char/dtlk.c +++ b/drivers/char/dtlk.c @@ -59,7 +59,7 @@ #include #include #include /* for inb_p, outb_p, inb, outb, etc. */ -#include /* for get_user, etc. */ +#include /* for get_user, etc. */ #include /* for wait_queue */ #include /* for __init, module_{init,exit} */ #include /* for POLLIN, etc. */ diff --git a/drivers/char/generic_nvram.c b/drivers/char/generic_nvram.c index 073db9558379..14e728fbb8a0 100644 --- a/drivers/char/generic_nvram.c +++ b/drivers/char/generic_nvram.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_PPC_PMAC #include diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c index a7c5c59675f0..4f337375252e 100644 --- a/drivers/char/hangcheck-timer.c +++ b/drivers/char/hangcheck-timer.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index f9766415ff10..6ce5ce8be2f2 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #define RNG_MODULE_NAME "hw_random" diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 4facc7517a6a..4035495f3a86 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -43,7 +43,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/char/lp.c b/drivers/char/lp.c index c4094c4e22c1..5b6742770656 100644 --- a/drivers/char/lp.c +++ b/drivers/char/lp.c @@ -134,7 +134,7 @@ #include #include -#include +#include /* if you have more than 8 printers, remember to increase LP_NO */ #define LP_NO 8 diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c index 67d426470e53..8c9216a0f62e 100644 --- a/drivers/char/mbcs.c +++ b/drivers/char/mbcs.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c index 3d6c0671e996..f786b18ac500 100644 --- a/drivers/char/mmtimer.c +++ b/drivers/char/mmtimer.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/char/mwave/3780i.c b/drivers/char/mwave/3780i.c index 972c40a19629..4a8937f80570 100644 --- a/drivers/char/mwave/3780i.c +++ b/drivers/char/mwave/3780i.c @@ -54,7 +54,7 @@ #include /* cond_resched() */ #include -#include +#include #include #include "smapi.h" #include "mwavedd.h" diff --git a/drivers/char/mwave/mwavedd.h b/drivers/char/mwave/mwavedd.h index 37e0a4926080..21cb09c7bed7 100644 --- a/drivers/char/mwave/mwavedd.h +++ b/drivers/char/mwave/mwavedd.h @@ -53,7 +53,7 @@ #include "smapi.h" #include "mwavepub.h" #include -#include +#include #include extern int mwave_debug; diff --git a/drivers/char/nsc_gpio.c b/drivers/char/nsc_gpio.c index b07b119ae57f..2a91bf048804 100644 --- a/drivers/char/nsc_gpio.c +++ b/drivers/char/nsc_gpio.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #define NAME "nsc_gpio" diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c index 0e184426db98..a5b1eb276c0b 100644 --- a/drivers/char/nwbutton.c +++ b/drivers/char/nwbutton.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/char/nwflash.c b/drivers/char/nwflash.c index dbe598de9b74..a284ae25e69a 100644 --- a/drivers/char/nwflash.c +++ b/drivers/char/nwflash.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include /*****************************************************************************/ #include diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c index 3f79a9fb6b1b..5f4be88c0dfc 100644 --- a/drivers/char/pc8736x_gpio.c +++ b/drivers/char/pc8736x_gpio.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #define DEVNAME "pc8736x_gpio" diff --git a/drivers/char/pcmcia/cm4040_cs.c b/drivers/char/pcmcia/cm4040_cs.c index fc061f7c2bd1..d7123259143e 100644 --- a/drivers/char/pcmcia/cm4040_cs.c +++ b/drivers/char/pcmcia/cm4040_cs.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c index a7dd5f4f2c5a..d136db1a10f0 100644 --- a/drivers/char/pcmcia/synclink_cs.c +++ b/drivers/char/pcmcia/synclink_cs.c @@ -84,7 +84,7 @@ #define PUT_USER(error,value,addr) error = put_user(value,addr) #define COPY_TO_USER(error,dest,src,size) error = copy_to_user(dest,src,size) ? -EFAULT : 0 -#include +#include static MGSL_PARAMS default_params = { MGSL_MODE_HDLC, /* unsigned long mode */ diff --git a/drivers/char/random.c b/drivers/char/random.c index d6876d506220..1ef26403bcc8 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -265,7 +265,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/char/raw.c b/drivers/char/raw.c index e83b2adc014a..293167c6e254 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -24,7 +24,7 @@ #include #include -#include +#include struct raw_device_data { struct block_device *binding; diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c index 0bc135b9b16f..903761bc41c9 100644 --- a/drivers/char/scx200_gpio.c +++ b/drivers/char/scx200_gpio.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c index 719c5b4eed39..4fa7fcd8af36 100644 --- a/drivers/char/sonypi.c +++ b/drivers/char/sonypi.c @@ -52,7 +52,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c index 100cd1de9939..572a51704e67 100644 --- a/drivers/char/tlclk.c +++ b/drivers/char/tlclk.c @@ -44,7 +44,7 @@ #include #include #include /* inb/outb */ -#include +#include MODULE_AUTHOR("Sebastien Bouchard "); MODULE_LICENSE("GPL"); diff --git a/drivers/char/toshiba.c b/drivers/char/toshiba.c index f5a45d887a37..5488516da8ea 100644 --- a/drivers/char/toshiba.c +++ b/drivers/char/toshiba.c @@ -63,7 +63,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c index c07dfe5c4da3..3e6b23c3453c 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c @@ -88,7 +88,7 @@ #include #include -#include +#include #ifdef CONFIG_OF /* For open firmware. */ diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c b/drivers/cpufreq/ia64-acpi-cpufreq.c index 759612da4fdc..e28a31a40829 100644 --- a/drivers/cpufreq/ia64-acpi-cpufreq.c +++ b/drivers/cpufreq/ia64-acpi-cpufreq.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index fe8f08948fcb..ac321f09e717 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -19,7 +19,7 @@ #include #include -#include +#include #define PROMOTION_COUNT 4 #define DEMOTION_COUNT 1 diff --git a/drivers/dio/dio.c b/drivers/dio/dio.c index 55dd88d82d6d..830184529109 100644 --- a/drivers/dio/dio.c +++ b/drivers/dio/dio.c @@ -31,7 +31,7 @@ #include #include #include /* kmalloc() */ -#include +#include #include /* readb() */ struct dio_bus dio_bus = { diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index de4d5d08af9e..65cf2b9355c4 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -13,7 +13,7 @@ */ #include -#include +#include #include #include #include diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 5f2c717f8053..750891ea07de 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include "edac_mc.h" #include "edac_module.h" diff --git a/drivers/edac/edac_pci.c b/drivers/edac/edac_pci.c index 4e9d5632041a..48c844a72a27 100644 --- a/drivers/edac/edac_pci.c +++ b/drivers/edac/edac_pci.c @@ -10,7 +10,7 @@ * */ #include -#include +#include #include #include #include diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c index 3de95a29024c..cf9e396d7702 100644 --- a/drivers/i2c/i2c-core.c +++ b/drivers/i2c/i2c-core.c @@ -30,7 +30,7 @@ #define pr_fmt(fmt) "i2c-core: " fmt #include -#include +#include #include #include #include diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index 0ceae5cbd89a..4b5dc0162e67 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -130,7 +130,7 @@ #include #include -#include +#include #include #define DRV_NAME "hpt366" diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 83679da0c3f0..5ceace542b77 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 6360bbd37efe..201e43fcbc94 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -51,7 +51,7 @@ #include #include -#include +#include #include int ide_end_rq(ide_drive_t *drive, struct request *rq, int error, diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 376f2dc410c5..210a0887dd29 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include void SELECT_MASK(ide_drive_t *drive, int mask) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 0b63facd1d87..330e319419e6 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -36,7 +36,7 @@ #include #include -#include +#include #include /** diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index 97c070077774..863db44c7916 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -16,7 +16,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 579f9a7f6283..e0a995b85a2d 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 415a3185cde7..249b403b43a4 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -50,7 +50,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 09b649159e6c..700782203483 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include "uverbs.h" #include "core_priv.h" diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 813593550c4b..b3f95d453fba 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -46,7 +46,7 @@ #include #include -#include +#include #include diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index a2f9f29c6ab5..fd811115af49 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include "ipoib.h" diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 64b3d11dcf1e..9104e6b8cac9 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -62,7 +62,7 @@ #include -#include +#include #include #include diff --git a/drivers/input/input-compat.c b/drivers/input/input-compat.c index d84d20b9cec0..2186f71c9fe5 100644 --- a/drivers/input/input-compat.c +++ b/drivers/input/input-compat.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include "input-compat.h" #ifdef CONFIG_COMPAT diff --git a/drivers/input/misc/atlas_btns.c b/drivers/input/misc/atlas_btns.c index 638165c78e75..6423aaccc763 100644 --- a/drivers/input/misc/atlas_btns.c +++ b/drivers/input/misc/atlas_btns.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #define ACPI_ATLAS_NAME "Atlas ACPI" #define ACPI_ATLAS_CLASS "Atlas" diff --git a/drivers/input/mouse/amimouse.c b/drivers/input/mouse/amimouse.c index a7fd8f22ba56..a33437c480e3 100644 --- a/drivers/input/mouse/amimouse.c +++ b/drivers/input/mouse/amimouse.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/input/mouse/atarimouse.c b/drivers/input/mouse/atarimouse.c index d1c43236b125..96f2f51604bd 100644 --- a/drivers/input/mouse/atarimouse.c +++ b/drivers/input/mouse/atarimouse.c @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 354d47ecd66a..7331084973e1 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "psmouse.h" #include "trackpoint.h" diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c index 852858e5d8d0..559c99ca6592 100644 --- a/drivers/input/serio/hp_sdc.c +++ b/drivers/input/serio/hp_sdc.c @@ -79,7 +79,7 @@ # define sdc_readb(p) gsc_readb(p) # define sdc_writeb(v,p) gsc_writeb((v),(p)) #elif defined(__mc68000__) -# include +#include # define sdc_readb(p) in_8(p) # define sdc_writeb(v,p) out_8((p),(v)) #else diff --git a/drivers/input/serio/q40kbd.c b/drivers/input/serio/q40kbd.c index 5a9d521510bf..d0fccc8ec259 100644 --- a/drivers/input/serio/q40kbd.c +++ b/drivers/input/serio/q40kbd.c @@ -38,7 +38,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c index d189843f3727..f8ead9f9c77e 100644 --- a/drivers/input/serio/serport.c +++ b/drivers/input/serio/serport.c @@ -13,7 +13,7 @@ * the Free Software Foundation. */ -#include +#include #include #include #include diff --git a/drivers/input/tablet/aiptek.c b/drivers/input/tablet/aiptek.c index 4613f0aefd08..d67547bded3e 100644 --- a/drivers/input/tablet/aiptek.c +++ b/drivers/input/tablet/aiptek.c @@ -75,7 +75,7 @@ #include #include #include -#include +#include #include /* diff --git a/drivers/input/tablet/gtco.c b/drivers/input/tablet/gtco.c index abf09ac42ce4..b796e891e2ee 100644 --- a/drivers/input/tablet/gtco.c +++ b/drivers/input/tablet/gtco.c @@ -56,7 +56,7 @@ Scott Hill shill@gtcocalcomp.com #include #include #include -#include +#include #include #include #include diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 823f6985b260..49d0f70c2bae 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #ifdef AVMB1_COMPAT diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c index 4d9b195547c5..9fdbd99c7547 100644 --- a/drivers/isdn/hardware/avm/b1.c +++ b/drivers/isdn/hardware/avm/b1.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include "avmcard.h" diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c index 19b113faeb7b..818bd8f231db 100644 --- a/drivers/isdn/hardware/avm/b1dma.c +++ b/drivers/isdn/hardware/avm/b1dma.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include "avmcard.h" diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c index 5d00d72fe482..17beb2869dc1 100644 --- a/drivers/isdn/hardware/avm/c4.c +++ b/drivers/isdn/hardware/avm/c4.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/isdn/hardware/eicon/capimain.c b/drivers/isdn/hardware/eicon/capimain.c index 997d46abf5b2..be36d82004d6 100644 --- a/drivers/isdn/hardware/eicon/capimain.c +++ b/drivers/isdn/hardware/eicon/capimain.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/isdn/hardware/eicon/divamnt.c b/drivers/isdn/hardware/eicon/divamnt.c index 0de29b7b712f..72e58bf07577 100644 --- a/drivers/isdn/hardware/eicon/divamnt.c +++ b/drivers/isdn/hardware/eicon/divamnt.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "platform.h" #include "di_defs.h" diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c index 4103a8c178d7..cb88090f9cea 100644 --- a/drivers/isdn/hardware/eicon/divasi.c +++ b/drivers/isdn/hardware/eicon/divasi.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "platform.h" #include "di_defs.h" diff --git a/drivers/isdn/hardware/eicon/divasmain.c b/drivers/isdn/hardware/eicon/divasmain.c index 32f34511c416..8b7ad4f1ab01 100644 --- a/drivers/isdn/hardware/eicon/divasmain.c +++ b/drivers/isdn/hardware/eicon/divasmain.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/isdn/hardware/eicon/divasproc.c b/drivers/isdn/hardware/eicon/divasproc.c index 56ce98a4e248..b57efd6ad916 100644 --- a/drivers/isdn/hardware/eicon/divasproc.c +++ b/drivers/isdn/hardware/eicon/divasproc.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include "platform.h" #include "debuglib.h" diff --git a/drivers/isdn/hysdn/hysdn_boot.c b/drivers/isdn/hysdn/hysdn_boot.c index eda4741e3f2f..4a0425378f37 100644 --- a/drivers/isdn/hysdn/hysdn_boot.c +++ b/drivers/isdn/hysdn/hysdn_boot.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include "hysdn_defs.h" #include "hysdn_pof.h" diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 9e385b38debf..ac219045daf7 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include "lg.h" diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index e3abebc912c0..0bc127e9f16a 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include "lg.h" /*M:008 diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 743253fc638f..d71f6323ac00 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include "../lg.h" diff --git a/drivers/macintosh/ans-lcd.c b/drivers/macintosh/ans-lcd.c index cd35079c8c98..281fa9e6fc1f 100644 --- a/drivers/macintosh/ans-lcd.c +++ b/drivers/macintosh/ans-lcd.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index 08edb2c25b60..227869159ac0 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #define VERSION "0.7" #define AUTHOR "(c) 2005 Benjamin Herrenschmidt, IBM Corp." diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 91081dcdc272..43b8db2b5445 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c index a00ee41f0573..a411c5cb77a1 100644 --- a/drivers/macintosh/via-pmu68k.c +++ b/drivers/macintosh/via-pmu68k.c @@ -38,7 +38,7 @@ #include #include -#include +#include /* Misc minor number allocated for /dev/pmu */ #define PMU_MINOR 154 diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index c72a77048b73..a5a9b17f0f7f 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -17,7 +17,7 @@ #include #include -#include +#include #define DM_MSG_PREFIX "ioctl" #define DM_DRIVER_EMAIL "dm-devel@redhat.com" diff --git a/drivers/media/dvb-core/dmxdev.c b/drivers/media/dvb-core/dmxdev.c index efe55a3e80d0..0c44479b556e 100644 --- a/drivers/media/dvb-core/dmxdev.c +++ b/drivers/media/dvb-core/dmxdev.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include "dmxdev.h" static int debug; diff --git a/drivers/media/dvb-core/dvb_demux.c b/drivers/media/dvb-core/dvb_demux.c index 3ad0b2cd26b1..bbbff72bbb2a 100644 --- a/drivers/media/dvb-core/dvb_demux.c +++ b/drivers/media/dvb-core/dvb_demux.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include "dvb_demux.h" diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c index dfc03a95df71..bc5e8cfe7ca2 100644 --- a/drivers/media/dvb-core/dvb_net.c +++ b/drivers/media/dvb-core/dvb_net.c @@ -62,7 +62,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/dvb-core/dvb_ringbuffer.c b/drivers/media/dvb-core/dvb_ringbuffer.c index 7df7fb3738a0..5c4b5a1f604f 100644 --- a/drivers/media/dvb-core/dvb_ringbuffer.c +++ b/drivers/media/dvb-core/dvb_ringbuffer.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "dvb_ringbuffer.h" diff --git a/drivers/media/i2c/adv7170.c b/drivers/media/i2c/adv7170.c index 05f1dc6c72af..fc9ec0f3679c 100644 --- a/drivers/media/i2c/adv7170.c +++ b/drivers/media/i2c/adv7170.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/adv7175.c b/drivers/media/i2c/adv7175.c index f554809a51e7..72139bdae1ca 100644 --- a/drivers/media/i2c/adv7175.c +++ b/drivers/media/i2c/adv7175.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/bt856.c b/drivers/media/i2c/bt856.c index 48176591a80d..54c627859c8e 100644 --- a/drivers/media/i2c/bt856.c +++ b/drivers/media/i2c/bt856.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/bt866.c b/drivers/media/i2c/bt866.c index bbec70c882a3..0d3f46af2545 100644 --- a/drivers/media/i2c/bt866.c +++ b/drivers/media/i2c/bt866.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/cs53l32a.c b/drivers/media/i2c/cs53l32a.c index e4b3cf49dd38..59c1a98c5a90 100644 --- a/drivers/media/i2c/cs53l32a.c +++ b/drivers/media/i2c/cs53l32a.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/m52790.c b/drivers/media/i2c/m52790.c index 81171d8e1c2c..89c28c36c5bf 100644 --- a/drivers/media/i2c/m52790.c +++ b/drivers/media/i2c/m52790.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/saa6588.c b/drivers/media/i2c/saa6588.c index 89e458c23983..00640233a5e3 100644 --- a/drivers/media/i2c/saa6588.c +++ b/drivers/media/i2c/saa6588.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/media/i2c/saa7110.c b/drivers/media/i2c/saa7110.c index 6f49886806ee..ad456ce051f9 100644 --- a/drivers/media/i2c/saa7110.c +++ b/drivers/media/i2c/saa7110.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/saa7185.c b/drivers/media/i2c/saa7185.c index eecad2d1edce..119050e1197a 100644 --- a/drivers/media/i2c/saa7185.c +++ b/drivers/media/i2c/saa7185.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/tlv320aic23b.c b/drivers/media/i2c/tlv320aic23b.c index 2e06c06cac9b..cc6104da34ef 100644 --- a/drivers/media/i2c/tlv320aic23b.c +++ b/drivers/media/i2c/tlv320aic23b.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/vp27smpx.c b/drivers/media/i2c/vp27smpx.c index d6c23bdbcd4a..ef0d8b8e3df7 100644 --- a/drivers/media/i2c/vp27smpx.c +++ b/drivers/media/i2c/vp27smpx.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/vpx3220.c b/drivers/media/i2c/vpx3220.c index 90b693f4e2ab..ce9f09370e22 100644 --- a/drivers/media/i2c/vpx3220.c +++ b/drivers/media/i2c/vpx3220.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/wm8739.c b/drivers/media/i2c/wm8739.c index f086e5e6e844..c885def54b15 100644 --- a/drivers/media/i2c/wm8739.c +++ b/drivers/media/i2c/wm8739.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/i2c/wm8775.c b/drivers/media/i2c/wm8775.c index 5581f4db02af..45039d756753 100644 --- a/drivers/media/i2c/wm8775.c +++ b/drivers/media/i2c/wm8775.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/pci/ivtv/ivtv-driver.h b/drivers/media/pci/ivtv/ivtv-driver.h index 10cba305dbd2..6b09a9514d64 100644 --- a/drivers/media/pci/ivtv/ivtv-driver.h +++ b/drivers/media/pci/ivtv/ivtv-driver.h @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/media/pci/meye/meye.c b/drivers/media/pci/meye/meye.c index e825bc93ea7a..24fba633c217 100644 --- a/drivers/media/pci/meye/meye.c +++ b/drivers/media/pci/meye/meye.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/media/pci/zoran/videocodec.c b/drivers/media/pci/zoran/videocodec.c index 13a3c07cd259..3c3cbce0f9cc 100644 --- a/drivers/media/pci/zoran/videocodec.c +++ b/drivers/media/pci/zoran/videocodec.c @@ -40,7 +40,7 @@ #ifdef CONFIG_PROC_FS #include #include -#include +#include #endif #include "videocodec.h" diff --git a/drivers/media/pci/zoran/zoran_driver.c b/drivers/media/pci/zoran/zoran_driver.c index 2170e174c335..94b9b616df98 100644 --- a/drivers/media/pci/zoran/zoran_driver.c +++ b/drivers/media/pci/zoran/zoran_driver.c @@ -66,7 +66,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/media/platform/arv.c b/drivers/media/platform/arv.c index 03c5098499c4..8fe59bf6cd3f 100644 --- a/drivers/media/platform/arv.c +++ b/drivers/media/platform/arv.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/media/usb/pvrusb2/pvrusb2-ioread.c b/drivers/media/usb/pvrusb2/pvrusb2-ioread.c index 70b8a052eb5b..3c7ca2c2c108 100644 --- a/drivers/media/usb/pvrusb2/pvrusb2-ioread.c +++ b/drivers/media/usb/pvrusb2/pvrusb2-ioread.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #define BUFFER_COUNT 32 #define BUFFER_SIZE PAGE_ALIGN(0x4000) diff --git a/drivers/media/usb/pwc/pwc-ctrl.c b/drivers/media/usb/pwc/pwc-ctrl.c index 3a1618580ed6..655cef39eb3d 100644 --- a/drivers/media/usb/pwc/pwc-ctrl.c +++ b/drivers/media/usb/pwc/pwc-ctrl.c @@ -39,7 +39,7 @@ /* Control functions for the cam; brightness, contrast, video mode, etc. */ #ifdef __KERNEL__ -#include +#include #endif #include diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index 57cfe26a393f..a5ea1f517291 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -54,7 +54,7 @@ #if defined(CONFIG_SPI) #include #endif -#include +#include #include #include #include diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c index 8be561ab2615..fa2124cb31bd 100644 --- a/drivers/media/v4l2-core/v4l2-dev.c +++ b/drivers/media/v4l2-core/v4l2-dev.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index 02b5f69e1a42..7b3b41368931 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -58,7 +58,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/message/fusion/mptlan.h b/drivers/message/fusion/mptlan.h index 69e9d5463564..8946e19dbfc8 100644 --- a/drivers/message/fusion/mptlan.h +++ b/drivers/message/fusion/mptlan.h @@ -70,7 +70,7 @@ #include #include -#include +#include #include /* Override mptbase.h by pre-defining these! */ diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index 520f58439080..e05c3245930a 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -76,7 +76,7 @@ #include #include #include -#include +#include #include #include "ibmasm.h" #include "remote.h" diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index bab3f07b1117..cb1698f268f1 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -43,7 +43,7 @@ #include #include -#include +#include #include "queue.h" #include "block.h" diff --git a/drivers/mmc/host/android-goldfish.c b/drivers/mmc/host/android-goldfish.c index dca5518b0139..590a8a4522be 100644 --- a/drivers/mmc/host/android-goldfish.c +++ b/drivers/mmc/host/android-goldfish.c @@ -49,7 +49,7 @@ #include #include -#include +#include #define DRIVER_NAME "goldfish_mmc" diff --git a/drivers/mtd/devices/pmc551.c b/drivers/mtd/devices/pmc551.c index 220f9200fa52..cadea0620cd0 100644 --- a/drivers/mtd/devices/pmc551.c +++ b/drivers/mtd/devices/pmc551.c @@ -82,7 +82,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/mtd/devices/slram.c b/drivers/mtd/devices/slram.c index a70eb83e68f1..8087c36dc693 100644 --- a/drivers/mtd/devices/slram.c +++ b/drivers/mtd/devices/slram.c @@ -30,7 +30,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c index 9fb3b0dcdac2..664d206a4cbe 100644 --- a/drivers/mtd/ftl.c +++ b/drivers/mtd/ftl.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c index b66b541877f0..8db740d6eb08 100644 --- a/drivers/mtd/inftlcore.c +++ b/drivers/mtd/inftlcore.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c index 1388c8d7f309..8d6bb189ea8e 100644 --- a/drivers/mtd/inftlmount.c +++ b/drivers/mtd/inftlmount.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/mtd/maps/sun_uflash.c b/drivers/mtd/maps/sun_uflash.c index d459aca07881..414956eca0c9 100644 --- a/drivers/mtd/maps/sun_uflash.c +++ b/drivers/mtd/maps/sun_uflash.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 8d58acf33021..df8a5ef334c0 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "mtdcore.h" diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 2a47a3f0e730..ce5ccc573a9c 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include "mtdcore.h" diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c index 46f27de018c3..e21161353e76 100644 --- a/drivers/mtd/nftlcore.c +++ b/drivers/mtd/nftlcore.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index 31f89f1c6123..b8c293373ecc 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include "ipddp.h" /* Our stuff */ diff --git a/drivers/net/eql.c b/drivers/net/eql.c index a10ad74cc8d2..fe13bfea30ac 100644 --- a/drivers/net/eql.c +++ b/drivers/net/eql.c @@ -127,7 +127,7 @@ #include #include -#include +#include static int eql_open(struct net_device *dev); static int eql_close(struct net_device *dev); diff --git a/drivers/net/ethernet/3com/3c509.c b/drivers/net/ethernet/3com/3c509.c index a7533780dddc..c7f9f2c77da7 100644 --- a/drivers/net/ethernet/3com/3c509.c +++ b/drivers/net/ethernet/3com/3c509.c @@ -88,7 +88,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c index be5b80103bec..e7b1fa56b290 100644 --- a/drivers/net/ethernet/3com/3c515.c +++ b/drivers/net/ethernet/3com/3c515.c @@ -72,7 +72,7 @@ static int max_interrupt_work = 20; #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/3com/3c574_cs.c b/drivers/net/ethernet/3com/3c574_cs.c index 9359a37fedc0..47c844cc9d27 100644 --- a/drivers/net/ethernet/3com/3c574_cs.c +++ b/drivers/net/ethernet/3com/3c574_cs.c @@ -92,7 +92,7 @@ earlier 3Com products. #include #include -#include +#include #include /*====================================================================*/ diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index b3560a364e53..40196f41768a 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -92,7 +92,7 @@ static int vortex_debug = 1; #include #include /* For nr_irqs only. */ #include -#include +#include /* Kernel compatibility defines, some common to David Hinds' PCMCIA package. This is only in the support-all-kernels source code. */ diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index a0cacbe846ba..9fe3990319ec 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -119,7 +119,7 @@ static const int multicast_filter_limit = 32; #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c index 1d84a0544ace..3da1fc539ef9 100644 --- a/drivers/net/ethernet/8390/axnet_cs.c +++ b/drivers/net/ethernet/8390/axnet_cs.c @@ -46,7 +46,7 @@ #include #include -#include +#include #define AXNET_CMD 0x00 #define AXNET_DATAPORT 0x10 /* NatSemi-defined port window offset. */ diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c index 07355302443d..1bdea746926c 100644 --- a/drivers/net/ethernet/8390/ne2k-pci.c +++ b/drivers/net/ethernet/8390/ne2k-pci.c @@ -54,7 +54,7 @@ static int options[MAX_UNITS]; #include #include -#include +#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/pcnet_cs.c b/drivers/net/ethernet/8390/pcnet_cs.c index 63079a6e20d9..bd0a2a14b649 100644 --- a/drivers/net/ethernet/8390/pcnet_cs.c +++ b/drivers/net/ethernet/8390/pcnet_cs.c @@ -49,7 +49,7 @@ #include #include -#include +#include #define PCNET_CMD 0x00 #define PCNET_DATAPORT 0x10 /* NatSemi-defined port window offset. */ diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c index 3aaad33cdbc6..c12d2618eebf 100644 --- a/drivers/net/ethernet/adaptec/starfire.c +++ b/drivers/net/ethernet/adaptec/starfire.c @@ -45,7 +45,7 @@ #include #include #include /* Processor type for cache alignment. */ -#include +#include #include /* diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c index 16f0c70266bc..a1a52eb53b14 100644 --- a/drivers/net/ethernet/alteon/acenic.c +++ b/drivers/net/ethernet/alteon/acenic.c @@ -80,7 +80,7 @@ #include #include #include -#include +#include #define DRV_NAME "acenic" diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c index 11cf1e3e0295..9595f1bc535b 100644 --- a/drivers/net/ethernet/amd/amd8111e.c +++ b/drivers/net/ethernet/amd/amd8111e.c @@ -87,7 +87,7 @@ Revision History: #include #include -#include +#include #if IS_ENABLED(CONFIG_VLAN_8021Q) #define AMD8111E_VLAN_TAG_USED 1 diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c index 113a3b3cc50c..b556c926557a 100644 --- a/drivers/net/ethernet/amd/nmclan_cs.c +++ b/drivers/net/ethernet/amd/nmclan_cs.c @@ -151,7 +151,7 @@ Include Files #include #include -#include +#include #include /* ---------------------------------------------------------------------------- diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c index 1df3048a3cdb..48707ed76ffc 100644 --- a/drivers/net/ethernet/broadcom/b44.c +++ b/drivers/net/ethernet/broadcom/b44.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 3a05f9098e75..d8aff7a4b3c7 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include "cpl5_cmd.h" #include "regs.h" diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 7b2224ae72f2..d76491676b51 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include "common.h" #include "cxgb3_ioctl.h" diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 66c37fac59b2..6f951877430b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -63,7 +63,7 @@ #include #include #include -#include +#include #include #include "cxgb4.h" diff --git a/drivers/net/ethernet/dec/tulip/de2104x.c b/drivers/net/ethernet/dec/tulip/de2104x.c index 90c573b8ccaf..57c17e797ae3 100644 --- a/drivers/net/ethernet/dec/tulip/de2104x.c +++ b/drivers/net/ethernet/dec/tulip/de2104x.c @@ -49,7 +49,7 @@ #include #include -#include +#include #include /* These identify the driver base version and may not be removed. */ diff --git a/drivers/net/ethernet/dec/tulip/de4x5.c b/drivers/net/ethernet/dec/tulip/de4x5.c index 51fda3a6b13f..df4a871df633 100644 --- a/drivers/net/ethernet/dec/tulip/de4x5.c +++ b/drivers/net/ethernet/dec/tulip/de4x5.c @@ -472,7 +472,7 @@ #include #include #include -#include +#include #ifdef CONFIG_PPC_PMAC #include #endif /* CONFIG_PPC_PMAC */ diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c index df4994919456..07e10a45beaa 100644 --- a/drivers/net/ethernet/dec/tulip/dmfe.c +++ b/drivers/net/ethernet/dec/tulip/dmfe.c @@ -90,7 +90,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_TULIP_DM910X diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c index 5f1377449b8f..17e566a8b345 100644 --- a/drivers/net/ethernet/dec/tulip/tulip_core.c +++ b/drivers/net/ethernet/dec/tulip/tulip_core.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #ifdef CONFIG_SPARC #include diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c index e1c4133b8787..f82ebe5d89ee 100644 --- a/drivers/net/ethernet/dec/tulip/uli526x.c +++ b/drivers/net/ethernet/dec/tulip/uli526x.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #define uw32(reg, val) iowrite32(val, ioaddr + (reg)) #define ur32(reg) ioread32(ioaddr + (reg)) diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c index feda96d585e7..bc9bf88e5831 100644 --- a/drivers/net/ethernet/dec/tulip/winbond-840.c +++ b/drivers/net/ethernet/dec/tulip/winbond-840.c @@ -129,7 +129,7 @@ static int full_duplex[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; #include #include #include -#include +#include #include /* Processor type for cache alignment. */ #include #include diff --git a/drivers/net/ethernet/dec/tulip/xircom_cb.c b/drivers/net/ethernet/dec/tulip/xircom_cb.c index 19e4ea15b504..a8de79355578 100644 --- a/drivers/net/ethernet/dec/tulip/xircom_cb.c +++ b/drivers/net/ethernet/dec/tulip/xircom_cb.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #ifdef CONFIG_NET_POLL_CONTROLLER #include diff --git a/drivers/net/ethernet/dlink/dl2k.h b/drivers/net/ethernet/dlink/dl2k.h index 8f4f61262d5c..5d8ae5320242 100644 --- a/drivers/net/ethernet/dlink/dl2k.h +++ b/drivers/net/ethernet/dlink/dl2k.h @@ -31,7 +31,7 @@ #include #include /* Processor type for cache alignment. */ #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c index eab36acfc0d1..2e5b66762e15 100644 --- a/drivers/net/ethernet/dlink/sundance.c +++ b/drivers/net/ethernet/dlink/sundance.c @@ -91,7 +91,7 @@ static char *media[MAX_UNITS]; #include #include #include -#include +#include #include /* Processor type for cache alignment. */ #include #include diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c index 6967b287b6e7..9cb436cb3745 100644 --- a/drivers/net/ethernet/fealnx.c +++ b/drivers/net/ethernet/fealnx.c @@ -88,7 +88,7 @@ static int full_duplex[MAX_UNITS] = { -1, -1, -1, -1, -1, -1, -1, -1 }; #include /* Processor type for cache alignment. */ #include -#include +#include #include /* These identify the driver base version and may not be removed. */ diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index d9f3a480ca1b..1f98838f32b7 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include "fs_enet.h" diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c b/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c index 120c758f5d01..6e64989f8478 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c +++ b/drivers/net/ethernet/freescale/fs_enet/mac-fcc.c @@ -42,7 +42,7 @@ #include #include -#include +#include #include "fs_enet.h" diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c index 777beffa1e1e..db9c0bcf54cd 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mac-fec.c +++ b/drivers/net/ethernet/freescale/fs_enet/mac-fec.c @@ -36,7 +36,7 @@ #include #include -#include +#include #ifdef CONFIG_8xx #include diff --git a/drivers/net/ethernet/freescale/fs_enet/mac-scc.c b/drivers/net/ethernet/freescale/fs_enet/mac-scc.c index 15abd37d70e3..96d44cf44fe0 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mac-scc.c +++ b/drivers/net/ethernet/freescale/fs_enet/mac-scc.c @@ -35,7 +35,7 @@ #include #include -#include +#include #ifdef CONFIG_8xx #include diff --git a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c index a89267b94352..1582d82483ec 100644 --- a/drivers/net/ethernet/freescale/fs_enet/mii-fec.c +++ b/drivers/net/ethernet/freescale/fs_enet/mii-fec.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include "fs_enet.h" diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 756f7e763d5f..a6e7afa878be 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -93,7 +93,7 @@ #include #endif #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/gianfar.h b/drivers/net/ethernet/freescale/gianfar.h index 6e8a9c8467b9..5aa814799d70 100644 --- a/drivers/net/ethernet/freescale/gianfar.h +++ b/drivers/net/ethernet/freescale/gianfar.h @@ -40,7 +40,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index 56588f2e1d91..a93e0199c369 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index 53c5fcf1436c..9d660888510f 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c index 8ba636f61b50..b642990b549c 100644 --- a/drivers/net/ethernet/freescale/ucc_geth_ethtool.c +++ b/drivers/net/ethernet/freescale/ucc_geth_ethtool.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include "ucc_geth.h" diff --git a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c index 51c4abc51bf4..a69cd19a55ae 100644 --- a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c +++ b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c @@ -54,7 +54,7 @@ #include #include -#include +#include #include /*====================================================================*/ diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 52a69c925965..5909615c27f7 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c index d2b29b490ae0..e5d72559cca9 100644 --- a/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c +++ b/drivers/net/ethernet/intel/ixgb/ixgb_ethtool.c @@ -30,7 +30,7 @@ #include "ixgb.h" -#include +#include #define IXGB_ALL_RAR_ENTRIES 16 diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c index 22b0821c1da0..90eac63f9606 100644 --- a/drivers/net/ethernet/natsemi/natsemi.c +++ b/drivers/net/ethernet/natsemi/natsemi.c @@ -51,7 +51,7 @@ #include /* Processor type for cache alignment. */ #include #include -#include +#include #define DRV_NAME "natsemi" #define DRV_VERSION "2.1" diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 93c4bdc0cdca..f9d2eb9a920a 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -119,7 +119,7 @@ #include #include -#include +#include #define DRV_NAME "ns83820" diff --git a/drivers/net/ethernet/packetengines/hamachi.c b/drivers/net/ethernet/packetengines/hamachi.c index 2d04679a923a..baff744b560e 100644 --- a/drivers/net/ethernet/packetengines/hamachi.c +++ b/drivers/net/ethernet/packetengines/hamachi.c @@ -160,7 +160,7 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; #include #include -#include +#include #include /* Processor type for cache alignment. */ #include #include diff --git a/drivers/net/ethernet/packetengines/yellowfin.c b/drivers/net/ethernet/packetengines/yellowfin.c index 2a2ca5fa0c69..fa7770da6ef8 100644 --- a/drivers/net/ethernet/packetengines/yellowfin.c +++ b/drivers/net/ethernet/packetengines/yellowfin.c @@ -100,7 +100,7 @@ static int gx_fix; #include #include #include -#include +#include #include /* Processor type for cache alignment. */ #include #include diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c index b7c89ebcf4a2..0b3cd58093d5 100644 --- a/drivers/net/ethernet/realtek/8139cp.c +++ b/drivers/net/ethernet/realtek/8139cp.c @@ -76,7 +76,7 @@ #include #include #include -#include +#include /* These identify the driver base version and may not be removed. */ static char version[] = diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index 42051ab98cf0..d390b9663dc3 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c index 39fca6c0b68d..19a458716f1a 100644 --- a/drivers/net/ethernet/sis/sis900.c +++ b/drivers/net/ethernet/sis/sis900.c @@ -74,7 +74,7 @@ #include /* Processor type for cache alignment. */ #include #include -#include /* User space memory access functions */ +#include /* User space memory access functions */ #include "sis900.h" diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c index fe9760ffab51..55a95e1d69d6 100644 --- a/drivers/net/ethernet/smsc/epic100.c +++ b/drivers/net/ethernet/smsc/epic100.c @@ -86,7 +86,7 @@ static int rx_copybreak; #include #include #include -#include +#include #include /* These identify the driver base version and may not be removed. */ diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c index f1c75e291e55..67154621abcf 100644 --- a/drivers/net/ethernet/smsc/smc91c92_cs.c +++ b/drivers/net/ethernet/smsc/smc91c92_cs.c @@ -52,7 +52,7 @@ #include #include -#include +#include /*====================================================================*/ diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index e9e5ef241c6f..0e8e89f17dbb 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -99,7 +99,7 @@ #include #include #include -#include +#include #define cas_page_map(x) kmap_atomic((x)) #define cas_page_unmap(x) kunmap_atomic((x)) diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index 66ecf0fcc330..d277e4107976 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -42,7 +42,7 @@ #include #include -#include +#include #include #ifdef CONFIG_SPARC diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index ca96408058b0..72ff05cd3ed8 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -49,7 +49,7 @@ #include #include #endif -#include +#include #include #include diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c index ba5c54249055..0a6c4e804eed 100644 --- a/drivers/net/ethernet/via/via-rhine.c +++ b/drivers/net/ethernet/via/via-rhine.c @@ -114,7 +114,7 @@ static const int multicast_filter_limit = 32; #include /* Processor type for cache alignment. */ #include #include -#include +#include #include /* These identify the driver base version and may not be removed. */ diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c index 3b08ec766076..f71883264cc0 100644 --- a/drivers/net/ethernet/xircom/xirc2ps_cs.c +++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c @@ -88,7 +88,7 @@ #include #include -#include +#include #ifndef MANFID_COMPAQ #define MANFID_COMPAQ 0x0138 diff --git a/drivers/net/fddi/skfp/skfddi.c b/drivers/net/fddi/skfp/skfddi.c index 3a639180e4a0..2414f1dc8ddd 100644 --- a/drivers/net/fddi/skfp/skfddi.c +++ b/drivers/net/fddi/skfp/skfddi.c @@ -88,7 +88,7 @@ static const char * const boot_msg = #include #include -#include +#include #include "h/types.h" #undef ADDR // undo Linux definition diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 470b3dcd54e5..922bf440e9f1 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -13,7 +13,7 @@ */ #include -#include +#include #include #include #include diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 78dbc44540f6..7d054697b199 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c index 072cddce9264..809dc25909d1 100644 --- a/drivers/net/hamradio/baycom_par.c +++ b/drivers/net/hamradio/baycom_par.c @@ -86,7 +86,7 @@ #include #include -#include +#include /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c index 7b916d5b14b9..ebc06822fd4d 100644 --- a/drivers/net/hamradio/baycom_ser_fdx.c +++ b/drivers/net/hamradio/baycom_ser_fdx.c @@ -82,7 +82,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c index f9a8976195ba..60fcf512c208 100644 --- a/drivers/net/hamradio/baycom_ser_hdx.c +++ b/drivers/net/hamradio/baycom_ser_hdx.c @@ -67,7 +67,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index 622ab3ab9e93..f62e7f325cf9 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -69,7 +69,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c index e4137c1b3df9..2479072981a1 100644 --- a/drivers/net/hamradio/dmascc.c +++ b/drivers/net/hamradio/dmascc.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include "z8530.h" diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index 4bad0b894e9c..8c3633c1d078 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -58,7 +58,7 @@ #include #include #include -#include +#include #include diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index 1dfe2304daa7..ece59c54a653 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -17,7 +17,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c index b8083161ef46..6754cd01c605 100644 --- a/drivers/net/hamradio/scc.c +++ b/drivers/net/hamradio/scc.c @@ -178,7 +178,7 @@ #include #include -#include +#include #include "z8530.h" diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index aaff07c10058..b6891ada1d7b 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -68,7 +68,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c index f5a9728b89f3..dd7fc6659ad4 100644 --- a/drivers/net/hippi/rrunner.c +++ b/drivers/net/hippi/rrunner.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #define rr_if_busy(dev) netif_queue_stopped(dev) #define rr_if_running(dev) netif_running(dev) diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c index 7a3f990c1935..7a20a9a4663a 100644 --- a/drivers/net/irda/irtty-sir.c +++ b/drivers/net/irda/irtty-sir.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/irda/kingsun-sir.c b/drivers/net/irda/kingsun-sir.c index fb5d162ec7d2..24c0f169a7b1 100644 --- a/drivers/net/irda/kingsun-sir.c +++ b/drivers/net/irda/kingsun-sir.c @@ -71,7 +71,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/irda/ks959-sir.c b/drivers/net/irda/ks959-sir.c index 8e6e0edf2440..3affded3e30d 100644 --- a/drivers/net/irda/ks959-sir.c +++ b/drivers/net/irda/ks959-sir.c @@ -123,7 +123,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/irda/ksdazzle-sir.c b/drivers/net/irda/ksdazzle-sir.c index 37f23a189b35..741452c7ce35 100644 --- a/drivers/net/irda/ksdazzle-sir.c +++ b/drivers/net/irda/ksdazzle-sir.c @@ -87,7 +87,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/irda/mcs7780.c b/drivers/net/irda/mcs7780.c index bca6a1e72d1d..6f6ed75b63c9 100644 --- a/drivers/net/irda/mcs7780.c +++ b/drivers/net/irda/mcs7780.c @@ -55,7 +55,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c index a0849f49bbec..ffedad2a360a 100644 --- a/drivers/net/irda/vlsi_ir.c +++ b/drivers/net/irda/vlsi_ir.c @@ -45,7 +45,7 @@ MODULE_LICENSE("GPL"); #include #include #include -#include +#include #include #include diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 6255973e3dda..1e05b7c2d157 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/phy/davicom.c b/drivers/net/phy/davicom.c index 36e3e2033eca..e28913d9ea7e 100644 --- a/drivers/net/phy/davicom.c +++ b/drivers/net/phy/davicom.c @@ -32,7 +32,7 @@ #include #include -#include +#include #define MII_DM9161_SCR 0x10 #define MII_DM9161_SCR_INIT 0x0610 diff --git a/drivers/net/phy/icplus.c b/drivers/net/phy/icplus.c index 22b51f01a94a..567280a72241 100644 --- a/drivers/net/phy/icplus.c +++ b/drivers/net/phy/icplus.c @@ -28,7 +28,7 @@ #include #include -#include +#include MODULE_DESCRIPTION("ICPlus IP175C/IP101A/IP101G/IC1001 PHY drivers"); MODULE_AUTHOR("Michael Barkowski"); diff --git a/drivers/net/phy/lxt.c b/drivers/net/phy/lxt.c index b9fde1bcf0f0..8d198a1f0031 100644 --- a/drivers/net/phy/lxt.c +++ b/drivers/net/phy/lxt.c @@ -32,7 +32,7 @@ #include #include -#include +#include /* The Level one LXT970 is used by many boards */ diff --git a/drivers/net/phy/qsemi.c b/drivers/net/phy/qsemi.c index d470db89e8dd..dbef8002bc28 100644 --- a/drivers/net/phy/qsemi.c +++ b/drivers/net/phy/qsemi.c @@ -32,7 +32,7 @@ #include #include -#include +#include /* ------------------------------------------------------------------------- */ /* The Quality Semiconductor QS6612 is used on the RPX CLLF */ diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c index 9c889e0303dd..feb9569e3345 100644 --- a/drivers/net/ppp/ppp_async.c +++ b/drivers/net/ppp/ppp_async.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #define PPP_VERSION "2.4.2" diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index 925d3e295bac..9ae53986cb4a 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #define PPP_VERSION "2.4.2" diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index f017c72bb7fd..d7e405268983 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -83,7 +83,7 @@ #include #include -#include +#include #define PPPOE_HASH_BITS 4 #define PPPOE_HASH_SIZE (1 << PPPOE_HASH_BITS) diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c index b9c8be6283d3..c0599b3b23c0 100644 --- a/drivers/net/ppp/pppox.c +++ b/drivers/net/ppp/pppox.c @@ -34,7 +34,7 @@ #include -#include +#include static const struct pppox_proto *pppox_protos[PX_MAX_PROTO + 1]; diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c index 8b8b53259783..7820fced33f6 100644 --- a/drivers/net/sb1000.c +++ b/drivers/net/sb1000.c @@ -55,7 +55,7 @@ static char version[] = "sb1000.c:v1.1.2 6/01/98 (fventuri@mediaone.net)\n"; #include #include -#include +#include #ifdef SB1000_DEBUG static int sb1000_debug = SB1000_DEBUG; diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index 27ed25252aac..5782733959f0 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -75,7 +75,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index 7e933d8ff811..9841f3dc0682 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -64,7 +64,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 57e88b814700..cd8e02c94be0 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -73,7 +73,7 @@ #include #include -#include +#include /* Uncomment to enable debugging */ /* #define TUN_DEBUG 1 */ diff --git a/drivers/net/usb/catc.c b/drivers/net/usb/catc.c index a1f2f6f1e614..3daa41bdd4ea 100644 --- a/drivers/net/usb/catc.c +++ b/drivers/net/usb/catc.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #undef DEBUG diff --git a/drivers/net/usb/kaweth.c b/drivers/net/usb/kaweth.c index 338aed5da14d..876f02f4945e 100644 --- a/drivers/net/usb/kaweth.c +++ b/drivers/net/usb/kaweth.c @@ -54,7 +54,7 @@ #include #include #include -#include +#include #include #undef DEBUG diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c index 399f7ee57aea..24e803fe9a53 100644 --- a/drivers/net/usb/pegasus.c +++ b/drivers/net/usb/pegasus.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include "pegasus.h" /* diff --git a/drivers/net/usb/rtl8150.c b/drivers/net/usb/rtl8150.c index 93a1bda1c1e5..95b7bd0d7abc 100644 --- a/drivers/net/usb/rtl8150.c +++ b/drivers/net/usb/rtl8150.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include /* Version Information */ #define DRIVER_VERSION "v0.6.2 (2004/08/27)" diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index ae6ecf401189..65ee2a6f248c 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -52,7 +52,7 @@ #include #include -#include +#include static const char version[] = "DLCI driver v0.35, 4 Jan 1997, mike.mclagan@linux.org"; diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c index 7351e5440ed7..799830ffcae2 100644 --- a/drivers/net/wan/dscc4.c +++ b/drivers/net/wan/dscc4.c @@ -95,7 +95,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c index 03696d35ee9c..33265eb50420 100644 --- a/drivers/net/wan/farsync.c +++ b/drivers/net/wan/farsync.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include "farsync.h" diff --git a/drivers/net/wan/hd64570.c b/drivers/net/wan/hd64570.c index dc334c85d966..166696d2c496 100644 --- a/drivers/net/wan/hd64570.c +++ b/drivers/net/wan/hd64570.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include "hd64570.h" #define get_msci(port) (phy_node(port) ? MSCI1_OFFSET : MSCI0_OFFSET) diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c index e92ecf1d3314..7ef49dab6855 100644 --- a/drivers/net/wan/hd64572.c +++ b/drivers/net/wan/hd64572.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include "hd64572.h" #define NAPI_WEIGHT 16 diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index 6676607164d6..9df9ed62beff 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/wan/lmc/lmc_main.c b/drivers/net/wan/lmc/lmc_main.c index 001b7796740d..4698450c77d1 100644 --- a/drivers/net/wan/lmc/lmc_main.c +++ b/drivers/net/wan/lmc/lmc_main.c @@ -59,7 +59,7 @@ #include /* Processor type for cache alignment. */ #include #include -#include +#include //#include #define DRIVER_MAJOR_VERSION 1 diff --git a/drivers/net/wan/lmc/lmc_media.c b/drivers/net/wan/lmc/lmc_media.c index ff2e4a5654c7..cffe23bd16e1 100644 --- a/drivers/net/wan/lmc/lmc_media.c +++ b/drivers/net/wan/lmc/lmc_media.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include "lmc.h" #include "lmc_var.h" diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c index 3f83be98d469..3ca3419c54a0 100644 --- a/drivers/net/wan/sbni.c +++ b/drivers/net/wan/sbni.c @@ -63,7 +63,7 @@ #include #include #include -#include +#include #include "sbni.h" diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c index 421ac5f85699..236c62538036 100644 --- a/drivers/net/wan/sdla.c +++ b/drivers/net/wan/sdla.c @@ -56,7 +56,7 @@ #include #include -#include +#include static const char* version = "SDLA driver v0.30, 12 Sep 1996, mike.mclagan@linux.org"; diff --git a/drivers/net/wireless/atmel/atmel.c b/drivers/net/wireless/atmel/atmel.c index eb92d5ab7a27..e12f62356fd1 100644 --- a/drivers/net/wireless/atmel/atmel.c +++ b/drivers/net/wireless/atmel/atmel.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c index 64176090b196..356aba9d3d53 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c @@ -148,7 +148,7 @@ that only one external action is invoked at a time. #include #include #include -#include +#include #include #include #include diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_geo.c b/drivers/net/wireless/intel/ipw2x00/libipw_geo.c index 218f2a32de21..ce7eda20a68f 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_geo.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_geo.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include "libipw.h" diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_module.c b/drivers/net/wireless/intel/ipw2x00/libipw_module.c index 2332075565f2..c58c5b2dcce5 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_module.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_module.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c index 1c1ec7bb9302..6df19f03355a 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c index e8c039879b05..048f1e3ada11 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include "libipw.h" diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c index a8a9bd8e176a..544ef7adde7d 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_hw.c +++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c @@ -32,7 +32,7 @@ #include -#include +#include #include #include diff --git a/drivers/net/wireless/intersil/hostap/hostap_main.c b/drivers/net/wireless/intersil/hostap/hostap_main.c index 1a16b8cb366e..544fc09dcb62 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_main.c +++ b/drivers/net/wireless/intersil/hostap/hostap_main.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include "hostap_wlan.h" #include "hostap_80211.h" diff --git a/drivers/net/wireless/intersil/prism54/isl_38xx.c b/drivers/net/wireless/intersil/prism54/isl_38xx.c index 6700387ef9ab..ce9d4db0d9ca 100644 --- a/drivers/net/wireless/intersil/prism54/isl_38xx.c +++ b/drivers/net/wireless/intersil/prism54/isl_38xx.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include "prismcompat.h" diff --git a/drivers/net/wireless/intersil/prism54/isl_ioctl.c b/drivers/net/wireless/intersil/prism54/isl_ioctl.c index 48e8a978a832..334717b0a2be 100644 --- a/drivers/net/wireless/intersil/prism54/isl_ioctl.c +++ b/drivers/net/wireless/intersil/prism54/isl_ioctl.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include "prismcompat.h" #include "isl_ioctl.h" diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index 4fdc7223c894..b94479441b0c 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -53,7 +53,7 @@ #include #include -#include +#include /* Warning : these stuff will slow down the driver... */ #define WIRELESS_SPY /* Enable spying addresses */ diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c index d9d29ab88184..acec0d9ec422 100644 --- a/drivers/net/wireless/wl3501_cs.c +++ b/drivers/net/wireless/wl3501_cs.c @@ -51,7 +51,7 @@ #include #include -#include +#include #include "wl3501.h" diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c index 5371b374f1fe..e8f68f5732f1 100644 --- a/drivers/nubus/proc.c +++ b/drivers/nubus/proc.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include static int diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index c0cc4e7ff023..67935fbbbcab 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "oprof.h" #include "event_buffer.h" diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 134398e0231b..d77ebbfc67c9 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "oprof.h" diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 3ed6238f8f6e..553ef8a5d588 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -48,7 +48,7 @@ #include #include /* for L1_CACHE_BYTES */ -#include +#include #include #include #include diff --git a/drivers/parisc/ccio-rm-dma.c b/drivers/parisc/ccio-rm-dma.c index f78f6f1aef47..1bf988010855 100644 --- a/drivers/parisc/ccio-rm-dma.c +++ b/drivers/parisc/ccio-rm-dma.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c index 783906fe659a..4dd9b1308128 100644 --- a/drivers/parisc/eisa_eeprom.c +++ b/drivers/parisc/eisa_eeprom.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #define EISA_EEPROM_MINOR 241 diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c index 21905fef2cbf..d9bffe8d29b9 100644 --- a/drivers/parisc/eisa_enumerator.c +++ b/drivers/parisc/eisa_enumerator.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index b48243131993..ff1a332d76e4 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -49,7 +49,7 @@ #include /* HZ */ #include #include -#include +#include /* The control of the LEDs and LCDs on PARISC-machines have to be done completely in software. The necessary calculations are done in a work queue diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index 3651c3871d5b..055f83fddc18 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -68,7 +68,7 @@ #include #include -#include +#include #include #define PDCS_VERSION "0.30" diff --git a/drivers/parport/daisy.c b/drivers/parport/daisy.c index 5bed17f68ef4..d998d0ed2bec 100644 --- a/drivers/parport/daisy.c +++ b/drivers/parport/daisy.c @@ -26,7 +26,7 @@ #include #include -#include +#include #undef DEBUG diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c index 2e21af43d91e..c0e7d21c88c2 100644 --- a/drivers/parport/ieee1284_ops.c +++ b/drivers/parport/ieee1284_ops.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #undef DEBUG /* undef me for production */ diff --git a/drivers/parport/parport_gsc.c b/drivers/parport/parport_gsc.c index 6e3a60c78873..dd6d4ccb41e4 100644 --- a/drivers/parport/parport_gsc.c +++ b/drivers/parport/parport_gsc.c @@ -34,7 +34,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c index d763bc9e44c1..4d1d6eaf333d 100644 --- a/drivers/parport/probe.c +++ b/drivers/parport/probe.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include static const struct { const char *token; diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 74ed3e459a3e..8ee44a104ac4 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -23,7 +23,7 @@ #include #include -#include +#include #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) diff --git a/drivers/pci/hotplug/acpiphp_ibm.c b/drivers/pci/hotplug/acpiphp_ibm.c index f6221d739f59..68d105aaf4e2 100644 --- a/drivers/pci/hotplug/acpiphp_ibm.c +++ b/drivers/pci/hotplug/acpiphp_ibm.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include "acpiphp.h" #include "../pci.h" diff --git a/drivers/pci/hotplug/cpqphp_core.c b/drivers/pci/hotplug/cpqphp_core.c index ec009a7dba20..33d300d12411 100644 --- a/drivers/pci/hotplug/cpqphp_core.c +++ b/drivers/pci/hotplug/cpqphp_core.c @@ -40,7 +40,7 @@ #include #include -#include +#include #include "cpqphp.h" #include "cpqphp_nvram.h" diff --git a/drivers/pci/hotplug/cpqphp_nvram.c b/drivers/pci/hotplug/cpqphp_nvram.c index c25fc9061059..daae8071a156 100644 --- a/drivers/pci/hotplug/cpqphp_nvram.c +++ b/drivers/pci/hotplug/cpqphp_nvram.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include "cpqphp.h" #include "cpqphp_nvram.h" diff --git a/drivers/pci/hotplug/pci_hotplug_core.c b/drivers/pci/hotplug/pci_hotplug_core.c index 56013d0daf7f..7b0e97be9063 100644 --- a/drivers/pci/hotplug/pci_hotplug_core.c +++ b/drivers/pci/hotplug/pci_hotplug_core.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include "../pci.h" #include "cpci_hotplug.h" diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index 2408abe4ee8c..f82710a8694d 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include "pci.h" diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index b91c4da68365..9bf993e1f71e 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "pci.h" SYSCALL_DEFINE5(pciconfig_read, unsigned long, bus, unsigned long, dfn, diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index c890a49587e4..aa2ee51d3547 100644 --- a/drivers/platform/x86/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -68,7 +68,7 @@ #include #include #endif -#include +#include #include #define dprintk(fmt, ...) \ diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index aa65a857a6b1..cacb43fb1df7 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -82,7 +82,7 @@ #include #include #include -#include +#include #include /* ThinkPad CMOS commands */ diff --git a/drivers/pnp/interface.c b/drivers/pnp/interface.c index 4b6808ff0e5d..5c5b3d47b5f6 100644 --- a/drivers/pnp/interface.c +++ b/drivers/pnp/interface.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include "base.h" diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c index c212db0fc65d..5ee6b2a5f8d5 100644 --- a/drivers/pnp/pnpbios/proc.c +++ b/drivers/pnp/pnpbios/proc.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include "pnpbios.h" diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 84ca314c87e3..dd46e96a3034 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include /* This is ugly... */ diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 67bf50c9946f..ade04216c970 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index 6c5d671304b4..8713fefd794b 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c index 113c1c1fa1af..9e3419124264 100644 --- a/drivers/s390/block/dasd_erp.c +++ b/drivers/s390/block/dasd_erp.c @@ -15,7 +15,7 @@ #include #include -#include +#include /* This is ugly... */ #define PRINTK_HEADER "dasd_erp:" diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index e2fa759bf2ad..8b1341fb2e0d 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -16,7 +16,7 @@ #include #include -#include +#include /* This is ugly... */ #define PRINTK_HEADER "dasd_gendisk:" diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 9dfbd972f844..ec65c1e51c2a 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include /* This is ugly... */ #define PRINTK_HEADER "dasd_ioctl:" diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c index bad7a196bf84..70dc2c4cd3f7 100644 --- a/drivers/s390/block/dasd_proc.c +++ b/drivers/s390/block/dasd_proc.c @@ -20,7 +20,7 @@ #include #include -#include +#include /* This is ugly... */ #define PRINTK_HEADER "dasd_proc:" diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 288f59a4147b..b9d7e755c8a3 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #define XPRAM_NAME "xpram" #define XPRAM_DEVS 1 /* one partition */ diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c index 1b8d825623bd..9ec4ae056158 100644 --- a/drivers/s390/char/con3215.c +++ b/drivers/s390/char/con3215.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c index 7b9c50aa4cc9..82c913318b73 100644 --- a/drivers/s390/char/keyboard.c +++ b/drivers/s390/char/keyboard.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "keyboard.h" diff --git a/drivers/s390/char/monreader.c b/drivers/s390/char/monreader.c index ebdeaa53182d..027ac6ae5eea 100644 --- a/drivers/s390/char/monreader.c +++ b/drivers/s390/char/monreader.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/s390/char/monwriter.c b/drivers/s390/char/monwriter.c index 9b5d1138b2e2..571a7e352755 100644 --- a/drivers/s390/char/monwriter.c +++ b/drivers/s390/char/monwriter.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/char/sclp_rw.c b/drivers/s390/char/sclp_rw.c index 6010cd347a08..91b26df5227d 100644 --- a/drivers/s390/char/sclp_rw.c +++ b/drivers/s390/char/sclp_rw.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include "sclp.h" #include "sclp_rw.h" diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c index 9259017a1295..236b736ae136 100644 --- a/drivers/s390/char/sclp_tty.c +++ b/drivers/s390/char/sclp_tty.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "ctrlchar.h" #include "sclp.h" diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c index 68d6ee7ae504..095481d32236 100644 --- a/drivers/s390/char/sclp_vt220.c +++ b/drivers/s390/char/sclp_vt220.c @@ -26,7 +26,7 @@ #include #include -#include +#include #include "sclp.h" #include "ctrlchar.h" diff --git a/drivers/s390/char/tape_char.c b/drivers/s390/char/tape_char.c index 77f9b9c2f701..46ac1164f242 100644 --- a/drivers/s390/char/tape_char.c +++ b/drivers/s390/char/tape_char.c @@ -18,7 +18,7 @@ #include #include -#include +#include #define TAPE_DBF_AREA tape_core_dbf diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c index 272cb6cd1b2a..e5ebe2fbee23 100644 --- a/drivers/s390/char/tty3270.c +++ b/drivers/s390/char/tty3270.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include "raw3270.h" #include "tty3270.h" diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 2a67b496a9e2..65f5a794f26d 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include "vmcp.h" static debug_info_t *vmcp_debug; diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c index 3167e8581994..57974a1e0e03 100644 --- a/drivers/s390/char/vmlogrdr.c +++ b/drivers/s390/char/vmlogrdr.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index ff18f373af9a..04aceb694d51 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index f771e5e9e26b..d3b51edb056e 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c index 9082476b51db..bf7f5d4c50e1 100644 --- a/drivers/s390/cio/blacklist.c +++ b/drivers/s390/cio/blacklist.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 854a6e58dfea..51eece9af577 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/s390/crypto/zcrypt_cex2a.c b/drivers/s390/crypto/zcrypt_cex2a.c index c7d48a18199e..b97c5d5ee5a4 100644 --- a/drivers/s390/crypto/zcrypt_cex2a.c +++ b/drivers/s390/crypto/zcrypt_cex2a.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/crypto/zcrypt_pcixcc.c b/drivers/s390/crypto/zcrypt_pcixcc.c index 26ceaa696765..600604782b65 100644 --- a/drivers/s390/crypto/zcrypt_pcixcc.c +++ b/drivers/s390/crypto/zcrypt_pcixcc.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include "ap_bus.h" diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c index 2981024a2438..3f85b97ab8d2 100644 --- a/drivers/s390/net/netiucv.c +++ b/drivers/s390/net/netiucv.c @@ -62,7 +62,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c index 33fbe8249fd5..04efed171c88 100644 --- a/drivers/sbus/char/display7seg.c +++ b/drivers/sbus/char/display7seg.c @@ -17,7 +17,7 @@ #include #include #include -#include /* put_/get_user */ +#include /* put_/get_user */ #include #include diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c index 5609b602c54d..56e962a01493 100644 --- a/drivers/sbus/char/envctrl.c +++ b/drivers/sbus/char/envctrl.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/sbus/char/flash.c b/drivers/sbus/char/flash.c index 206ef4232adf..216f923161d1 100644 --- a/drivers/sbus/char/flash.c +++ b/drivers/sbus/char/flash.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index a40ee1e37486..6ff61dad5e21 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/sbus/char/openprom.c b/drivers/sbus/char/openprom.c index 4612691c6619..2c2e6a3b4c7e 100644 --- a/drivers/sbus/char/openprom.c +++ b/drivers/sbus/char/openprom.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_PCI #include diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 316f87fe3299..00e7968a1d70 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -92,7 +92,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index 970d8fa6bd53..b150e131b2e7 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -64,7 +64,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index aa412ab02765..33261b690774 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -210,7 +210,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index 6678d1fd897b..1ee7c654f7b8 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include /* For flush_kernel_dcache_page */ #include diff --git a/drivers/scsi/aacraid/commctrl.c b/drivers/scsi/aacraid/commctrl.c index 5648b715fed9..e1daff230c7d 100644 --- a/drivers/scsi/aacraid/commctrl.c +++ b/drivers/scsi/aacraid/commctrl.c @@ -41,7 +41,7 @@ #include /* ssleep prototype */ #include #include -#include +#include #include #include "aacraid.h" diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 9e45749d55ed..af032c46ec0e 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -61,7 +61,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c index 9d253cb83ee7..d9e15210b110 100644 --- a/drivers/scsi/bfa/bfad.c +++ b/drivers/scsi/bfa/bfad.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include "bfad_drv.h" diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c index 27c0dce22e72..5f75e638ec95 100644 --- a/drivers/scsi/dpt_i2o.c +++ b/drivers/scsi/dpt_i2o.c @@ -37,7 +37,7 @@ MODULE_DESCRIPTION("Adaptec I2O RAID Driver"); //////////////////////////////////////////////////////////////// #include /* For SCSI-Passthrough */ -#include +#include #include #include /* for kmalloc() */ diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c index 0a767740bf02..d020a13646ae 100644 --- a/drivers/scsi/gdth.c +++ b/drivers/scsi/gdth.c @@ -130,7 +130,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c index a83f705ed8a5..db17ad15b0c1 100644 --- a/drivers/scsi/hptiop.c +++ b/drivers/scsi/hptiop.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index 45b9566b928e..b782bb60baf0 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -51,7 +51,7 @@ #define _IPS_H_ #include - #include +#include #include /* diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 9d05302a3bcd..3c63c292cb92 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/megaraid/megaraid_mm.h b/drivers/scsi/megaraid/megaraid_mm.h index 55b425c0a654..a30e725f2d5c 100644 --- a/drivers/scsi/megaraid/megaraid_mm.h +++ b/drivers/scsi/megaraid/megaraid_mm.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 6484c382f670..d5cf15eb8c5e 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -42,7 +42,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index a2960f5d98ec..e8196c55b633 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -52,7 +52,7 @@ static const char * osst_version = "0.99.4"; #include #include #include -#include +#include #include /* The driver prints some debugging information on the console if DEBUG diff --git a/drivers/scsi/qla2xxx/qla_sup.c b/drivers/scsi/qla2xxx/qla_sup.c index 9f6012b78e56..b4336e0cd85f 100644 --- a/drivers/scsi/qla2xxx/qla_sup.c +++ b/drivers/scsi/qla2xxx/qla_sup.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include /* * NVRAM support routines diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index c4f7b56fa6f6..8b8c814df5c7 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c index 7a74b82e8973..480a597b3877 100644 --- a/drivers/scsi/scsi_proc.c +++ b/drivers/scsi/scsi_proc.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 1622e23138e0..b1933041da39 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index bed2bbd6b923..94352e4df831 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/scsi/sr_ioctl.c b/drivers/scsi/sr_ioctl.c index 03054c0e7689..dfffdf63e44c 100644 --- a/drivers/scsi/sr_ioctl.c +++ b/drivers/scsi/sr_ioctl.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 605887d5ee57..5f35b863e1a7 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -41,7 +41,7 @@ static const char *verstr = "20160209"; #include #include -#include +#include #include #include diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c index dfbb974927f2..dea16bb8c46a 100644 --- a/drivers/tty/amiserial.c +++ b/drivers/tty/amiserial.c @@ -127,7 +127,7 @@ static struct serial_state rs_table[1]; #define NR_PORTS ARRAY_SIZE(rs_table) -#include +#include #define serial_isroot() (capable(CAP_SYS_ADMIN)) diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c index ce864875330e..9b5c0fb216b5 100644 --- a/drivers/tty/hvc/hvc_console.c +++ b/drivers/tty/hvc/hvc_console.c @@ -42,7 +42,7 @@ #include #include -#include +#include #include "hvc_console.h" diff --git a/drivers/tty/hvc/hvcs.c b/drivers/tty/hvc/hvcs.c index 3c4d7c2b4ade..7823d6d998cf 100644 --- a/drivers/tty/hvc/hvcs.c +++ b/drivers/tty/hvc/hvcs.c @@ -81,7 +81,7 @@ #include #include #include -#include +#include #include /* diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c index 96ce6bd1cc6f..2e578d6433af 100644 --- a/drivers/tty/hvc/hvsi.c +++ b/drivers/tty/hvc/hvsi.c @@ -46,7 +46,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c index 60d37b225589..4caf0c3b1f99 100644 --- a/drivers/tty/moxa.c +++ b/drivers/tty/moxa.c @@ -47,7 +47,7 @@ #include #include -#include +#include #include "moxa.h" diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c index 69294ae154be..7b8f383fb090 100644 --- a/drivers/tty/mxser.c +++ b/drivers/tty/mxser.c @@ -43,7 +43,7 @@ #include #include -#include +#include #include "mxser.h" diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c index a7fa016f31eb..eb278832f5ce 100644 --- a/drivers/tty/n_hdlc.c +++ b/drivers/tty/n_hdlc.c @@ -103,7 +103,7 @@ #include #include -#include +#include /* * Buffers for individual HDLC frames diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c index 345111467b85..305b6490d405 100644 --- a/drivers/tty/n_r3964.c +++ b/drivers/tty/n_r3964.c @@ -65,7 +65,7 @@ #include #include #include -#include +#include /*#define DEBUG_QUEUE*/ diff --git a/drivers/tty/serial/icom.c b/drivers/tty/serial/icom.c index c60a8d5e4020..d83783cfbade 100644 --- a/drivers/tty/serial/icom.c +++ b/drivers/tty/serial/icom.c @@ -53,7 +53,7 @@ #include #include -#include +#include #include "icom.h" diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index d0847375ea64..9939c3d9912b 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -36,7 +36,7 @@ #include #include -#include +#include /* * This is used to lock changes in serial line configuration. diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c index 415885c56435..657eed82eeb3 100644 --- a/drivers/tty/synclink.c +++ b/drivers/tty/synclink.c @@ -107,7 +107,7 @@ #define PUT_USER(error,value,addr) error = put_user(value,addr) #define COPY_TO_USER(error,dest,src,size) error = copy_to_user(dest,src,size) ? -EFAULT : 0 -#include +#include #define RCLRVALUE 0xffff diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 8267bcf2405e..31885f20fc15 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -77,7 +77,7 @@ #include #include #include -#include +#include #if defined(CONFIG_HDLC) || (defined(CONFIG_HDLC_MODULE) && defined(CONFIG_SYNCLINK_GT_MODULE)) #define SYNCLINK_GENERIC_HDLC 1 diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c index d66620f7eaa3..51e8846cd68f 100644 --- a/drivers/tty/synclinkmp.c +++ b/drivers/tty/synclinkmp.c @@ -79,7 +79,7 @@ #define PUT_USER(error,value,addr) error = put_user(value,addr) #define COPY_TO_USER(error,dest,src,size) error = copy_to_user(dest,src,size) ? -EFAULT : 0 -#include +#include static MGSL_PARAMS default_params = { MGSL_MODE_HDLC, /* unsigned long mode */ diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c index bf36ac9aee41..f27fc0f14c11 100644 --- a/drivers/tty/tty_ioctl.c +++ b/drivers/tty/tty_ioctl.c @@ -22,7 +22,7 @@ #include #include -#include +#include #undef TTY_DEBUG_WAIT_UNTIL_SENT diff --git a/drivers/tty/vt/consolemap.c b/drivers/tty/vt/consolemap.c index 71e81406ef71..1f6e17fc3fb0 100644 --- a/drivers/tty/vt/consolemap.c +++ b/drivers/tty/vt/consolemap.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c index 368ce1803e8f..36e1b8c7680f 100644 --- a/drivers/tty/vt/selection.c +++ b/drivers/tty/vt/selection.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c index 14a2b5f11bca..56dcff6059d3 100644 --- a/drivers/tty/vt/vc_screen.c +++ b/drivers/tty/vt/vc_screen.c @@ -39,7 +39,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index f62c598810ff..a56edf2d58eb 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c index 4dec9df8764b..5a59da0dc98a 100644 --- a/drivers/usb/atm/usbatm.c +++ b/drivers/usb/atm/usbatm.c @@ -64,7 +64,7 @@ #include "usbatm.h" -#include +#include #include #include #include diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 143454ea385b..1fa5c0f29c64 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include #include "hub.h" diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 10b2576f8b6a..e8f4102d19df 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c index 5d3d914ab4fb..683098afa93e 100644 --- a/drivers/usb/host/uhci-hcd.c +++ b/drivers/usb/host/uhci-hcd.c @@ -42,7 +42,7 @@ #include #include -#include +#include #include #include diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c index 9a82f8308ad7..01a9373b7e18 100644 --- a/drivers/usb/misc/ftdi-elan.c +++ b/drivers/usb/misc/ftdi-elan.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c index 2975e80b7a56..debc1fd74b0d 100644 --- a/drivers/usb/misc/idmouse.c +++ b/drivers/usb/misc/idmouse.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include /* image constants */ diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c index 9ca595632f17..3bc5356832db 100644 --- a/drivers/usb/misc/ldusb.c +++ b/drivers/usb/misc/ldusb.c @@ -28,7 +28,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c index c8fbe7b739a0..b10e26c74a90 100644 --- a/drivers/usb/misc/legousbtower.c +++ b/drivers/usb/misc/legousbtower.c @@ -83,7 +83,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 1a874a1f3890..91c22276c03b 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -20,7 +20,7 @@ #include #include -#include +#include #include "usb_mon.h" diff --git a/drivers/usb/mon/mon_stat.c b/drivers/usb/mon/mon_stat.c index 5388a339cfb8..5bdf73a57498 100644 --- a/drivers/usb/mon/mon_stat.c +++ b/drivers/usb/mon/mon_stat.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "usb_mon.h" diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c index e59334b09c41..db1a4abf2806 100644 --- a/drivers/usb/mon/mon_text.c +++ b/drivers/usb/mon/mon_text.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "usb_mon.h" diff --git a/drivers/usb/musb/musb_debugfs.c b/drivers/usb/musb/musb_debugfs.c index 9b22d946c089..4fef50e5c8c1 100644 --- a/drivers/usb/musb/musb_debugfs.c +++ b/drivers/usb/musb/musb_debugfs.c @@ -37,7 +37,7 @@ #include #include -#include +#include #include "musb_core.h" #include "musb_debug.h" diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 1e11614322fe..42d02a206059 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/video/fbdev/68328fb.c b/drivers/video/fbdev/68328fb.c index 17f21cedff9b..c0c6b88d3839 100644 --- a/drivers/video/fbdev/68328fb.c +++ b/drivers/video/fbdev/68328fb.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/video/fbdev/hitfb.c b/drivers/video/fbdev/hitfb.c index 9d68dc9ee7bf..abe3e54d4506 100644 --- a/drivers/video/fbdev/hitfb.c +++ b/drivers/video/fbdev/hitfb.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/video/fbdev/hpfb.c b/drivers/video/fbdev/hpfb.c index 9476d196f510..16f16f5e1a4b 100644 --- a/drivers/video/fbdev/hpfb.c +++ b/drivers/video/fbdev/hpfb.c @@ -16,7 +16,7 @@ #include #include -#include +#include static struct fb_info fb_info = { .fix = { diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c index 8778e01cebac..1c3c7ab26a95 100644 --- a/drivers/video/fbdev/mx3fb.c +++ b/drivers/video/fbdev/mx3fb.c @@ -33,7 +33,7 @@ #include #include -#include +#include #define MX3FB_NAME "mx3_sdc_fb" diff --git a/drivers/video/fbdev/q40fb.c b/drivers/video/fbdev/q40fb.c index 7487f76f6275..04ea330ccf5d 100644 --- a/drivers/video/fbdev/q40fb.c +++ b/drivers/video/fbdev/q40fb.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include #include diff --git a/drivers/video/fbdev/sm501fb.c b/drivers/video/fbdev/sm501fb.c index d0a4e2f79a57..d80bc8a3200f 100644 --- a/drivers/video/fbdev/sm501fb.c +++ b/drivers/video/fbdev/sm501fb.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #ifdef CONFIG_PM diff --git a/drivers/video/fbdev/stifb.c b/drivers/video/fbdev/stifb.c index 7df4228e25f0..accfef71e984 100644 --- a/drivers/video/fbdev/stifb.c +++ b/drivers/video/fbdev/stifb.c @@ -67,7 +67,7 @@ #include #include /* for HP-UX compatibility */ -#include +#include #include "sticore.h" diff --git a/drivers/video/fbdev/w100fb.c b/drivers/video/fbdev/w100fb.c index 10951c82f6ed..d570e19a2864 100644 --- a/drivers/video/fbdev/w100fb.c +++ b/drivers/video/fbdev/w100fb.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include