diff options
Diffstat (limited to 'kernel')
60 files changed, 1590 insertions, 879 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8bd9cfdc70d7..e0839bcd48c8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -125,38 +125,6 @@ struct cfent { }; /* - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when - * cgroup_subsys->use_id != 0. - */ -#define CSS_ID_MAX (65535) -struct css_id { - /* - * The css to which this ID points. This pointer is set to valid value - * after cgroup is populated. If cgroup is removed, this will be NULL. - * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_tryget() - * should be used for avoiding race. - */ - struct cgroup_subsys_state __rcu *css; - /* - * ID of this css. - */ - unsigned short id; - /* - * Depth in hierarchy which this ID belongs to. - */ - unsigned short depth; - /* - * ID is freed by RCU. (and lookup routine is RCU safe.) - */ - struct rcu_head rcu_head; - /* - * Hierarchy of CSS ID belongs to. - */ - unsigned short stack[0]; /* Array of Length (depth+1) */ -}; - -/* * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { @@ -387,9 +355,6 @@ struct cgrp_cset_link { static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; -static int cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *css); - /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to @@ -841,8 +806,6 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys_state *child_css); - static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -4240,21 +4203,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) goto err; } } - - /* This cgroup is ready now */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* - * Update id->css pointer and make this css visible from - * CSS ID functions. This pointer will be dereferened - * from RCU-read-side without locks. - */ - if (id) - rcu_assign_pointer(id->css, css); - } - return 0; err: cgroup_clear_dir(cgrp, subsys_mask); @@ -4323,7 +4271,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->cgroup = cgrp; css->ss = ss; css->flags = 0; - css->id = NULL; if (cgrp->parent) css->parent = cgroup_css(cgrp->parent, ss); @@ -4455,12 +4402,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; init_css(css, ss, cgrp); - - if (ss->use_id) { - err = alloc_css_id(css); - if (err) - goto err_free_all; - } } /* @@ -4925,12 +4866,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* our new subsystem will be attached to the dummy hierarchy. */ init_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_css() because it sets css->id. */ - if (ss->use_id) { - ret = cgroup_init_idr(ss, css); - if (ret) - goto err_unload; - } /* * Now we need to entangle the css into the existing css_sets. unlike @@ -4996,9 +4931,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) offline_css(cgroup_css(cgroup_dummy_top, ss)); - if (ss->use_id) - idr_destroy(&ss->idr); - /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; @@ -5025,8 +4957,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * remove subsystem's css from the cgroup_dummy_top and free it - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. note that this - * also takes care of freeing the css_id. + * the cgrp->subsys pointer to find their state. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); @@ -5097,8 +5028,6 @@ int __init cgroup_init(void) for_each_builtin_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); - if (ss->use_id) - cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* allocate id for the dummy hierarchy */ @@ -5518,181 +5447,6 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); -/* - * Functons for CSS ID. - */ - -/* to get ID other than 0, this should be called when !cgroup_is_dead() */ -unsigned short css_id(struct cgroup_subsys_state *css) -{ - struct css_id *cssid; - - /* - * This css_id() can return correct value when somone has refcnt - * on this or this is under rcu_read_lock(). Once css->id is allocated, - * it's unchanged until freed. - */ - cssid = rcu_dereference_raw(css->id); - - if (cssid) - return cssid->id; - return 0; -} -EXPORT_SYMBOL_GPL(css_id); - -/** - * css_is_ancestor - test "root" css is an ancestor of "child" - * @child: the css to be tested. - * @root: the css supporsed to be an ancestor of the child. - * - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, the caller must hold rcu_read_lock(). - * But, considering usual usage, the csses should be valid objects after test. - * Assuming that the caller will do some action to the child if this returns - * returns true, the caller must take "child";s reference count. - * If "child" is valid object and this returns true, "root" is valid, too. - */ - -bool css_is_ancestor(struct cgroup_subsys_state *child, - const struct cgroup_subsys_state *root) -{ - struct css_id *child_id; - struct css_id *root_id; - - child_id = rcu_dereference(child->id); - if (!child_id) - return false; - root_id = rcu_dereference(root->id); - if (!root_id) - return false; - if (child_id->depth < root_id->depth) - return false; - if (child_id->stack[root_id->depth] != root_id->id) - return false; - return true; -} - -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) -{ - struct css_id *id = rcu_dereference_protected(css->id, true); - - /* When this is called before css_id initialization, id can be NULL */ - if (!id) - return; - - BUG_ON(!ss->use_id); - - rcu_assign_pointer(id->css, NULL); - rcu_assign_pointer(css->id, NULL); - spin_lock(&ss->id_lock); - idr_remove(&ss->idr, id->id); - spin_unlock(&ss->id_lock); - kfree_rcu(id, rcu_head); -} -EXPORT_SYMBOL_GPL(free_css_id); - -/* - * This is called by init or create(). Then, calls to this function are - * always serialized (By cgroup_mutex() at create()). - */ - -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) -{ - struct css_id *newid; - int ret, size; - - BUG_ON(!ss->use_id); - - size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); - newid = kzalloc(size, GFP_KERNEL); - if (!newid) - return ERR_PTR(-ENOMEM); - - idr_preload(GFP_KERNEL); - spin_lock(&ss->id_lock); - /* Don't use 0. allocates an ID of 1-65535 */ - ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); - spin_unlock(&ss->id_lock); - idr_preload_end(); - - /* Returns error when there are no free spaces for new ID.*/ - if (ret < 0) - goto err_out; - - newid->id = ret; - newid->depth = depth; - return newid; -err_out: - kfree(newid); - return ERR_PTR(ret); - -} - -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, - struct cgroup_subsys_state *rootcss) -{ - struct css_id *newid; - - spin_lock_init(&ss->id_lock); - idr_init(&ss->idr); - - newid = get_new_cssid(ss, 0); - if (IS_ERR(newid)) - return PTR_ERR(newid); - - newid->stack[0] = newid->id; - RCU_INIT_POINTER(newid->css, rootcss); - RCU_INIT_POINTER(rootcss->id, newid); - return 0; -} - -static int alloc_css_id(struct cgroup_subsys_state *child_css) -{ - struct cgroup_subsys_state *parent_css = css_parent(child_css); - struct css_id *child_id, *parent_id; - int i, depth; - - parent_id = rcu_dereference_protected(parent_css->id, true); - depth = parent_id->depth + 1; - - child_id = get_new_cssid(child_css->ss, depth); - if (IS_ERR(child_id)) - return PTR_ERR(child_id); - - for (i = 0; i < depth; i++) - child_id->stack[i] = parent_id->stack[i]; - child_id->stack[depth] = child_id->id; - /* - * child_id->css pointer will be set after this cgroup is available - * see cgroup_populate_dir() - */ - rcu_assign_pointer(child_css->id, child_id); - - return 0; -} - -/** - * css_lookup - lookup css by id - * @ss: cgroup subsys to be looked into. - * @id: the id - * - * Returns pointer to cgroup_subsys_state if there is valid one with id. - * NULL if not. Should be called under rcu_read_lock() - */ -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) -{ - struct css_id *cssid = NULL; - - BUG_ON(!ss->use_id); - cssid = idr_find(&ss->idr, id); - - if (unlikely(!cssid)) - return NULL; - - return rcu_dereference(cssid->css); -} -EXPORT_SYMBOL_GPL(css_lookup); - /** * css_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest diff --git a/kernel/cpu.c b/kernel/cpu.c index 63aa50d7ce1e..973d034acf84 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -437,11 +437,6 @@ int cpu_up(unsigned int cpu) { int err = 0; -#ifdef CONFIG_MEMORY_HOTPLUG - int nid; - pg_data_t *pgdat; -#endif - if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -452,27 +447,9 @@ int cpu_up(unsigned int cpu) return -EINVAL; } -#ifdef CONFIG_MEMORY_HOTPLUG - nid = cpu_to_node(cpu); - if (!node_online(nid)) { - err = mem_online_node(nid); - if (err) - return err; - } - - pgdat = NODE_DATA(nid); - if (!pgdat) { - printk(KERN_ERR - "Can't online cpu %d due to NULL pgdat\n", cpu); - return -ENOMEM; - } - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } -#endif + err = try_online_node(cpu_to_node(cpu)); + if (err) + return err; cpu_maps_update_begin(); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0506d447aed2..7d2f35e5df2f 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -575,8 +575,12 @@ return_normal: raw_spin_lock(&dbg_slave_lock); #ifdef CONFIG_SMP + /* If send_ready set, slaves are already waiting */ + if (ks->send_ready) + atomic_set(ks->send_ready, 1); + /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step) && kgdb_do_roundup) + else if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) if (arch_kgdb_ops.enable_nmi) arch_kgdb_ops.enable_nmi(0); + memset(ks, 0, sizeof(struct kgdb_state)); ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; ks->signo = signo; ks->err_code = ecode; - ks->kgdb_usethreadid = 0; ks->linux_regs = regs; if (kgdb_reenter_check(ks)) @@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } +int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) +{ +#ifdef CONFIG_SMP + if (!kgdb_io_ready(0) || !send_ready) + return 1; + + if (kgdb_info[cpu].enter_kgdb == 0) { + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + + memset(ks, 0, sizeof(struct kgdb_state)); + ks->cpu = cpu; + ks->ex_vector = trapnr; + ks->signo = SIGTRAP; + ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; + ks->linux_regs = regs; + ks->send_ready = send_ready; + kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); + return 0; + } +#endif + return 1; +} + static void kgdb_console_write(struct console *co, const char *s, unsigned count) { diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 2235967e78b0..572aa4f5677c 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -26,6 +26,7 @@ struct kgdb_state { unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; + atomic_t *send_ready; }; /* Exception state values */ @@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); +#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { return DBG_PASS_EVENT; } +#define KGDB_KDB_REASON_SYSTEM_NMI 0 #endif /* CONFIG_KGDB_KDB */ #endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 328d18ef31e4..8859ca34dcfe 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks) if (atomic_read(&kgdb_setting_breakpoint)) reason = KDB_REASON_KEYBOARD; - if (in_nmi()) + if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP) + reason = KDB_REASON_SYSTEM_NMI; + + else if (in_nmi()) reason = KDB_REASON_NMI; for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 00eb8f7fbf41..0b097c8a1e50 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, instruction_pointer(regs)); kdb_dumpregs(regs); break; + case KDB_REASON_SYSTEM_NMI: + kdb_printf("due to System NonMaskable Interrupt\n"); + break; case KDB_REASON_NMI: kdb_printf("due to NonMaskable Interrupt @ " kdb_machreg_fmt "\n", diff --git a/kernel/delayacct.c b/kernel/delayacct.c index d473988c1d0b..54996b71e66d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) struct timespec ts; cputime_t utime, stime, stimescaled, utimescaled; - /* Though tsk->delays accessed later, early exit avoids - * unnecessary returning of other data - */ - if (!tsk->delays) - goto done; - tmp = (s64)d->cpu_run_real_total; task_cputime(tsk, &utime, &stime); cputime_to_timespec(utime + stime, &ts); @@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_count += tsk->delays->freepages_count; spin_unlock_irqrestore(&tsk->delays->lock, flags); -done: return 0; } diff --git a/kernel/elfcore.c b/kernel/elfcore.c index ff915efef66d..e556751d15d9 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -1,23 +1,19 @@ #include <linux/elf.h> #include <linux/fs.h> #include <linux/mm.h> - -#include <asm/elf.h> - +#include <linux/binfmts.h> Elf_Half __weak elf_core_extra_phdrs(void) { return 0; } -int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, - unsigned long limit) +int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) { return 1; } -int __weak elf_core_write_extra_data(struct file *file, size_t *size, - unsigned long limit) +int __weak elf_core_write_extra_data(struct coredump_params *cprm) { return 1; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 953c14348375..d724e7757cd1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; -static atomic_t perf_sample_allowed_ns __read_mostly = - ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); +static int perf_sample_allowed_ns __read_mostly = + DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; void update_perf_cpu_limits(void) { @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) tmp *= sysctl_perf_cpu_time_max_percent; do_div(tmp, 100); - atomic_set(&perf_sample_allowed_ns, tmp); + ACCESS_ONCE(perf_sample_allowed_ns) = tmp; } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(table, write, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; @@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 -DEFINE_PER_CPU(u64, running_sample_length); +static DEFINE_PER_CPU(u64, running_sample_length); void perf_sample_event_took(u64 sample_len_ns) { u64 avg_local_sample_len; u64 local_samples_len; + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - if (atomic_read(&perf_sample_allowed_ns) == 0) + if (allowed_ns == 0) return; /* decay the counter by 1 average sample */ @@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) */ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) + if (avg_local_sample_len <= allowed_ns) return; if (max_samples_per_tick <= 1) @@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %d), lowering " + "perf samples too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, - atomic_read(&perf_sample_allowed_ns), + avg_local_sample_len, allowed_ns, sysctl_perf_event_sample_rate); update_perf_cpu_limits(); @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); ctx->parent_ctx = NULL; } + ctx->generation++; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; + + ctx->generation++; } /* @@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + size += sizeof(data->txn); + event->header_size = size; } @@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) */ if (event->state > PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_OFF; + + ctx->generation++; } static void perf_group_detach(struct perf_event *event) @@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, } /* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. + * Test whether two contexts are equivalent, i.e. whether they have both been + * cloned from the same version of the same context. + * + * Equivalence is measured using a generation number in the context that is + * incremented on each modification to it; see unclone_ctx(), list_add_event() + * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; + /* Pinning disables the swap optimization */ + if (ctx1->pin_count || ctx2->pin_count) + return 0; + + /* If ctx1 is the parent of ctx2 */ + if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) + return 1; + + /* If ctx2 is the parent of ctx1 */ + if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) + return 1; + + /* + * If ctx1 and ctx2 have the same parent; we flatten the parent + * hierarchy, see perf_event_init_context(). + */ + if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && + ctx1->parent_gen == ctx2->parent_gen) + return 1; + + /* Unmatched */ + return 0; } static void __perf_event_sync_stat(struct perf_event *event, @@ -2210,9 +2234,6 @@ static void __perf_event_sync_stat(struct perf_event *event, perf_event_update_userpage(next_event); } -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { @@ -2244,7 +2265,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; - struct perf_event_context *parent; + struct perf_event_context *parent, *next_parent; struct perf_cpu_context *cpuctx; int do_switch = 1; @@ -2256,10 +2277,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, return; rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_event_ctxp[ctxn]; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { + if (!next_ctx) + goto unlock; + + parent = rcu_dereference(ctx->parent_ctx); + next_parent = rcu_dereference(next_ctx->parent_ctx); + + /* If neither context have a parent context; they cannot be clones. */ + if (!parent && !next_parent) + goto unlock; + + if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both @@ -2287,6 +2316,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } +unlock: rcu_read_unlock(); if (do_switch) { @@ -4572,6 +4602,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + if (sample_type & PERF_SAMPLE_TRANSACTION) + perf_output_put(handle, data->txn); + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -5100,27 +5133,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); + char *name; if (file) { struct inode *inode; dev_t dev; + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = "//enomem"; + goto cpy_name; + } /* - * d_path works from the end of the rb backwards, so we + * d_path() works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); + name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; + name = "//toolong"; + goto cpy_name; } inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; @@ -5128,34 +5160,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) gen = inode->i_generation; maj = MAJOR(dev); min = MINOR(dev); - + goto got_name; } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp) - 1); - tmp[sizeof(tmp) - 1] = '\0'; - goto got_name; - } + name = (char *)arch_vma_name(vma); + if (name) + goto cpy_name; - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_brk && + if (vma->vm_start <= vma->vm_mm->start_brk && vma->vm_end >= vma->vm_mm->brk) { - name = strncpy(tmp, "[heap]", sizeof(tmp)); - goto got_name; - } else if (vma->vm_start <= vma->vm_mm->start_stack && + name = "[heap]"; + goto cpy_name; + } + if (vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack) { - name = strncpy(tmp, "[stack]", sizeof(tmp)); - goto got_name; + name = "[stack]"; + goto cpy_name; } - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; + name = "//anon"; + goto cpy_name; } +cpy_name: + strlcpy(tmp, name, sizeof(tmp)); + name = tmp; got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); + /* + * Since our buffer works in 8 byte units we need to align our string + * size to a multiple of 8. However, we must guarantee the tail end is + * zero'd out to avoid leaking random bits to userspace. + */ + size = strlen(name)+1; + while (!IS_ALIGNED(size, sizeof(u64))) + name[size++] = '\0'; mmap_event->file_name = name; mmap_event->file_size = size; @@ -6292,6 +6329,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); } +static DEVICE_ATTR_RO(type); static ssize_t perf_event_mux_interval_ms_show(struct device *dev, @@ -6336,17 +6374,19 @@ perf_event_mux_interval_ms_store(struct device *dev, return count; } +static DEVICE_ATTR_RW(perf_event_mux_interval_ms); -static struct device_attribute pmu_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_RW(perf_event_mux_interval_ms), - __ATTR_NULL, +static struct attribute *pmu_dev_attrs[] = { + &dev_attr_type.attr, + &dev_attr_perf_event_mux_interval_ms.attr, + NULL, }; +ATTRIBUTE_GROUPS(pmu_dev); static int pmu_bus_running; static struct bus_type pmu_bus = { .name = "event_source", - .dev_attrs = pmu_dev_attrs, + .dev_groups = pmu_dev_groups, }; static void pmu_dev_release(struct device *dev) @@ -7126,7 +7166,6 @@ SYSCALL_DEFINE5(perf_event_open, } perf_install_in_context(ctx, event, event->cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); @@ -7209,7 +7248,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); - ++ctx->generation; perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca6599723be5..569b218782ad 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) } #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned int \ +static inline unsigned long \ func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned int len) \ + const void *buf, unsigned long len) \ { \ unsigned long size, written; \ \ do { \ - size = min_t(unsigned long, handle->size, len); \ - \ + size = min(handle->size, len); \ written = memcpy_func(handle->addr, buf, size); \ + written = size - written; \ \ len -= written; \ handle->addr += written; \ @@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \ return len; \ } -static inline int memcpy_common(void *dst, const void *src, size_t n) +static inline unsigned long +memcpy_common(void *dst, const void *src, unsigned long n) { memcpy(dst, src, n); - return n; + return 0; } DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) -#define MEMCPY_SKIP(dst, src, n) (n) +static inline unsigned long +memcpy_skip(void *dst, const void *src, unsigned long n) +{ + return 0; +} -DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) +DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip) #ifndef arch_perf_out_copy_user -#define arch_perf_out_copy_user __copy_from_user_inatomic +#define arch_perf_out_copy_user arch_perf_out_copy_user + +static inline unsigned long +arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) +{ + unsigned long ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, n); + pagefault_enable(); + + return ret; +} #endif DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 9c2ddfbf4525..e8b168af135b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -12,40 +12,10 @@ #include <linux/perf_event.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/circ_buf.h> #include "internal.h" -static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long sz = perf_data_size(rb); - unsigned long mask = sz - 1; - - /* - * check if user-writable - * overwrite : over-write its own tail - * !overwrite: buffer possibly drops events. - */ - if (rb->overwrite) - return true; - - /* - * verify that payload is not bigger than buffer - * otherwise masking logic may fail to detect - * the "not enough space" condition - */ - if ((head - offset) > sz) - return false; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLL_IN); @@ -115,8 +85,8 @@ again: rb->user_page->data_head = head; /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read rb->head. + * Now check if we missed an update -- rely on previous implied + * compiler barriers to force a re-read. */ if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); @@ -135,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle, { struct ring_buffer *rb; unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; + int have_lost, page_shift; struct { struct perf_event_header header; u64 id; @@ -151,57 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle, event = event->parent; rb = rcu_dereference(event->rb); - if (!rb) + if (unlikely(!rb)) goto out; - handle->rb = rb; - handle->event = event; - - if (!rb->nr_pages) + if (unlikely(!rb->nr_pages)) goto out; + handle->rb = rb; + handle->event = event; + have_lost = local_read(&rb->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; + if (unlikely(have_lost)) { + size += sizeof(lost_event); + if (event->attr.sample_id_all) + size += event->id_header_size; } perf_output_get_handle(handle); do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - * - * See perf_output_put_handle(). - */ tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_mb(); offset = head = local_read(&rb->head); - head += size; - if (unlikely(!perf_output_space(rb, tail, offset, head))) + if (!rb->overwrite && + unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) goto fail; + head += size; } while (local_cmpxchg(&rb->head, offset, head) != offset); - if (head - local_read(&rb->wakeup) > rb->watermark) + /* + * Separate the userpage->tail read from the data stores below. + * Matches the MB userspace SHOULD issue after reading the data + * and before storing the new tail position. + * + * See perf_output_put_handle(). + */ + smp_mb(); + + if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(rb)); - handle->page &= rb->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); - handle->addr = rb->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; + page_shift = PAGE_SHIFT + page_order(rb); - if (have_lost) { + handle->page = (offset >> page_shift) & (rb->nr_pages - 1); + offset &= (1UL << page_shift) - 1; + handle->addr = rb->data_pages[handle->page] + offset; + handle->size = (1UL << page_shift) - offset; + + if (unlikely(have_lost)) { + struct perf_sample_data sample_data; + + lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); + perf_event_header__init_id(&lost_event.header, + &sample_data, event); perf_output_put(handle, lost_event); perf_event__output_id_sample(event, handle, &sample_data); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70e..24b7d6ca871b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -35,6 +35,7 @@ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ #include <linux/percpu-rwsem.h> +#include <linux/task_work.h> #include <linux/uprobes.h> @@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and - * write_opcode accordingly. This would never be a problem for archs that - * have fixed length instructions. + * uprobe_write_opcode accordingly. This would never be a problem for archs + * that have fixed length instructions. */ /* - * write_opcode - write the opcode at a given virtual address. + * uprobe_write_opcode - write the opcode at a given virtual address. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. @@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * For mm @mm, write the opcode at @vaddr. * Return 0 (success) or a negative errno. */ -static int write_opcode(struct mm_struct *mm, unsigned long vaddr, +int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; @@ -314,7 +315,7 @@ put_old: */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN); } /** @@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) return ret; } -static int -__copy_insn(struct address_space *mapping, struct file *filp, char *insn, - unsigned long nbytes, loff_t offset) +static int __copy_insn(struct address_space *mapping, struct file *filp, + void *insn, int nbytes, loff_t offset) { struct page *page; @@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, static int copy_insn(struct uprobe *uprobe, struct file *filp) { - struct address_space *mapping; - unsigned long nbytes; - int bytes; - - nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); - mapping = uprobe->inode->i_mapping; + struct address_space *mapping = uprobe->inode->i_mapping; + loff_t offs = uprobe->offset; + void *insn = uprobe->arch.insn; + int size = MAX_UINSN_BYTES; + int len, err = -EIO; - /* Instruction at end of binary; copy only available bytes */ - if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) - bytes = uprobe->inode->i_size - uprobe->offset; - else - bytes = MAX_UINSN_BYTES; + /* Copy only available bytes, -EIO if nothing was read */ + do { + if (offs >= i_size_read(uprobe->inode)) + break; - /* Instruction at the page-boundary; copy bytes in second page */ - if (nbytes < bytes) { - int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, - bytes - nbytes, uprobe->offset + nbytes); + len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); + err = __copy_insn(mapping, filp, insn, len, offs); if (err) - return err; - bytes = nbytes; - } - return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); + break; + + insn += len; + offs += len; + size -= len; + } while (size); + + return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, @@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, if (ret) goto out; - /* write_opcode() assumes we don't cross page boundary */ + /* uprobe_write_opcode() assumes we don't cross page boundary */ BUG_ON((uprobe->offset & ~PAGE_MASK) + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); @@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon } /* Slot allocation for XOL */ -static int xol_add_vma(struct xol_area *area) +static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - struct mm_struct *mm = current->mm; int ret = -EALREADY; down_write(&mm->mmap_sem); if (mm->uprobes_state.xol_area) goto fail; - ret = -ENOMEM; - /* Try to map as high as possible, this is only a hint. */ - area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { - ret = area->vaddr; - goto fail; + if (!area->vaddr) { + /* Try to map as high as possible, this is only a hint. */ + area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, + PAGE_SIZE, 0, 0); + if (area->vaddr & ~PAGE_MASK) { + ret = area->vaddr; + goto fail; + } } ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, @@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area) smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; - ret = 0; fail: up_write(&mm->mmap_sem); return ret; } -/* - * get_xol_area - Allocate process's xol_area if necessary. - * This area will be used for storing instructions for execution out of line. - * - * Returns the allocated area or NULL. - */ -static struct xol_area *get_xol_area(void) +static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; - struct xol_area *area; uprobe_opcode_t insn = UPROBE_SWBP_INSN; + struct xol_area *area; - area = mm->uprobes_state.xol_area; - if (area) - goto ret; - - area = kzalloc(sizeof(*area), GFP_KERNEL); + area = kmalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; - /* allocate first slot of task's xol_area for the return probes */ + area->vaddr = vaddr; + init_waitqueue_head(&area->wq); + /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); atomic_set(&area->slot_count, 1); - init_waitqueue_head(&area->wq); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); - if (!xol_add_vma(area)) + if (!xol_add_vma(mm, area)) return area; __free_page(area->page); @@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void) free_area: kfree(area); out: + return NULL; +} + +/* + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *get_xol_area(void) +{ + struct mm_struct *mm = current->mm; + struct xol_area *area; + + if (!mm->uprobes_state.xol_area) + __create_xol_area(0); + area = mm->uprobes_state.xol_area; - ret: - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ return area; } @@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); + copy_to_page(area->page, xol_vaddr, + uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. @@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t) } /* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t) -{ - t->utask = NULL; -} - -/* * Allocate a uprobe_task object for the task if if necessary. * Called when the thread hits a breakpoint. * @@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void) return current->utask; } +static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) +{ + struct uprobe_task *n_utask; + struct return_instance **p, *o, *n; + + n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + if (!n_utask) + return -ENOMEM; + t->utask = n_utask; + + p = &n_utask->return_instances; + for (o = o_utask->return_instances; o; o = o->next) { + n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!n) + return -ENOMEM; + + *n = *o; + atomic_inc(&n->uprobe->ref); + n->next = NULL; + + *p = n; + p = &n->next; + n_utask->depth++; + } + + return 0; +} + +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", + current->comm, current->pid, msg); +} + +static void dup_xol_work(struct callback_head *work) +{ + kfree(work); + + if (current->flags & PF_EXITING) + return; + + if (!__create_xol_area(current->utask->vaddr)) + uprobe_warn(current, "dup xol area"); +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t, unsigned long flags) +{ + struct uprobe_task *utask = current->utask; + struct mm_struct *mm = current->mm; + struct callback_head *work; + struct xol_area *area; + + t->utask = NULL; + + if (!utask || !utask->return_instances) + return; + + if (mm == t->mm && !(flags & CLONE_VFORK)) + return; + + if (dup_utask(t, utask)) + return uprobe_warn(t, "dup ret instances"); + + /* The task can fork() after dup_xol_work() fails */ + area = mm->uprobes_state.xol_area; + if (!area) + return uprobe_warn(t, "dup xol area"); + + if (mm == t->mm) + return; + + /* TODO: move it into the union in uprobe_task */ + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + return uprobe_warn(t, "dup xol area"); + + t->utask->vaddr = area->vaddr; + init_task_work(work, dup_xol_work); + task_work_add(t, work, true); +} + /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. @@ -1857,9 +1941,4 @@ static int __init init_uprobes(void) return register_die_notifier(&uprobe_exception_nb); } -module_init(init_uprobes); - -static void __exit exit_uprobes(void) -{ -} -module_exit(exit_uprobes); +__initcall(init_uprobes); diff --git a/kernel/fork.c b/kernel/fork.c index c93be06dee87..f6d11fc67f72 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1370,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif - uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ @@ -1487,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, perf_event_fork(p); trace_task_newtask(p, clone_flags); + uprobe_copy_process(p, clone_flags); return p; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d4da55d1fb65..d04ce8ac4399 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL larger and run slower. Also be sure to exclude files from profiling which are not linked to the kernel image to prevent linker errors. +choice + prompt "Specify GCOV format" + depends on GCOV_KERNEL + default GCOV_FORMAT_AUTODETECT + ---help--- + The gcov format is usually determined by the GCC version, but there are + exceptions where format changes are integrated in lower-version GCCs. + In such a case use this option to adjust the format used in the kernel + accordingly. + + If unsure, choose "Autodetect". + +config GCOV_FORMAT_AUTODETECT + bool "Autodetect" + ---help--- + Select this option to use the format that corresponds to your GCC + version. + +config GCOV_FORMAT_3_4 + bool "GCC 3.4 format" + ---help--- + Select this option to use the format defined by GCC 3.4. + +config GCOV_FORMAT_4_7 + bool "GCC 4.7 format" + ---help--- + Select this option to use the format defined by GCC 4.7. + +endchoice + endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index e97ca59e2520..52aa7e8de927 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,33 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o +# if-lt +# Usage VAR := $(call if-lt, $(a), $(b)) +# Returns 1 if (a < b) +if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) + +ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) + cc-ver := 0304 +else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) + cc-ver := 0407 +else +# Use cc-version if available, otherwise set 0 +# +# scripts/Kbuild.include, which contains cc-version function, is not included +# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" +# Meaning cc-ver is empty causing if-lt test to fail with +# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. +# This has no affect on the clean phase, but the error message could be +# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version +# is not available. We can probably move if-lt to Kbuild.include, so it's also +# not defined during clean or to include Kbuild.include in +# scripts/Makefile.clean. But the following workaround seems least invasive. + cc-ver := $(if $(call cc-version),$(call cc-version),0) +endif + +obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o + +ifeq ($(call if-lt, $(cc-ver), 0407),1) + obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o +else + obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o +endif diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9b22d03cc581..f45b75b713c0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -20,7 +20,6 @@ #include <linux/mutex.h> #include "gcov.h" -static struct gcov_info *gcov_info_head; static int gcov_events_enabled; static DEFINE_MUTEX(gcov_lock); @@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info) mutex_lock(&gcov_lock); if (gcov_version == 0) { - gcov_version = info->version; + gcov_version = gcov_info_version(info); /* * Printing gcc's version magic may prove useful for debugging * incompatibility reports. @@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info) * Add new profiling data structure to list and inform event * listener. */ - info->next = gcov_info_head; - gcov_info_head = info; + gcov_info_link(info); if (gcov_events_enabled) gcov_event(GCOV_ADD, info); mutex_unlock(&gcov_lock); @@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_delta); +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + /** * gcov_enable_events - enable event reporting through gcov_event() * @@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta); */ void gcov_enable_events(void) { - struct gcov_info *info; + struct gcov_info *info = NULL; mutex_lock(&gcov_lock); gcov_events_enabled = 1; + /* Perform event callback for previously registered entries. */ - for (info = gcov_info_head; info; info = info->next) + while ((info = gcov_info_next(info))) gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); } @@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) { struct module *mod = data; - struct gcov_info *info; - struct gcov_info *prev; + struct gcov_info *info = NULL; + struct gcov_info *prev = NULL; if (event != MODULE_STATE_GOING) return NOTIFY_OK; mutex_lock(&gcov_lock); - prev = NULL; + /* Remove entries located in module from linked list. */ - for (info = gcov_info_head; info; info = info->next) { + while ((info = gcov_info_next(info))) { if (within(info, mod->module_core, mod->core_size)) { - if (prev) - prev->next = info->next; - else - gcov_info_head = info->next; + gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); } else prev = info; } + mutex_unlock(&gcov_lock); return NOTIFY_OK; diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 7a7d2ee96d42..15ff01a76379 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str) unsigned long val; if (kstrtoul(str, 0, &val)) { - pr_warning("invalid gcov_persist parameter '%s'\n", str); + pr_warn("invalid gcov_persist parameter '%s'\n", str); return 0; } gcov_persist = val; @@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name) list_for_each_entry(node, &all_head, all) { info = get_node_info(node); - if (info && (strcmp(info->filename, name) == 0)) + if (info && (strcmp(gcov_info_filename(info), name) == 0)) return node; } @@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, seq = file->private_data; info = gcov_iter_get_info(seq->private); mutex_lock(&node_lock); - node = get_node_by_name(info->filename); + node = get_node_by_name(gcov_info_filename(info)); if (node) { /* Reset counts or remove node for unloaded modules. */ if (node->num_loaded == 0) @@ -365,7 +365,7 @@ static const char *deskew(const char *basename) */ static void add_links(struct gcov_node *node, struct dentry *parent) { - char *basename; + const char *basename; char *target; int num; int i; @@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent) if (!node->links) return; for (i = 0; i < num; i++) { - target = get_link_target(get_node_info(node)->filename, - &gcov_link[i]); + target = get_link_target( + gcov_info_filename(get_node_info(node)), + &gcov_link[i]); if (!target) goto out_err; - basename = strrchr(target, '/'); - if (!basename) + basename = kbasename(target); + if (basename == target) goto out_err; - basename++; node->links[i] = debugfs_create_symlink(deskew(basename), parent, target); if (!node->links[i]) @@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, } else node->dentry = debugfs_create_dir(node->name, parent->dentry); if (!node->dentry) { - pr_warning("could not create file\n"); + pr_warn("could not create file\n"); kfree(node); return NULL; } @@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent, err_nomem: kfree(node); - pr_warning("out of memory\n"); + pr_warn("out of memory\n"); return NULL; } @@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info) struct gcov_node *parent; struct gcov_node *node; - filename = kstrdup(info->filename, GFP_KERNEL); + filename = kstrdup(gcov_info_filename(info), GFP_KERNEL); if (!filename) return; parent = &root_node; @@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) */ loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); if (!loaded_info) { - pr_warning("could not add '%s' (out of memory)\n", - info->filename); + pr_warn("could not add '%s' (out of memory)\n", + gcov_info_filename(info)); return; } memcpy(loaded_info, node->loaded_info, @@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) * data set replaces the copy of the last one. */ if (!gcov_info_is_compatible(node->unloaded_info, info)) { - pr_warning("discarding saved data for %s " - "(incompatible version)\n", info->filename); + pr_warn("discarding saved data for %s " + "(incompatible version)\n", + gcov_info_filename(info)); gcov_info_free(node->unloaded_info); node->unloaded_info = NULL; } @@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info) * The initial one takes precedence. */ if (!gcov_info_is_compatible(node->loaded_info[0], info)) { - pr_warning("could not add '%s' (incompatible " - "version)\n", info->filename); + pr_warn("could not add '%s' (incompatible " + "version)\n", gcov_info_filename(info)); kfree(loaded_info); return; } @@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info) else { node->unloaded_info = gcov_info_dup(info); if (!node->unloaded_info) { - pr_warning("could not save data for '%s' " - "(out of memory)\n", info->filename); + pr_warn("could not save data for '%s' " + "(out of memory)\n", + gcov_info_filename(info)); } } } @@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info) i = get_info_index(node, info); if (i < 0) { - pr_warning("could not remove '%s' (not found)\n", - info->filename); + pr_warn("could not remove '%s' (not found)\n", + gcov_info_filename(info)); return; } if (gcov_persist) @@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) struct gcov_node *node; mutex_lock(&node_lock); - node = get_node_by_name(info->filename); + node = get_node_by_name(gcov_info_filename(info)); switch (action) { case GCOV_ADD: if (node) @@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) if (node) remove_info(node, info); else { - pr_warning("could not remove '%s' (not found)\n", - info->filename); + pr_warn("could not remove '%s' (not found)\n", + gcov_info_filename(info)); } break; } diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index ae5bb4260033..27bc88a35013 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -21,6 +21,121 @@ #include <linux/vmalloc.h> #include "gcov.h" +#define GCOV_COUNTERS 5 + +static struct gcov_info *gcov_info_head; + +/** + * struct gcov_fn_info - profiling meta data per function + * @ident: object file-unique function identifier + * @checksum: function checksum + * @n_ctrs: number of values per counter type belonging to this function + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + */ +struct gcov_fn_info { + unsigned int ident; + unsigned int checksum; + unsigned int n_ctrs[0]; +}; + +/** + * struct gcov_ctr_info - profiling data per counter type + * @num: number of counter values for this type + * @values: array of counter values for this type + * @merge: merge function for counter values of this type (unused) + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { + unsigned int num; + gcov_type *values; + void (*merge)(gcov_type *, unsigned int); +}; + +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: time stamp + * @filename: name of the associated gcov data file + * @n_functions: number of instrumented functions + * @functions: function data + * @ctr_mask: mask specifying which counter types are active + * @counts: counter data per counter type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { + unsigned int version; + struct gcov_info *next; + unsigned int stamp; + const char *filename; + unsigned int n_functions; + const struct gcov_fn_info *functions; + unsigned int ctr_mask; + struct gcov_ctr_info counts[0]; +}; + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return gcov_info_head; + + return info->next; +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + info->next = gcov_info_head; + gcov_info_head = info; +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + if (prev) + prev->next = info->next; + else + gcov_info_head = info->next; +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c new file mode 100644 index 000000000000..2c6e4631c814 --- /dev/null +++ b/kernel/gcov/gcc_4_7.c @@ -0,0 +1,560 @@ +/* + * This code provides functions to handle gcc's profiling data format + * introduced with gcc 4.7. + * + * This file is based heavily on gcc_3_4.c file. + * + * For a better understanding, refer to gcc source: + * gcc/gcov-io.h + * libgcc/libgcov.c + * + * Uses gcc-internal data definitions. + */ + +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +#define GCOV_COUNTERS 8 +#define GCOV_TAG_FUNCTION_LENGTH 3 + +static struct gcov_info *gcov_info_head; + +/** + * struct gcov_ctr_info - information about counters for a single function + * @num: number of counter values for this type + * @values: array of counter values for this type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { + unsigned int num; + gcov_type *values; +}; + +/** + * struct gcov_fn_info - profiling meta data per function + * @key: comdat key + * @ident: unique ident of function + * @lineno_checksum: function lineo_checksum + * @cfg_checksum: function cfg checksum + * @ctrs: instrumented counters + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + * + * Information about a single function. This uses the trailing array + * idiom. The number of counters is determined from the merge pointer + * array in gcov_info. The key is used to detect which of a set of + * comdat functions was selected -- it points to the gcov_info object + * of the object file containing the selected comdat function. + */ +struct gcov_fn_info { + const struct gcov_info *key; + unsigned int ident; + unsigned int lineno_checksum; + unsigned int cfg_checksum; + struct gcov_ctr_info ctrs[0]; +}; + +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: uniquifying time stamp + * @filename: name of the associated gcov data file + * @merge: merge functions (null for unused counter type) + * @n_functions: number of instrumented functions + * @functions: pointer to pointers to function information + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { + unsigned int version; + struct gcov_info *next; + unsigned int stamp; + const char *filename; + void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); + unsigned int n_functions; + struct gcov_fn_info **functions; +}; + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return gcov_info_head; + + return info->next; +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + info->next = gcov_info_head; + gcov_info_head = info; +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + if (prev) + prev->next = info->next; + else + gcov_info_head = info->next; +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/* + * Determine whether a counter is active. Doesn't change at run-time. + */ +static int counter_active(struct gcov_info *info, unsigned int type) +{ + return info->merge[type] ? 1 : 0; +} + +/* Determine number of active counters. Based on gcc magic. */ +static unsigned int num_counter_active(struct gcov_info *info) +{ + unsigned int i; + unsigned int result = 0; + + for (i = 0; i < GCOV_COUNTERS; i++) { + if (counter_active(info, i)) + result++; + } + return result; +} + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_ctr_info *ci_ptr; + unsigned int fi_idx; + unsigned int ct_idx; + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + ci_ptr = info->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { + if (!counter_active(info, ct_idx)) + continue; + + memset(ci_ptr->values, 0, + sizeof(gcov_type) * ci_ptr->num); + ci_ptr++; + } + } +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + return (info1->stamp == info2->stamp); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_ctr_info *dci_ptr; + struct gcov_ctr_info *sci_ptr; + unsigned int fi_idx; + unsigned int ct_idx; + unsigned int val_idx; + + for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) { + dci_ptr = dst->functions[fi_idx]->ctrs; + sci_ptr = src->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { + if (!counter_active(src, ct_idx)) + continue; + + for (val_idx = 0; val_idx < sci_ptr->num; val_idx++) + dci_ptr->values[val_idx] += + sci_ptr->values[val_idx]; + + dci_ptr++; + sci_ptr++; + } + } +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_ctr_info *dci_ptr; /* dst counter info */ + struct gcov_ctr_info *sci_ptr; /* src counter info */ + unsigned int active; + unsigned int fi_idx; /* function info idx */ + unsigned int ct_idx; /* counter type idx */ + size_t fi_size; /* function info size */ + size_t cv_size; /* counter values size */ + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + + dup->next = NULL; + dup->filename = NULL; + dup->functions = NULL; + + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err_free; + + dup->functions = kcalloc(info->n_functions, + sizeof(struct gcov_fn_info *), GFP_KERNEL); + if (!dup->functions) + goto err_free; + + active = num_counter_active(info); + fi_size = sizeof(struct gcov_fn_info); + fi_size += sizeof(struct gcov_ctr_info) * active; + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL); + if (!dup->functions[fi_idx]) + goto err_free; + + *(dup->functions[fi_idx]) = *(info->functions[fi_idx]); + + sci_ptr = info->functions[fi_idx]->ctrs; + dci_ptr = dup->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < active; ct_idx++) { + + cv_size = sizeof(gcov_type) * sci_ptr->num; + + dci_ptr->values = vmalloc(cv_size); + + if (!dci_ptr->values) + goto err_free; + + dci_ptr->num = sci_ptr->num; + memcpy(dci_ptr->values, sci_ptr->values, cv_size); + + sci_ptr++; + dci_ptr++; + } + } + + return dup; +err_free: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + unsigned int active; + unsigned int fi_idx; + unsigned int ct_idx; + struct gcov_ctr_info *ci_ptr; + + if (!info->functions) + goto free_info; + + active = num_counter_active(info); + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + if (!info->functions[fi_idx]) + continue; + + ci_ptr = info->functions[fi_idx]->ctrs; + + for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) + vfree(ci_ptr->values); + + kfree(info->functions[fi_idx]); + } + +free_info: + kfree(info->functions); + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + struct gcov_ctr_info *ci_ptr; + unsigned int fi_idx; + unsigned int ct_idx; + unsigned int cv_idx; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->stamp); + + for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { + fi_ptr = info->functions[fi_idx]; + + /* Function record. */ + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + ci_ptr = fi_ptr->ctrs; + + for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { + if (!counter_active(info, ct_idx)) + continue; + + /* Counter record. */ + pos += store_gcov_u32(buffer, pos, + GCOV_TAG_FOR_COUNTER(ct_idx)); + pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); + + for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { + pos += store_gcov_u64(buffer, pos, + ci_ptr->values[cv_idx]); + } + + ci_ptr++; + } + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 060073ebf7a6..92c8e22a29ed 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -21,7 +21,6 @@ * gcc and need to be kept as close to the original definition as possible to * remain compatible. */ -#define GCOV_COUNTERS 5 #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) @@ -34,60 +33,18 @@ typedef long gcov_type; typedef long long gcov_type; #endif -/** - * struct gcov_fn_info - profiling meta data per function - * @ident: object file-unique function identifier - * @checksum: function checksum - * @n_ctrs: number of values per counter type belonging to this function - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - */ -struct gcov_fn_info { - unsigned int ident; - unsigned int checksum; - unsigned int n_ctrs[0]; -}; - -/** - * struct gcov_ctr_info - profiling data per counter type - * @num: number of counter values for this type - * @values: array of counter values for this type - * @merge: merge function for counter values of this type (unused) - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { - unsigned int num; - gcov_type *values; - void (*merge)(gcov_type *, unsigned int); -}; +/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so + * we cannot use full definition here and they need to be placed in gcc specific + * implementation of gcov. This also means no direct access to the members in + * generic code and usage of the interface below.*/ +struct gcov_info; -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: time stamp - * @filename: name of the associated gcov data file - * @n_functions: number of instrumented functions - * @functions: function data - * @ctr_mask: mask specifying which counter types are active - * @counts: counter data per counter type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { - unsigned int version; - struct gcov_info *next; - unsigned int stamp; - const char *filename; - unsigned int n_functions; - const struct gcov_fn_info *functions; - unsigned int ctr_mask; - struct gcov_ctr_info counts[0]; -}; +/* Interface to access gcov_info data */ +const char *gcov_info_filename(struct gcov_info *info); +unsigned int gcov_info_version(struct gcov_info *info); +struct gcov_info *gcov_info_next(struct gcov_info *info); +void gcov_info_link(struct gcov_info *info); +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); /* Base interface. */ enum gcov_action { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 706724e9835d..cf68bb36fe58 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, } EXPORT_SYMBOL_GPL(irq_create_strict_mappings); -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) +unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; unsigned int virq; - domain = controller ? irq_find_host(controller) : irq_default_domain; + domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; if (!domain) { pr_warn("no irq domain found for %s !\n", - of_node_full_name(controller)); + of_node_full_name(irq_data->np)); return 0; } /* If domain has no translation, then we assume interrupt line */ if (domain->ops->xlate == NULL) - hwirq = intspec[0]; + hwirq = irq_data->args[0]; else { - if (domain->ops->xlate(domain, controller, intspec, intsize, - &hwirq, &type)) + if (domain->ops->xlate(domain, irq_data->np, irq_data->args, + irq_data->args_count, &hwirq, &type)) return 0; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 514bcfd855a8..3e59f951d42f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_mput; } - sched_setscheduler(t, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); /* * We keep the reference to the task struct even if diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 297a9247a3b3..9019f15deab2 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable); void static_key_slow_inc(struct static_key *key) { + STATIC_KEY_CHECK_USE(); if (atomic_inc_not_zero(&key->enabled)) return; @@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work) void static_key_slow_dec(struct static_key *key) { + STATIC_KEY_CHECK_USE(); __static_key_slow_dec(key, 0, NULL); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_deferred(struct static_key_deferred *key) { + STATIC_KEY_CHECK_USE(); __static_key_slow_dec(&key->key, key->timeout, &key->work); } EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); @@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { + STATIC_KEY_CHECK_USE(); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } @@ -212,6 +216,7 @@ void __init jump_label_init(void) key->next = NULL; #endif } + static_key_initialized = true; jump_label_unlock(); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a0d367a49122..ceeadfcabb76 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2066,7 +2066,7 @@ static int __init init_kprobes(void) { int i, err = 0; unsigned long offset = 0, size = 0; - char *modname, namebuf[128]; + char *modname, namebuf[KSYM_NAME_LEN]; const char *symbol_name; void *addr; struct kprobe_blackpoint *kb; @@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) const char *sym = NULL; unsigned int i = *(loff_t *) v; unsigned long offset = 0; - char *modname, namebuf[128]; + char *modname, namebuf[KSYM_NAME_LEN]; head = &kprobe_table[i]; preempt_disable(); diff --git a/kernel/kthread.c b/kernel/kthread.c index 760e86df8c20..b5ae3ee860a9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -33,7 +33,7 @@ struct kthread_create_info /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; - struct completion done; + struct completion *done; struct list_head list; }; @@ -178,6 +178,7 @@ static int kthread(void *_create) struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; + struct completion *done; struct kthread self; int ret; @@ -187,10 +188,16 @@ static int kthread(void *_create) init_completion(&self.parked); current->vfork_done = &self.exited; + /* If user was SIGKILLed, I release the structure. */ + done = xchg(&create->done, NULL); + if (!done) { + kfree(create); + do_exit(-EINTR); + } /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; - complete(&create->done); + complete(done); schedule(); ret = -EINTR; @@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create) /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { + /* If user was SIGKILLed, I release the structure. */ + struct completion *done = xchg(&create->done, NULL); + + if (!done) { + kfree(create); + return; + } create->result = ERR_PTR(pid); - complete(&create->done); + complete(done); } } @@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), const char namefmt[], ...) { - struct kthread_create_info create; - - create.threadfn = threadfn; - create.data = data; - create.node = node; - init_completion(&create.done); + DECLARE_COMPLETION_ONSTACK(done); + struct task_struct *task; + struct kthread_create_info *create = kmalloc(sizeof(*create), + GFP_KERNEL); + + if (!create) + return ERR_PTR(-ENOMEM); + create->threadfn = threadfn; + create->data = data; + create->node = node; + create->done = &done; spin_lock(&kthread_create_lock); - list_add_tail(&create.list, &kthread_create_list); + list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); - wait_for_completion(&create.done); - - if (!IS_ERR(create.result)) { + /* + * Wait for completion in killable state, for I might be chosen by + * the OOM killer while kthreadd is trying to allocate memory for + * new kernel thread. + */ + if (unlikely(wait_for_completion_killable(&done))) { + /* + * If I was SIGKILLed before kthreadd (or new kernel thread) + * calls complete(), leave the cleanup of this structure to + * that thread. + */ + if (xchg(&create->done, NULL)) + return ERR_PTR(-ENOMEM); + /* + * kthreadd (or new kernel thread) will call complete() + * shortly. + */ + wait_for_completion(&done); + } + task = create->result; + if (!IS_ERR(task)) { static const struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); - vsnprintf(create.result->comm, sizeof(create.result->comm), - namefmt, args); + vsnprintf(task->comm, sizeof(task->comm), namefmt, args); va_end(args); /* * root may have changed our (kthreadd's) priority or CPU mask. * The kernel thread should not inherit these properties. */ - sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(create.result, cpu_all_mask); + sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(task, cpu_all_mask); } - return create.result; + kfree(create); + return task; } EXPORT_SYMBOL(kthread_create_on_node); diff --git a/kernel/module.c b/kernel/module.c index dc582749fa13..af5ebd21d77b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms, if (syms->licence == GPL_ONLY) return false; if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", fsa->name); + pr_warn("Symbol %s is being used by a non-GPL module, " + "which will not be allowed in the future\n", + fsa->name); } } #ifdef CONFIG_UNUSED_SYMBOLS if (syms->unused && fsa->warn) { - printk(KERN_WARNING "Symbol %s is marked as UNUSED, " - "however this module is using it.\n", fsa->name); - printk(KERN_WARNING - "This symbol will go away in the future.\n"); - printk(KERN_WARNING - "Please evalute if this is the right api to use and if " - "it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); + pr_warn("Symbol %s is marked as UNUSED, however this module is " + "using it.\n", fsa->name); + pr_warn("This symbol will go away in the future.\n"); + pr_warn("Please evalute if this is the right api to use and if " + "it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); } #endif @@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info) return 0; if (align > PAGE_SIZE) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", - mod->name, align, PAGE_SIZE); + pr_warn("%s: per-cpu alignment %li > %li\n", + mod->name, align, PAGE_SIZE); align = PAGE_SIZE; } mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align); if (!mod->percpu) { - printk(KERN_WARNING - "%s: Could not allocate %lu bytes percpu data\n", - mod->name, (unsigned long)pcpusec->sh_size); + pr_warn("%s: Could not allocate %lu bytes percpu data\n", + mod->name, (unsigned long)pcpusec->sh_size); return -ENOMEM; } mod->percpu_size = pcpusec->sh_size; @@ -679,7 +676,7 @@ static int add_module_usage(struct module *a, struct module *b) pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); if (!use) { - printk(KERN_WARNING "%s: out of memory loading\n", a->name); + pr_warn("%s: out of memory loading\n", a->name); return -ENOMEM; } @@ -1145,8 +1142,7 @@ static int try_to_force_load(struct module *mod, const char *reason) { #ifdef CONFIG_MODULE_FORCE_LOAD if (!test_taint(TAINT_FORCED_MODULE)) - printk(KERN_WARNING "%s: %s: kernel tainted.\n", - mod->name, reason); + pr_warn("%s: %s: kernel tainted.\n", mod->name, reason); add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE); return 0; #else @@ -1199,8 +1195,7 @@ static int check_version(Elf_Shdr *sechdrs, goto bad_version; } - printk(KERN_WARNING "%s: no symbol version for %s\n", - mod->name, symname); + pr_warn("%s: no symbol version for %s\n", mod->name, symname); return 0; bad_version: @@ -1309,8 +1304,8 @@ resolve_symbol_wait(struct module *mod, !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) || PTR_ERR(ksym) != -EBUSY, 30 * HZ) <= 0) { - printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", - mod->name, owner); + pr_warn("%s: gave up waiting for init of module %s.\n", + mod->name, owner); } return ksym; } @@ -1626,15 +1621,14 @@ static int mod_sysfs_init(struct module *mod) struct kobject *kobj; if (!module_sysfs_initialized) { - printk(KERN_ERR "%s: module sysfs not initialized\n", - mod->name); + pr_err("%s: module sysfs not initialized\n", mod->name); err = -EINVAL; goto out; } kobj = kset_find_obj(module_kset, mod->name); if (kobj) { - printk(KERN_ERR "%s: module is already loaded\n", mod->name); + pr_err("%s: module is already loaded\n", mod->name); kobject_put(kobj); err = -EINVAL; goto out; @@ -1961,8 +1955,7 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { if (find_symbol(s->name, &owner, NULL, true, false)) { - printk(KERN_ERR - "%s: exports duplicate symbol %s" + pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", mod->name, s->name, module_name(owner)); return -ENOEXEC; @@ -2013,8 +2006,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", - mod->name, name, PTR_ERR(ksym)); + pr_warn("%s: Unknown symbol %s (err %li)\n", + mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; break; @@ -2168,8 +2161,8 @@ static void set_license(struct module *mod, const char *license) if (!license_is_gpl_compatible(license)) { if (!test_taint(TAINT_PROPRIETARY_MODULE)) - printk(KERN_WARNING "%s: module license '%s' taints " - "kernel.\n", mod->name, license); + pr_warn("%s: module license '%s' taints kernel.\n", + mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); } @@ -2405,8 +2398,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) return; #ifdef CONFIG_DYNAMIC_DEBUG if (ddebug_add_module(debug, num, debug->modname)) - printk(KERN_ERR "dynamic debug error adding module: %s\n", - debug->modname); + pr_err("dynamic debug error adding module: %s\n", + debug->modname); #endif } @@ -2619,8 +2612,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) Elf_Shdr *shdr = &info->sechdrs[i]; if (shdr->sh_type != SHT_NOBITS && info->len < shdr->sh_offset + shdr->sh_size) { - printk(KERN_ERR "Module len %lu truncated\n", - info->len); + pr_err("Module len %lu truncated\n", info->len); return -ENOEXEC; } @@ -2682,15 +2674,14 @@ static struct module *setup_load_info(struct load_info *info, int flags) info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); if (!info->index.mod) { - printk(KERN_WARNING "No module found in object\n"); + pr_warn("No module found in object\n"); return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; if (info->index.sym == 0) { - printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", - mod->name); + pr_warn("%s: module has no symbols (stripped?)\n", mod->name); return ERR_PTR(-ENOEXEC); } @@ -2717,7 +2708,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) if (err) return err; } else if (!same_magic(modmagic, vermagic, info->index.vers)) { - printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", + pr_err("%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); return -ENOEXEC; } @@ -2727,9 +2718,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); - printk(KERN_WARNING "%s: module is from the staging directory," - " the quality is unknown, you have been warned.\n", - mod->name); + pr_warn("%s: module is from the staging directory, the quality " + "is unknown, you have been warned.\n", mod->name); } /* Set up license info based on the info section */ @@ -2801,8 +2791,7 @@ static void find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->extable), &mod->num_exentries); if (section_addr(info, "__obsparm")) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); + pr_warn("%s: Ignoring obsolete parameters\n", mod->name); info->debug = section_objs(info, "__verbose", sizeof(*info->debug), &info->num_debug); @@ -3078,11 +3067,10 @@ static int do_init_module(struct module *mod) return ret; } if (ret > 0) { - printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", - __func__, mod->name, ret, - __func__); + pr_warn("%s: '%s'->init suspiciously returned %d, it should " + "follow 0/-E convention\n" + "%s: loading module anyway...\n", + __func__, mod->name, ret, __func__); dump_stack(); } @@ -3205,10 +3193,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname) { /* Check for magic 'dyndbg' arg */ int ret = ddebug_dyndbg_module_param_cb(param, val, modname); - if (ret != 0) { - printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", - modname, param); - } + if (ret != 0) + pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); return 0; } @@ -3243,10 +3229,9 @@ static int load_module(struct load_info *info, const char __user *uargs, #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; if (!mod->sig_ok) { - printk_once(KERN_NOTICE - "%s: module verification failed: signature and/or" - " required key missing - tainting kernel\n", - mod->name); + pr_notice_once("%s: module verification failed: signature " + "and/or required key missing - tainting " + "kernel\n", mod->name); add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); } #endif diff --git a/kernel/panic.c b/kernel/panic.c index b6c482ccc5db..c00b4ceb39e8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -233,7 +233,7 @@ static const struct tnt tnts[] = { */ const char *print_tainted(void) { - static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")]; if (tainted_mask) { char *s; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 42086551a24a..06c62de9c711 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -132,6 +132,12 @@ out: return ERR_PTR(err); } +static void delayed_free_pidns(struct rcu_head *p) +{ + kmem_cache_free(pid_ns_cachep, + container_of(p, struct pid_namespace, rcu)); +} + static void destroy_pid_namespace(struct pid_namespace *ns) { int i; @@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); - kmem_cache_free(pid_ns_cachep, ns); + call_rcu(&ns->rcu, delayed_free_pidns); } struct pid_namespace *copy_pid_ns(unsigned long flags, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d444c4e834f4..2fac9cc79b3d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG def_bool y depends on PM_DEBUG && PM_SLEEP +config DPM_WATCHDOG + bool "Device suspend/resume watchdog" + depends on PM_DEBUG && PSTORE + ---help--- + Sets up a watchdog timer to capture drivers that are + locked up attempting to suspend/resume a device. + A detected lockup causes system panic with message + captured in pstore device for inspection in subsequent + boot session. + +config DPM_WATCHDOG_TIMEOUT + int "Watchdog timeout in seconds" + range 1 120 + default 12 + depends on DPM_WATCHDOG + config PM_TRACE bool help diff --git a/kernel/power/qos.c b/kernel/power/qos.c index a394297f8b2f..8dff9b48075a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) return -EFAULT; - } else if (count <= 11) { /* ASCII perhaps? */ - char ascii_value[11]; - unsigned long int ulval; + } else { int ret; - if (copy_from_user(ascii_value, buf, count)) - return -EFAULT; - - if (count > 10) { - if (ascii_value[10] == '\n') - ascii_value[10] = '\0'; - else - return -EINVAL; - } else { - ascii_value[count] = '\0'; - } - ret = kstrtoul(ascii_value, 16, &ulval); - if (ret) { - pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); - return -EINVAL; - } - value = (s32)lower_32_bits(ulval); - } else { - return -EINVAL; + ret = kstrtos32_from_user(buf, count, 16, &value); + if (ret) + return ret; } req = filp->private_data; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 98c3b34a4cff..10c22cae83a0 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1402,7 +1402,11 @@ int hibernate_preallocate_memory(void) * highmem and non-highmem zones separately. */ pages_highmem = preallocate_image_highmem(highmem / 2); - alloc = (count - max_size) - pages_highmem; + alloc = count - max_size; + if (alloc > pages_highmem) + alloc -= pages_highmem; + else + alloc = 0; pages = preallocate_image_memory(alloc, avail_normal); if (pages < alloc) { /* We have exhausted non-highmem pages, try highmem. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 957f06164ad1..24850270c802 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -36,9 +36,9 @@ static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; - char frozen; - char ready; - char platform_support; + bool frozen; + bool ready; + bool platform_support; bool free_bitmaps; } snapshot_state; @@ -93,9 +93,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) atomic_inc(&snapshot_device_available); - data->frozen = 0; - data->ready = 0; - data->platform_support = 0; + data->frozen = false; + data->ready = false; + data->platform_support = false; Unlock: unlock_system_sleep(); @@ -229,7 +229,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (error) thaw_processes(); else - data->frozen = 1; + data->frozen = true; break; @@ -240,7 +240,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, free_basic_memory_bitmaps(); data->free_bitmaps = false; thaw_processes(); - data->frozen = 0; + data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: @@ -270,7 +270,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); - data->ready = 0; + data->ready = false; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after @@ -334,7 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); - data->ready = 0; + data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b4e8500afdb3..be7c86bae576 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = { #ifdef CONFIG_KEXEC /* - * This appends the listed symbols to /proc/vmcoreinfo + * This appends the listed symbols to /proc/vmcore * - * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to + * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. @@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = 1; - printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + pr_info("debug: ignoring loglevel setting.\n"); return 0; } @@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str) pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); - return 1; + return 0; } -__setup("boot_delay=", boot_delay_setup); +early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { @@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; - printk(KERN_INFO "debug: skip boot console de-registration.\n"); + pr_info("debug: skip boot console de-registration.\n"); return 0; } @@ -2241,7 +2241,7 @@ void register_console(struct console *newcon) /* find the last or real console */ for_each_console(bcon) { if (!(bcon->flags & CON_BOOT)) { - printk(KERN_INFO "Too late to register bootconsole %s%d\n", + pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); return; } @@ -2358,21 +2358,18 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ + pr_info("%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { - /* we need to iterate through twice, to make sure we print - * everything out, before we unregister the console(s) + /* We need to iterate through all boot consoles, to make + * sure we print everything out, before we unregister them. */ - printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", - newcon->name, newcon->index); for_each_console(bcon) if (bcon->flags & CON_BOOT) unregister_console(bcon); - } else { - printk(KERN_INFO "%sconsole [%s%d] enabled\n", - (newcon->flags & CON_BOOT) ? "boot" : "" , - newcon->name, newcon->index); } } EXPORT_SYMBOL(register_console); @@ -2382,6 +2379,10 @@ int unregister_console(struct console *console) struct console *a, *b; int res; + pr_info("%sconsole [%s%d] disabled\n", + (console->flags & CON_BOOT) ? "boot" : "" , + console->name, console->index); + res = _braille_unregister_console(console); if (res) return res; @@ -2421,8 +2422,6 @@ static int __init printk_late_init(void) for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { - printk(KERN_INFO "turn off boot console %s%d\n", - con->name, con->index); unregister_console(con); } } @@ -2449,7 +2448,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) if (pending & PRINTK_PENDING_SCHED) { char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); + pr_warn("[sched_delayed] %s", buf); } if (pending & PRINTK_PENDING_WAKEUP) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dd562e9aa2c8..1f4bcb3cc21c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -257,7 +257,8 @@ ok: if (task->mm) dumpable = get_dumpable(task->mm); rcu_read_lock(); - if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { + if (dumpable != SUID_DUMP_USER && + !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { rcu_read_unlock(); return -EPERM; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aa066f306be2..1deccd78be98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4883,6 +4883,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_busy); +DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { @@ -4894,6 +4896,7 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); + rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); @@ -4902,6 +4905,9 @@ static void update_top_cache_domain(int cpu) sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + + sd = highest_flag_domain(cpu, SD_ASYM_PACKING); + rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 41c02b6b090e..df77c605c7a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6534,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu) static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; + int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq()->sd); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (!sd || !sd->nohz_idle) goto unlock; sd->nohz_idle = 0; - for (; sd; sd = sd->parent) - atomic_inc(&sd->groups->sgp->nr_busy_cpus); + atomic_inc(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6551,16 +6551,16 @@ unlock: void set_cpu_sd_state_idle(void) { struct sched_domain *sd; + int cpu = smp_processor_id(); rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq()->sd); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (!sd || sd->nohz_idle) goto unlock; sd->nohz_idle = 1; - for (; sd; sd = sd->parent) - atomic_dec(&sd->groups->sgp->nr_busy_cpus); + atomic_dec(&sd->groups->sgp->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -6767,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) { unsigned long now = jiffies; struct sched_domain *sd; + struct sched_group_power *sgp; + int nr_busy; if (unlikely(idle_cpu(cpu))) return 0; @@ -6792,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) goto need_kick; rcu_read_lock(); - for_each_domain(cpu, sd) { - struct sched_group *sg = sd->groups; - struct sched_group_power *sgp = sg->sgp; - int nr_busy = atomic_read(&sgp->nr_busy_cpus); + sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) - goto need_kick_unlock; + if (sd) { + sgp = sd->groups->sgp; + nr_busy = atomic_read(&sgp->nr_busy_cpus); - if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight - && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) + if (nr_busy > 1) goto need_kick_unlock; - - if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) - break; } + + sd = rcu_dereference(per_cpu(sd_asym, cpu)); + + if (sd && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + rcu_read_unlock(); return 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4e650acffed7..88c85b21d633 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -623,6 +623,8 @@ DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain *, sd_numa); +DECLARE_PER_CPU(struct sched_domain *, sd_busy); +DECLARE_PER_CPU(struct sched_domain *, sd_asym); struct sched_group_power { atomic_t ref; diff --git a/kernel/signal.c b/kernel/signal.c index ded28b91fa53..940b30ee9a30 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER -int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) +int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { int err; diff --git a/kernel/smp.c b/kernel/smp.c index 0564571dcdf7..46116100f0ee 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -18,6 +18,7 @@ #ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, + CSD_FLAG_WAIT = 0x02, }; struct call_function_data { @@ -124,7 +125,7 @@ static void csd_lock(struct call_single_data *csd) static void csd_unlock(struct call_single_data *csd) { - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); + WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: @@ -146,6 +147,9 @@ void generic_exec_single(int cpu, struct call_single_data *csd, int wait) unsigned long flags; int ipi; + if (wait) + csd->flags |= CSD_FLAG_WAIT; + raw_spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); list_add_tail(&csd->list, &dst->list); @@ -340,6 +344,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd, } put_cpu(); } +EXPORT_SYMBOL_GPL(__smp_call_function_single); /** * smp_call_function_many(): Run a function on a set of other CPUs. @@ -524,6 +529,11 @@ void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } +void __weak smp_announce(void) +{ + printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); +} + /* Called by boot processor to activate the rest. */ void __init smp_init(void) { @@ -540,7 +550,7 @@ void __init smp_init(void) } /* Any cleanup work */ - printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_announce(); smp_cpus_done(setup_max_cpus); } diff --git a/kernel/softirq.c b/kernel/softirq.c index dcab1d3fb53d..b24988353458 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -29,7 +29,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/irq.h> -#include <asm/irq.h> /* - No shared variables, all the data are CPU local. - If a softirq needs serialization, let it serialize itself @@ -134,7 +133,6 @@ EXPORT_SYMBOL(local_bh_disable); static void __local_bh_enable(unsigned int cnt) { - WARN_ON_ONCE(in_irq()); WARN_ON_ONCE(!irqs_disabled()); if (softirq_count() == cnt) @@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt) */ void _local_bh_enable(void) { + WARN_ON_ONCE(in_irq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } @@ -171,8 +170,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) */ preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); - if (unlikely(!in_interrupt() && local_softirq_pending())) + if (unlikely(!in_interrupt() && local_softirq_pending())) { + /* + * Run softirq if any pending. And do it in its own stack + * as we may be calling this deep in a task call stack already. + */ do_softirq(); + } preempt_count_dec(); #ifdef CONFIG_TRACE_IRQFLAGS @@ -280,10 +284,11 @@ restart: account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); + WARN_ON_ONCE(in_interrupt()); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } -#ifndef __ARCH_HAS_DO_SOFTIRQ + asmlinkage void do_softirq(void) { @@ -298,13 +303,11 @@ asmlinkage void do_softirq(void) pending = local_softirq_pending(); if (pending) - __do_softirq(); + do_softirq_own_stack(); local_irq_restore(flags); } -#endif - /* * Enter an interrupt context. */ @@ -329,15 +332,21 @@ void irq_enter(void) static inline void invoke_softirq(void) { if (!force_irqthreads) { +#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if * it is the irq stack, because it should be near empty - * at this stage. But we have no way to know if the arch - * calls irq_exit() on the irq stack. So call softirq - * in its own stack to prevent from any overrun on top - * of a potentially deep task stack. + * at this stage. */ - do_softirq(); + __do_softirq(); +#else + /* + * Otherwise, irq_exit() is called on the task stack that can + * be potentially deep already. So call softirq in its own stack + * to prevent from any overrun. + */ + do_softirq_own_stack(); +#endif } else { wakeup_softirqd(); } @@ -771,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu) { local_irq_disable(); if (local_softirq_pending()) { + /* + * We can safely run softirq on inline stack, as we are not deep + * in the task stack here. + */ __do_softirq(); rcu_note_context_switch(cpu); local_irq_enable(); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c530bc5be7cf..84571e09c907 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,6 +20,7 @@ #include <linux/kallsyms.h> #include <linux/smpboot.h> #include <linux/atomic.h> +#include <linux/lglock.h> /* * Structure to determine completion condition and record errors. May @@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); static bool stop_machine_initialized = false; +/* + * Avoids a race between stop_two_cpus and global stop_cpus, where + * the stoppers could get queued up in reverse order, leading to + * system deadlock. Using an lglock means stop_two_cpus remains + * relatively cheap. + */ +DEFINE_STATIC_LGLOCK(stop_cpus_lock); + static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { memset(done, 0, sizeof(*done)); @@ -276,6 +285,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * return -ENOENT; } + lg_local_lock(&stop_cpus_lock); /* * Queuing needs to be done by the lowest numbered CPU, to ensure * that works are always queued in the same order on every CPU. @@ -284,6 +294,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * smp_call_function_single(min(cpu1, cpu2), &irq_cpu_stop_queue_work, &call_args, 0); + lg_local_unlock(&stop_cpus_lock); preempt_enable(); wait_for_completion(&done.completion); @@ -335,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ - preempt_disable(); + lg_global_lock(&stop_cpus_lock); for_each_cpu(cpu, cpumask) cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); - preempt_enable(); + lg_global_unlock(&stop_cpus_lock); } static int __stop_cpus(const struct cpumask *cpumask, diff --git a/kernel/sys.c b/kernel/sys.c index c18ecca575b4..c72311324ea7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,7 +16,6 @@ #include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> -#include <linux/kexec.h> #include <linux/workqueue.h> #include <linux/capability.h> #include <linux/device.h> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 339c003314f4..34a604726d0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ -static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; +static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -1057,6 +1057,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_proc_update_handler, + .extra1 = &one, }, { .procname = "perf_cpu_time_max_percent", @@ -2222,8 +2223,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) + if (!first) { err = proc_put_char(&buffer, &left, '\t'); + if (err) + break; + } err = proc_put_long(&buffer, &left, val, false); if (err) break; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b609213ca9a2..653cbbd9e7ad 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file, if (get_user(value, vec + i)) goto out_kfree; - str += snprintf(str, end - str, "%lu\t", value); + str += scnprintf(str, end - str, "%lu\t", value); } result = kernel_write(file, buffer, str - buffer, 0); @@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file, if (get_user(value, vec + i)) goto out_kfree; - str += snprintf(str, end - str, "%lu\t", value); + str += scnprintf(str, end - str, "%lu\t", value); } result = kernel_write(file, buffer, str - buffer, 0); @@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file, if (get_user(dnaddr, (__le16 __user *)newval)) goto out; - len = snprintf(buf, sizeof(buf), "%hu.%hu", + len = scnprintf(buf, sizeof(buf), "%hu.%hu", le16_to_cpu(dnaddr) >> 10, le16_to_cpu(dnaddr) & 0x3ff); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 145bb4d3bd4d..9f4618eb51c8 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) struct listener_list *listeners; struct listener *s, *tmp, *s2; unsigned int cpu; + int ret = 0; if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; @@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); - if (!s) + if (!s) { + ret = -ENOMEM; goto cleanup; - + } s->pid = pid; s->valid = 1; @@ -339,7 +341,7 @@ cleanup: } up_write(&listeners->sem); } - return 0; + return ret; } static int parse(struct nlattr *na, struct cpumask *mask) @@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) if (!na) goto err; - if (nla_put(skb, type, sizeof(pid), &pid) < 0) + if (nla_put(skb, type, sizeof(pid), &pid) < 0) { + nla_nest_cancel(skb, na); goto err; + } ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); - if (!ret) + if (!ret) { + nla_nest_cancel(skb, na); goto err; + } nla_nest_end(skb, na); return nla_data(ret); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2b62fe86f9ec..3ce6e8c5f3fc 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -100,7 +100,7 @@ config NO_HZ_FULL # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency - depends on 64BIT + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index eec50fcef9e4..88c9c65a430d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EINVAL; return hrtimer_get_res(baseid, tp); } @@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) - return -ENOTSUPP; + return -EINVAL; *tp = ktime_to_timespec(base->gettime()); return 0; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 662c5798a685..086ad6043bcb 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -619,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev, const char *buf, size_t count) { char name[CS_NAME_LEN]; - size_t ret = sysfs_get_uname(buf, name, count); + ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce; if (ret < 0) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f3..ba3e502c955a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { } #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs: Pointer to clocksource - * + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) + * @maxadj: maximum adjustment value to mult (~11%) + * @mask: bitmask for two's complement subtraction of non 64 bit counters */ -static u64 clocksource_max_deferment(struct clocksource *cs) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) { u64 max_nsecs, max_cycles; /* * Calculate the maximum number of cycles that we can pass to the * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) + * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) * which is equivalent to the below. - * max_cycles < (2^63)/(cs->mult + cs->maxadj) - * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) - * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) - * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) - * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) + * max_cycles < (2^63)/(mult + maxadj) + * max_cycles < 2^(log2((2^63)/(mult + maxadj))) + * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) + * max_cycles < 2^(63 - log2(mult + maxadj)) + * max_cycles < 1 << (63 - log2(mult + maxadj)) * Please note that we add 1 to the result of the log2 to account for * any rounding errors, ensure the above inequality is satisfied and * no overflow will occur. */ - max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); + max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); /* * The actual maximum number of cycles we can defer the clocksource is - * determined by the minimum of max_cycles and cs->mask. + * determined by the minimum of max_cycles and mask. * Note: Here we subtract the maxadj to make sure we don't sleep for * too long if there's a large negative adjustment. */ - max_cycles = min_t(u64, max_cycles, (u64) cs->mask); - max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, - cs->shift); + max_cycles = min(max_cycles, mask); + max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + + return max_nsecs; +} + +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs; + max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, + cs->mask); /* * To ensure that the clocksource does not wrap whilst we are idle, * limit the time the clocksource can be deferred by 12.5%. Please @@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev, return count; } -size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) { size_t ret = cnt; @@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - size_t ret; + ssize_t ret; mutex_lock(&clocksource_mutex); @@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev, { struct clocksource *cs; char name[CS_NAME_LEN]; - size_t ret; + ssize_t ret; ret = sysfs_get_uname(buf, name, count); if (ret < 0) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bb2215174f05..af8d1d4f3d55 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work) * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... + * We want the clock to be within a couple of ticks from the target. */ if (!ntp_synced()) { /* @@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work) } getnstimeofday(&now); - if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { struct timespec adjust = now; fail = -ENODEV; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0b479a6a22bb..68b799375981 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,25 +8,28 @@ #include <linux/clocksource.h> #include <linux/init.h> #include <linux/jiffies.h> +#include <linux/ktime.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/syscore_ops.h> -#include <linux/timer.h> +#include <linux/hrtimer.h> #include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h> struct clock_data { + ktime_t wrap_kt; u64 epoch_ns; - u32 epoch_cyc; - u32 epoch_cyc_copy; + u64 epoch_cyc; + seqcount_t seq; unsigned long rate; u32 mult; u32 shift; bool suspended; }; -static void sched_clock_poll(unsigned long wrap_ticks); -static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static struct hrtimer sched_clock_timer; static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); @@ -35,42 +38,46 @@ static struct clock_data cd = { .mult = NSEC_PER_SEC / HZ, }; -static u32 __read_mostly sched_clock_mask = 0xffffffff; +static u64 __read_mostly sched_clock_mask; -static u32 notrace jiffy_sched_clock_read(void) +static u64 notrace jiffy_sched_clock_read(void) { - return (u32)(jiffies - INITIAL_JIFFIES); + /* + * We don't need to use get_jiffies_64 on 32-bit arches here + * because we register with BITS_PER_LONG + */ + return (u64)(jiffies - INITIAL_JIFFIES); } -static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static u32 __read_mostly (*read_sched_clock_32)(void); + +static u64 notrace read_sched_clock_32_wrapper(void) +{ + return read_sched_clock_32(); +} + +static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { return (cyc * mult) >> shift; } -static unsigned long long notrace sched_clock_32(void) +unsigned long long notrace sched_clock(void) { u64 epoch_ns; - u32 epoch_cyc; - u32 cyc; + u64 epoch_cyc; + u64 cyc; + unsigned long seq; if (cd.suspended) return cd.epoch_ns; - /* - * Load the epoch_cyc and epoch_ns atomically. We do this by - * ensuring that we always write epoch_cyc, epoch_ns and - * epoch_cyc_copy in strict order, and read them in strict order. - * If epoch_cyc and epoch_cyc_copy are not equal, then we're in - * the middle of an update, and we should repeat the load. - */ do { + seq = read_seqcount_begin(&cd.seq); epoch_cyc = cd.epoch_cyc; - smp_rmb(); epoch_ns = cd.epoch_ns; - smp_rmb(); - } while (epoch_cyc != cd.epoch_cyc_copy); + } while (read_seqcount_retry(&cd.seq, seq)); cyc = read_sched_clock(); cyc = (cyc - epoch_cyc) & sched_clock_mask; @@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void) static void notrace update_sched_clock(void) { unsigned long flags; - u32 cyc; + u64 cyc; u64 ns; cyc = read_sched_clock(); ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, cd.mult, cd.shift); - /* - * Write epoch_cyc and epoch_ns in a way that the update is - * detectable in cyc_to_fixed_sched_clock(). - */ + raw_local_irq_save(flags); - cd.epoch_cyc_copy = cyc; - smp_wmb(); + write_seqcount_begin(&cd.seq); cd.epoch_ns = ns; - smp_wmb(); cd.epoch_cyc = cyc; + write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } -static void sched_clock_poll(unsigned long wrap_ticks) +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { - mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); update_sched_clock(); + hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +void __init sched_clock_register(u64 (*read)(void), int bits, + unsigned long rate) { - unsigned long r, w; + unsigned long r; u64 res, wrap; char r_unit; if (cd.rate > rate) return; - BUG_ON(bits > 32); WARN_ON(!irqs_disabled()); read_sched_clock = read; - sched_clock_mask = (1ULL << bits) - 1; + sched_clock_mask = CLOCKSOURCE_MASK(bits); cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); r = rate; if (r >= 4000000) { @@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) r_unit = ' '; /* calculate how many ns until we wrap */ - wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); - do_div(wrap, NSEC_PER_MSEC); - w = wrap; + wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); + cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); /* calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, cd.mult, cd.shift); - pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", - bits, r, r_unit, res, w); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", + bits, r, r_unit, res, wrap); - /* - * Start the timer to keep sched_clock() properly updated and - * sets the initial epoch. - */ - sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); update_sched_clock(); /* @@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pF as sched_clock source\n", read); } -unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; - -unsigned long long notrace sched_clock(void) +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) { - return sched_clock_func(); + read_sched_clock_32 = read; + sched_clock_register(read_sched_clock_32_wrapper, bits, rate); } void __init sched_clock_postinit(void) @@ -180,14 +177,22 @@ void __init sched_clock_postinit(void) * make it the final one one. */ if (read_sched_clock == jiffy_sched_clock_read) - setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); - sched_clock_poll(sched_clock_timer.data); + update_sched_clock(); + + /* + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ + hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + sched_clock_timer.function = sched_clock_poll; + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); } static int sched_clock_suspend(void) { - sched_clock_poll(sched_clock_timer.data); + sched_clock_poll(&sched_clock_timer); cd.suspended = true; return 0; } @@ -195,7 +200,6 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { cd.epoch_cyc = read_sched_clock(); - cd.epoch_cyc_copy = cd.epoch_cyc; cd.suspended = false; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 218bcb565fed..9532690daaa9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev, struct clock_event_device *newdev) { if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || + (newdev->features & CLOCK_EVT_FEAT_PERCPU) || (newdev->features & CLOCK_EVT_FEAT_C3STOP)) return false; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bc906cad709b..18e71f7fbc2a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); -extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); /* * NO_HZ / high resolution timer shared code diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 947ba25a95a0..3abf53418b67 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, * ktime_get_update_offsets - hrtimer helper * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset + * @offs_tai: pointer to storage for monotonic -> clock tai offset * * Returns current monotonic time and updates the offsets - * Called from hrtimer_interupt() or retrigger_next_event() + * Called from hrtimer_interrupt() or retrigger_next_event() */ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, ktime_t *offs_tai) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 0b537f27b559..1fb08f21302e 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v) period = ktime_to_timespec(time); ms = period.tv_nsec / 1000000; - seq_puts(m, "Timer Stats Version: v0.2\n"); + seq_puts(m, "Timer Stats Version: v0.3\n"); seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); if (atomic_read(&overflow_count)) - seq_printf(m, "Overflow: %d entries\n", - atomic_read(&overflow_count)); + seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); + seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive"); for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b8b8560bfb95..f785aef65799 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/time.h> #include <linux/uaccess.h> +#include <linux/list.h> #include <trace/events/block.h> @@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); + /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -107,10 +111,18 @@ record_it: * Send out a notify for this process, if we haven't done so since a trace * started */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk) { + unsigned long flags; + struct blk_trace *bt; + tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); + spin_lock_irqsave(&running_trace_lock, flags); + list_for_each_entry(bt, &running_trace_list, running_list) { + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, + sizeof(tsk->comm)); + } + spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, goto record_it; } + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(tsk); + /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq * from coming in and stepping on our toes. */ local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, bt->dir = dir; bt->dev = dev; atomic_set(&bt->dropped, 0); + INIT_LIST_HEAD(&bt->running_list); ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .end_lba = cbuts.end_lba, .pid = cbuts.pid, }; - memcpy(&buts.name, &cbuts.name, 32); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; - if (copy_to_user(arg, &buts.name, 32)) { + if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); return -EFAULT; } @@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; + spin_lock_irq(&running_trace_lock); + list_add(&bt->running_list, &running_trace_list); + spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); + spin_lock_irq(&running_trace_lock); + list_del(&bt->running_list); + spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7974ba20557d..d9fea7dfd5d3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #endif ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | - (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | + (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); } EXPORT_SYMBOL_GPL(tracing_generic_entry_update); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10c86fb7a2b4..73d08aa25b55 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -124,6 +124,7 @@ enum trace_flag_type { TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, }; #define TRACE_BUF_SIZE 1024 diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 80c36bcf66e8..78e27e3b52ac 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, { /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return -EPERM; /* No tracing, just counting, so no obvious leak */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 34e7cbac0c9c..ed32284fbe32 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; - need_resched = - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + + switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | + TRACE_FLAG_PREEMPT_RESCHED)) { + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'N'; + break; + case TRACE_FLAG_NEED_RESCHED: + need_resched = 'n'; + break; + case TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'p'; + break; + default: + need_resched = '.'; + break; + } + hardsoft_irq = (hardirq && softirq) ? 'H' : hardirq ? 'h' : |