From 4c737b41de7f4eef2a593803bad1b918dd718b10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: make cgroup_path() and friends behave in the style of strlcpy() cgroup_path() and friends used to format the path from the end and thus the resulting path usually didn't start at the start of the passed in buffer. Also, when the buffer was too small, the partial result was truncated from the head rather than tail and there was no way to tell how long the full path would be. These make the functions less robust and more awkward to use. With recent updates to kernfs_path(), cgroup_path() and friends can be made to behave in strlcpy() style. * cgroup_path(), cgroup_path_ns[_locked]() and task_cgroup_path() now always return the length of the full path. If buffer is too small, it contains nul terminated truncated output. * All users updated accordingly. v2: cgroup_path() usage in kernel/sched/debug.c converted. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Cc: Serge Hallyn Cc: Peter Zijlstra --- kernel/cgroup.c | 48 ++++++++++++++++++++++-------------------------- kernel/cpuset.c | 12 ++++++------ kernel/sched/debug.c | 3 ++- 3 files changed, 30 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1c51b7f5221..3861161e460f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2315,22 +2315,18 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +static int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); - int ret; - ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; + return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); } -char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns) +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) { - char *ret; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2357,12 +2353,12 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * * Return value is the same as kernfs_path(). */ -char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroup_root *root; struct cgroup *cgrp; int hierarchy_id = 1; - char *path = NULL; + int ret; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); @@ -2371,16 +2367,15 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - if (strlcpy(buf, "/", buflen) < buflen) - path = buf; + ret = strlcpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - return path; + return ret; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -5716,7 +5711,7 @@ core_initcall(cgroup_wq_init); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *path; + char *buf; int retval; struct cgroup_root *root; @@ -5759,18 +5754,18 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (!path) { + if (retval >= PATH_MAX) { retval = -ENAMETOOLONG; goto out_unlock; } + + seq_puts(m, buf); } else { - path = "/"; + seq_puts(m, "/"); } - seq_puts(m, path); - if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) seq_puts(m, " (deleted)\n"); else @@ -6035,8 +6030,9 @@ static void cgroup_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL, *path; + char *pathbuf = NULL, *agentbuf = NULL; char *argv[3], *envp[3]; + int ret; mutex_lock(&cgroup_mutex); @@ -6046,13 +6042,13 @@ static void cgroup_release_agent(struct work_struct *work) goto out; spin_lock_irq(&css_set_lock); - path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (!path) + if (ret >= PATH_MAX) goto out; argv[0] = agentbuf; - argv[1] = path; + argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c7fd2778ed50..793ae6fd96dc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2689,7 +2689,7 @@ void __cpuset_memory_pressure_bump(void) int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { - char *buf, *p; + char *buf; struct cgroup_subsys_state *css; int retval; @@ -2700,18 +2700,18 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); - p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); + retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); css_put(css); - if (!p) + if (retval >= PATH_MAX) goto out_free; - seq_puts(m, p); + seq_puts(m, buf); seq_putc(m, '\n'); retval = 0; out_free: kfree(buf); out: - return retval; + return 0; } #endif /* CONFIG_PROC_PID_CPUSET */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a0a9995256d..23cb609ba4eb 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -410,7 +410,8 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif -- cgit v1.2.3 From ed1777de25e45bfb58fad63341904f8a77911785 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 10 Aug 2016 11:23:44 -0400 Subject: cgroup: add tracepoints for basic operations Debugging what goes wrong with cgroup setup can get hairy. Add tracepoints for cgroup hierarchy mount, cgroup creation/destruction and task migration operations for better visibility. Signed-off-by: Tejun Heo --- include/trace/events/cgroup.h | 163 ++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup.c | 25 +++++++ 2 files changed, 188 insertions(+) create mode 100644 include/trace/events/cgroup.h (limited to 'kernel') diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h new file mode 100644 index 000000000000..ab68640a18d0 --- /dev/null +++ b/include/trace/events/cgroup.h @@ -0,0 +1,163 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cgroup + +#if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CGROUP_H + +#include +#include + +DECLARE_EVENT_CLASS(cgroup_root, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root), + + TP_STRUCT__entry( + __field( int, root ) + __field( u16, ss_mask ) + __string( name, root->name ) + ), + + TP_fast_assign( + __entry->root = root->hierarchy_id; + __entry->ss_mask = root->subsys_mask; + __assign_str(name, root->name); + ), + + TP_printk("root=%d ss_mask=%#x name=%s", + __entry->root, __entry->ss_mask, __get_str(name)) +); + +DEFINE_EVENT(cgroup_root, cgroup_setup_root, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root) +); + +DEFINE_EVENT(cgroup_root, cgroup_destroy_root, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root) +); + +DEFINE_EVENT(cgroup_root, cgroup_remount, + + TP_PROTO(struct cgroup_root *root), + + TP_ARGS(root) +); + +DECLARE_EVENT_CLASS(cgroup, + + TP_PROTO(struct cgroup *cgrp), + + TP_ARGS(cgrp), + + TP_STRUCT__entry( + __field( int, root ) + __field( int, id ) + __field( int, level ) + __dynamic_array(char, path, + cgrp->kn ? cgroup_path(cgrp, NULL, 0) + 1 + : strlen("(null)")) + ), + + TP_fast_assign( + __entry->root = cgrp->root->hierarchy_id; + __entry->id = cgrp->id; + __entry->level = cgrp->level; + if (cgrp->kn) + cgroup_path(cgrp, __get_dynamic_array(path), + __get_dynamic_array_len(path)); + else + __assign_str(path, "(null)"); + ), + + TP_printk("root=%d id=%d level=%d path=%s", + __entry->root, __entry->id, __entry->level, __get_str(path)) +); + +DEFINE_EVENT(cgroup, cgroup_mkdir, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DEFINE_EVENT(cgroup, cgroup_rmdir, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DEFINE_EVENT(cgroup, cgroup_release, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DEFINE_EVENT(cgroup, cgroup_rename, + + TP_PROTO(struct cgroup *cgroup), + + TP_ARGS(cgroup) +); + +DECLARE_EVENT_CLASS(cgroup_migrate, + + TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + + TP_ARGS(dst_cgrp, task, threadgroup), + + TP_STRUCT__entry( + __field( int, dst_root ) + __field( int, dst_id ) + __field( int, dst_level ) + __dynamic_array(char, dst_path, + dst_cgrp->kn ? cgroup_path(dst_cgrp, NULL, 0) + 1 + : strlen("(null)")) + __field( int, pid ) + __string( comm, task->comm ) + ), + + TP_fast_assign( + __entry->dst_root = dst_cgrp->root->hierarchy_id; + __entry->dst_id = dst_cgrp->id; + __entry->dst_level = dst_cgrp->level; + if (dst_cgrp->kn) + cgroup_path(dst_cgrp, __get_dynamic_array(dst_path), + __get_dynamic_array_len(dst_path)); + else + __assign_str(dst_path, "(null)"); + __entry->pid = task->pid; + __assign_str(comm, task->comm); + ), + + TP_printk("dst_root=%d dst_id=%d dst_level=%d dst_path=%s pid=%d comm=%s", + __entry->dst_root, __entry->dst_id, __entry->dst_level, + __get_str(dst_path), __entry->pid, __get_str(comm)) +); + +DEFINE_EVENT(cgroup_migrate, cgroup_attach_task, + + TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + + TP_ARGS(dst_cgrp, task, threadgroup) +); + +DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, + + TP_PROTO(struct cgroup *dst_cgrp, struct task_struct *task, bool threadgroup), + + TP_ARGS(dst_cgrp, task, threadgroup) +); + +#endif /* _TRACE_CGROUP_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3861161e460f..5e2e81ad9175 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -64,6 +64,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -1176,6 +1179,8 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; + trace_cgroup_destroy_root(root); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); @@ -1874,6 +1879,9 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); spin_unlock(&release_agent_path_lock); } + + trace_cgroup_remount(root); + out_unlock: kfree(opts.release_agent); kfree(opts.name); @@ -2031,6 +2039,8 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; + trace_cgroup_setup_root(root); + /* * There must be no failure case after here, since rebinding takes * care of subsystems' refcounts, which are explicitly dropped in @@ -2825,6 +2835,10 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); + + if (!ret) + trace_cgroup_attach_task(dst_cgrp, leader, threadgroup); + return ret; } @@ -3587,6 +3601,8 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&cgroup_mutex); ret = kernfs_rename(kn, new_parent, new_name_str); + if (!ret) + trace_cgroup_rename(cgrp); mutex_unlock(&cgroup_mutex); @@ -4355,6 +4371,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) if (task) { ret = cgroup_migrate(task, false, to->root); + if (!ret) + trace_cgroup_transfer_tasks(to, task, false); put_task_struct(task); } } while (task && !ret); @@ -5020,6 +5038,8 @@ static void css_release_work_fn(struct work_struct *work) ss->css_released(css); } else { /* cgroup release path */ + trace_cgroup_release(cgrp); + cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5306,6 +5326,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (ret) goto out_destroy; + trace_cgroup_mkdir(cgrp); + /* let's create and online css's */ kernfs_activate(kn); @@ -5481,6 +5503,9 @@ static int cgroup_rmdir(struct kernfs_node *kn) ret = cgroup_destroy_locked(cgrp); + if (!ret) + trace_cgroup_rmdir(cgrp); + cgroup_kn_unlock(kn); return ret; } -- cgit v1.2.3 From 679a5e3f12830392d14f94b04bbe0f3cabdbd773 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Sep 2016 11:58:36 +0200 Subject: cpuset: fix error handling regression in proc_cpuset_show() 4c737b41de7f ("cgroup: make cgroup_path() and friends behave in the style of strlcpy()") botched the conversion of proc_cpuset_show() and broke its error handling. It made the function return 0 on failures and fail to handle error returns from cgroup_path_ns(). Fix it. Reported-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cpuset.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 793ae6fd96dc..97dd8e178786 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2698,12 +2698,13 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - retval = -ENAMETOOLONG; css = task_get_css(tsk, cpuset_cgrp_id); retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, current->nsproxy->cgroup_ns); css_put(css); if (retval >= PATH_MAX) + retval = -ENAMETOOLONG; + if (retval < 0) goto out_free; seq_puts(m, buf); seq_putc(m, '\n'); @@ -2711,7 +2712,7 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, out_free: kfree(buf); out: - return 0; + return retval; } #endif /* CONFIG_PROC_PID_CPUSET */ -- cgit v1.2.3 From e0223003e6e141533446d01a92784592a97a8552 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Sep 2016 15:49:40 +0200 Subject: cgroup: fix error handling regressions in proc_cgroup_show() and cgroup_release_agent() 4c737b41de7f ("cgroup: make cgroup_path() and friends behave in the style of strlcpy()") broke error handling in proc_cgroup_show() and cgroup_release_agent() by not handling negative return values from cgroup_path_ns_locked(). Fix it. Reported-by: Dan Carpenter Signed-off-by: Tejun Heo --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5e2e81ad9175..a7f9fb4e1fc7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5781,10 +5781,10 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, current->nsproxy->cgroup_ns); - if (retval >= PATH_MAX) { + if (retval >= PATH_MAX) retval = -ENAMETOOLONG; + if (retval < 0) goto out_unlock; - } seq_puts(m, buf); } else { @@ -6069,7 +6069,7 @@ static void cgroup_release_agent(struct work_struct *work) spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); spin_unlock_irq(&css_set_lock); - if (ret >= PATH_MAX) + if (ret < 0 || ret >= PATH_MAX) goto out; argv[0] = agentbuf; -- cgit v1.2.3 From 38addce8b600ca335dc86fa3d48c890f1c6fa1f4 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:41:19 +0200 Subject: gcc-plugins: Add latent_entropy plugin This adds a new gcc plugin named "latent_entropy". It is designed to extract as much possible uncertainty from a running system at boot time as possible, hoping to capitalize on any possible variation in CPU operation (due to runtime data differences, hardware differences, SMP ordering, thermal timing variation, cache behavior, etc). At the very least, this plugin is a much more comprehensive example for how to manipulate kernel code using the gcc plugin internals. The need for very-early boot entropy tends to be very architecture or system design specific, so this plugin is more suited for those sorts of special cases. The existing kernel RNG already attempts to extract entropy from reliable runtime variation, but this plugin takes the idea to a logical extreme by permuting a global variable based on any variation in code execution (e.g. a different value (and permutation function) is used to permute the global based on loop count, case statement, if/then/else branching, etc). To do this, the plugin starts by inserting a local variable in every marked function. The plugin then adds logic so that the value of this variable is modified by randomly chosen operations (add, xor and rol) and random values (gcc generates separate static values for each location at compile time and also injects the stack pointer at runtime). The resulting value depends on the control flow path (e.g., loops and branches taken). Before the function returns, the plugin mixes this local variable into the latent_entropy global variable. The value of this global variable is added to the kernel entropy pool in do_one_initcall() and _do_fork(), though it does not credit any bytes of entropy to the pool; the contents of the global are just used to mix the pool. Additionally, the plugin can pre-initialize arrays with build-time random contents, so that two different kernel builds running on identical hardware will not have the same starting values. Signed-off-by: Emese Revfy [kees: expanded commit message and code comments] Signed-off-by: Kees Cook --- arch/Kconfig | 18 + arch/powerpc/kernel/Makefile | 5 + include/linux/random.h | 11 + init/main.c | 1 + kernel/fork.c | 1 + mm/page_alloc.c | 5 + scripts/Makefile.gcc-plugins | 9 +- scripts/gcc-plugins/latent_entropy_plugin.c | 640 ++++++++++++++++++++++++++++ 8 files changed, 689 insertions(+), 1 deletion(-) create mode 100644 scripts/gcc-plugins/latent_entropy_plugin.c (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index fd6e9712af81..ae80cd24e8f0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -383,6 +383,24 @@ config GCC_PLUGIN_SANCOV gcc-4.5 on). It is based on the commit "Add fuzzing coverage support" by Dmitry Vyukov . +config GCC_PLUGIN_LATENT_ENTROPY + bool "Generate some entropy during boot and runtime" + depends on GCC_PLUGINS + help + By saying Y here the kernel will instrument some kernel code to + extract some entropy from both original and artificially created + program state. This will help especially embedded systems where + there is little 'natural' source of entropy normally. The cost + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + + Note that entropy extracted this way is not cryptographically + secure! + + This plugin was ported from grsecurity/PaX. More information at: + * https://grsecurity.net/ + * https://pax.grsecurity.net/ + config HAVE_CC_STACKPROTECTOR bool help diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fe4c075bcf50..62df36c3f138 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif +CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) + ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) diff --git a/include/linux/random.h b/include/linux/random.h index 3d6e9815cd85..a59c74cdb1eb 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -18,6 +18,17 @@ struct random_ready_callback { }; extern void add_device_randomness(const void *, unsigned int); + +#if defined(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) && !defined(__CHECKER__) +static inline void add_latent_entropy(void) +{ + add_device_randomness((const void *)&latent_entropy, + sizeof(latent_entropy)); +} +#else +static inline void add_latent_entropy(void) {} +#endif + extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); extern void add_interrupt_randomness(int irq, int irq_flags); diff --git a/init/main.c b/init/main.c index a8a58e2794a5..2858be732f6d 100644 --- a/init/main.c +++ b/init/main.c @@ -789,6 +789,7 @@ int __init_or_module do_one_initcall(initcall_t fn) } WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); + add_latent_entropy(); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..001b18473a07 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1780,6 +1780,7 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + add_latent_entropy(); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2214c64ed3c..248851d1fc86 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -91,6 +91,11 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY +volatile u64 latent_entropy; +EXPORT_SYMBOL(latent_entropy); +#endif + /* * Array of node states. */ diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 61f0e6db909b..060d2cb373db 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -6,6 +6,12 @@ ifdef CONFIG_GCC_PLUGINS gcc-plugin-$(CONFIG_GCC_PLUGIN_CYC_COMPLEXITY) += cyc_complexity_plugin.so + gcc-plugin-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) += latent_entropy_plugin.so + gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) += -DLATENT_ENTROPY_PLUGIN + ifdef CONFIG_PAX_LATENT_ENTROPY + DISABLE_LATENT_ENTROPY_PLUGIN += -fplugin-arg-latent_entropy_plugin-disable + endif + ifdef CONFIG_GCC_PLUGIN_SANCOV ifeq ($(CFLAGS_KCOV),) # It is needed because of the gcc-plugin.sh and gcc version checks. @@ -21,7 +27,8 @@ ifdef CONFIG_GCC_PLUGINS GCC_PLUGINS_CFLAGS := $(strip $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y)) $(gcc-plugin-cflags-y)) - export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR SANCOV_PLUGIN + export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN GCC_PLUGIN_SUBDIR + export SANCOV_PLUGIN DISABLE_LATENT_ENTROPY_PLUGIN ifneq ($(PLUGINCC),) # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication. diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c new file mode 100644 index 000000000000..ff1939b804ae --- /dev/null +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -0,0 +1,640 @@ +/* + * Copyright 2012-2016 by the PaX Team + * Copyright 2016 by Emese Revfy + * Licensed under the GPL v2 + * + * Note: the choice of the license means that the compilation process is + * NOT 'eligible' as defined by gcc's library exception to the GPL v3, + * but for the kernel it doesn't matter since it doesn't link against + * any of the gcc libraries + * + * This gcc plugin helps generate a little bit of entropy from program state, + * used throughout the uptime of the kernel. Here is an instrumentation example: + * + * before: + * void __latent_entropy test(int argc, char *argv[]) + * { + * if (argc <= 1) + * printf("%s: no command arguments :(\n", *argv); + * else + * printf("%s: %d command arguments!\n", *argv, args - 1); + * } + * + * after: + * void __latent_entropy test(int argc, char *argv[]) + * { + * // latent_entropy_execute() 1. + * unsigned long local_entropy; + * // init_local_entropy() 1. + * void *local_entropy_frameaddr; + * // init_local_entropy() 3. + * unsigned long tmp_latent_entropy; + * + * // init_local_entropy() 2. + * local_entropy_frameaddr = __builtin_frame_address(0); + * local_entropy = (unsigned long) local_entropy_frameaddr; + * + * // init_local_entropy() 4. + * tmp_latent_entropy = latent_entropy; + * // init_local_entropy() 5. + * local_entropy ^= tmp_latent_entropy; + * + * // latent_entropy_execute() 3. + * if (argc <= 1) { + * // perturb_local_entropy() + * local_entropy += 4623067384293424948; + * printf("%s: no command arguments :(\n", *argv); + * // perturb_local_entropy() + * } else { + * local_entropy ^= 3896280633962944730; + * printf("%s: %d command arguments!\n", *argv, args - 1); + * } + * + * // latent_entropy_execute() 4. + * tmp_latent_entropy = rol(tmp_latent_entropy, local_entropy); + * latent_entropy = tmp_latent_entropy; + * } + * + * TODO: + * - add ipa pass to identify not explicitly marked candidate functions + * - mix in more program state (function arguments/return values, + * loop variables, etc) + * - more instrumentation control via attribute parameters + * + * BUGS: + * - none known + * + * Options: + * -fplugin-arg-latent_entropy_plugin-disable + * + * Attribute: __attribute__((latent_entropy)) + * The latent_entropy gcc attribute can be only on functions and variables. + * If it is on a function then the plugin will instrument it. If the attribute + * is on a variable then the plugin will initialize it with a random value. + * The variable must be an integer, an integer array type or a structure + * with integer fields. + */ + +#include "gcc-common.h" + +int plugin_is_GPL_compatible; + +static GTY(()) tree latent_entropy_decl; + +static struct plugin_info latent_entropy_plugin_info = { + .version = "201606141920vanilla", + .help = "disable\tturn off latent entropy instrumentation\n", +}; + +static unsigned HOST_WIDE_INT seed; +/* + * get_random_seed() (this is a GCC function) generates the seed. + * This is a simple random generator without any cryptographic security because + * the entropy doesn't come from here. + */ +static unsigned HOST_WIDE_INT get_random_const(void) +{ + unsigned int i; + unsigned HOST_WIDE_INT ret = 0; + + for (i = 0; i < 8 * sizeof(ret); i++) { + ret = (ret << 1) | (seed & 1); + seed >>= 1; + if (ret & 1) + seed ^= 0xD800000000000000ULL; + } + + return ret; +} + +static tree tree_get_random_const(tree type) +{ + unsigned long long mask; + + mask = 1ULL << (TREE_INT_CST_LOW(TYPE_SIZE(type)) - 1); + mask = 2 * (mask - 1) + 1; + + if (TYPE_UNSIGNED(type)) + return build_int_cstu(type, mask & get_random_const()); + return build_int_cst(type, mask & get_random_const()); +} + +static tree handle_latent_entropy_attribute(tree *node, tree name, + tree args __unused, + int flags __unused, + bool *no_add_attrs) +{ + tree type; +#if BUILDING_GCC_VERSION <= 4007 + VEC(constructor_elt, gc) *vals; +#else + vec *vals; +#endif + + switch (TREE_CODE(*node)) { + default: + *no_add_attrs = true; + error("%qE attribute only applies to functions and variables", + name); + break; + + case VAR_DECL: + if (DECL_INITIAL(*node)) { + *no_add_attrs = true; + error("variable %qD with %qE attribute must not be initialized", + *node, name); + break; + } + + if (!TREE_STATIC(*node)) { + *no_add_attrs = true; + error("variable %qD with %qE attribute must not be local", + *node, name); + break; + } + + type = TREE_TYPE(*node); + switch (TREE_CODE(type)) { + default: + *no_add_attrs = true; + error("variable %qD with %qE attribute must be an integer or a fixed length integer array type or a fixed sized structure with integer fields", + *node, name); + break; + + case RECORD_TYPE: { + tree fld, lst = TYPE_FIELDS(type); + unsigned int nelt = 0; + + for (fld = lst; fld; nelt++, fld = TREE_CHAIN(fld)) { + tree fieldtype; + + fieldtype = TREE_TYPE(fld); + if (TREE_CODE(fieldtype) == INTEGER_TYPE) + continue; + + *no_add_attrs = true; + error("structure variable %qD with %qE attribute has a non-integer field %qE", + *node, name, fld); + break; + } + + if (fld) + break; + +#if BUILDING_GCC_VERSION <= 4007 + vals = VEC_alloc(constructor_elt, gc, nelt); +#else + vec_alloc(vals, nelt); +#endif + + for (fld = lst; fld; fld = TREE_CHAIN(fld)) { + tree random_const, fld_t = TREE_TYPE(fld); + + random_const = tree_get_random_const(fld_t); + CONSTRUCTOR_APPEND_ELT(vals, fld, random_const); + } + + /* Initialize the fields with random constants */ + DECL_INITIAL(*node) = build_constructor(type, vals); + break; + } + + /* Initialize the variable with a random constant */ + case INTEGER_TYPE: + DECL_INITIAL(*node) = tree_get_random_const(type); + break; + + case ARRAY_TYPE: { + tree elt_type, array_size, elt_size; + unsigned int i, nelt; + HOST_WIDE_INT array_size_int, elt_size_int; + + elt_type = TREE_TYPE(type); + elt_size = TYPE_SIZE_UNIT(TREE_TYPE(type)); + array_size = TYPE_SIZE_UNIT(type); + + if (TREE_CODE(elt_type) != INTEGER_TYPE || !array_size + || TREE_CODE(array_size) != INTEGER_CST) { + *no_add_attrs = true; + error("array variable %qD with %qE attribute must be a fixed length integer array type", + *node, name); + break; + } + + array_size_int = TREE_INT_CST_LOW(array_size); + elt_size_int = TREE_INT_CST_LOW(elt_size); + nelt = array_size_int / elt_size_int; + +#if BUILDING_GCC_VERSION <= 4007 + vals = VEC_alloc(constructor_elt, gc, nelt); +#else + vec_alloc(vals, nelt); +#endif + + for (i = 0; i < nelt; i++) { + tree cst = size_int(i); + tree rand_cst = tree_get_random_const(elt_type); + + CONSTRUCTOR_APPEND_ELT(vals, cst, rand_cst); + } + + /* + * Initialize the elements of the array with random + * constants + */ + DECL_INITIAL(*node) = build_constructor(type, vals); + break; + } + } + break; + + case FUNCTION_DECL: + break; + } + + return NULL_TREE; +} + +static struct attribute_spec latent_entropy_attr = { + .name = "latent_entropy", + .min_length = 0, + .max_length = 0, + .decl_required = true, + .type_required = false, + .function_type_required = false, + .handler = handle_latent_entropy_attribute, +#if BUILDING_GCC_VERSION >= 4007 + .affects_type_identity = false +#endif +}; + +static void register_attributes(void *event_data __unused, void *data __unused) +{ + register_attribute(&latent_entropy_attr); +} + +static bool latent_entropy_gate(void) +{ + tree list; + + /* don't bother with noreturn functions for now */ + if (TREE_THIS_VOLATILE(current_function_decl)) + return false; + + /* gcc-4.5 doesn't discover some trivial noreturn functions */ + if (EDGE_COUNT(EXIT_BLOCK_PTR_FOR_FN(cfun)->preds) == 0) + return false; + + list = DECL_ATTRIBUTES(current_function_decl); + return lookup_attribute("latent_entropy", list) != NULL_TREE; +} + +static tree create_var(tree type, const char *name) +{ + tree var; + + var = create_tmp_var(type, name); + add_referenced_var(var); + mark_sym_for_renaming(var); + return var; +} + +/* + * Set up the next operation and its constant operand to use in the latent + * entropy PRNG. When RHS is specified, the request is for perturbing the + * local latent entropy variable, otherwise it is for perturbing the global + * latent entropy variable where the two operands are already given by the + * local and global latent entropy variables themselves. + * + * The operation is one of add/xor/rol when instrumenting the local entropy + * variable and one of add/xor when perturbing the global entropy variable. + * Rotation is not used for the latter case because it would transmit less + * entropy to the global variable than the other two operations. + */ +static enum tree_code get_op(tree *rhs) +{ + static enum tree_code op; + unsigned HOST_WIDE_INT random_const; + + random_const = get_random_const(); + + switch (op) { + case BIT_XOR_EXPR: + op = PLUS_EXPR; + break; + + case PLUS_EXPR: + if (rhs) { + op = LROTATE_EXPR; + /* + * This code limits the value of random_const to + * the size of a wide int for the rotation + */ + random_const &= HOST_BITS_PER_WIDE_INT - 1; + break; + } + + case LROTATE_EXPR: + default: + op = BIT_XOR_EXPR; + break; + } + if (rhs) + *rhs = build_int_cstu(unsigned_intDI_type_node, random_const); + return op; +} + +static gimple create_assign(enum tree_code code, tree lhs, tree op1, + tree op2) +{ + return gimple_build_assign_with_ops(code, lhs, op1, op2); +} + +static void perturb_local_entropy(basic_block bb, tree local_entropy) +{ + gimple_stmt_iterator gsi; + gimple assign; + tree rhs; + enum tree_code op; + + op = get_op(&rhs); + assign = create_assign(op, local_entropy, local_entropy, rhs); + gsi = gsi_after_labels(bb); + gsi_insert_before(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); +} + +static void __perturb_latent_entropy(gimple_stmt_iterator *gsi, + tree local_entropy) +{ + gimple assign; + tree temp; + enum tree_code op; + + /* 1. create temporary copy of latent_entropy */ + temp = create_var(unsigned_intDI_type_node, "tmp_latent_entropy"); + + /* 2. read... */ + add_referenced_var(latent_entropy_decl); + mark_sym_for_renaming(latent_entropy_decl); + assign = gimple_build_assign(temp, latent_entropy_decl); + gsi_insert_before(gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 3. ...modify... */ + op = get_op(NULL); + assign = create_assign(op, temp, temp, local_entropy); + gsi_insert_after(gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 4. ...write latent_entropy */ + assign = gimple_build_assign(latent_entropy_decl, temp); + gsi_insert_after(gsi, assign, GSI_NEW_STMT); + update_stmt(assign); +} + +static bool handle_tail_calls(basic_block bb, tree local_entropy) +{ + gimple_stmt_iterator gsi; + + for (gsi = gsi_start_bb(bb); !gsi_end_p(gsi); gsi_next(&gsi)) { + gcall *call; + gimple stmt = gsi_stmt(gsi); + + if (!is_gimple_call(stmt)) + continue; + + call = as_a_gcall(stmt); + if (!gimple_call_tail_p(call)) + continue; + + __perturb_latent_entropy(&gsi, local_entropy); + return true; + } + + return false; +} + +static void perturb_latent_entropy(tree local_entropy) +{ + edge_iterator ei; + edge e, last_bb_e; + basic_block last_bb; + + gcc_assert(single_pred_p(EXIT_BLOCK_PTR_FOR_FN(cfun))); + last_bb_e = single_pred_edge(EXIT_BLOCK_PTR_FOR_FN(cfun)); + + FOR_EACH_EDGE(e, ei, last_bb_e->src->preds) { + if (ENTRY_BLOCK_PTR_FOR_FN(cfun) == e->src) + continue; + if (EXIT_BLOCK_PTR_FOR_FN(cfun) == e->src) + continue; + + handle_tail_calls(e->src, local_entropy); + } + + last_bb = single_pred(EXIT_BLOCK_PTR_FOR_FN(cfun)); + if (!handle_tail_calls(last_bb, local_entropy)) { + gimple_stmt_iterator gsi = gsi_last_bb(last_bb); + + __perturb_latent_entropy(&gsi, local_entropy); + } +} + +static void init_local_entropy(basic_block bb, tree local_entropy) +{ + gimple assign, call; + tree frame_addr, rand_const, tmp, fndecl, udi_frame_addr; + enum tree_code op; + unsigned HOST_WIDE_INT rand_cst; + gimple_stmt_iterator gsi = gsi_after_labels(bb); + + /* 1. create local_entropy_frameaddr */ + frame_addr = create_var(ptr_type_node, "local_entropy_frameaddr"); + + /* 2. local_entropy_frameaddr = __builtin_frame_address() */ + fndecl = builtin_decl_implicit(BUILT_IN_FRAME_ADDRESS); + call = gimple_build_call(fndecl, 1, integer_zero_node); + gimple_call_set_lhs(call, frame_addr); + gsi_insert_before(&gsi, call, GSI_NEW_STMT); + update_stmt(call); + + udi_frame_addr = fold_convert(unsigned_intDI_type_node, frame_addr); + assign = gimple_build_assign(local_entropy, udi_frame_addr); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 3. create temporary copy of latent_entropy */ + tmp = create_var(unsigned_intDI_type_node, "tmp_latent_entropy"); + + /* 4. read the global entropy variable into local entropy */ + add_referenced_var(latent_entropy_decl); + mark_sym_for_renaming(latent_entropy_decl); + assign = gimple_build_assign(tmp, latent_entropy_decl); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + /* 5. mix local_entropy_frameaddr into local entropy */ + assign = create_assign(BIT_XOR_EXPR, local_entropy, local_entropy, tmp); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); + + rand_cst = get_random_const(); + rand_const = build_int_cstu(unsigned_intDI_type_node, rand_cst); + op = get_op(NULL); + assign = create_assign(op, local_entropy, local_entropy, rand_const); + gsi_insert_after(&gsi, assign, GSI_NEW_STMT); + update_stmt(assign); +} + +static bool create_latent_entropy_decl(void) +{ + varpool_node_ptr node; + + if (latent_entropy_decl != NULL_TREE) + return true; + + FOR_EACH_VARIABLE(node) { + tree name, var = NODE_DECL(node); + + if (DECL_NAME_LENGTH(var) < sizeof("latent_entropy") - 1) + continue; + + name = DECL_NAME(var); + if (strcmp(IDENTIFIER_POINTER(name), "latent_entropy")) + continue; + + latent_entropy_decl = var; + break; + } + + return latent_entropy_decl != NULL_TREE; +} + +static unsigned int latent_entropy_execute(void) +{ + basic_block bb; + tree local_entropy; + + if (!create_latent_entropy_decl()) + return 0; + + /* prepare for step 2 below */ + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + if (!single_pred_p(bb)) { + split_edge(single_succ_edge(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + gcc_assert(single_succ_p(ENTRY_BLOCK_PTR_FOR_FN(cfun))); + bb = single_succ(ENTRY_BLOCK_PTR_FOR_FN(cfun)); + } + + /* 1. create the local entropy variable */ + local_entropy = create_var(unsigned_intDI_type_node, "local_entropy"); + + /* 2. initialize the local entropy variable */ + init_local_entropy(bb, local_entropy); + + bb = bb->next_bb; + + /* + * 3. instrument each BB with an operation on the + * local entropy variable + */ + while (bb != EXIT_BLOCK_PTR_FOR_FN(cfun)) { + perturb_local_entropy(bb, local_entropy); + bb = bb->next_bb; + }; + + /* 4. mix local entropy into the global entropy variable */ + perturb_latent_entropy(local_entropy); + return 0; +} + +static void latent_entropy_start_unit(void *gcc_data __unused, + void *user_data __unused) +{ + tree type, id; + int quals; + + seed = get_random_seed(false); + + if (in_lto_p) + return; + + /* extern volatile u64 latent_entropy */ + gcc_assert(TYPE_PRECISION(long_long_unsigned_type_node) == 64); + quals = TYPE_QUALS(long_long_unsigned_type_node) | TYPE_QUAL_VOLATILE; + type = build_qualified_type(long_long_unsigned_type_node, quals); + id = get_identifier("latent_entropy"); + latent_entropy_decl = build_decl(UNKNOWN_LOCATION, VAR_DECL, id, type); + + TREE_STATIC(latent_entropy_decl) = 1; + TREE_PUBLIC(latent_entropy_decl) = 1; + TREE_USED(latent_entropy_decl) = 1; + DECL_PRESERVE_P(latent_entropy_decl) = 1; + TREE_THIS_VOLATILE(latent_entropy_decl) = 1; + DECL_EXTERNAL(latent_entropy_decl) = 1; + DECL_ARTIFICIAL(latent_entropy_decl) = 1; + lang_hooks.decls.pushdecl(latent_entropy_decl); +} + +#define PASS_NAME latent_entropy +#define PROPERTIES_REQUIRED PROP_gimple_leh | PROP_cfg +#define TODO_FLAGS_FINISH TODO_verify_ssa | TODO_verify_stmts | TODO_dump_func \ + | TODO_update_ssa +#include "gcc-generate-gimple-pass.h" + +int plugin_init(struct plugin_name_args *plugin_info, + struct plugin_gcc_version *version) +{ + bool enabled = true; + const char * const plugin_name = plugin_info->base_name; + const int argc = plugin_info->argc; + const struct plugin_argument * const argv = plugin_info->argv; + int i; + + struct register_pass_info latent_entropy_pass_info; + + latent_entropy_pass_info.pass = make_latent_entropy_pass(); + latent_entropy_pass_info.reference_pass_name = "optimized"; + latent_entropy_pass_info.ref_pass_instance_number = 1; + latent_entropy_pass_info.pos_op = PASS_POS_INSERT_BEFORE; + static const struct ggc_root_tab gt_ggc_r_gt_latent_entropy[] = { + { + .base = &latent_entropy_decl, + .nelt = 1, + .stride = sizeof(latent_entropy_decl), + .cb = >_ggc_mx_tree_node, + .pchw = >_pch_nx_tree_node + }, + LAST_GGC_ROOT_TAB + }; + + if (!plugin_default_version_check(version, &gcc_version)) { + error(G_("incompatible gcc/plugin versions")); + return 1; + } + + for (i = 0; i < argc; ++i) { + if (!(strcmp(argv[i].key, "disable"))) { + enabled = false; + continue; + } + error(G_("unkown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key); + } + + register_callback(plugin_name, PLUGIN_INFO, NULL, + &latent_entropy_plugin_info); + if (enabled) { + register_callback(plugin_name, PLUGIN_START_UNIT, + &latent_entropy_start_unit, NULL); + register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, + NULL, (void *)>_ggc_r_gt_latent_entropy); + register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, + &latent_entropy_pass_info); + } + register_callback(plugin_name, PLUGIN_ATTRIBUTES, register_attributes, + NULL); + + return 0; +} -- cgit v1.2.3 From 0766f788eb727e2e330d55d30545db65bcf2623f Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Mon, 20 Jun 2016 20:42:34 +0200 Subject: latent_entropy: Mark functions with __latent_entropy The __latent_entropy gcc attribute can be used only on functions and variables. If it is on a function then the plugin will instrument it for gathering control-flow entropy. If the attribute is on a variable then the plugin will initialize it with random contents. The variable must be an integer, an integer array type or a structure with integer fields. These specific functions have been selected because they are init functions (to help gather boot-time entropy), are called at unpredictable times, or they have variable loops, each of which provide some level of latent entropy. Signed-off-by: Emese Revfy [kees: expanded commit message] Signed-off-by: Kees Cook --- block/blk-softirq.c | 2 +- drivers/char/random.c | 4 ++-- fs/namespace.c | 1 + include/linux/compiler-gcc.h | 7 +++++++ include/linux/compiler.h | 4 ++++ include/linux/fdtable.h | 2 +- include/linux/genhd.h | 2 +- include/linux/init.h | 5 +++-- include/linux/random.h | 4 ++-- kernel/fork.c | 6 ++++-- kernel/rcu/tiny.c | 2 +- kernel/rcu/tree.c | 2 +- kernel/sched/fair.c | 2 +- kernel/softirq.c | 4 ++-- kernel/time/timer.c | 2 +- lib/irq_poll.c | 2 +- lib/random32.c | 2 +- mm/page_alloc.c | 2 +- net/core/dev.c | 4 ++-- 19 files changed, 37 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 53b1737e978d..489eab825a8a 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -18,7 +18,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); * Softirq action handler - move entries to local list and loop over them * while passing them to the queue registered handler. */ -static void blk_done_softirq(struct softirq_action *h) +static __latent_entropy void blk_done_softirq(struct softirq_action *h) { struct list_head *cpu_list, local_list; diff --git a/drivers/char/random.c b/drivers/char/random.c index 3efb3bf0ab83..7274ae89ddb3 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -479,8 +479,8 @@ static ssize_t _extract_entropy(struct entropy_store *r, void *buf, static void crng_reseed(struct crng_state *crng, struct entropy_store *r); static void push_to_pool(struct work_struct *work); -static __u32 input_pool_data[INPUT_POOL_WORDS]; -static __u32 blocking_pool_data[OUTPUT_POOL_WORDS]; +static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy; +static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy; static struct entropy_store input_pool = { .poolinfo = &poolinfo_table[0], diff --git a/fs/namespace.c b/fs/namespace.c index 7bb2cda3bfef..4a9568b81138 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2759,6 +2759,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) return new_ns; } +__latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 573c5a18908f..432f5c97e18f 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -188,6 +188,13 @@ #endif /* GCC_VERSION >= 40300 */ #if GCC_VERSION >= 40500 + +#ifndef __CHECKER__ +#ifdef LATENT_ENTROPY_PLUGIN +#define __latent_entropy __attribute__((latent_entropy)) +#endif +#endif + /* * Mark a position in code as unreachable. This can be used to * suppress control flow warnings after asm blocks that transfer diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 668569844d37..ceaddaf76ff1 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -406,6 +406,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s # define __attribute_const__ /* unimplemented */ #endif +#ifndef __latent_entropy +# define __latent_entropy +#endif + /* * Tell gcc if a function is cold. The compiler will assume any path * directly leading to the call is unlikely. diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 5295535b60c6..9852c7e33466 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -105,7 +105,7 @@ struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); int unshare_files(struct files_struct **); -struct files_struct *dup_fd(struct files_struct *, int *); +struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 1dbf52f9c24b..e0341af6950e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -437,7 +437,7 @@ extern void disk_flush_events(struct gendisk *disk, unsigned int mask); extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ -extern void add_disk_randomness(struct gendisk *disk); +extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; extern void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) diff --git a/include/linux/init.h b/include/linux/init.h index 6935d02474aa..1e5c131d5c9a 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -39,7 +39,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold notrace +#define __init __section(.init.text) __cold notrace __latent_entropy #define __initdata __section(.init.data) #define __initconst __constsection(.init.rodata) #define __exitdata __section(.exit.data) @@ -86,7 +86,8 @@ #define __exit __section(.exit.text) __exitused __cold notrace /* Used for MEMORY_HOTPLUG */ -#define __meminit __section(.meminit.text) __cold notrace +#define __meminit __section(.meminit.text) __cold notrace \ + __latent_entropy #define __meminitdata __section(.meminit.data) #define __meminitconst __constsection(.meminit.rodata) #define __memexit __section(.memexit.text) __exitused __cold notrace diff --git a/include/linux/random.h b/include/linux/random.h index a59c74cdb1eb..d80a4388a4fd 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -30,8 +30,8 @@ static inline void add_latent_entropy(void) {} #endif extern void add_input_randomness(unsigned int type, unsigned int code, - unsigned int value); -extern void add_interrupt_randomness(int irq, int irq_flags); + unsigned int value) __latent_entropy; +extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; extern void get_random_bytes(void *buf, int nbytes); extern int add_random_ready_callback(struct random_ready_callback *rdy); diff --git a/kernel/fork.c b/kernel/fork.c index 001b18473a07..05393881ef39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -404,7 +404,8 @@ free_tsk: } #ifdef CONFIG_MMU -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; @@ -1296,7 +1297,8 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static struct task_struct *copy_process(unsigned long clone_flags, +static __latent_entropy struct task_struct *copy_process( + unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 944b1b491ed8..1898559e6b60 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -170,7 +170,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) false)); } -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5d80925e7fc8..e5164deb51e1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3013,7 +3013,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* * Do RCU core processing for the current CPU. */ -static void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 039de34f1521..004996df2f10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8283,7 +8283,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ -static void run_rebalance_domains(struct softirq_action *h) +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? diff --git a/kernel/softirq.c b/kernel/softirq.c index 17caf4b63342..34033fd09c8c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -482,7 +482,7 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule_first); -static void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -518,7 +518,7 @@ static void tasklet_action(struct softirq_action *a) } } -static void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 32bf6f75a8fe..2d47980a1bc4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1633,7 +1633,7 @@ static inline void __run_timers(struct timer_base *base) /* * This function runs timers and the timer-tq in bottom half context. */ -static void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 836f7db4e548..63be7495dbb2 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop) } EXPORT_SYMBOL(irq_poll_complete); -static void irq_poll_softirq(struct softirq_action *h) +static void __latent_entropy irq_poll_softirq(struct softirq_action *h) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = irq_poll_budget; diff --git a/lib/random32.c b/lib/random32.c index 69ed593aab07..a30923573676 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void) } #endif -static DEFINE_PER_CPU(struct rnd_state, net_rand_state); +static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; /** * prandom_u32_state - seeded pseudo-random number generator. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 248851d1fc86..901121af4e54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -92,7 +92,7 @@ int _node_numa_mem_[MAX_NUMNODES]; #endif #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -volatile u64 latent_entropy; +volatile u64 latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); #endif diff --git a/net/core/dev.c b/net/core/dev.c index ea6312057a71..ee076c2791f9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3855,7 +3855,7 @@ int netif_rx_ni(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx_ni); -static void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5187,7 +5187,7 @@ out_unlock: return work; } -static void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + 2; -- cgit v1.2.3 From 9cfb38a7ba5a9c27c1af8093fb1af4b699c0a441 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 9 Oct 2016 08:04:03 +0800 Subject: sched/fair: Fix sched domains NULL dereference in select_idle_sibling() Commit: 10e2f1acd01 ("sched/core: Rewrite and improve select_idle_siblings()") ... improved select_idle_sibling(), but also triggered a regression (crash) during CPU-hotplug: BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 IP: [] select_idle_sibling+0x1c2/0x4f0 Call Trace: select_task_rq_fair+0x749/0x930 ? select_task_rq_fair+0xb4/0x930 ? __lock_is_held+0x54/0x70 try_to_wake_up+0x19a/0x5b0 default_wake_function+0x12/0x20 autoremove_wake_function+0x12/0x40 __wake_up_common+0x55/0x90 __wake_up+0x39/0x50 wake_up_klogd_work_func+0x40/0x60 irq_work_run_list+0x57/0x80 irq_work_run+0x2c/0x30 smp_irq_work_interrupt+0x2e/0x40 irq_work_interrupt+0x96/0xa0 ? _raw_spin_unlock_irqrestore+0x45/0x80 try_to_wake_up+0x4a/0x5b0 wake_up_state+0x10/0x20 __kthread_unpark+0x67/0x70 kthread_unpark+0x22/0x30 cpuhp_online_idle+0x3e/0x70 cpu_startup_entry+0x6a/0x450 start_secondary+0x154/0x180 This can be reproduced by running the ftrace test case of kselftest, the test case will hot-unplug the CPU and the CPU will attach to the NULL sched-domain during scheduler teardown. The step 2 for the rewrite select_idle_siblings(): | Step 2) tracks the average cost of the scan and compares this to the | average idle time guestimate for the CPU doing the wakeup. If the CPU which doing the wakeup is the going hot-unplug CPU, then NULL sched domain will be dereferenced to acquire the average cost of the scan. This patch fix it by failing the search of an idle CPU in the LLC process if this sched domain is NULL. Tested-by: Catalin Marinas Signed-off-by: Wanpeng Li Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1475971443-3187-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 502e95a6e927..8b03fb5d1b9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5471,13 +5471,18 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { - struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - u64 avg_idle = this_rq()->avg_idle; - u64 avg_cost = this_sd->avg_scan_cost; + struct sched_domain *this_sd; + u64 avg_cost, avg_idle = this_rq()->avg_idle; u64 time, cost; s64 delta; int cpu, wrap; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + + avg_cost = this_sd->avg_scan_cost; + /* * Due to large variance we need a large fuzz factor; hackbench in * particularly is sensitive here. -- cgit v1.2.3 From a705e07b9c80df27b6bb12f7a4cd4cf4ed2f728b Mon Sep 17 00:00:00 2001 From: Joonas Lahtinen Date: Wed, 12 Oct 2016 13:18:56 +0300 Subject: cpu/hotplug: Use distinct name for cpu_hotplug.dep_map Use distinctive name for cpu_hotplug.dep_map to avoid the actual cpu_hotplug.lock appearing as cpu_hotplug.lock#2 in lockdep splats. Signed-off-by: Joonas Lahtinen Reviewed-by: Chris Wilson Acked-by: Gautham R. Shenoy Cc: Andrew Morton Cc: Daniel Vetter Cc: Gautham R . Shenoy Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: intel-gfx@lists.freedesktop.org Cc: trivial@kernel.org Signed-off-by: Ingo Molnar --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5df20d6d1520..29de1a9352c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -228,7 +228,7 @@ static struct { .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq), .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), #ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "cpu_hotplug.lock" }, + .dep_map = STATIC_LOCKDEP_MAP_INIT("cpu_hotplug.dep_map", &cpu_hotplug.dep_map), #endif }; -- cgit v1.2.3 From 54e23845e965898f65f76aba79fa9db76d830fa9 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 17 Oct 2016 11:47:02 +0200 Subject: alarmtimer: Remove unused but set variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the set but unused variable base in alarm_clock_get to fix the following warning when building with 'W=1': kernel/time/alarmtimer.c: In function ‘alarm_timer_create’: kernel/time/alarmtimer.c:545:21: warning: variable ‘base’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Cc: John Stultz Link: http://lkml.kernel.org/r/20161017094702.10873-1-tklauser@distanz.ch Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c3aad685bbc0..12dd190634ab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -542,7 +542,6 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) static int alarm_timer_create(struct k_itimer *new_timer) { enum alarmtimer_type type; - struct alarm_base *base; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -551,7 +550,6 @@ static int alarm_timer_create(struct k_itimer *new_timer) return -EPERM; type = clock2alarm(new_timer->it_clock); - base = &alarm_bases[type]; alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); return 0; } -- cgit v1.2.3 From b5a9b340789b2b24c6896bcf7a065c31a4db671c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 19 Oct 2016 14:45:23 +0200 Subject: sched/fair: Fix incorrect task group ->load_avg A scheduler performance regression has been reported by Joseph Salisbury, which he bisected back to: 3d30544f0212 ("sched/fair: Apply more PELT fixes) The regression triggers when several levels of task groups are involved (read: SystemD) and cpu_possible_mask != cpu_present_mask. The root cause is that group entity's load (tg_child->se[i]->avg.load_avg) is initialized to scale_load_down(se->load.weight). During the creation of a child task group, its group entities on possible CPUs are attached to parent's cfs_rq (tg_parent) and their loads are added to the parent's load (tg_parent->load_avg) with update_tg_load_avg(). But only the load on online CPUs will then be updated to reflect real load, whereas load on other CPUs will stay at the initial value. The result is a tg_parent->load_avg that is higher than the real load, the weight of group entities (tg_parent->se[i]->load.weight) on online CPUs is smaller than it should be, and the task group gets a less running time than what it could expect. ( This situation can be detected with /proc/sched_debug. The ".tg_load_avg" of the task group will be much higher than sum of ".tg_load_avg_contrib" of online cfs_rqs of the task group. ) The load of group entities don't have to be intialized to something else than 0 because their load will increase when an entity is attached. Reported-by: Joseph Salisbury Tested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: # 4.8.x Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: joonwoop@codeaurora.org Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes) Link: http://lkml.kernel.org/r/1476881123-10159-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76ee7de1859d..d941c97dfbc3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway -- cgit v1.2.3 From 9beae1ea89305a9667ceaab6d0bf46a045ad71e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:17 +0100 Subject: mm: replace get_user_pages_remote() write/force parameters with gup_flags This removes the 'write' and 'force' from get_user_pages_remote() and replaces them with 'gup_flags' to make the use of FOLL_FORCE explicit in callers as use of this flag can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Reviewed-by: Jan Kara Signed-off-by: Linus Torvalds --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 7 +++++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 6 +++++- drivers/infiniband/core/umem_odp.c | 7 +++++-- fs/exec.c | 9 +++++++-- include/linux/mm.h | 2 +- kernel/events/uprobes.c | 6 ++++-- mm/gup.c | 22 +++++++--------------- mm/memory.c | 6 +++++- security/tomoyo/domain.c | 2 +- 9 files changed, 40 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 5ce3603e6eac..0370b842d9cc 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -748,19 +748,22 @@ static struct page **etnaviv_gem_userptr_do_get_pages( int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT; struct page **pvec; uintptr_t ptr; + unsigned int flags = 0; pvec = drm_malloc_ab(npages, sizeof(struct page *)); if (!pvec) return ERR_PTR(-ENOMEM); + if (!etnaviv_obj->userptr.ro) + flags |= FOLL_WRITE; + pinned = 0; ptr = etnaviv_obj->userptr.ptr; down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - !etnaviv_obj->userptr.ro, 0, - pvec + pinned, NULL); + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index e537930c64b5..c6f780f5abc9 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -508,6 +508,10 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) pvec = drm_malloc_gfp(npages, sizeof(struct page *), GFP_TEMPORARY); if (pvec != NULL) { struct mm_struct *mm = obj->userptr.mm->mm; + unsigned int flags = 0; + + if (!obj->userptr.read_only) + flags |= FOLL_WRITE; ret = -EFAULT; if (atomic_inc_not_zero(&mm->mm_users)) { @@ -517,7 +521,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) (work->task, mm, obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, - !obj->userptr.read_only, 0, + flags, pvec + pinned, NULL); if (ret < 0) break; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 75077a018675..1f0fe3217f23 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, u64 off; int j, k, ret = 0, start_idx, npages = 0; u64 base_virt_addr; + unsigned int flags = 0; if (access_mask == 0) return -EINVAL; @@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, goto out_put_task; } + if (access_mask & ODP_WRITE_ALLOWED_BIT) + flags |= FOLL_WRITE; + start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; k = start_idx; @@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - access_mask & ODP_WRITE_ALLOWED_BIT, - 0, local_page_list, NULL); + flags, local_page_list, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/fs/exec.c b/fs/exec.c index 6fcfb3f7b137..4e497b9ee71e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index 30bb5d9631bb..ecc4be7b67e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1276,7 +1276,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct **vmas, int *nonblocking); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d4129bb05e5d..f9ec9add2164 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -300,7 +300,8 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, + &vma); if (ret <= 0) return ret; @@ -1710,7 +1711,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * but we treat this as a 'remote' access since it is * essentially a kernel access to the memory. */ - result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, + NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index bbc2e76cdd77..7aa113c2d373 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -915,9 +915,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). + * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -946,9 +944,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. + * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page + * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must + * be called after the page is finished with, and before put_page is called. * * get_user_pages is typically used for fewer-copy IO operations, to get a * handle on the memory by some means other than accesses via the user virtual @@ -965,18 +963,12 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, + unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { - unsigned int flags = FOLL_TOUCH | FOLL_REMOTE; - - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, flags); + NULL, false, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); diff --git a/mm/memory.c b/mm/memory.c index fc1987dfd8cc..20a9adb7b36e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3873,6 +3873,10 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, { struct vm_area_struct *vma; void *old_buf = buf; + unsigned int flags = FOLL_FORCE; + + if (write) + flags |= FOLL_WRITE; down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ @@ -3882,7 +3886,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - write, 1, &page, &vma); + flags, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index ade7c6cad172..682b73af7766 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - 0, 1, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; -- cgit v1.2.3 From f307ab6dcea03f9d8e4d70508fd7d1ca57cfa7f9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 13 Oct 2016 01:20:20 +0100 Subject: mm: replace access_process_vm() write parameter with gup_flags This removes the 'write' argument from access_process_vm() and replaces it with 'gup_flags' as use of this function previously silently implied FOLL_FORCE, whereas after this patch callers explicitly pass this flag. We make this explicit as use of FOLL_FORCE can result in surprising behaviour (and hence bugs) within the mm subsystem. Signed-off-by: Lorenzo Stoakes Acked-by: Jesper Nilsson Acked-by: Michal Hocko Acked-by: Michael Ellerman Signed-off-by: Linus Torvalds --- arch/alpha/kernel/ptrace.c | 9 ++++++--- arch/blackfin/kernel/ptrace.c | 5 +++-- arch/cris/arch-v32/kernel/ptrace.c | 4 ++-- arch/ia64/kernel/ptrace.c | 14 +++++++++----- arch/m32r/kernel/ptrace.c | 15 ++++++++++----- arch/mips/kernel/ptrace32.c | 5 +++-- arch/powerpc/kernel/ptrace32.c | 5 +++-- arch/score/kernel/ptrace.c | 10 ++++++---- arch/sparc/kernel/ptrace_64.c | 24 ++++++++++++++++-------- arch/x86/kernel/step.c | 3 ++- arch/x86/um/ptrace_32.c | 3 ++- arch/x86/um/ptrace_64.c | 3 ++- include/linux/mm.h | 3 ++- kernel/ptrace.c | 16 ++++++++++------ mm/memory.c | 8 ++------ mm/nommu.c | 6 +++--- mm/util.c | 5 +++-- 17 files changed, 84 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index d9ee81769899..940dfb406591 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -157,14 +157,16 @@ put_reg(struct task_struct *task, unsigned long regno, unsigned long data) static inline int read_int(struct task_struct *task, unsigned long addr, int * data) { - int copied = access_process_vm(task, addr, data, sizeof(int), 0); + int copied = access_process_vm(task, addr, data, sizeof(int), + FOLL_FORCE); return (copied == sizeof(int)) ? 0 : -EIO; } static inline int write_int(struct task_struct *task, unsigned long addr, int data) { - int copied = access_process_vm(task, addr, &data, sizeof(int), 1); + int copied = access_process_vm(task, addr, &data, sizeof(int), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(int)) ? 0 : -EIO; } @@ -281,7 +283,8 @@ long arch_ptrace(struct task_struct *child, long request, /* When I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ case PTRACE_PEEKDATA: - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), + FOLL_FORCE); ret = -EIO; if (copied != sizeof(tmp)) break; diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index 8b8fe671b1a6..8d79286ee4e8 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -271,7 +271,7 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &tmp, - to_copy, 0); + to_copy, FOLL_FORCE); if (copied) break; @@ -324,7 +324,8 @@ long arch_ptrace(struct task_struct *child, long request, case BFIN_MEM_ACCESS_CORE: case BFIN_MEM_ACCESS_CORE_ONLY: copied = access_process_vm(child, addr, &data, - to_copy, 1); + to_copy, + FOLL_FORCE | FOLL_WRITE); break; case BFIN_MEM_ACCESS_DMA: if (safe_dma_memcpy(paddr, &data, to_copy)) diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c index f085229cf870..f0df654ac6fc 100644 --- a/arch/cris/arch-v32/kernel/ptrace.c +++ b/arch/cris/arch-v32/kernel/ptrace.c @@ -147,7 +147,7 @@ long arch_ptrace(struct task_struct *child, long request, /* The trampoline page is globally mapped, no page table to traverse.*/ tmp = *(unsigned long*)addr; } else { - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; @@ -279,7 +279,7 @@ static int insn_size(struct task_struct *child, unsigned long pc) int opsize = 0; /* Read the opcode at pc (do what PTRACE_PEEKTEXT would do). */ - copied = access_process_vm(child, pc, &opcode, sizeof(opcode), 0); + copied = access_process_vm(child, pc, &opcode, sizeof(opcode), FOLL_FORCE); if (copied != sizeof(opcode)) return 0; diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 6f54d511cc50..31aa8c0f68e1 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -453,7 +453,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack, return 0; } } - copied = access_process_vm(child, addr, &ret, sizeof(ret), 0); + copied = access_process_vm(child, addr, &ret, sizeof(ret), FOLL_FORCE); if (copied != sizeof(ret)) return -EIO; *val = ret; @@ -489,7 +489,8 @@ ia64_poke (struct task_struct *child, struct switch_stack *child_stack, *ia64_rse_skip_regs(krbs, regnum) = val; } } - } else if (access_process_vm(child, addr, &val, sizeof(val), 1) + } else if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; return 0; @@ -543,7 +544,8 @@ ia64_sync_user_rbs (struct task_struct *child, struct switch_stack *sw, ret = ia64_peek(child, sw, user_rbs_end, addr, &val); if (ret < 0) return ret; - if (access_process_vm(child, addr, &val, sizeof(val), 1) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE) != sizeof(val)) return -EIO; } @@ -559,7 +561,8 @@ ia64_sync_kernel_rbs (struct task_struct *child, struct switch_stack *sw, /* now copy word for word from user rbs to kernel rbs: */ for (addr = user_rbs_start; addr < user_rbs_end; addr += 8) { - if (access_process_vm(child, addr, &val, sizeof(val), 0) + if (access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE) != sizeof(val)) return -EIO; @@ -1156,7 +1159,8 @@ arch_ptrace (struct task_struct *child, long request, case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: /* read word at location addr */ - if (access_process_vm(child, addr, &data, sizeof(data), 0) + if (access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE) != sizeof(data)) return -EIO; /* ensure return value is not mistaken for error code */ diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index 51f5e9aa4901..c145605a981f 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -493,7 +493,8 @@ unregister_all_debug_traps(struct task_struct *child) int i; for (i = 0; i < p->nr_trap; i++) - access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), 1); + access_process_vm(child, p->addr[i], &p->insn[i], sizeof(p->insn[i]), + FOLL_FORCE | FOLL_WRITE); p->nr_trap = 0; } @@ -537,7 +538,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) unsigned long next_insn, code; unsigned long addr = next_pc & ~3; - if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), 0) + if (access_process_vm(child, addr, &next_insn, sizeof(next_insn), + FOLL_FORCE) != sizeof(next_insn)) { return -1; /* error */ } @@ -546,7 +548,8 @@ embed_debug_trap(struct task_struct *child, unsigned long next_pc) if (register_debug_trap(child, next_pc, next_insn, &code)) { return -1; /* error */ } - if (access_process_vm(child, addr, &code, sizeof(code), 1) + if (access_process_vm(child, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE) != sizeof(code)) { return -1; /* error */ } @@ -562,7 +565,8 @@ withdraw_debug_trap(struct pt_regs *regs) addr = (regs->bpc - 2) & ~3; regs->bpc -= 2; if (unregister_debug_trap(current, addr, &code)) { - access_process_vm(current, addr, &code, sizeof(code), 1); + access_process_vm(current, addr, &code, sizeof(code), + FOLL_FORCE | FOLL_WRITE); invalidate_cache(); } } @@ -589,7 +593,8 @@ void user_enable_single_step(struct task_struct *child) /* Compute next pc. */ pc = get_stack_long(child, PT_BPC); - if (access_process_vm(child, pc&~3, &insn, sizeof(insn), 0) + if (access_process_vm(child, pc&~3, &insn, sizeof(insn), + FOLL_FORCE) != sizeof(insn)) return; diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index 283b5a1967d1..7e71a4e0281b 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -70,7 +70,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *) (unsigned long) data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &data, - sizeof(data), 1) == sizeof(data)) + sizeof(data), + FOLL_FORCE | FOLL_WRITE) == sizeof(data)) break; ret = -EIO; break; diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index f52b7db327c8..010b7b310237 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -74,7 +74,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; copied = access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 0); + sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) break; ret = put_user(tmp, (u32 __user *)data); @@ -179,7 +179,8 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; ret = 0; if (access_process_vm(child, (u64)addrOthers, &tmp, - sizeof(tmp), 1) == sizeof(tmp)) + sizeof(tmp), + FOLL_FORCE | FOLL_WRITE) == sizeof(tmp)) break; ret = -EIO; break; diff --git a/arch/score/kernel/ptrace.c b/arch/score/kernel/ptrace.c index 55836188b217..4f7314d5f334 100644 --- a/arch/score/kernel/ptrace.c +++ b/arch/score/kernel/ptrace.c @@ -131,7 +131,7 @@ read_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -142,7 +142,7 @@ read_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, res, sizeof(*res), 0); + copied = access_process_vm(child, addr, res, sizeof(*res), FOLL_FORCE); return copied != sizeof(*res) ? -EIO : 0; } @@ -153,7 +153,8 @@ write_tsk_short(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } @@ -164,7 +165,8 @@ write_tsk_long(struct task_struct *child, { int copied; - copied = access_process_vm(child, addr, &val, sizeof(val), 1); + copied = access_process_vm(child, addr, &val, sizeof(val), + FOLL_FORCE | FOLL_WRITE); return copied != sizeof(val) ? -EIO : 0; } diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 9ddc4928a089..ac082dd8c67d 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -127,7 +127,8 @@ static int get_from_target(struct task_struct *target, unsigned long uaddr, if (copy_from_user(kbuf, (void __user *) uaddr, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 0); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE); if (len2 != len) return -EFAULT; } @@ -141,7 +142,8 @@ static int set_to_target(struct task_struct *target, unsigned long uaddr, if (copy_to_user((void __user *) uaddr, kbuf, len)) return -EFAULT; } else { - int len2 = access_process_vm(target, uaddr, kbuf, len, 1); + int len2 = access_process_vm(target, uaddr, kbuf, len, + FOLL_FORCE | FOLL_WRITE); if (len2 != len) return -EFAULT; } @@ -505,7 +507,8 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - k, sizeof(*k), 0) + k, sizeof(*k), + FOLL_FORCE) != sizeof(*k)) return -EFAULT; k++; @@ -531,12 +534,14 @@ static int genregs32_get(struct task_struct *target, if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; @@ -615,7 +620,8 @@ static int genregs32_set(struct task_struct *target, (unsigned long) ®_window[pos], (void *) k, - sizeof(*k), 1) + sizeof(*k), + FOLL_FORCE | FOLL_WRITE) != sizeof(*k)) return -EFAULT; k++; @@ -642,13 +648,15 @@ static int genregs32_set(struct task_struct *target, if (access_process_vm(target, (unsigned long) u, - ®, sizeof(reg), 0) + ®, sizeof(reg), + FOLL_FORCE) != sizeof(reg)) return -EFAULT; if (access_process_vm(target, (unsigned long) ®_window[pos], - ®, sizeof(reg), 1) + ®, sizeof(reg), + FOLL_FORCE | FOLL_WRITE) != sizeof(reg)) return -EFAULT; pos++; diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index c9a073866ca7..a23ce84a3f6c 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -57,7 +57,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) unsigned char opcode[15]; unsigned long addr = convert_ip_to_linear(child, regs); - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + copied = access_process_vm(child, addr, opcode, sizeof(opcode), + FOLL_FORCE); for (i = 0; i < copied; i++) { switch (opcode[i]) { /* popf and iret */ diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index 5766ead6fdb9..60a5a5a85505 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -36,7 +36,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk(KERN_ERR "is_syscall : failed to read " "instruction from 0x%lx\n", addr); diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index 0b5c184dd5b3..e30202b1716e 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -212,7 +212,8 @@ int is_syscall(unsigned long addr) * slow, but that doesn't matter, since it will be called only * in case of singlestepping, if copy_from_user failed. */ - n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + n = access_process_vm(current, addr, &instr, sizeof(instr), + FOLL_FORCE); if (n != sizeof(instr)) { printk("is_syscall : failed to read instruction from " "0x%lx\n", addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index f31bf9058587..ffbd72979ee7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1266,7 +1266,8 @@ static inline int fixup_user_fault(struct task_struct *tsk, } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2a99027312a6..e6474f7272ec 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -537,7 +537,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst int this_len, retval; this_len = (len > sizeof(buf)) ? sizeof(buf) : len; - retval = access_process_vm(tsk, src, buf, this_len, 0); + retval = access_process_vm(tsk, src, buf, this_len, FOLL_FORCE); if (!retval) { if (copied) break; @@ -564,7 +564,8 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds this_len = (len > sizeof(buf)) ? sizeof(buf) : len; if (copy_from_user(buf, src, this_len)) return -EFAULT; - retval = access_process_vm(tsk, dst, buf, this_len, 1); + retval = access_process_vm(tsk, dst, buf, this_len, + FOLL_FORCE | FOLL_WRITE); if (!retval) { if (copied) break; @@ -1127,7 +1128,7 @@ int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long tmp; int copied; - copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), FOLL_FORCE); if (copied != sizeof(tmp)) return -EIO; return put_user(tmp, (unsigned long __user *)data); @@ -1138,7 +1139,8 @@ int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, { int copied; - copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + copied = access_process_vm(tsk, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); return (copied == sizeof(data)) ? 0 : -EIO; } @@ -1155,7 +1157,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, switch (request) { case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: - ret = access_process_vm(child, addr, &word, sizeof(word), 0); + ret = access_process_vm(child, addr, &word, sizeof(word), + FOLL_FORCE); if (ret != sizeof(word)) ret = -EIO; else @@ -1164,7 +1167,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, case PTRACE_POKETEXT: case PTRACE_POKEDATA: - ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = access_process_vm(child, addr, &data, sizeof(data), + FOLL_FORCE | FOLL_WRITE); ret = (ret != sizeof(data) ? -EIO : 0); break; diff --git a/mm/memory.c b/mm/memory.c index bac2d994850e..e18c57bdc75c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3951,20 +3951,16 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Do not walk the page table directly, use get_user_pages */ int access_process_vm(struct task_struct *tsk, unsigned long addr, - void *buf, int len, int write) + void *buf, int len, unsigned int gup_flags) { struct mm_struct *mm; int ret; - unsigned int flags = FOLL_FORCE; mm = get_task_mm(tsk); if (!mm) return 0; - if (write) - flags |= FOLL_WRITE; - - ret = __access_remote_vm(tsk, mm, addr, buf, len, flags); + ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 93d5bb53fc63..db5fd1795298 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1861,7 +1861,8 @@ int access_remote_vm(struct mm_struct *mm, unsigned long addr, * Access another process' address space. * - source/target buffer must be kernel space */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags) { struct mm_struct *mm; @@ -1872,8 +1873,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, - write ? FOLL_WRITE : 0); + len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); mmput(mm); return len; diff --git a/mm/util.c b/mm/util.c index 4c685bde5ebc..952cbe7ad7b7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -624,7 +624,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen) len = buflen; - res = access_process_vm(task, arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then @@ -639,7 +639,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, - buffer+res, len, 0); + buffer+res, len, + FOLL_FORCE); res = strnlen(buffer, res); } } -- cgit v1.2.3 From 8835ca59dac2bc1e0136791abf3ccd51588803ce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 19 Oct 2016 09:11:24 -0700 Subject: printk: suppress empty continuation lines We have a fairly common pattern where you print several things as continuations on one single line in a loop, and then at the end you do printk(KERN_CONT "\n"); to flush the buffered output. But if the output was flushed by something else (concurrent printk activity, or just system logging), we don't want that final flushing to just print an empty line. So just suppress empty continuation lines when they couldn't be merged into the line they are a continuation of. Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d5e397315473..de08fc90baaf 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1769,6 +1769,10 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c cont_flush(); } + /* Skip empty continuation lines that couldn't be added - they just flush */ + if (!text_len && (lflags & LOG_CONT)) + return 0; + /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { if (cont_add(facility, level, lflags, text, text_len)) -- cgit v1.2.3 From 3118dac501bc0317de099db81618d589503351e1 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 6 Oct 2016 23:06:43 +0530 Subject: kernel/irq: Export irq_set_parent() The TPS65217 driver grew interrupt support which uses irq_set_parent(). While it's not yet clear why this is used in the first place, building the driver as a module fails with: ERROR: ".irq_set_parent" [drivers/mfd/tps65217.ko] undefined! The correctness of the driver change is still investigated, but for now it's less trouble to export irq_set_parent() than dealing with the build wreckage. [ tglx: Rewrote changelog and made the export GPL ] Fixes: 6556bdacf646 ("mfd: tps65217: Add support for IRQs") Signed-off-by: Sudip Mukherjee Cc: Sudip Mukherjee Cc: Marcin Niestroj Cc: Grygorii Strashko Cc: Tony Lindgren Cc: Lee Jones Link: http://lkml.kernel.org/r/1475775403-27207-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0c5f1a5db654..9c4d30483264 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -721,6 +721,7 @@ int irq_set_parent(int irq, int parent_irq) irq_put_desc_unlock(desc, flags); return 0; } +EXPORT_SYMBOL_GPL(irq_set_parent); #endif /* -- cgit v1.2.3 From f660f6066716b700148f60dba3461e65efff3123 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 10 Oct 2016 15:10:51 +0300 Subject: softirq: Display IRQ_POLL for irq-poll statistics This library was moved to the generic area and was renamed to irq-poll. Hence, update proc/softirqs output accordingly. Signed-off-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 1bf81ef91375..744fa611cae0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -58,7 +58,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); const char * const softirq_to_name[NR_SOFTIRQS] = { - "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; -- cgit v1.2.3 From 1adb469b9b76276d7e5ea36a20a24c47d6618a0b Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Fri, 21 Oct 2016 16:24:09 +0100 Subject: PM / suspend: Fix missing KERN_CONT for suspend message Commit 4bcc595ccd80 (printk: reinstate KERN_CONT for printing continuation lines) exposed a missing KERN_CONT from one of the messages shown on entering suspend. With v4.9-rc1, the 'done.' shown after syncing the filesystems no longer appears as a continuation but a new message with its own timestamp. [ 9.259566] PM: Syncing filesystems ... [ 9.264119] done. Fix this by adding the KERN_CONT log level for the 'done.' part of the message seen after syncing filesystems. While we are at it, convert these suspend printks to pr_info and pr_cont, respectively. Signed-off-by: Jon Hunter Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1e7f5da648d9..6ccb08f57fcb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -498,9 +498,9 @@ static int enter_state(suspend_state_t state) #ifndef CONFIG_SUSPEND_SKIP_SYNC trace_suspend_resume(TPS("sync_filesystems"), 0, true); - printk(KERN_INFO "PM: Syncing filesystems ... "); + pr_info("PM: Syncing filesystems ... "); sys_sync(); - printk("done.\n"); + pr_cont("done.\n"); trace_suspend_resume(TPS("sync_filesystems"), 0, false); #endif -- cgit v1.2.3 From b831275a3553c32091222ac619cfddd73a5553fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:41:56 +0200 Subject: timers: Plug locking race vs. timer migration Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the timer flags. As a consequence the compiler is allowed to reload the flags between the initial check for TIMER_MIGRATION and the following timer base computation and the spin lock of the base. While this has not been observed (yet), we need to make sure that it never happens. Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Peter Zijlstra --- kernel/time/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d47980a1bc4..0d4b91c5a374 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -943,7 +943,14 @@ static struct timer_base *lock_timer_base(struct timer_list *timer, { for (;;) { struct timer_base *base; - u32 tf = timer->flags; + u32 tf; + + /* + * We need to use READ_ONCE() here, otherwise the compiler + * might re-read @tf between the check for TIMER_MIGRATING + * and spin_lock(). + */ + tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); -- cgit v1.2.3 From 4da9152a4308dcbf611cde399c695c359fc9145f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Oct 2016 11:55:10 +0200 Subject: timers: Lock base for same bucket optimization Linus stumbled over the unlocked modification of the timer expiry value in mod_timer() which is an optimization for timers which stay in the same bucket - due to the bucket granularity - despite their expiry time getting updated. The optimization itself still makes sense even if we take the lock, because in case that the bucket stays the same, we avoid the pointless queue/enqueue dance. Make the check and the modification of timer->expires protected by the base lock and shuffle the remaining code around so we can keep the lock held when we actually have to requeue the timer to a different bucket. Fixes: f00c0afdfa62 ("timers: Implement optimization for same expiry time in mod_timer()") Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Peter Zijlstra --- kernel/time/timer.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0d4b91c5a374..ccf913038f9f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -971,6 +971,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) unsigned long clk = 0, flags; int ret = 0; + BUG_ON(!timer->function); + /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the @@ -979,13 +981,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) if (timer_pending(timer)) { if (timer->expires == expires) return 1; + /* - * Take the current timer_jiffies of base, but without holding - * the lock! + * We lock timer base and calculate the bucket index right + * here. If the timer ends up in the same bucket, then we + * just update the expiry time and avoid the whole + * dequeue/enqueue dance. */ - base = get_timer_base(timer->flags); - clk = base->clk; + base = lock_timer_base(timer, &flags); + clk = base->clk; idx = calc_wheel_index(expires, clk); /* @@ -995,14 +1000,14 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) */ if (idx == timer_get_idx(timer)) { timer->expires = expires; - return 1; + ret = 1; + goto out_unlock; } + } else { + base = lock_timer_base(timer, &flags); } timer_stats_timer_set_start_info(timer); - BUG_ON(!timer->function); - - base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, false); if (!ret && pending_only) @@ -1035,9 +1040,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance - * between calculating 'idx' and taking the lock, only enqueue_timer() - * and trigger_dyntick_cpu() is required. Otherwise we need to - * (re)calculate the wheel index via internal_add_timer(). + * between calculating 'idx' and possibly switching the base, only + * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise + * we need to (re)calculate the wheel index via + * internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) { enqueue_timer(base, timer, idx); -- cgit v1.2.3 From 041ad7bc758db259bb960ef795197dd14aab19a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2016 11:07:35 +0000 Subject: timers: Prevent base clock rewind when forwarding clock Ashton and Michael reported, that kernel versions 4.8 and later suffer from USB timeouts which are caused by the timer wheel rework. This is caused by a bug in the base clock forwarding mechanism, which leads to timers expiring early. The scenario which leads to this is: run_timers() while (jiffies >= base->clk) { collect_expired_timers(); base->clk++; expire_timers(); } So base->clk = jiffies + 1. Now the cpu goes idle: idle() get_next_timer_interrupt() nextevt = __next_time_interrupt(); if (time_after(nextevt, base->clk)) base->clk = jiffies; jiffies has not advanced since run_timers(), so this assignment effectively decrements base->clk by one. base->clk is the index into the timer wheel arrays. So let's assume the following state after the base->clk increment in run_timers(): jiffies = 0 base->clk = 1 A timer gets enqueued with an expiry delta of 63 ticks (which is the case with the USB timeout and HZ=250) so the resulting bucket index is: base->clk + delta = 1 + 63 = 64 The timer goes into the first wheel level. The array size is 64 so it ends up in bucket 0, which is correct as it takes 63 ticks to advance base->clk to index into bucket 0 again. If the cpu goes idle before jiffies advance, then the bug in the forwarding mechanism sets base->clk back to 0, so the next invocation of run_timers() at the next tick will index into bucket 0 and therefore expire the timer 62 ticks too early. Instead of blindly setting base->clk to jiffies we must make the forwarding conditional on jiffies > base->clk, but we cannot use jiffies for this as we might run into the following issue: if (time_after(jiffies, base->clk) { if (time_after(nextevt, base->clk)) base->clk = jiffies; jiffies can increment between the check and the assigment far enough to advance beyond nextevt. So we need to use a stable value for checking. get_next_timer_interrupt() has the basej argument which is the jiffies value snapshot taken in the calling code. So we can just that. Thanks to Ashton for bisecting and providing trace data! Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Reported-by: Ashton Holmes Reported-by: Michael Thayer Signed-off-by: Thomas Gleixner Cc: Michal Necasek Cc: Peter Zijlstra Cc: knut.osmundsen@oracle.com Cc: stable@vger.kernel.org Cc: stern@rowland.harvard.edu Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161022110552.175308322@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ccf913038f9f..7c446fb5163a 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1523,12 +1523,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); base->next_expiry = nextevt; /* - * We have a fresh next event. Check whether we can forward the base: + * We have a fresh next event. Check whether we can forward the + * base. We can only do that when @basej is past base->clk + * otherwise we might rewind base->clk. */ - if (time_after(nextevt, jiffies)) - base->clk = jiffies; - else if (time_after(nextevt, base->clk)) - base->clk = nextevt; + if (time_after(basej, base->clk)) { + if (time_after(nextevt, basej)) + base->clk = basej; + else if (time_after(nextevt, base->clk)) + base->clk = nextevt; + } if (time_before_eq(nextevt, basej)) { expires = basem; -- cgit v1.2.3 From 6bad6bccf2d717f652d37e63cf261eaa23466009 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Oct 2016 11:07:37 +0000 Subject: timers: Prevent base clock corruption when forwarding When a timer is enqueued we try to forward the timer base clock. This mechanism has two issues: 1) Forwarding a remote base unlocked The forwarding function is called from get_target_base() with the current timer base lock held. But if the new target base is a different base than the current base (can happen with NOHZ, sigh!) then the forwarding is done on an unlocked base. This can lead to corruption of base->clk. Solution is simple: Invoke the forwarding after the target base is locked. 2) Possible corruption due to jiffies advancing This is similar to the issue in get_net_timer_interrupt() which was fixed in the previous patch. jiffies can advance between check and assignement and therefore advancing base->clk beyond the next expiry value. So we need to read jiffies into a local variable once and do the checks and assignment with the local copy. Fixes: a683f390b93f("timers: Forward the wheel clock whenever possible") Reported-by: Ashton Holmes Reported-by: Michael Thayer Signed-off-by: Thomas Gleixner Cc: Michal Necasek Cc: Peter Zijlstra Cc: knut.osmundsen@oracle.com Cc: stable@vger.kernel.org Cc: stern@rowland.harvard.edu Cc: rt@linutronix.de Link: http://lkml.kernel.org/r/20161022110552.253640125@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/time/timer.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 7c446fb5163a..c611c47de884 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -878,7 +878,7 @@ static inline struct timer_base *get_timer_base(u32 tflags) #ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { #ifdef CONFIG_SMP if ((tflags & TIMER_PINNED) || !base->migration_enabled) @@ -891,25 +891,27 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { + unsigned long jnow = READ_ONCE(jiffies); + /* * We only forward the base when it's idle and we have a delta between * base clock and jiffies. */ - if (!base->is_idle || (long) (jiffies - base->clk) < 2) + if (!base->is_idle || (long) (jnow - base->clk) < 2) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jiffies)) - base->clk = jiffies; + if (time_after(base->next_expiry, jnow)) + base->clk = jnow; else base->clk = base->next_expiry; } #else static inline struct timer_base * -__get_target_base(struct timer_base *base, unsigned tflags) +get_target_base(struct timer_base *base, unsigned tflags) { return get_timer_this_cpu_base(tflags); } @@ -917,14 +919,6 @@ __get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { } #endif -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - struct timer_base *target = __get_target_base(base, tflags); - - forward_timer_base(target); - return target; -} /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means @@ -1037,6 +1031,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } + /* Try to forward a stale timer base clock */ + forward_timer_base(base); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance -- cgit v1.2.3 From f5d6d2da0d9098a4aa0ebcc187aa0fc167045d6b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 26 Oct 2016 13:37:04 +0200 Subject: sched/fair: Remove unused but set variable 'rq' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit: 8663e24d56dc ("sched/fair: Reorder cgroup creation code") ... the variable 'rq' in alloc_fair_sched_group() is set but no longer used. Remove it to fix the following GCC warning when building with 'W=1': kernel/sched/fair.c:8842:13: warning: variable ‘rq’ set but not used [-Wunused-but-set-variable] Signed-off-by: Tobias Klauser Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026113704.8981-1-tklauser@distanz.ch Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d941c97dfbc3..c242944f5cbd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8839,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct sched_entity *se; struct cfs_rq *cfs_rq; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8854,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) -- cgit v1.2.3 From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 26 Oct 2016 10:15:30 -0700 Subject: mm: remove per-zone hashtable of bitlock waitqueues The per-zone waitqueues exist because of a scalability issue with the page waitqueues on some NUMA machines, but it turns out that they hurt normal loads, and now with the vmalloced stacks they also end up breaking gfs2 that uses a bit_wait on a stack object: wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE) where 'gh' can be a reference to the local variable 'mount_gh' on the stack of fill_super(). The reason the per-zone hash table breaks for this case is that there is no "zone" for virtual allocations, and trying to look up the physical page to get at it will fail (with a BUG_ON()). It turns out that I actually complained to the mm people about the per-zone hash table for another reason just a month ago: the zone lookup also hurts the regular use of "unlock_page()" a lot, because the zone lookup ends up forcing several unnecessary cache misses and generates horrible code. As part of that earlier discussion, we had a much better solution for the NUMA scalability issue - by just making the page lock have a separate contention bit, the waitqueue doesn't even have to be looked at for the normal case. Peter Zijlstra already has a patch for that, but let's see if anybody even notices. In the meantime, let's fix the actual gfs2 breakage by simplifying the bitlock waitqueues and removing the per-zone issue. Reported-by: Andreas Gruenbacher Tested-by: Bob Peterson Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Steven Whitehouse Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 30 +------------ kernel/sched/core.c | 16 +++++++ kernel/sched/wait.c | 10 ----- mm/filemap.c | 4 +- mm/memory_hotplug.c | 28 ------------ mm/page_alloc.c | 115 +------------------------------------------------ 6 files changed, 21 insertions(+), 182 deletions(-) (limited to 'kernel') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7f2ae99e5daf..0f088f3a2fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,33 +440,7 @@ struct zone { seqlock_t span_seqlock; #endif - /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. - */ - wait_queue_head_t *wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) @@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) static inline bool zone_is_initialized(struct zone *zone) { - return !!zone->wait_table; + return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..42d4027f9e26 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7515,11 +7515,27 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#define WAIT_TABLE_BITS 8 +#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) +static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + unsigned long val = (unsigned long)word << shift | bit; + + return bit_wait_table + hash_long(val, WAIT_TABLE_BITS); +} +EXPORT_SYMBOL(bit_waitqueue); + void __init sched_init(void) { int i, j; unsigned long alloc_size = 0, ptr; + for (i = 0; i < WAIT_TABLE_SIZE; i++) + init_waitqueue_head(bit_wait_table + i); + #ifdef CONFIG_FAIR_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); #endif diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4f7053579fe3..9453efe9b25a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -480,16 +480,6 @@ void wake_up_bit(void *word, int bit) } EXPORT_SYMBOL(wake_up_bit); -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - /* * Manipulate the atomic_t address to produce a better bit waitqueue table hash * index (we're keying off bit -1, but that would produce a horrible hash diff --git a/mm/filemap.c b/mm/filemap.c index 849f459ad078..c7fe2f16503f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -790,9 +790,7 @@ EXPORT_SYMBOL(__page_cache_alloc); */ wait_queue_head_t *page_waitqueue(struct page *page) { - const struct zone *zone = page_zone(page); - - return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; + return bit_waitqueue(page, 0); } EXPORT_SYMBOL(page_waitqueue); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 962927309b6e..b18dab401be6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -268,7 +268,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; struct page *page; - struct zone *zone; nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; page = virt_to_page(pgdat); @@ -276,19 +275,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) for (i = 0; i < nr_pages; i++, page++) get_page_bootmem(node, page, NODE_INFO); - zone = &pgdat->node_zones[0]; - for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone_is_initialized(zone)) { - nr_pages = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; - page = virt_to_page(zone->wait_table); - - for (i = 0; i < nr_pages; i++, page++) - get_page_bootmem(node, page, NODE_INFO); - } - } - pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); @@ -2158,20 +2144,6 @@ void try_offline_node(int nid) */ node_set_offline(nid); unregister_one_node(nid); - - /* free waittable in each zone */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - /* - * wait_table may be allocated from boot memory, - * here only free if it's allocated by vmalloc. - */ - if (is_vmalloc_addr(zone->wait_table)) { - vfree(zone->wait_table); - zone->wait_table = NULL; - } - } } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..de7c6e43b1c9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4976,72 +4976,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) #endif } -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -#ifndef CONFIG_MEMORY_HOTPLUG -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} -#else -/* - * A zone's size might be changed by hot-add, so it is not possible to determine - * a suitable size for its wait_table. So we use the maximum size now. - * - * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: - * - * i386 (preemption config) : 4096 x 16 = 64Kbyte. - * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. - * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. - * - * The maximum entries are prepared when a zone's memory is (512K + 256) pages - * or more by the traditional way. (See above). It equals: - * - * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. - * ia64(16K page size) : = ( 8G + 4M)byte. - * powerpc (64K page size) : = (32G +16M)byte. - */ -static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) -{ - return 4096UL; -} -#endif - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -5304,49 +5238,6 @@ void __init setup_per_cpu_pageset(void) alloc_percpu(struct per_cpu_nodestat); } -static noinline __ref -int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) -{ - int i; - size_t alloc_size; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_hash_nr_entries = - wait_table_hash_nr_entries(zone_size_pages); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_hash_nr_entries); - alloc_size = zone->wait_table_hash_nr_entries - * sizeof(wait_queue_head_t); - - if (!slab_is_available()) { - zone->wait_table = (wait_queue_head_t *) - memblock_virt_alloc_node_nopanic( - alloc_size, zone->zone_pgdat->node_id); - } else { - /* - * This case means that a zone whose size was 0 gets new memory - * via memory hot-add. - * But it may be the case that a new node was hot-added. In - * this case vmalloc() will not be able to use this new node's - * memory - this wait_table must be initialized to use this new - * node itself as well. - * To use this new node's memory, further consideration will be - * necessary. - */ - zone->wait_table = vmalloc(alloc_size); - } - if (!zone->wait_table) - return -ENOMEM; - - for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) - init_waitqueue_head(zone->wait_table + i); - - return 0; -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -5367,10 +5258,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - int ret; - ret = zone_wait_table_init(zone, size); - if (ret) - return ret; + pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -5382,6 +5270,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_start_pfn, (zone_start_pfn + size)); zone_init_free_lists(zone); + zone->initialized = 1; return 0; } -- cgit v1.2.3 From b274c0bb394c6a69ac12feac7c2db81f5aff5a55 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 27 Oct 2016 17:46:21 -0700 Subject: kcov: properly check if we are in an interrupt in_interrupt() returns a nonzero value when we are either in an interrupt or have bh disabled via local_bh_disable(). Since we are interested in only ignoring coverage from actual interrupts, do a proper check instead of just calling in_interrupt(). As a result of this change, kcov will start to collect coverage from within local_bh_disable()/local_bh_enable() sections. Link: http://lkml.kernel.org/r/1476115803-20712-1-git-send-email-andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Dmitry Vyukov Cc: Nicolai Stange Cc: Andrey Ryabinin Cc: Kees Cook Cc: James Morse Cc: Vegard Nossum Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 8d44b3fea9d0..30e6d05aa5a9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -53,8 +53,15 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. + * The checks for whether we are in an interrupt are open-coded, because + * 1. We can't use in_interrupt() here, since it also returns true + * when we are inside local_bh_disable() section. + * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), + * since that leads to slower generated code (three separate tests, + * one for each of the flags). */ - if (!t || in_interrupt()) + if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET + | NMI_MASK))) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { -- cgit v1.2.3 From 0933840acf7b65d6d30a5b6089d882afea57aca3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 20 Oct 2016 13:10:11 +0200 Subject: perf/core: Protect PMU device removal with a 'pmu_bus_running' check, to fix CONFIG_DEBUG_TEST_DRIVER_REMOVE=y kernel panic CAI Qian reported a crash in the PMU uncore device removal code, enabled by the CONFIG_DEBUG_TEST_DRIVER_REMOVE=y option: https://marc.info/?l=linux-kernel&m=147688837328451 The reason for the crash is that perf_pmu_unregister() tries to remove a PMU device which is not added at this point. We add PMU devices only after pmu_bus is registered, which happens in the perf_event_sysfs_init() call and sets the 'pmu_bus_running' flag. The fix is to get the 'pmu_bus_running' flag state at the point the PMU is taken out of the PMU list and remove the device later only if it's set. Reported-by: CAI Qian Tested-by: CAI Qian Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Greg Kroah-Hartman Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rob Herring Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161020111011.GA13361@krava Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c6e47e97b33f..a5d2e62faf7e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8855,7 +8855,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { + int remove_device; + mutex_lock(&pmus_lock); + remove_device = pmu_bus_running; list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); @@ -8869,10 +8872,12 @@ void perf_pmu_unregister(struct pmu *pmu) free_percpu(pmu->pmu_disable_count); if (pmu->type >= PERF_TYPE_MAX) idr_remove(&pmu_idr, pmu->type); - if (pmu->nr_addr_filters) - device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); - device_del(pmu->dev); - put_device(pmu->dev); + if (remove_device) { + if (pmu->nr_addr_filters) + device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); + device_del(pmu->dev); + put_device(pmu->dev); + } free_pmu_context(pmu); } EXPORT_SYMBOL_GPL(perf_pmu_unregister); -- cgit v1.2.3 From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 26 Oct 2016 11:48:24 +0200 Subject: perf/powerpc: Don't call perf_event_disable() from atomic context The trinity syscall fuzzer triggered following WARN() on powerpc: WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 ... NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0 LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 Call Trace: [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable) [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 Followed by a lockdep warning: =============================== [ INFO: suspicious RCU usage. ] 4.8.0-rc5+ #7 Tainted: G W ------------------------------- ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by ls/2998: #0: (rcu_read_lock){......}, at: [] .__atomic_notifier_call_chain+0x0/0x1c0 #1: (rcu_read_lock){......}, at: [] .hw_breakpoint_handler+0x0/0x2b0 stack backtrace: CPU: 9 PID: 2998 Comm: ls Tainted: G W 4.8.0-rc5+ #7 Call Trace: [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable) [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180 [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0 [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0 [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380 [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60 [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0 [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0 [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0 [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0 [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100 [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48 While it looks like the first WARN() is probably valid, the other one is triggered by disabling event via perf_event_disable() from atomic context. The event is disabled here in case we were not able to emulate the instruction that hit the breakpoint. By disabling the event we unschedule the event and make sure it's not scheduled back. But we can't call perf_event_disable() from atomic context, instead we need to use the event's pending_disable irq_work method to disable it. Reported-by: Jan Stancek Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Huang Ying Cc: Jiri Olsa Cc: Linus Torvalds Cc: Michael Neuling Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- include/linux/perf_event.h | 1 + kernel/events/core.c | 10 ++++++++-- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 9781c69eae57..03d089b3ed72 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -275,7 +275,7 @@ int hw_breakpoint_handler(struct die_args *args) if (!stepped) { WARN(1, "Unable to handle hardware breakpoint. Breakpoint at " "0x%lx will be disabled.", info->address); - perf_event_disable(bp); + perf_event_disable_inatomic(bp); goto out; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 060d0ede88df..4741ecdb9817 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); +extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/kernel/events/core.c b/kernel/events/core.c index a5d2e62faf7e..0e292132efac 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1960,6 +1960,12 @@ void perf_event_disable(struct perf_event *event) } EXPORT_SYMBOL_GPL(perf_event_disable); +void perf_event_disable_inatomic(struct perf_event *event) +{ + event->pending_disable = 1; + irq_work_queue(&event->pending); +} + static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, u64 tstamp) @@ -7075,8 +7081,8 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - event->pending_disable = 1; - irq_work_queue(&event->pending); + + perf_event_disable_inatomic(event); } READ_ONCE(event->overflow_handler)(event, data, regs); -- cgit v1.2.3