From 9b20a352d78a7651aa68a9220f77ccb03009d892 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Sun, 27 Jul 2014 07:24:01 +0930 Subject: module: add within_module() function It is just a small optimization that allows to replace few occurrences of within_module_init() || within_module_core() with a single call. Signed-off-by: Petr Mladek Signed-off-by: Rusty Russell --- kernel/module.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 81e727cf6df9..e87fdd2fc3c2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3448,8 +3448,7 @@ const char *module_address_lookup(unsigned long addr, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { if (modname) *modname = mod->name; ret = get_ksymbol(mod, addr, size, offset); @@ -3473,8 +3472,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, NULL, NULL); @@ -3499,8 +3497,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_init(addr, mod) || - within_module_core(addr, mod)) { + if (within_module(addr, mod)) { const char *sym; sym = get_ksymbol(mod, addr, size, offset); @@ -3764,8 +3761,7 @@ struct module *__module_address(unsigned long addr) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if (within_module_core(addr, mod) - || within_module_init(addr, mod)) + if (within_module(addr, mod)) return mod; } return NULL; -- cgit v1.2.3 From 2e3a10a1551d6ceea005e6a62ca58183b8976217 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 27 Jul 2014 07:29:01 +0930 Subject: ARM: avoid ARM binutils leaking ELF local symbols Symbols starting with .L are ELF local symbols and should not appear in ELF symbol tables. However, unfortunately ARM binutils leaks the .LANCHOR symbols into the symbol table, which leads kallsyms to report these symbols rather than the real name. It is not very useful when %pf reports symbols against these leaked .LANCHOR symbols. Arrange for kallsyms to ignore these symbols using the same mechanism that is used for the ARM mapping symbols. Signed-off-by: Russell King Signed-off-by: Rusty Russell --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e87fdd2fc3c2..cd9bce918cdf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3385,6 +3385,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size) */ static inline int is_arm_mapping_symbol(const char *str) { + if (str[0] == '.' && str[1] == 'L') + return true; return str[0] == '$' && strchr("atd", str[1]) && (str[2] == '\0' || str[2] == '.'); } -- cgit v1.2.3 From 728dba3a39c66b3d8ac889ddbe38b5b1c264aec3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Feb 2014 19:13:49 -0800 Subject: namespaces: Use task_lock and not rcu to protect nsproxy The synchronous syncrhonize_rcu in switch_task_namespaces makes setns a sufficiently expensive system call that people have complained. Upon inspect nsproxy no longer needs rcu protection for remote reads. remote reads are rare. So optimize for same process reads and write by switching using rask_lock instead. This yields a simpler to understand lock, and a faster setns system call. In particular this fixes a performance regression observed by Rafael David Tinoco . This is effectively a revert of Pavel Emelyanov's commit cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter from 2007. The race this originialy fixed no longer exists as do_notify_parent uses task_active_pid_ns(parent) instead of parent->nsproxy. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 6 +++--- fs/proc/proc_net.c | 4 +++- fs/proc_namespace.c | 8 +++----- include/linux/nsproxy.h | 16 ++++++---------- ipc/namespace.c | 6 +++--- kernel/nsproxy.c | 15 ++++----------- kernel/utsname.c | 6 +++--- net/core/net_namespace.c | 10 ++++++---- 8 files changed, 31 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd887..7187d01329c3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2972,13 +2972,13 @@ static void *mntns_get(struct task_struct *task) struct mnt_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->mnt_ns; get_mnt_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 4677bb7dc7c2..a63af3e0a612 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir) rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); if (task != NULL) { - ns = task_nsproxy(task); + task_lock(task); + ns = task->nsproxy; if (ns != NULL) net = get_net(ns->net_ns); + task_unlock(task); } rcu_read_unlock(); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 1a81373947f3..73ca1740d839 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file, if (!task) goto err; - rcu_read_lock(); - nsp = task_nsproxy(task); + task_lock(task); + nsp = task->nsproxy; if (!nsp || !nsp->mnt_ns) { - rcu_read_unlock(); + task_unlock(task); put_task_struct(task); goto err; } ns = nsp->mnt_ns; get_mnt_ns(ns); - rcu_read_unlock(); - task_lock(task); if (!task->fs) { task_unlock(task); put_task_struct(task); diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index b4ec59d159ac..35fa08fd7739 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -40,32 +40,28 @@ extern struct nsproxy init_nsproxy; * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or - * any pointer on the nsproxy itself + * any pointer on the nsproxy itself. Current must hold the task_lock + * when changing tsk->nsproxy. * * 2. when accessing (i.e. reading) current task's namespaces - no * precautions should be taken - just dereference the pointers * * 3. the access to other task namespaces is performed like this - * rcu_read_lock(); - * nsproxy = task_nsproxy(tsk); + * task_lock(task); + * nsproxy = task->nsproxy; * if (nsproxy != NULL) { * / * * * work with the namespaces here * * e.g. get the reference on one of them * * / * } / * - * * NULL task_nsproxy() means that this task is + * * NULL task->nsproxy means that this task is * * almost dead (zombie) * * / - * rcu_read_unlock(); + * task_unlock(task); * */ -static inline struct nsproxy *task_nsproxy(struct task_struct *tsk) -{ - return rcu_dereference(tsk->nsproxy); -} - int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); diff --git a/ipc/namespace.c b/ipc/namespace.c index 59451c1e214d..b54468e48e32 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -154,11 +154,11 @@ static void *ipcns_get(struct task_struct *task) struct ipc_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) ns = get_ipc_ns(nsproxy->ipc_ns); - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e7811086b82..ef42d0ab3115 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + task_lock(p); ns = p->nsproxy; + p->nsproxy = new; + task_unlock(p); - rcu_assign_pointer(p->nsproxy, new); - - if (ns && atomic_dec_and_test(&ns->count)) { - /* - * wait for others to get what they want from this nsproxy. - * - * cannot release this nsproxy via the call_rcu() since - * put_mnt_ns() will want to sleep - */ - synchronize_rcu(); + if (ns && atomic_dec_and_test(&ns->count)) free_nsproxy(ns); - } } void exit_task_namespaces(struct task_struct *p) diff --git a/kernel/utsname.c b/kernel/utsname.c index fd393124e507..883aaaa7de8a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task) struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } - rcu_read_unlock(); + task_unlock(task); return ns; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 85b62691f4f2..7c6b51a58968 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid) tsk = find_task_by_vpid(pid); if (tsk) { struct nsproxy *nsproxy; - nsproxy = task_nsproxy(tsk); + task_lock(tsk); + nsproxy = tsk->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); + task_unlock(tsk); } rcu_read_unlock(); return net; @@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task) struct net *net = NULL; struct nsproxy *nsproxy; - rcu_read_lock(); - nsproxy = task_nsproxy(task); + task_lock(task); + nsproxy = task->nsproxy; if (nsproxy) net = get_net(nsproxy->net_ns); - rcu_read_unlock(); + task_unlock(task); return net; } -- cgit v1.2.3 From df5601f9c3d831b4c478b004a1ed90a18643adbe Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 7 Oct 2013 15:37:19 +0200 Subject: tracehook_signal_handler: Remove sig, info, ka and regs These parameters are nowhere used, so we can remove them. Signed-off-by: Richard Weinberger --- include/linux/tracehook.h | 8 +------- kernel/signal.c | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 6f8ab7da27c4..84d497297c5f 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -133,10 +133,6 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) /** * tracehook_signal_handler - signal handler setup is complete - * @sig: number of signal being delivered - * @info: siginfo_t of signal being delivered - * @ka: sigaction setting that chose the handler - * @regs: user register state * @stepping: nonzero if debugger single-step or block-step in use * * Called by the arch code after a signal handler has been set up. @@ -146,9 +142,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) * Called without locks, shortly before returning to user mode * (or handling more signals). */ -static inline void tracehook_signal_handler(int sig, siginfo_t *info, - const struct k_sigaction *ka, - struct pt_regs *regs, int stepping) +static inline void tracehook_signal_handler(int stepping) { if (stepping) ptrace_notify(SIGTRAP); diff --git a/kernel/signal.c b/kernel/signal.c index a4077e90f19f..c4d47661cc86 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2379,7 +2379,7 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, if (!(ka->sa.sa_flags & SA_NODEFER)) sigaddset(&blocked, sig); set_current_blocked(&blocked); - tracehook_signal_handler(sig, info, ka, regs, stepping); + tracehook_signal_handler(stepping); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) -- cgit v1.2.3 From 10b1c7ac8bfed429cf3dcb0225482c8dc1485d8e Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Sun, 13 Jul 2014 13:36:04 +0200 Subject: Clean up signal_delivered() - Pass a ksignal struct to it - Remove unused regs parameter - Make it private as it's nowhere outside of kernel/signal.c is used Signed-off-by: Richard Weinberger --- include/linux/signal.h | 1 - kernel/signal.c | 21 ++++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index c9e65360c49a..b005cc3dc1dc 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -282,7 +282,6 @@ struct ksignal { extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); -extern void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); diff --git a/kernel/signal.c b/kernel/signal.c index c4d47661cc86..0d75cf875d44 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2353,19 +2353,15 @@ relock: /** * signal_delivered - - * @sig: number of signal being delivered - * @info: siginfo_t of signal being delivered - * @ka: sigaction setting that chose the handler - * @regs: user register state + * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been - * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER - * is set in @ka->sa.sa_flags. Tracing is notified. + * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ -void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, - struct pt_regs *regs, int stepping) +static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; @@ -2375,9 +2371,9 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, simply clear the restore sigmask flag. */ clear_restore_sigmask(); - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); + sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); + if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); tracehook_signal_handler(stepping); } @@ -2387,8 +2383,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping) if (failed) force_sigsegv(ksig->sig, current); else - signal_delivered(ksig->sig, &ksig->info, &ksig->ka, - signal_pt_regs(), stepping); + signal_delivered(ksig, stepping); } /* -- cgit v1.2.3 From 828b1f65d23cf8a68795739f6dd08fc8abd9ee64 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 7 Oct 2013 15:26:57 +0200 Subject: Rip out get_signal_to_deliver() Now we can turn get_signal() to the main function. Signed-off-by: Richard Weinberger --- include/linux/signal.h | 14 +------------- kernel/signal.c | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index b005cc3dc1dc..750196fcc0a5 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -280,7 +280,7 @@ struct ksignal { int sig; }; -extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); +extern int get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); @@ -300,18 +300,6 @@ static inline void disallow_signal(int sig) kernel_sigaction(sig, SIG_IGN); } -/* - * Eventually that'll replace get_signal_to_deliver(); macro for now, - * to avoid nastiness with include order. - */ -#define get_signal(ksig) \ -({ \ - struct ksignal *p = (ksig); \ - p->sig = get_signal_to_deliver(&p->info, &p->ka, \ - signal_pt_regs(), NULL);\ - p->sig > 0; \ -}) - extern struct kmem_cache *sighand_cachep; int unhandled_signal(struct task_struct *tsk, int sig); diff --git a/kernel/signal.c b/kernel/signal.c index 0d75cf875d44..5c6020040388 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2166,8 +2166,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, - struct pt_regs *regs, void *cookie) +int get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2237,13 +2236,13 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, info); + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && signr != SIGKILL) { - signr = ptrace_signal(signr, info); + signr = ptrace_signal(signr, &ksig->info); if (!signr) continue; } @@ -2251,13 +2250,13 @@ relock: ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ - trace_signal_deliver(signr, info, ka); + trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ - *return_ka = *ka; + ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; @@ -2307,7 +2306,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(info->si_signo))) { + if (likely(do_signal_stop(ksig->info.si_signo))) { /* It released the siglock. */ goto relock; } @@ -2328,7 +2327,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(info->si_signo); + print_fatal_signal(ksig->info.si_signo); proc_coredump_connector(current); /* * If it was able to dump core, this kills all @@ -2338,17 +2337,19 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(info); + do_coredump(&ksig->info); } /* * Death signals, no core dump. */ - do_group_exit(info->si_signo); + do_group_exit(ksig->info.si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - return signr; + + ksig->sig = signr; + return ksig->sig > 0; } /** -- cgit v1.2.3 From 372ba8cb46b271a7662b92cbefedee56725f6bd0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Aug 2014 14:19:21 +0100 Subject: cpuidle: menu: Lookup CPU runqueues less The menu governer makes separate lookups of the CPU runqueue to get load and number of IO waiters but it can be done with a single lookup. Signed-off-by: Mel Gorman Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 17 +++++++---------- include/linux/sched.h | 3 +-- kernel/sched/core.c | 7 +++++++ kernel/sched/proc.c | 7 ------- 4 files changed, 15 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index f55d8260ec43..27702742b319 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -134,12 +134,9 @@ struct menu_device { #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) -static int get_loadavg(void) +static inline int get_loadavg(unsigned long load) { - unsigned long this = this_cpu_load(); - - - return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10; + return LOAD_INT(load) * 10 + LOAD_FRAC(load) / 10; } static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters) @@ -175,13 +172,13 @@ static inline int which_bucket(unsigned int duration, unsigned long nr_iowaiters * to be, the higher this multiplier, and thus the higher * the barrier to go to an expensive C state. */ -static inline int performance_multiplier(unsigned long nr_iowaiters) +static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned long load) { int mult = 1; /* for higher loadavg, we are more reluctant */ - mult += 2 * get_loadavg(); + mult += 2 * get_loadavg(load); /* for IO wait tasks (per cpu!) we add 5x each */ mult += 10 * nr_iowaiters; @@ -296,7 +293,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); int i; unsigned int interactivity_req; - unsigned long nr_iowaiters; + unsigned long nr_iowaiters, cpu_load; if (data->needs_update) { menu_update(drv, dev); @@ -312,7 +309,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) /* determine the expected residency time, round up */ data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length()); - nr_iowaiters = nr_iowait_cpu(smp_processor_id()); + get_iowait_load(&nr_iowaiters, &cpu_load); data->bucket = which_bucket(data->next_timer_us, nr_iowaiters); /* @@ -331,7 +328,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) * duration / latency ratio. Adjust the latency limit if * necessary. */ - interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters); + interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); if (latency_req > interactivity_req) latency_req = interactivity_req; diff --git a/include/linux/sched.h b/include/linux/sched.h index 306f4f0c987a..641bd954bb5d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -168,8 +168,7 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); -extern unsigned long this_cpu_load(void); - +extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); extern void update_cpu_load_nohz(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b494fe..863ef1d19563 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2385,6 +2385,13 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) +{ + struct rq *this = this_rq(); + *nr_waiters = atomic_read(&this->nr_iowait); + *load = this->cpu_load[0]; +} + #ifdef CONFIG_SMP /* diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 16f5a30f9c88..8ecd552fe4f2 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -8,13 +8,6 @@ #include "sched.h" -unsigned long this_cpu_load(void) -{ - struct rq *this = this_rq(); - return this->cpu_load[0]; -} - - /* * Global load-average calculations * -- cgit v1.2.3 From 021de3d904b88b1771a3a2cfc5b75023c391e646 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 6 Aug 2014 15:36:31 -0400 Subject: ring-buffer: Up rb_iter_peek() loop count to 3 After writting a test to try to trigger the bug that caused the ring buffer iterator to become corrupted, I hit another bug: WARNING: CPU: 1 PID: 5281 at kernel/trace/ring_buffer.c:3766 rb_iter_peek+0x113/0x238() Modules linked in: ipt_MASQUERADE sunrpc [...] CPU: 1 PID: 5281 Comm: grep Tainted: G W 3.16.0-rc3-test+ #143 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 0000000000000000 ffffffff81809a80 ffffffff81503fb0 0000000000000000 ffffffff81040ca1 ffff8800796d6010 ffffffff810c138d ffff8800796d6010 ffff880077438c80 ffff8800796d6010 ffff88007abbe600 0000000000000003 Call Trace: [] ? dump_stack+0x4a/0x75 [] ? warn_slowpath_common+0x7e/0x97 [] ? rb_iter_peek+0x113/0x238 [] ? rb_iter_peek+0x113/0x238 [] ? ring_buffer_iter_peek+0x2d/0x5c [] ? tracing_iter_reset+0x6e/0x96 [] ? s_start+0xd7/0x17b [] ? kmem_cache_alloc_trace+0xda/0xea [] ? seq_read+0x148/0x361 [] ? vfs_read+0x93/0xf1 [] ? SyS_read+0x60/0x8e [] ? tracesys+0xdd/0xe2 Debugging this bug, which triggers when the rb_iter_peek() loops too many times (more than 2 times), I discovered there's a case that can cause that function to legitimately loop 3 times! rb_iter_peek() is different than rb_buffer_peek() as the rb_buffer_peek() only deals with the reader page (it's for consuming reads). The rb_iter_peek() is for traversing the buffer without consuming it, and as such, it can loop for one more reason. That is, if we hit the end of the reader page or any page, it will go to the next page and try again. That is, we have this: 1. iter->head > iter->head_page->page->commit (rb_inc_iter() which moves the iter to the next page) try again 2. event = rb_iter_head_event() event->type_len == RINGBUF_TYPE_TIME_EXTEND rb_advance_iter() try again 3. read the event. But we never get to 3, because the count is greater than 2 and we cause the WARNING and return NULL. Up the counter to 3. Cc: stable@vger.kernel.org # 2.6.37+ Fixes: 69d1b839f7ee "ring-buffer: Bind time extend and data events together" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ff7027199a9a..31a9edd7aa93 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1984,7 +1984,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) /** * rb_update_event - update event type and data - * @event: the even to update + * @event: the event to update * @type: the type of event * @length: the size of the event field in the ring buffer * @@ -3764,12 +3764,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a time extend is encountered. - * Since the time extend is always attached to a data event, - * we should never loop more than once. - * (We never hit the following condition more than twice). + * We repeat when a time extend is encountered or we hit + * the end of the page. Since the time extend is always attached + * to a data event, we should never loop more than three times. + * Once for going to next page, once on time extend, and + * finally once to get the event. + * (We never hit the following condition more than thrice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) -- cgit v1.2.3 From 651e22f2701b4113989237c3048d17337dd2185c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 6 Aug 2014 14:11:33 -0400 Subject: ring-buffer: Always reset iterator to reader page When performing a consuming read, the ring buffer swaps out a page from the ring buffer with a empty page and this page that was swapped out becomes the new reader page. The reader page is owned by the reader and since it was swapped out of the ring buffer, writers do not have access to it (there's an exception to that rule, but it's out of scope for this commit). When reading the "trace" file, it is a non consuming read, which means that the data in the ring buffer will not be modified. When the trace file is opened, a ring buffer iterator is allocated and writes to the ring buffer are disabled, such that the iterator will not have issues iterating over the data. Although the ring buffer disabled writes, it does not disable other reads, or even consuming reads. If a consuming read happens, then the iterator is reset and starts reading from the beginning again. My tests would sometimes trigger this bug on my i386 box: WARNING: CPU: 0 PID: 5175 at kernel/trace/trace.c:1527 __trace_find_cmdline+0x66/0xaa() Modules linked in: CPU: 0 PID: 5175 Comm: grep Not tainted 3.16.0-rc3-test+ #8 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 00000000 00000000 f09c9e1c c18796b3 c1b5d74c f09c9e4c c103a0e3 c1b5154b f09c9e78 00001437 c1b5d74c 000005f7 c10bd85a c10bd85a c1cac57c f09c9eb0 ed0e0000 f09c9e64 c103a185 00000009 f09c9e5c c1b5154b f09c9e78 f09c9e80^M Call Trace: [] dump_stack+0x4b/0x75 [] warn_slowpath_common+0x7e/0x95 [] ? __trace_find_cmdline+0x66/0xaa [] ? __trace_find_cmdline+0x66/0xaa [] warn_slowpath_fmt+0x33/0x35 [] __trace_find_cmdline+0x66/0xaa^M [] trace_find_cmdline+0x40/0x64 [] trace_print_context+0x27/0xec [] ? trace_seq_printf+0x37/0x5b [] print_trace_line+0x319/0x39b [] ? ring_buffer_read+0x47/0x50 [] s_show+0x192/0x1ab [] ? s_next+0x5a/0x7c [] seq_read+0x267/0x34c [] vfs_read+0x8c/0xef [] ? seq_lseek+0x154/0x154 [] SyS_read+0x54/0x7f [] syscall_call+0x7/0xb ---[ end trace 3f507febd6b4cc83 ]--- >>>> ##### CPU 1 buffer started #### Which was the __trace_find_cmdline() function complaining about the pid in the event record being negative. After adding more test cases, this would trigger more often. Strangely enough, it would never trigger on a single test, but instead would trigger only when running all the tests. I believe that was the case because it required one of the tests to be shutting down via delayed instances while a new test started up. After spending several days debugging this, I found that it was caused by the iterator becoming corrupted. Debugging further, I found out why the iterator became corrupted. It happened with the rb_iter_reset(). As consuming reads may not read the full reader page, and only part of it, there's a "read" field to know where the last read took place. The iterator, must also start at the read position. In the rb_iter_reset() code, if the reader page was disconnected from the ring buffer, the iterator would start at the head page within the ring buffer (where writes still happen). But the mistake there was that it still used the "read" field to start the iterator on the head page, where it should always start at zero because readers never read from within the ring buffer where writes occur. I originally wrote a patch to have it set the iter->head to 0 instead of iter->head_page->read, but then I questioned why it wasn't always setting the iter to point to the reader page, as the reader page is still valid. The list_empty(reader_page->list) just means that it was successful in swapping out. But the reader_page may still have data. There was a bug report a long time ago that was not reproducible that had something about trace_pipe (consuming read) not matching trace (iterator read). This may explain why that happened. Anyway, the correct answer to this bug is to always use the reader page an not reset the iterator to inside the writable ring buffer. Cc: stable@vger.kernel.org # 2.6.28+ Fixes: d769041f8653 "ring_buffer: implement new locking" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 31a9edd7aa93..b95381ebdd5e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3357,21 +3357,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; /* Iterator usage is expected to have record disabled */ - if (list_empty(&cpu_buffer->reader_page->list)) { - iter->head_page = rb_set_head_page(cpu_buffer); - if (unlikely(!iter->head_page)) - return; - iter->head = iter->head_page->read; - } else { - iter->head_page = cpu_buffer->reader_page; - iter->head = cpu_buffer->reader_page->read; - } + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + + iter->cache_reader_page = iter->head_page; + iter->cache_read = iter->head; + if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; - iter->cache_reader_page = cpu_buffer->reader_page; - iter->cache_read = cpu_buffer->read; } /** -- cgit v1.2.3 From 84c91b7ae07c62cf6dee7fde3277f4be21331f85 Mon Sep 17 00:00:00 2001 From: "Lee, Chun-Yi" Date: Mon, 4 Aug 2014 23:23:21 +0800 Subject: PM / hibernate: avoid unsafe pages in e820 reserved regions When the machine doesn't well handle the e820 persistent when hibernate resuming, then it may cause page fault when writing image to snapshot buffer: [ 17.929495] BUG: unable to handle kernel paging request at ffff880069d4f000 [ 17.933469] IP: [] load_image_lzo+0x810/0xe40 [ 17.933469] PGD 2194067 PUD 77ffff067 PMD 2197067 PTE 0 [ 17.933469] Oops: 0002 [#1] SMP ... The ffff880069d4f000 page is in e820 reserved region of resume boot kernel: [ 0.000000] BIOS-e820: [mem 0x0000000069d4f000-0x0000000069e12fff] reserved ... [ 0.000000] PM: Registered nosave memory: [mem 0x69d4f000-0x69e12fff] So snapshot.c mark the pfn to forbidden pages map. But, this page is also in the memory bitmap in snapshot image because it's an original page used by image kernel, so it will also mark as an unsafe(free) page in prepare_image(). That means the page in e820 when resuming mark as "forbidden" and "free", it causes get_buffer() treat it as an allocated unsafe page. Then snapshot_write_next() return this page to load_image, load_image writing content to this address, but this page didn't really allocated . So, we got page fault. Although the root cause is from BIOS, I think aggressive check and significant message in kernel will better then a page fault for issue tracking, especially when serial console unavailable. This patch adds code in mark_unsafe_pages() for check does free pages in nosave region. If so, then it print message and return fault to stop whole S4 resume process: [ 8.166004] PM: Image loading progress: 0% [ 8.658717] PM: 0x6796c000 in e820 nosave region: [mem 0x6796c000-0x6796cfff] [ 8.918737] PM: Read 2511940 kbytes in 1.04 seconds (2415.32 MB/s) [ 8.926633] PM: Error -14 resuming [ 8.933534] PM: Failed to load hibernation image, recovering. Reviewed-by: Takashi Iwai Acked-by: Pavel Machek Signed-off-by: Lee, Chun-Yi [rjw: Subject] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4fc5c32422b3..c4b8093c80b3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -954,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } } +static bool is_nosave_page(unsigned long pfn) +{ + struct nosave_region *region; + + list_for_each_entry(region, &nosave_regions, list) { + if (pfn >= region->start_pfn && pfn < region->end_pfn) { + pr_err("PM: %#010llx in e820 nosave region: " + "[mem %#010llx-%#010llx]\n", + (unsigned long long) pfn << PAGE_SHIFT, + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); + return true; + } + } + + return false; +} + /** * create_basic_memory_bitmaps - create bitmaps needed for marking page * frames that should not be saved and free page frames. The pointers @@ -2015,7 +2034,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) do { pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn))) + if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; -- cgit v1.2.3 From bab5e2d6522bc3cb892c1e8aaafecab05bed9d85 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:03:22 -0700 Subject: kernel/auditfilter.c: replace count*size kmalloc by kcalloc kcalloc manages count*sizeof overflow. Signed-off-by: Fabian Frederick Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..c447cd9848d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { -- cgit v1.2.3 From 656c3b79f782a235413087168b61ff279034d860 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 6 Aug 2014 16:04:03 -0700 Subject: kernel/watchdog.c: convert printk/pr_warning to pr_foo() Replace some obsolete functions. Signed-off-by: Fabian Frederick Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..51b29e9d2ba6 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -484,7 +486,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", -- cgit v1.2.3 From ed4d4902ebdd7ca8b5a51daaf6bebf4b172895cc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:06:54 -0700 Subject: mm, hugetlb: remove hugetlb_zero and hugetlb_infinity They are unnecessary: "zero" can be used in place of "hugetlb_zero" and passing extra2 == NULL is equivalent to infinity. Signed-off-by: David Rientjes Cc: Joonsoo Kim Reviewed-by: Naoya Horiguchi Reviewed-by: Luiz Capitulino Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 - kernel/sysctl.c | 9 +++------ mm/hugetlb.c | 1 - 3 files changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a23c096b3080..6e6d338641fe 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -87,7 +87,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif extern unsigned long hugepages_treat_as_movable; -extern const unsigned long hugetlb_zero, hugetlb_infinity; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..75875a741b5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a0fcb33973e..d9ad93b55585 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,6 @@ #include #include "internal.h" -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; -- cgit v1.2.3 From fb794bcbb4e5552242f9a4c5e1ffe4c6da29a968 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 6 Aug 2014 16:07:58 -0700 Subject: mm, oom: remove unnecessary exit_state check The oom killer scans each process and determines whether it is eligible for oom kill or whether the oom killer should abort because of concurrent memory freeing. It will abort when an eligible process is found to have TIF_MEMDIE set, meaning it has already been oom killed and we're waiting for it to exit. Processes with task->mm == NULL should not be considered because they are either kthreads or have already detached their memory and killing them would not lead to memory freeing. That memory is only freed after exit_mm() has returned, however, and not when task->mm is first set to NULL. Clear TIF_MEMDIE after exit_mm()'s mmput() so that an oom killed process is no longer considered for oom kill, but only until exit_mm() has returned. This was fragile in the past because it relied on exit_notify() to be reached before no longer considering TIF_MEMDIE processes. Signed-off-by: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 + mm/oom_kill.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..88c6b3e42583 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -455,6 +455,7 @@ static void exit_mm(struct task_struct * tsk) task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + clear_thread_flag(TIF_MEMDIE); } /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d33aca1552ad..1e11df8fa7ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -258,8 +258,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill) { - if (task->exit_state) - return OOM_SCAN_CONTINUE; if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; -- cgit v1.2.3 From 618fde872163e782183ce574c77f1123e2be8887 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 6 Aug 2014 16:08:14 -0700 Subject: kernel/smp.c:on_each_cpu_cond(): fix warning in fallback path The rarely-executed memry-allocation-failed callback path generates a WARN_ON_ONCE() when smp_call_function_single() succeeds. Presumably it's supposed to warn on failures. Signed-off-by: Sasha Levin Cc: Christoph Lameter Cc: Gilad Ben-Yossef Cc: David Rientjes Cc: Joonsoo Kim Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 487653b5844f..aff8aa14f547 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -670,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), if (cond_func(cpu, info)) { ret = smp_call_function_single(cpu, func, info, wait); - WARN_ON_ONCE(!ret); + WARN_ON_ONCE(ret); } preempt_enable(); } -- cgit v1.2.3 From 7030017752437cebc3ec5590735bd89ead1e4cb8 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:49 -0700 Subject: printk: make dynamic kernel ring buffer alignment explicit We have to consider alignment for the ring buffer both for the default static size, and then also for when an dynamic allocation is made when the log_buf_len=n kernel parameter is passed to set the size specifically to a size larger than the default size set by the architecture through CONFIG_LOG_BUF_SHIFT. The default static kernel ring buffer can be aligned properly if architectures set CONFIG_LOG_BUF_SHIFT properly, we provide ranges for the size though so even if CONFIG_LOG_BUF_SHIFT has a sensible aligned value it can be reduced to a non aligned value. Commit 6ebb017de9 ("printk: Fix alignment of buf causing crash on ARM EABI") by Andrew Lunn ensures the static buffer is always aligned and the decision of alignment is done by the compiler by using __alignof__(struct log). When log_buf_len=n is used we allocate the ring buffer dynamically. Dynamic allocation varies, for the early allocation called before setup_arch() memblock_virt_alloc() requests a page aligment and for the default kernel allocation memblock_virt_alloc_nopanic() requests no special alignment, which in turn ends up aligning the allocation to SMP_CACHE_BYTES, which is L1 cache aligned. Since we already have the required alignment for the kernel ring buffer though we can do better and request explicit alignment for LOG_ALIGN. This does that to be safe and make dynamic allocation alignment explicit. Signed-off-by: Luis R. Rodriguez Tested-by: Petr Mladek Acked-by: Petr Mladek Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..6f598f92f2a1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -853,9 +853,10 @@ void __init setup_log_buf(int early) if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { -- cgit v1.2.3 From c0a318a361e7652b8c4f7b91d3a31c771cf34e4f Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:52 -0700 Subject: printk: move power of 2 practice of ring buffer size to a helper In practice the power of 2 practice of the size of the kernel ring buffer remains purely historical but not a requirement, specially now that we have LOG_ALIGN and use it for both static and dynamic allocations. It could have helped with implicit alignment back in the days given the even the dynamically sized ring buffer was guaranteed to be aligned so long as CONFIG_LOG_BUF_SHIFT was set to produce a __LOG_BUF_LEN which is architecture aligned, since log_buf_len=n would be allowed only if it was > __LOG_BUF_LEN and we always ended up rounding the log_buf_len=n to the next power of 2 with roundup_pow_of_two(), any multiple of 2 then should be also architecture aligned. These assumptions of course relied heavily on CONFIG_LOG_BUF_SHIFT producing an aligned value but users can always change this. We now have precise alignment requirements set for the log buffer size for both static and dynamic allocations, but lets upkeep the old practice of using powers of 2 for its size to help with easy expected scalable values and the allocators for dynamic allocations. We'll reuse this later so move this into a helper. Signed-off-by: Luis R. Rodriguez Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6f598f92f2a1..32ad0c7a0cd3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -828,15 +828,21 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } -- cgit v1.2.3 From f54051722e5715d24cd4469606ebdf488b6d5779 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:54 -0700 Subject: printk: make dynamic units clear for the kernel ring buffer Signed-off-by: Luis R. Rodriguez Suggested-by: Davidlohr Bueso Cc: Andrew Lunn Cc: Stephen Warren Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 32ad0c7a0cd3..db290be32984 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -879,7 +879,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } -- cgit v1.2.3 From 23b2899f7f194f06e09b52a1f46f027a21fae17c Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Wed, 6 Aug 2014 16:08:56 -0700 Subject: printk: allow increasing the ring buffer depending on the number of CPUs The default size of the ring buffer is too small for machines with a large amount of CPUs under heavy load. What ends up happening when debugging is the ring buffer overlaps and chews up old messages making debugging impossible unless the size is passed as a kernel parameter. An idle system upon boot up will on average spew out only about one or two extra lines but where this really matters is on heavy load and that will vary widely depending on the system and environment. There are mechanisms to help increase the kernel ring buffer for tracing through debugfs, and those interfaces even allow growing the kernel ring buffer per CPU. We also have a static value which can be passed upon boot. Relying on debugfs however is not ideal for production, and relying on the value passed upon bootup is can only used *after* an issue has creeped up. Instead of being reactive this adds a proactive measure which lets you scale the amount of contributions you'd expect to the kernel ring buffer under load by each CPU in the worst case scenario. We use num_possible_cpus() to avoid complexities which could be introduced by dynamically changing the ring buffer size at run time, num_possible_cpus() lets us use the upper limit on possible number of CPUs therefore avoiding having to deal with hotplugging CPUs on and off. This introduces the kernel configuration option LOG_CPU_MAX_BUF_SHIFT which is used to specify the maximum amount of contributions to the kernel ring buffer in the worst case before the kernel ring buffer flips over, the size is specified as a power of 2. The total amount of contributions made by each CPU must be greater than half of the default kernel ring buffer size (1 << LOG_BUF_SHIFT bytes) in order to trigger an increase upon bootup. The kernel ring buffer is increased to the next power of two that would fit the required minimum kernel ring buffer size plus the additional CPU contribution. For example if LOG_BUF_SHIFT is 18 (256 KB) you'd require at least 128 KB contributions by other CPUs in order to trigger an increase of the kernel ring buffer. With a LOG_CPU_BUF_SHIFT of 12 (4 KB) you'd require at least anything over > 64 possible CPUs to trigger an increase. If you had 128 possible CPUs the amount of minimum required kernel ring buffer bumps to: ((1 << 18) + ((128 - 1) * (1 << 12))) / 1024 = 764 KB Since we require the ring buffer to be a power of two the new required size would be 1024 KB. This CPU contributions are ignored when the "log_buf_len" kernel parameter is used as it forces the exact size of the ring buffer to an expected power of two value. [pmladek@suse.cz: fix build] Signed-off-by: Luis R. Rodriguez Signed-off-by: Petr Mladek Tested-by: Davidlohr Bueso Tested-by: Petr Mladek Reviewed-by: Davidlohr Bueso Cc: Andrew Lunn Cc: Stephen Warren Cc: Michal Hocko Cc: Petr Mladek Cc: Joe Perches Cc: Arun KS Cc: Kees Cook Cc: Davidlohr Bueso Cc: Chris Metcalf Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 8 +++++-- init/Kconfig | 46 +++++++++++++++++++++++++++++++++---- kernel/printk/printk.c | 34 +++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 883901b9ac4f..9344d833b7ea 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1716,8 +1716,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. 7 (KERN_DEBUG) debug-level messages log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two. The default - size is set in the kernel config file. + in bytes. n must be a power of two and greater + than the minimal size. The minimal size is defined + by LOG_BUF_SHIFT kernel config parameter. There is + also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter + that allows to increase the default size depending on + the number of CPUs. See init/Kconfig for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for diff --git a/init/Kconfig b/init/Kconfig index 41066e49e880..a291b7ef4738 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -807,15 +807,53 @@ config LOG_BUF_SHIFT range 12 21 default 17 help - Select kernel log buffer size as a power of 2. + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + Examples: - 17 => 128 KB + 17 => 128 KB 16 => 64 KB - 15 => 32 KB - 14 => 16 KB + 15 => 32 KB + 14 => 16 KB 13 => 8 KB 12 => 4 KB +config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the compuation optimal for the the worst case + scenerio while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index db290be32984..f855ec36dff9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -266,6 +266,7 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -848,12 +849,45 @@ static int __init log_buf_len_setup(char *str) } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; -- cgit v1.2.3 From e97e1267e9faa6480898a1fc34c8e40d74d702f2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:08:59 -0700 Subject: printk: tweak do_syslog() to match comments In do_syslog() there's a path used by kmsg_poll() and kmsg_read() that only needs to know whether there's any data available to read (and not its size). These callers only check for non-zero return. As a shortcut, do_syslog() returns the difference between what has been logged and what has been "seen." The comments say that the "count of records" should be returned but it's not. Instead it returns (log_next_idx - syslog_idx), which is a difference between buffer offsets--and the result could be negative. The behavior is the same (it'll be zero or not in the same cases), but the count of records is more meaningful and it matches what the comments say. So change the code to return that. Signed-off-by: Alex Elder Cc: Petr Mladek Cc: Jan Kara Cc: Joe Perches Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index f855ec36dff9..ec3bfb0b1f62 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1351,7 +1351,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; -- cgit v1.2.3 From 42a9dc0b3d0f749375c767c7d5cab56e89160576 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:01 -0700 Subject: printk: rename DEFAULT_MESSAGE_LOGLEVEL Commit a8fe19ebfbfd ("kernel/printk: use symbolic defines for console loglevels") makes consistent use of symbolic values for printk() log levels. The naming scheme used is different from the one used for DEFAULT_MESSAGE_LOGLEVEL though. Change that symbol name to be MESSAGE_LOGLEVEL_DEFAULT for consistency. And because the value of that symbol comes from a similarly-named config option, rename CONFIG_DEFAULT_MESSAGE_LOGLEVEL as well. Signed-off-by: Alex Elder Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Petr Mladek Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 2 +- kernel/printk/printk.c | 2 +- lib/Kconfig.debug | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 319ff7e53efb..0990997a5304 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -31,7 +31,7 @@ static inline const char *printk_skip_level(const char *buffer) } /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL +#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ec3bfb0b1f62..770ed4821ba9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -56,7 +56,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cfe7df8f62cc..cb45f59685e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -15,7 +15,7 @@ config PRINTK_TIME The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt -config DEFAULT_MESSAGE_LOGLEVEL +config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 default "4" -- cgit v1.2.3 From 0b90fec3b990b50d77944bc73c1ba4b031dfa52f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:03 -0700 Subject: printk: fix some comments Fix a few comments that don't accurately describe their corresponding code. It also fixes some minor typographical errors. Signed-off-by: Alex Elder Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 770ed4821ba9..4bae344c1ec3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -113,9 +113,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +146,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -345,7 +345,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -1517,7 +1517,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1922,11 +1922,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -2086,8 +2087,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ -- cgit v1.2.3 From e99aa461660a6413b11da887fb499e04a0f46803 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:05 -0700 Subject: printk: use a clever macro Use the IS_ENABLED() macro rather than #ifdef blocks to set certain global values. Signed-off-by: Alex Elder Acked-by: Borislav Petkov Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4bae344c1ec3..ac86838227ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -454,11 +454,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -988,11 +984,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) -- cgit v1.2.3 From 249771b8307e7a91659d8b273f8b70d48c3a7bfc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Aug 2014 16:09:08 -0700 Subject: printk: miscellaneous cleanups Some small cleanups to kernel/printk/printk.c. None of them should cause any change in behavior. - When CONFIG_PRINTK is defined, parenthesize the value of LOG_LINE_MAX. - When CONFIG_PRINTK is *not* defined, there is an extra LOG_LINE_MAX definition; delete it. - Pull an assignment out of a conditional expression in console_setup(). - Use isdigit() in console_setup() rather than open coding it. - In update_console_cmdline(), drop a NUL-termination assignment; the strlcpy() call that precedes it guarantees it's not needed. - Simplify some logic in printk_timed_ratelimit(). Signed-off-by: Alex Elder Reviewed-by: Petr Mladek Cc: Andi Kleen Cc: Borislav Petkov Cc: Jan Kara Cc: John Stultz Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ac86838227ed..5eb0e6c800bb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -1835,7 +1836,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1936,7 +1937,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1945,7 +1947,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1984,7 +1986,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -2652,14 +2653,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); -- cgit v1.2.3 From 5874af2003b1aaaa053128d655710140e3187226 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Aug 2014 16:09:10 -0700 Subject: printk: enable interrupts before calling console_trylock_for_printk() We need interrupts disabled when calling console_trylock_for_printk() only so that cpu id we pass to can_use_console() remains valid (for other things console_sem provides all the exclusion we need and deadlocks on console_sem due to interrupts are impossible because we use down_trylock()). However if we are rescheduled, we are guaranteed to run on an online cpu so we can easily just get the cpu id in can_use_console(). We can lose a bit of performance when we enable interrupts in vprintk_emit() and then disable them again in console_unlock() but OTOH it can somewhat reduce interrupt latency caused by console_unlock(). We differ from (reverted) commit 939f04bec1a4 in that we avoid calling console_unlock() from vprintk_emit() with lockdep enabled as that has unveiled quite some bugs leading to system freezes during boot (e.g. https://lkml.org/lkml/2014/5/30/242, https://lkml.org/lkml/2014/6/28/521). Signed-off-by: Jan Kara Tested-by: Andreas Bombe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5eb0e6c800bb..df202fe0974a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1450,10 +1450,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1466,8 +1465,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1642,7 +1643,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1750,21 +1752,30 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + lockdep_off(); + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); + preempt_enable(); + lockdep_on(); } - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -- cgit v1.2.3 From d25d9feced6c94398979a035868f03e8e8d49ce8 Mon Sep 17 00:00:00 2001 From: Neil Zhang Date: Wed, 6 Aug 2014 16:09:12 -0700 Subject: kernel/printk/printk.c: fix bool assignements Fix coccinelle warnings. Signed-off-by: Neil Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index df202fe0974a..de1a6bb6861d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -919,7 +919,7 @@ static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { - ignore_loglevel = 1; + ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; @@ -2005,12 +2005,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha return -1; } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { - console_suspend_enabled = 0; + console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); -- cgit v1.2.3 From ecfdb33d1fbc7e6e095ba24dac2930208494e734 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:44:49 -0400 Subject: acct: encode_comp_t(0) is 0, fortunately... There was an amusing bogosity in ac_rw calculation - it tried to do encode_comp_t(encode_comp_t(0) / 1024). Seeing that comp_t is a 3-bit exponent + 13-bit mantissa... it's a good thing that 0 is represented by all-bits-clear. The history of that one is interesting - it was introduced in 2.1.68pre1, when acct.c had been reworked and moved to separate file. Two months later (2.1.86) somebody has noticed that the sucker won't compile - there was no task_struct::io_usage. At which point the ac_io calculation had changed from encode_comp_t(current->io_usage) to encode_comp_t(0) and the bug in the next line (absolutely real back then, had it ever managed to compile) become a harmless bogosity. Looks like nobody has ever noticed until now. Anyway, let's bury that idiocy now that it got noticed. 17 years is long enough... Signed-off-by: Al Viro --- kernel/acct.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a1844f14c6d6..807ebc5d8333 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -531,9 +531,6 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; spin_unlock_irq(¤t->sighand->siglock); - ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ - ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_swaps = encode_comp_t(0); /* * Get freeze protection. If the fs is frozen, just skip the write -- cgit v1.2.3 From ed44724b79d8e03a40665436019cf22baba80d30 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:37:20 -0400 Subject: acct: switch to __kernel_write() Signed-off-by: Al Viro --- fs/internal.h | 1 - include/linux/fs.h | 1 + kernel/acct.c | 31 ++++++++++++------------------- 3 files changed, 13 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/fs/internal.h b/fs/internal.h index 465742407466..9a2edba87c2b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, /* * read_write.c */ -extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); extern int rw_verify_area(int, struct file *, const loff_t *, size_t); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index e11d60cc867b..4b7d57cf7863 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2335,6 +2335,7 @@ extern int do_pipe_flags(int *, int); extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern ssize_t kernel_write(struct file *, const char *, size_t, loff_t); +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ diff --git a/kernel/acct.c b/kernel/acct.c index 807ebc5d8333..8082d9875d6b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -456,12 +456,16 @@ static void do_acct_process(struct bsd_acct_struct *acct, { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; - mm_segment_t fs; unsigned long flim; u64 elapsed, run_time; struct tty_struct *tty; const struct cred *orig_cred; + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; /* Perform file operations on behalf of whoever enabled accounting */ orig_cred = override_creds(file->f_cred); @@ -536,25 +540,14 @@ static void do_acct_process(struct bsd_acct_struct *acct, * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. */ - if (!file_start_write_trylock(file)) - goto out; - /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ - fs = get_fs(); - set_fs(KERNEL_DS); - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - file->f_op->write(file, (char *)&ac, - sizeof(acct_t), &file->f_pos); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; - set_fs(fs); - file_end_write(file); + if (file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos); + file_end_write(file); + } out: + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; revert_creds(orig_cred); } -- cgit v1.2.3 From cdd37e23092c3c6fbbb2e611f8c3d18e676bf28f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Apr 2014 23:45:53 -0400 Subject: separate namespace-independent parts of filling acct_t Signed-off-by: Al Viro --- kernel/acct.c | 98 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 8082d9875d6b..efa891beeaa3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -448,42 +448,20 @@ static u32 encode_float(u64 value) * do_exit() or when switching to a different output file. */ -/* - * do_acct_process does all actual work. Caller holds the reference to file. - */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void fill_ac(acct_t *ac) { struct pacct_struct *pacct = ¤t->signal->pacct; - acct_t ac; - unsigned long flim; u64 elapsed, run_time; struct tty_struct *tty; - const struct cred *orig_cred; - - /* - * Accounting records are not subject to resource limits. - */ - flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - /* Perform file operations on behalf of whoever enabled accounting */ - orig_cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. - */ - if (!check_free_space(acct, file)) - goto out; /* * Fill the accounting struct with the needed info as recorded * by the different kernel functions. */ - memset(&ac, 0, sizeof(acct_t)); + memset(ac, 0, sizeof(acct_t)); - ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; - strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm)); /* calculate run_time in nsec*/ run_time = ktime_get_ns(); @@ -491,27 +469,66 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 - ac.ac_etime = encode_float(elapsed); + ac->ac_etime = encode_float(elapsed); #else - ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? (unsigned long) elapsed : (unsigned long) -1l); #endif #if ACCT_VERSION==1 || ACCT_VERSION==2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); - ac.ac_etime_hi = etime >> 16; - ac.ac_etime_lo = (u16) etime; + ac->ac_etime_hi = etime >> 16; + ac->ac_etime_lo = (u16) etime; } #endif do_div(elapsed, AHZ); - ac.ac_btime = get_seconds() - elapsed; + ac->ac_btime = get_seconds() - elapsed; +#if ACCT_VERSION==2 + ac->ac_ahz = AHZ; +#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ + ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac->ac_flag = pacct->ac_flag; + ac->ac_mem = encode_comp_t(pacct->ac_mem); + ac->ac_minflt = encode_comp_t(pacct->ac_minflt); + ac->ac_majflt = encode_comp_t(pacct->ac_majflt); + ac->ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); +} +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *file) +{ + acct_t ac; + unsigned long flim; + const struct cred *orig_cred; + + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); + + /* + * First check to see if there is enough free_space to continue + * the process accounting system. + */ + if (!check_free_space(acct, file)) + goto out; + + fill_ac(&ac); /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 - ac.ac_ahz = AHZ; -#endif #if ACCT_VERSION==1 || ACCT_VERSION==2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; @@ -523,19 +540,6 @@ static void do_acct_process(struct bsd_acct_struct *acct, ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); #endif - - spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; /* Safe as we hold the siglock */ - ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); - ac.ac_flag = pacct->ac_flag; - ac.ac_mem = encode_comp_t(pacct->ac_mem); - ac.ac_minflt = encode_comp_t(pacct->ac_minflt); - ac.ac_majflt = encode_comp_t(pacct->ac_majflt); - ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock_irq(¤t->sighand->siglock); - /* * Get freeze protection. If the fs is frozen, just skip the write * as we could deadlock the system otherwise. -- cgit v1.2.3 From e25ff11ff16aba000dfe9e568d867e5142c31f16 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 May 2014 05:12:09 -0400 Subject: split the slow path in acct_process() off Signed-off-by: Al Viro --- kernel/acct.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index efa891beeaa3..51188603b258 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -599,34 +599,35 @@ void acct_collect(long exitcode, int group_dead) spin_unlock_irq(¤t->sighand->siglock); } -static void acct_process_in_ns(struct pid_namespace *ns) +static void slow_acct_process(struct pid_namespace *ns) { - struct file *file = NULL; - struct bsd_acct_struct *acct; + for ( ; ns; ns = ns->parent) { + struct file *file = NULL; + struct bsd_acct_struct *acct; - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - return; + acct = ns->bacct; + /* + * accelerate the common fastpath: + */ + if (!acct || !acct->file) + continue; - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { + spin_lock(&acct_lock); + file = acct->file; + if (unlikely(!file)) { + spin_unlock(&acct_lock); + continue; + } + get_file(file); spin_unlock(&acct_lock); - return; - } - get_file(file); - spin_unlock(&acct_lock); - do_acct_process(acct, ns, file); - fput(file); + do_acct_process(acct, ns, file); + fput(file); + } } /** - * acct_process - now just a wrapper around acct_process_in_ns, - * which in turn is a wrapper around do_acct_process. + * acct_process * * handles process accounting for an exiting task */ @@ -639,6 +640,11 @@ void acct_process(void) * alive and holds its namespace, which in turn holds * its parent. */ - for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) - acct_process_in_ns(ns); + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { + struct bsd_acct_struct *acct = ns->bacct; + if (acct && acct->file) + break; + } + if (unlikely(ns)) + slow_acct_process(ns); } -- cgit v1.2.3 From 795a2f22a8eaf749e20a11271a8821bf04ac6d90 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 May 2014 05:23:41 -0400 Subject: acct() should honour the limits from the very beginning We need to check free space on the first write to freshly opened log. Signed-off-by: Al Viro --- kernel/acct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 51188603b258..87773725a0dc 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -180,8 +180,8 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, if (file) { acct->file = file; acct->ns = ns; - acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - acct->active = 1; + acct->needcheck = jiffies; + acct->active = 0; list_add(&acct->list, &acct_list); } if (old_acct) { -- cgit v1.2.3 From 9df7fa16ee956bf0cdf4a711eac827be92d584bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 May 2014 06:49:45 -0400 Subject: acct: serialize acct_on() brute-force - on a global mutex that isn't nested into anything. Signed-off-by: Al Viro --- kernel/acct.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 87773725a0dc..08963a292878 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -241,6 +241,8 @@ static int acct_on(struct filename *pathname) return 0; } +static DEFINE_MUTEX(acct_on_mutex); + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -263,7 +265,9 @@ SYSCALL_DEFINE1(acct, const char __user *, name) struct filename *tmp = getname(name); if (IS_ERR(tmp)) return PTR_ERR(tmp); + mutex_lock(&acct_on_mutex); error = acct_on(tmp); + mutex_unlock(&acct_on_mutex); putname(tmp); } else { struct bsd_acct_struct *acct; -- cgit v1.2.3 From b8f00e6be46f4c9a112e05fd692712873c4c4048 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:51:03 -0400 Subject: acct: new lifetime rules Do not reuse bsd_acct_struct after closing the damn thing. Structure lifetime is controlled by refcount now. We also have a mutex in there, held over closing and writing (the file is O_APPEND, so we are not losing any concurrency). As the result, we do not need to bother with get_file()/fput() on log write anymore. Moreover, do_acct_process() only needs acct itself; file and pidns are picked from it. Killed instances are distinguished by having NULL ->ns. Refcount is protected by acct_lock; anybody taking the mutex needs to grab a reference first. The things will get a lot simpler in the next commits - this is just the minimal chunk switching to the new lifetime rules. Signed-off-by: Al Viro --- kernel/acct.c | 220 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 114 insertions(+), 106 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 08963a292878..f9ef9db55c0e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,15 +75,11 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct); -/* - * This structure is used so that all the data protected by lock - * can be placed in the same cache line as the lock. This primes - * the cache line to have the data after getting the lock. - */ struct bsd_acct_struct { + long count; + struct mutex lock; int active; unsigned long needcheck; struct file *file; @@ -157,39 +153,59 @@ out: return res; } -/* - * Close the old accounting file (if currently open) and then replace - * it with file (if non-NULL). - * - * NOTE: acct_lock MUST be held on entry and exit. - */ -static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, - struct pid_namespace *ns) +static void acct_put(struct bsd_acct_struct *p) { - struct file *old_acct = NULL; - struct pid_namespace *old_ns = NULL; - - if (acct->file) { - old_acct = acct->file; - old_ns = acct->ns; - acct->active = 0; - acct->file = NULL; - acct->ns = NULL; - list_del(&acct->list); - } - if (file) { - acct->file = file; - acct->ns = ns; - acct->needcheck = jiffies; - acct->active = 0; - list_add(&acct->list, &acct_list); + spin_lock(&acct_lock); + if (!--p->count) + kfree(p); + spin_unlock(&acct_lock); +} + +static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p) +{ + struct bsd_acct_struct *res; + spin_lock(&acct_lock); +again: + res = *p; + if (res) + res->count++; + spin_unlock(&acct_lock); + if (res) { + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + spin_lock(&acct_lock); + if (!--res->count) + kfree(res); + goto again; + } } - if (old_acct) { - mnt_unpin(old_acct->f_path.mnt); + return res; +} + +static void acct_kill(struct bsd_acct_struct *acct, + struct bsd_acct_struct *new) +{ + if (acct) { + struct file *file = acct->file; + struct pid_namespace *ns = acct->ns; + spin_lock(&acct_lock); + list_del(&acct->list); + mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); - do_acct_process(acct, old_ns, old_acct); - filp_close(old_acct, NULL); + do_acct_process(acct); + filp_close(file, NULL); spin_lock(&acct_lock); + ns->bacct = new; + if (new) { + mnt_pin(new->file->f_path.mnt); + list_add(&new->list, &acct_list); + } + acct->ns = NULL; + mutex_unlock(&acct->lock); + if (!(acct->count -= 2)) + kfree(acct); + spin_unlock(&acct_lock); } } @@ -197,47 +213,50 @@ static int acct_on(struct filename *pathname) { struct file *file; struct vfsmount *mnt; - struct pid_namespace *ns; - struct bsd_acct_struct *acct = NULL; + struct pid_namespace *ns = task_active_pid_ns(current); + struct bsd_acct_struct *acct, *old; + + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (!acct) + return -ENOMEM; /* Difference from BSD - they don't do O_APPEND */ file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); - if (IS_ERR(file)) + if (IS_ERR(file)) { + kfree(acct); return PTR_ERR(file); + } if (!S_ISREG(file_inode(file)->i_mode)) { + kfree(acct); filp_close(file, NULL); return -EACCES; } if (!file->f_op->write) { + kfree(acct); filp_close(file, NULL); return -EIO; } - ns = task_active_pid_ns(current); - if (ns->bacct == NULL) { - acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); - if (acct == NULL) { - filp_close(file, NULL); - return -ENOMEM; - } - } + acct->count = 1; + acct->file = file; + acct->needcheck = jiffies; + acct->ns = ns; + mutex_init(&acct->lock); + mnt = file->f_path.mnt; - spin_lock(&acct_lock); - if (ns->bacct == NULL) { + old = acct_get(&ns->bacct); + if (old) { + acct_kill(old, acct); + } else { + spin_lock(&acct_lock); ns->bacct = acct; - acct = NULL; + mnt_pin(mnt); + list_add(&acct->list, &acct_list); + spin_unlock(&acct_lock); } - - mnt = file->f_path.mnt; - mnt_pin(mnt); - acct_file_reopen(ns->bacct, file, ns); - spin_unlock(&acct_lock); - mntput(mnt); /* it's pinned, now give up active reference */ - kfree(acct); - return 0; } @@ -270,15 +289,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - struct bsd_acct_struct *acct; - - acct = task_active_pid_ns(current)->bacct; - if (acct == NULL) - return 0; - - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); + acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL); } return error; @@ -298,8 +309,19 @@ void acct_auto_close_mnt(struct vfsmount *m) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.mnt == m) { - acct_file_reopen(acct, NULL, NULL); + if (acct->file->f_path.mnt == m) { + acct->count++; + spin_unlock(&acct_lock); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + spin_lock(&acct_lock); + if (!--acct->count) + kfree(acct); + goto restart; + } + acct_kill(acct, NULL); + spin_lock(&acct_lock); goto restart; } spin_unlock(&acct_lock); @@ -319,8 +341,19 @@ void acct_auto_close(struct super_block *sb) spin_lock(&acct_lock); restart: list_for_each_entry(acct, &acct_list, list) - if (acct->file && acct->file->f_path.dentry->d_sb == sb) { - acct_file_reopen(acct, NULL, NULL); + if (acct->file->f_path.dentry->d_sb == sb) { + acct->count++; + spin_unlock(&acct_lock); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + spin_lock(&acct_lock); + if (!--acct->count) + kfree(acct); + goto restart; + } + acct_kill(acct, NULL); + spin_lock(&acct_lock); goto restart; } spin_unlock(&acct_lock); @@ -328,17 +361,7 @@ restart: void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct = ns->bacct; - - if (acct == NULL) - return; - - spin_lock(&acct_lock); - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - - kfree(acct); + acct_kill(acct_get(&ns->bacct), NULL); } /* @@ -507,12 +530,13 @@ static void fill_ac(acct_t *ac) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct bsd_acct_struct *acct, - struct pid_namespace *ns, struct file *file) +static void do_acct_process(struct bsd_acct_struct *acct) { acct_t ac; unsigned long flim; const struct cred *orig_cred; + struct pid_namespace *ns = acct->ns; + struct file *file = acct->file; /* * Accounting records are not subject to resource limits. @@ -606,27 +630,12 @@ void acct_collect(long exitcode, int group_dead) static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { - struct file *file = NULL; - struct bsd_acct_struct *acct; - - acct = ns->bacct; - /* - * accelerate the common fastpath: - */ - if (!acct || !acct->file) - continue; - - spin_lock(&acct_lock); - file = acct->file; - if (unlikely(!file)) { - spin_unlock(&acct_lock); - continue; + struct bsd_acct_struct *acct = acct_get(&ns->bacct); + if (acct) { + do_acct_process(acct); + mutex_unlock(&acct->lock); + acct_put(acct); } - get_file(file); - spin_unlock(&acct_lock); - - do_acct_process(acct, ns, file); - fput(file); } } @@ -645,8 +654,7 @@ void acct_process(void) * its parent. */ for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) { - struct bsd_acct_struct *acct = ns->bacct; - if (acct && acct->file) + if (ns->bacct) break; } if (unlikely(ns)) -- cgit v1.2.3 From 54a4d58a6459a93fc6ee898354b3d2ffb80dd03a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 14:24:18 -0400 Subject: acct: simplify check_free_space() a) file can't be NULL b) file can't be changed under us c) all writes are serialized by acct->lock; no need to mess with spinlock there. Signed-off-by: Al Viro --- kernel/acct.c | 50 +++++++++++--------------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f9ef9db55c0e..019f012a3c6f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -93,64 +93,36 @@ static LIST_HEAD(acct_list); /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct bsd_acct_struct *acct, struct file *file) +static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - int res; - int act; - u64 resume; - u64 suspend; - spin_lock(&acct_lock); - res = acct->active; - if (!file || time_is_before_jiffies(acct->needcheck)) + if (time_is_before_jiffies(acct->needcheck)) goto out; - spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(&file->f_path, &sbuf)) - return res; - suspend = sbuf.f_blocks * SUSPEND; - resume = sbuf.f_blocks * RESUME; - - do_div(suspend, 100); - do_div(resume, 100); - - if (sbuf.f_bavail <= suspend) - act = -1; - else if (sbuf.f_bavail >= resume) - act = 1; - else - act = 0; - - /* - * If some joker switched acct->file under us we'ld better be - * silent and _not_ touch anything. - */ - spin_lock(&acct_lock); - if (file != acct->file) { - if (act) - res = act > 0; + if (vfs_statfs(&acct->file->f_path, &sbuf)) goto out; - } if (acct->active) { - if (act < 0) { + u64 suspend = sbuf.f_blocks * SUSPEND; + do_div(suspend, 100); + if (sbuf.f_bavail <= suspend) { acct->active = 0; printk(KERN_INFO "Process accounting paused\n"); } } else { - if (act > 0) { + u64 resume = sbuf.f_blocks * RESUME; + do_div(resume, 100); + if (sbuf.f_bavail >= resume) { acct->active = 1; printk(KERN_INFO "Process accounting resumed\n"); } } acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; - res = acct->active; out: - spin_unlock(&acct_lock); - return res; + return acct->active; } static void acct_put(struct bsd_acct_struct *p) @@ -550,7 +522,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) * First check to see if there is enough free_space to continue * the process accounting system. */ - if (!check_free_space(acct, file)) + if (!check_free_space(acct)) goto out; fill_ac(&ac); -- cgit v1.2.3 From 215752fce31c80f3b3a1530bc7cddb3ba6a69b3a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 06:23:41 -0400 Subject: acct: get rid of acct_list Put these suckers on per-vfsmount and per-superblock lists instead. Note: right now it's still acct_lock for everything, but that's going to change. Signed-off-by: Al Viro --- fs/mount.h | 1 + fs/namespace.c | 2 +- fs/super.c | 2 +- include/linux/acct.h | 6 +-- include/linux/fs.h | 1 + kernel/acct.c | 135 +++++++++++++++++++++------------------------------ 6 files changed, 62 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/fs/mount.h b/fs/mount.h index d55297f2fa05..0a2d1458681f 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -56,6 +56,7 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; + struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/namespace.c b/fs/namespace.c index 182bc41cd887..22e530addfaf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -956,7 +956,7 @@ put_again: mnt->mnt_pinned = 0; rcu_read_unlock(); unlock_mount_hash(); - acct_auto_close_mnt(&mnt->mnt); + acct_auto_close_mnt(&mnt->mnt_pins); goto put_again; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { diff --git a/fs/super.c b/fs/super.c index d20d5b11dedf..52ed93eb63df 100644 --- a/fs/super.c +++ b/fs/super.c @@ -703,7 +703,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) #endif if (flags & MS_RDONLY) - acct_auto_close(sb); + acct_auto_close(&sb->s_pins); shrink_dcache_sb(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); diff --git a/include/linux/acct.h b/include/linux/acct.h index 4a5b7cb56079..65a4f889182e 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -24,14 +24,14 @@ struct super_block; struct pacct_struct; struct pid_namespace; extern int acct_parm[]; /* for sysctl */ -extern void acct_auto_close_mnt(struct vfsmount *m); -extern void acct_auto_close(struct super_block *sb); +extern void acct_auto_close(struct hlist_head *); +extern void acct_auto_close_mnt(struct hlist_head *); extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); #else -#define acct_auto_close_mnt(x) do { } while (0) #define acct_auto_close(x) do { } while (0) +#define acct_auto_close_mnt(x) do { } while (0) #define acct_collect(x,y) do { } while (0) #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b7d57cf7863..17f70872a4a5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1250,6 +1250,7 @@ struct super_block { /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; + struct hlist_head s_pins; /* * Keep the lru lists last in the structure so they always sit on their diff --git a/kernel/acct.c b/kernel/acct.c index 019f012a3c6f..21fbb3c27c2a 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,6 +59,7 @@ #include #include /* sector_div */ #include +#include <../fs/mount.h> /* will go away when we refactor */ /* * These constants control the amount of freespace that suspend and @@ -79,16 +80,16 @@ static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { long count; + struct hlist_node s_list; + struct hlist_node m_list; struct mutex lock; int active; unsigned long needcheck; struct file *file; struct pid_namespace *ns; - struct list_head list; }; static DEFINE_SPINLOCK(acct_lock); -static LIST_HEAD(acct_list); /* * Check the amount of free space and suspend/resume accordingly. @@ -133,25 +134,33 @@ static void acct_put(struct bsd_acct_struct *p) spin_unlock(&acct_lock); } -static struct bsd_acct_struct *acct_get(struct bsd_acct_struct **p) +static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) +{ + res->count++; + spin_unlock(&acct_lock); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + spin_lock(&acct_lock); + if (!--res->count) + kfree(res); + return NULL; + } + return res; +} + +static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; spin_lock(&acct_lock); again: - res = *p; - if (res) - res->count++; - spin_unlock(&acct_lock); - if (res) { - mutex_lock(&res->lock); - if (!res->ns) { - mutex_unlock(&res->lock); - spin_lock(&acct_lock); - if (!--res->count) - kfree(res); - goto again; - } + if (!ns->bacct) { + spin_unlock(&acct_lock); + return NULL; } + res = __acct_get(ns->bacct); + if (!res) + goto again; return res; } @@ -162,7 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct, struct file *file = acct->file; struct pid_namespace *ns = acct->ns; spin_lock(&acct_lock); - list_del(&acct->list); + hlist_del(&acct->m_list); + hlist_del(&acct->s_list); mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); do_acct_process(acct); @@ -170,8 +180,10 @@ static void acct_kill(struct bsd_acct_struct *acct, spin_lock(&acct_lock); ns->bacct = new; if (new) { - mnt_pin(new->file->f_path.mnt); - list_add(&new->list, &acct_list); + struct vfsmount *m = new->file->f_path.mnt; + mnt_pin(m); + hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); } acct->ns = NULL; mutex_unlock(&acct->lock); @@ -218,14 +230,15 @@ static int acct_on(struct filename *pathname) mutex_init(&acct->lock); mnt = file->f_path.mnt; - old = acct_get(&ns->bacct); + old = acct_get(ns); if (old) { acct_kill(old, acct); } else { spin_lock(&acct_lock); ns->bacct = acct; mnt_pin(mnt); - list_add(&acct->list, &acct_list); + hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); + hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); } mntput(mnt); /* it's pinned, now give up active reference */ @@ -261,79 +274,41 @@ SYSCALL_DEFINE1(acct, const char __user *, name) mutex_unlock(&acct_on_mutex); putname(tmp); } else { - acct_kill(acct_get(&task_active_pid_ns(current)->bacct), NULL); + acct_kill(acct_get(task_active_pid_ns(current)), NULL); } return error; } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @m: vfsmount being shut down - * - * If the accounting is turned on for a file in the subtree pointed to - * to by m, turn accounting off. Done when m is about to die. - */ -void acct_auto_close_mnt(struct vfsmount *m) +void acct_auto_close_mnt(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.mnt == m) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + m_list)), NULL); + } spin_unlock(&acct_lock); } -/** - * acct_auto_close - turn off a filesystem's accounting if it is on - * @sb: super block for the filesystem - * - * If the accounting is turned on for a file in the filesystem pointed - * to by sb, turn accounting off. - */ -void acct_auto_close(struct super_block *sb) +void acct_auto_close(struct hlist_head *list) { - struct bsd_acct_struct *acct; - - spin_lock(&acct_lock); -restart: - list_for_each_entry(acct, &acct_list, list) - if (acct->file->f_path.dentry->d_sb == sb) { - acct->count++; - spin_unlock(&acct_lock); - mutex_lock(&acct->lock); - if (!acct->ns) { - mutex_unlock(&acct->lock); - spin_lock(&acct_lock); - if (!--acct->count) - kfree(acct); - goto restart; - } - acct_kill(acct, NULL); - spin_lock(&acct_lock); - goto restart; - } + while (1) { + spin_lock(&acct_lock); + if (!list->first) + break; + acct_kill(__acct_get(hlist_entry(list->first, + struct bsd_acct_struct, + s_list)), NULL); + } spin_unlock(&acct_lock); } void acct_exit_ns(struct pid_namespace *ns) { - acct_kill(acct_get(&ns->bacct), NULL); + acct_kill(acct_get(ns), NULL); } /* @@ -602,7 +577,7 @@ void acct_collect(long exitcode, int group_dead) static void slow_acct_process(struct pid_namespace *ns) { for ( ; ns; ns = ns->parent) { - struct bsd_acct_struct *acct = acct_get(&ns->bacct); + struct bsd_acct_struct *acct = acct_get(ns); if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); -- cgit v1.2.3 From 2798d4ce61601808b965253d60624bbf201b51b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:04:28 -0400 Subject: acct: get rid of acct_lock for acct->count * make acct->count atomic and acct freeing - rcu-delayed. * instead of grabbing acct_lock around the places where we take a reference, do that under rcu_read_lock() with atomic_long_inc_not_zero(). * have the new acct locked before making ns->bacct point to it Signed-off-by: Al Viro --- kernel/acct.c | 85 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 21fbb3c27c2a..6fd375f15626 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -79,9 +79,14 @@ int acct_parm[3] = {4, 2, 30}; static void do_acct_process(struct bsd_acct_struct *acct); struct bsd_acct_struct { - long count; - struct hlist_node s_list; - struct hlist_node m_list; + atomic_long_t count; + union { + struct { + struct hlist_node s_list; + struct hlist_node m_list; + }; + struct rcu_head rcu; + }; struct mutex lock; int active; unsigned long needcheck; @@ -89,6 +94,11 @@ struct bsd_acct_struct { struct pid_namespace *ns; }; +static void acct_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct bsd_acct_struct, rcu)); +} + static DEFINE_SPINLOCK(acct_lock); /* @@ -128,22 +138,22 @@ out: static void acct_put(struct bsd_acct_struct *p) { - spin_lock(&acct_lock); - if (!--p->count) - kfree(p); - spin_unlock(&acct_lock); + if (atomic_long_dec_and_test(&p->count)) + call_rcu(&p->rcu, acct_free_rcu); } static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) { - res->count++; - spin_unlock(&acct_lock); + if (!atomic_long_inc_not_zero(&res->count)) { + rcu_read_unlock(); + cpu_relax(); + return NULL; + } + rcu_read_unlock(); mutex_lock(&res->lock); if (!res->ns) { mutex_unlock(&res->lock); - spin_lock(&acct_lock); - if (!--res->count) - kfree(res); + acct_put(res); return NULL; } return res; @@ -152,13 +162,15 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; - spin_lock(&acct_lock); again: - if (!ns->bacct) { - spin_unlock(&acct_lock); + smp_rmb(); + rcu_read_lock(); + res = ACCESS_ONCE(ns->bacct); + if (!res) { + rcu_read_unlock(); return NULL; } - res = __acct_get(ns->bacct); + res = __acct_get(res); if (!res) goto again; return res; @@ -170,26 +182,27 @@ static void acct_kill(struct bsd_acct_struct *acct, if (acct) { struct file *file = acct->file; struct pid_namespace *ns = acct->ns; + do_acct_process(acct); + mnt_unpin(file->f_path.mnt); + filp_close(file, NULL); spin_lock(&acct_lock); hlist_del(&acct->m_list); hlist_del(&acct->s_list); - mnt_unpin(file->f_path.mnt); spin_unlock(&acct_lock); - do_acct_process(acct); - filp_close(file, NULL); - spin_lock(&acct_lock); ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; mnt_pin(m); + spin_lock(&acct_lock); hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&acct_lock); + mutex_unlock(&new->lock); } acct->ns = NULL; + atomic_long_dec(&acct->count); mutex_unlock(&acct->lock); - if (!(acct->count -= 2)) - kfree(acct); - spin_unlock(&acct_lock); + acct_put(acct); } } @@ -223,7 +236,7 @@ static int acct_on(struct filename *pathname) return -EIO; } - acct->count = 1; + atomic_long_set(&acct->count, 1); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; @@ -231,15 +244,17 @@ static int acct_on(struct filename *pathname) mnt = file->f_path.mnt; old = acct_get(ns); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ if (old) { acct_kill(old, acct); } else { - spin_lock(&acct_lock); ns->bacct = acct; + spin_lock(&acct_lock); mnt_pin(mnt); hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); + mutex_unlock(&acct->lock); } mntput(mnt); /* it's pinned, now give up active reference */ return 0; @@ -282,28 +297,32 @@ SYSCALL_DEFINE1(acct, const char __user *, name) void acct_auto_close_mnt(struct hlist_head *list) { + rcu_read_lock(); while (1) { - spin_lock(&acct_lock); - if (!list->first) + struct hlist_node *p = ACCESS_ONCE(list->first); + if (!p) break; - acct_kill(__acct_get(hlist_entry(list->first, + acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, m_list)), NULL); + rcu_read_lock(); } - spin_unlock(&acct_lock); + rcu_read_unlock(); } void acct_auto_close(struct hlist_head *list) { + rcu_read_lock(); while (1) { - spin_lock(&acct_lock); - if (!list->first) + struct hlist_node *p = ACCESS_ONCE(list->first); + if (!p) break; - acct_kill(__acct_get(hlist_entry(list->first, + acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, s_list)), NULL); + rcu_read_lock(); } - spin_unlock(&acct_lock); + rcu_read_unlock(); } void acct_exit_ns(struct pid_namespace *ns) -- cgit v1.2.3 From 17c0a5aaffa63da6b5c73a31e36616bdcd12d143 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:35:19 -0400 Subject: make acct_kill() wait for file closing. Do actual closing of file via schedule_work(). And use __fput_sync() there. Signed-off-by: Al Viro --- kernel/acct.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 6fd375f15626..d9ebc96b1126 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -92,6 +92,8 @@ struct bsd_acct_struct { unsigned long needcheck; struct file *file; struct pid_namespace *ns; + struct work_struct work; + struct completion done; }; static void acct_free_rcu(struct rcu_head *head) @@ -176,15 +178,27 @@ again: return res; } +static void close_work(struct work_struct *work) +{ + struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); + struct file *file = acct->file; + mnt_unpin(file->f_path.mnt); + if (file->f_op->flush) + file->f_op->flush(file, NULL); + __fput_sync(file); + complete(&acct->done); +} + static void acct_kill(struct bsd_acct_struct *acct, struct bsd_acct_struct *new) { if (acct) { - struct file *file = acct->file; struct pid_namespace *ns = acct->ns; do_acct_process(acct); - mnt_unpin(file->f_path.mnt); - filp_close(file, NULL); + INIT_WORK(&acct->work, close_work); + init_completion(&acct->done); + schedule_work(&acct->work); + wait_for_completion(&acct->done); spin_lock(&acct_lock); hlist_del(&acct->m_list); hlist_del(&acct->s_list); -- cgit v1.2.3 From 215748e67d893169de9e62c3416e9e035e9e9c5f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 07:51:29 -0400 Subject: acct: move mnt_pin() upwards. Signed-off-by: Al Viro --- kernel/acct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index d9ebc96b1126..2d9e04d98998 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -206,7 +206,6 @@ static void acct_kill(struct bsd_acct_struct *acct, ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; - mnt_pin(m); spin_lock(&acct_lock); hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); @@ -256,6 +255,7 @@ static int acct_on(struct filename *pathname) acct->ns = ns; mutex_init(&acct->lock); mnt = file->f_path.mnt; + mnt_pin(mnt); old = acct_get(ns); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ @@ -264,7 +264,6 @@ static int acct_on(struct filename *pathname) } else { ns->bacct = acct; spin_lock(&acct_lock); - mnt_pin(mnt); hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); -- cgit v1.2.3 From 1629d0eb3ead0e0c49e4402049ec7b5b31b81cd7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 08:00:52 -0400 Subject: start carving bsd_acct_struct up pull generic parts into struct fs_pin. Eventually we want those to replace mnt_pin()/mnt_unpin() mess; that stuff will move to fs/*. Signed-off-by: Al Viro --- kernel/acct.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 2d9e04d98998..afeaaa6f49bf 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -78,7 +78,7 @@ int acct_parm[3] = {4, 2, 30}; */ static void do_acct_process(struct bsd_acct_struct *acct); -struct bsd_acct_struct { +struct fs_pin { atomic_long_t count; union { struct { @@ -87,6 +87,10 @@ struct bsd_acct_struct { }; struct rcu_head rcu; }; +}; + +struct bsd_acct_struct { + struct fs_pin pin; struct mutex lock; int active; unsigned long needcheck; @@ -96,9 +100,9 @@ struct bsd_acct_struct { struct completion done; }; -static void acct_free_rcu(struct rcu_head *head) +static void pin_free_rcu(struct rcu_head *head) { - kfree(container_of(head, struct bsd_acct_struct, rcu)); + kfree(container_of(head, struct fs_pin, rcu)); } static DEFINE_SPINLOCK(acct_lock); @@ -138,15 +142,15 @@ out: return acct->active; } -static void acct_put(struct bsd_acct_struct *p) +static void pin_put(struct fs_pin *p) { if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, acct_free_rcu); + call_rcu(&p->rcu, pin_free_rcu); } static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) { - if (!atomic_long_inc_not_zero(&res->count)) { + if (!atomic_long_inc_not_zero(&res->pin.count)) { rcu_read_unlock(); cpu_relax(); return NULL; @@ -155,7 +159,7 @@ static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) mutex_lock(&res->lock); if (!res->ns) { mutex_unlock(&res->lock); - acct_put(res); + pin_put(&res->pin); return NULL; } return res; @@ -200,22 +204,22 @@ static void acct_kill(struct bsd_acct_struct *acct, schedule_work(&acct->work); wait_for_completion(&acct->done); spin_lock(&acct_lock); - hlist_del(&acct->m_list); - hlist_del(&acct->s_list); + hlist_del(&acct->pin.m_list); + hlist_del(&acct->pin.s_list); spin_unlock(&acct_lock); ns->bacct = new; if (new) { struct vfsmount *m = new->file->f_path.mnt; spin_lock(&acct_lock); - hlist_add_head(&new->s_list, &m->mnt_sb->s_pins); - hlist_add_head(&new->m_list, &real_mount(m)->mnt_pins); + hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins); + hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins); spin_unlock(&acct_lock); mutex_unlock(&new->lock); } acct->ns = NULL; - atomic_long_dec(&acct->count); + atomic_long_dec(&acct->pin.count); mutex_unlock(&acct->lock); - acct_put(acct); + pin_put(&acct->pin); } } @@ -249,7 +253,7 @@ static int acct_on(struct filename *pathname) return -EIO; } - atomic_long_set(&acct->count, 1); + atomic_long_set(&acct->pin.count, 1); acct->file = file; acct->needcheck = jiffies; acct->ns = ns; @@ -264,8 +268,8 @@ static int acct_on(struct filename *pathname) } else { ns->bacct = acct; spin_lock(&acct_lock); - hlist_add_head(&acct->s_list, &mnt->mnt_sb->s_pins); - hlist_add_head(&acct->m_list, &real_mount(mnt)->mnt_pins); + hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins); + hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins); spin_unlock(&acct_lock); mutex_unlock(&acct->lock); } @@ -317,7 +321,7 @@ void acct_auto_close_mnt(struct hlist_head *list) break; acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, - m_list)), NULL); + pin.m_list)), NULL); rcu_read_lock(); } rcu_read_unlock(); @@ -332,7 +336,7 @@ void acct_auto_close(struct hlist_head *list) break; acct_kill(__acct_get(hlist_entry(p, struct bsd_acct_struct, - s_list)), NULL); + pin.s_list)), NULL); rcu_read_lock(); } rcu_read_unlock(); @@ -613,7 +617,7 @@ static void slow_acct_process(struct pid_namespace *ns) if (acct) { do_acct_process(acct); mutex_unlock(&acct->lock); - acct_put(acct); + pin_put(&acct->pin); } } } -- cgit v1.2.3 From efb170c22867cdc6f770de441bdefecec6712199 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 08:39:04 -0400 Subject: take fs_pin stuff to fs/* Add a new field to fs_pin - kill(pin). That's what umount and r/o remount will be calling for all pins attached to vfsmount and superblock resp. Called after bumping the refcount, so it won't go away under us. Dropping the refcount is responsibility of the instance. All generic stuff moved to fs/fs_pin.c; the next step will rip all the knowledge of kernel/acct.c from fs/super.c and fs/namespace.c. After that - death to mnt_pin(); it was intended to be usable as generic mechanism for code that wants to attach objects to vfsmount, so that they would not make the sucker busy and would get killed on umount. Never got it right; it remained acct.c-specific all along. Now it's very close to being killable. Signed-off-by: Al Viro --- fs/Makefile | 2 +- fs/fs_pin.c | 77 ++++++++++++++++++++++++++++++ include/linux/acct.h | 6 +-- include/linux/fs_pin.h | 17 +++++++ kernel/acct.c | 127 +++++++++++++------------------------------------ 5 files changed, 129 insertions(+), 100 deletions(-) create mode 100644 fs/fs_pin.c create mode 100644 include/linux/fs_pin.h (limited to 'kernel') diff --git a/fs/Makefile b/fs/Makefile index 4030cbfbc9af..90c88529892b 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o + stack.o fs_struct.o statfs.o fs_pin.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/fs_pin.c b/fs/fs_pin.c new file mode 100644 index 000000000000..f3ce0b874a44 --- /dev/null +++ b/fs/fs_pin.c @@ -0,0 +1,77 @@ +#include +#include +#include +#include "mount.h" + +static void pin_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct fs_pin, rcu)); +} + +static DEFINE_SPINLOCK(pin_lock); + +void pin_put(struct fs_pin *p) +{ + if (atomic_long_dec_and_test(&p->count)) + call_rcu(&p->rcu, pin_free_rcu); +} + +void pin_remove(struct fs_pin *pin) +{ + spin_lock(&pin_lock); + hlist_del(&pin->m_list); + hlist_del(&pin->s_list); + spin_unlock(&pin_lock); +} + +void pin_insert(struct fs_pin *pin, struct vfsmount *m) +{ + spin_lock(&pin_lock); + hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); + hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); + spin_unlock(&pin_lock); +} + +void acct_auto_close_mnt(struct hlist_head *list) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(list->first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, m_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} + +void acct_auto_close(struct hlist_head *list) +{ + while (1) { + struct hlist_node *p; + struct fs_pin *pin; + rcu_read_lock(); + p = ACCESS_ONCE(list->first); + if (!p) { + rcu_read_unlock(); + break; + } + pin = hlist_entry(p, struct fs_pin, s_list); + if (!atomic_long_inc_not_zero(&pin->count)) { + rcu_read_unlock(); + cpu_relax(); + continue; + } + rcu_read_unlock(); + pin->kill(pin); + } +} diff --git a/include/linux/acct.h b/include/linux/acct.h index 65a4f889182e..137837929dbe 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -24,18 +24,16 @@ struct super_block; struct pacct_struct; struct pid_namespace; extern int acct_parm[]; /* for sysctl */ -extern void acct_auto_close(struct hlist_head *); -extern void acct_auto_close_mnt(struct hlist_head *); extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); #else -#define acct_auto_close(x) do { } while (0) -#define acct_auto_close_mnt(x) do { } while (0) #define acct_collect(x,y) do { } while (0) #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) #endif +extern void acct_auto_close(struct hlist_head *); +extern void acct_auto_close_mnt(struct hlist_head *); /* * ACCT_VERSION numbers as yet defined: diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h new file mode 100644 index 000000000000..f66525e72ccf --- /dev/null +++ b/include/linux/fs_pin.h @@ -0,0 +1,17 @@ +#include + +struct fs_pin { + atomic_long_t count; + union { + struct { + struct hlist_node s_list; + struct hlist_node m_list; + }; + struct rcu_head rcu; + }; + void (*kill)(struct fs_pin *); +}; + +void pin_put(struct fs_pin *); +void pin_remove(struct fs_pin *); +void pin_insert(struct fs_pin *, struct vfsmount *); diff --git a/kernel/acct.c b/kernel/acct.c index afeaaa6f49bf..a7993a6cb604 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -59,7 +59,7 @@ #include #include /* sector_div */ #include -#include <../fs/mount.h> /* will go away when we refactor */ +#include /* * These constants control the amount of freespace that suspend and @@ -78,17 +78,6 @@ int acct_parm[3] = {4, 2, 30}; */ static void do_acct_process(struct bsd_acct_struct *acct); -struct fs_pin { - atomic_long_t count; - union { - struct { - struct hlist_node s_list; - struct hlist_node m_list; - }; - struct rcu_head rcu; - }; -}; - struct bsd_acct_struct { struct fs_pin pin; struct mutex lock; @@ -100,13 +89,6 @@ struct bsd_acct_struct { struct completion done; }; -static void pin_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct fs_pin, rcu)); -} - -static DEFINE_SPINLOCK(acct_lock); - /* * Check the amount of free space and suspend/resume accordingly. */ @@ -142,29 +124,6 @@ out: return acct->active; } -static void pin_put(struct fs_pin *p) -{ - if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, pin_free_rcu); -} - -static struct bsd_acct_struct *__acct_get(struct bsd_acct_struct *res) -{ - if (!atomic_long_inc_not_zero(&res->pin.count)) { - rcu_read_unlock(); - cpu_relax(); - return NULL; - } - rcu_read_unlock(); - mutex_lock(&res->lock); - if (!res->ns) { - mutex_unlock(&res->lock); - pin_put(&res->pin); - return NULL; - } - return res; -} - static struct bsd_acct_struct *acct_get(struct pid_namespace *ns) { struct bsd_acct_struct *res; @@ -176,9 +135,18 @@ again: rcu_read_unlock(); return NULL; } - res = __acct_get(res); - if (!res) + if (!atomic_long_inc_not_zero(&res->pin.count)) { + rcu_read_unlock(); + cpu_relax(); goto again; + } + rcu_read_unlock(); + mutex_lock(&res->lock); + if (!res->ns) { + mutex_unlock(&res->lock); + pin_put(&res->pin); + goto again; + } return res; } @@ -203,19 +171,8 @@ static void acct_kill(struct bsd_acct_struct *acct, init_completion(&acct->done); schedule_work(&acct->work); wait_for_completion(&acct->done); - spin_lock(&acct_lock); - hlist_del(&acct->pin.m_list); - hlist_del(&acct->pin.s_list); - spin_unlock(&acct_lock); + pin_remove(&acct->pin); ns->bacct = new; - if (new) { - struct vfsmount *m = new->file->f_path.mnt; - spin_lock(&acct_lock); - hlist_add_head(&new->pin.s_list, &m->mnt_sb->s_pins); - hlist_add_head(&new->pin.m_list, &real_mount(m)->mnt_pins); - spin_unlock(&acct_lock); - mutex_unlock(&new->lock); - } acct->ns = NULL; atomic_long_dec(&acct->pin.count); mutex_unlock(&acct->lock); @@ -223,6 +180,19 @@ static void acct_kill(struct bsd_acct_struct *acct, } } +static void acct_pin_kill(struct fs_pin *pin) +{ + struct bsd_acct_struct *acct; + acct = container_of(pin, struct bsd_acct_struct, pin); + mutex_lock(&acct->lock); + if (!acct->ns) { + mutex_unlock(&acct->lock); + pin_put(pin); + acct = NULL; + } + acct_kill(acct, NULL); +} + static int acct_on(struct filename *pathname) { struct file *file; @@ -254,25 +224,22 @@ static int acct_on(struct filename *pathname) } atomic_long_set(&acct->pin.count, 1); + acct->pin.kill = acct_pin_kill; acct->file = file; acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); mnt = file->f_path.mnt; mnt_pin(mnt); + mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ + pin_insert(&acct->pin, mnt); old = acct_get(ns); - mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ - if (old) { + if (old) acct_kill(old, acct); - } else { + else ns->bacct = acct; - spin_lock(&acct_lock); - hlist_add_head(&acct->pin.s_list, &mnt->mnt_sb->s_pins); - hlist_add_head(&acct->pin.m_list, &real_mount(mnt)->mnt_pins); - spin_unlock(&acct_lock); - mutex_unlock(&acct->lock); - } + mutex_unlock(&acct->lock); mntput(mnt); /* it's pinned, now give up active reference */ return 0; } @@ -312,36 +279,6 @@ SYSCALL_DEFINE1(acct, const char __user *, name) return error; } -void acct_auto_close_mnt(struct hlist_head *list) -{ - rcu_read_lock(); - while (1) { - struct hlist_node *p = ACCESS_ONCE(list->first); - if (!p) - break; - acct_kill(__acct_get(hlist_entry(p, - struct bsd_acct_struct, - pin.m_list)), NULL); - rcu_read_lock(); - } - rcu_read_unlock(); -} - -void acct_auto_close(struct hlist_head *list) -{ - rcu_read_lock(); - while (1) { - struct hlist_node *p = ACCESS_ONCE(list->first); - if (!p) - break; - acct_kill(__acct_get(hlist_entry(p, - struct bsd_acct_struct, - pin.s_list)), NULL); - rcu_read_lock(); - } - rcu_read_unlock(); -} - void acct_exit_ns(struct pid_namespace *ns) { acct_kill(acct_get(ns), NULL); -- cgit v1.2.3 From 3064c3563ba4c23e2c7a47254ec056ed9ba0098a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Aug 2014 09:12:31 -0400 Subject: death to mnt_pinned Rather than playing silly buggers with vfsmount refcounts, just have acct_on() ask fs/namespace.c for internal clone of file->f_path.mnt and replace it with said clone. Then attach the pin to original vfsmount. Voila - the clone will be alive until the file gets closed, making sure that underlying superblock remains active, etc., and we can drop the original vfsmount, so that it's not kept busy. If the file lives until the final mntput of the original vfsmount, we'll notice that there's an fs_pin (one in bsd_acct_struct that holds that file) and mnt_pin_kill() will take it out. Since ->kill() is synchronous, we won't proceed past that point until these files are closed (and private clones of our vfsmount are gone), so we get the same ordering warranties we used to get. mnt_pin()/mnt_unpin()/->mnt_pinned is gone now, and good riddance - it never became usable outside of kernel/acct.c (and racy wrt umount even there). Signed-off-by: Al Viro --- fs/mount.h | 1 - fs/namespace.c | 35 +++++++++-------------------------- include/linux/mount.h | 4 ++-- kernel/acct.c | 24 +++++++++++++++++++----- 4 files changed, 30 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/fs/mount.h b/fs/mount.h index 0a2d1458681f..6740a6215529 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -55,7 +55,6 @@ struct mount { int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ - int mnt_pinned; struct hlist_head mnt_pins; struct path mnt_ex_mountpoint; }; diff --git a/fs/namespace.c b/fs/namespace.c index 0e4ce51c5277..65af9d0e0d67 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -937,7 +937,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, static void mntput_no_expire(struct mount *mnt) { -put_again: rcu_read_lock(); mnt_add_count(mnt, -1); if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ @@ -950,14 +949,6 @@ put_again: unlock_mount_hash(); return; } - if (unlikely(mnt->mnt_pinned)) { - mnt_add_count(mnt, mnt->mnt_pinned + 1); - mnt->mnt_pinned = 0; - rcu_read_unlock(); - unlock_mount_hash(); - mnt_pin_kill(mnt); - goto put_again; - } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); @@ -980,6 +971,8 @@ put_again: * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); @@ -1007,25 +1000,15 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -void mnt_pin(struct vfsmount *mnt) +struct vfsmount *mnt_clone_internal(struct path *path) { - lock_mount_hash(); - real_mount(mnt)->mnt_pinned++; - unlock_mount_hash(); -} -EXPORT_SYMBOL(mnt_pin); - -void mnt_unpin(struct vfsmount *m) -{ - struct mount *mnt = real_mount(m); - lock_mount_hash(); - if (mnt->mnt_pinned) { - mnt_add_count(mnt, 1); - mnt->mnt_pinned--; - } - unlock_mount_hash(); + struct mount *p; + p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); + if (IS_ERR(p)) + return ERR_CAST(p); + p->mnt.mnt_flags |= MNT_INTERNAL; + return &p->mnt; } -EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) { diff --git a/include/linux/mount.h b/include/linux/mount.h index 839bac270904..864b120c1345 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -62,6 +62,7 @@ struct vfsmount { }; struct file; /* forward dec */ +struct path; extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); @@ -70,8 +71,7 @@ extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); -extern void mnt_pin(struct vfsmount *mnt); -extern void mnt_unpin(struct vfsmount *mnt); +extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); struct file_system_type; diff --git a/kernel/acct.c b/kernel/acct.c index a7993a6cb604..2e6cf818021d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -154,7 +154,6 @@ static void close_work(struct work_struct *work) { struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work); struct file *file = acct->file; - mnt_unpin(file->f_path.mnt); if (file->f_op->flush) file->f_op->flush(file, NULL); __fput_sync(file); @@ -196,9 +195,10 @@ static void acct_pin_kill(struct fs_pin *pin) static int acct_on(struct filename *pathname) { struct file *file; - struct vfsmount *mnt; + struct vfsmount *mnt, *internal; struct pid_namespace *ns = task_active_pid_ns(current); struct bsd_acct_struct *acct, *old; + int err; acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); if (!acct) @@ -222,6 +222,21 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return -EIO; } + internal = mnt_clone_internal(&file->f_path); + if (IS_ERR(internal)) { + kfree(acct); + filp_close(file, NULL); + return PTR_ERR(internal); + } + err = mnt_want_write(internal); + if (err) { + mntput(internal); + kfree(acct); + filp_close(file, NULL); + return err; + } + mnt = file->f_path.mnt; + file->f_path.mnt = internal; atomic_long_set(&acct->pin.count, 1); acct->pin.kill = acct_pin_kill; @@ -229,8 +244,6 @@ static int acct_on(struct filename *pathname) acct->needcheck = jiffies; acct->ns = ns; mutex_init(&acct->lock); - mnt = file->f_path.mnt; - mnt_pin(mnt); mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */ pin_insert(&acct->pin, mnt); @@ -240,7 +253,8 @@ static int acct_on(struct filename *pathname) else ns->bacct = acct; mutex_unlock(&acct->lock); - mntput(mnt); /* it's pinned, now give up active reference */ + mnt_drop_write(mnt); + mntput(mnt); return 0; } -- cgit v1.2.3 From 2577d92ebd28dd9b3dacdfad6dcd81be0d21bbdf Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Thu, 31 Jul 2014 09:28:36 +1000 Subject: kernel/acct.c: fix coding style warnings and errors Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- kernel/acct.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 2e6cf818021d..b4c667d22e79 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -108,14 +108,14 @@ static int check_free_space(struct bsd_acct_struct *acct) do_div(suspend, 100); if (sbuf.f_bavail <= suspend) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { u64 resume = sbuf.f_blocks * RESUME; do_div(resume, 100); if (sbuf.f_bavail >= resume) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -280,6 +280,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); mutex_lock(&acct_on_mutex); @@ -337,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -350,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -429,16 +431,17 @@ static void fill_ac(acct_t *ac) run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac->ac_etime = encode_float(elapsed); #else ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac->ac_etime_hi = etime >> 16; ac->ac_etime_lo = (u16) etime; } @@ -491,12 +494,12 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -530,6 +533,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { -- cgit v1.2.3 From f96f56780ca584930bb3a2769d73fd9a101bcbbe Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 4 Aug 2014 03:10:16 +0000 Subject: kprobes: Skip kretprobe hit in NMI context to avoid deadlock Skip kretprobe hit in NMI context, because if an NMI happens inside the critical section protected by kretprobe_table.lock and another(or same) kretprobe hit, pre_kretprobe_handler tries to lock kretprobe_table.lock again. Normal interrupts have no problem because they are disabled with the lock. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: David S. Miller Link: http://lkml.kernel.org/r/20140804031016.11433.65539.stgit@kbuild-fedora.novalocal [ Minor edits for clarity. ] Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 734e9a7d280b..3995f546d0f3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) unsigned long hash, flags = 0; struct kretprobe_instance *ri; - /*TODO: consider to only swap the RA after the last pre_handler fired */ + /* + * To avoid deadlocks, prohibit return probing in NMI contexts, + * just skip the probe and increase the (inexact) 'nmissed' + * statistical counter, so that the user is informed that + * something happened: + */ + if (unlikely(in_nmi())) { + rp->nmissed++; + return 0; + } + + /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { -- cgit v1.2.3 From 00501b531c4723972aa11d6d4ebcf8d6552007c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 8 Aug 2014 14:19:20 -0700 Subject: mm: memcontrol: rewrite charge API These patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. text data bss dec hex filename 37970 9892 400 48262 bc86 mm/memcontrol.o.old 35239 9892 400 45531 b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hughd@google.com: fix shmem_unuse] [hughd@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Hugh Dickins Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memcg_test.txt | 32 +-- include/linux/memcontrol.h | 53 ++--- include/linux/swap.h | 3 + kernel/events/uprobes.c | 15 +- mm/filemap.c | 21 +- mm/huge_memory.c | 57 +++-- mm/memcontrol.c | 407 ++++++++++++++--------------------- mm/memory.c | 41 ++-- mm/rmap.c | 19 -- mm/shmem.c | 37 ++-- mm/swap.c | 34 +++ mm/swapfile.c | 14 +- 12 files changed, 338 insertions(+), 395 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index 80ac454704b8..bcf750d3cecd 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -24,24 +24,7 @@ Please note that implementation details can be changed. a page/swp_entry may be charged (usage += PAGE_SIZE) at - mem_cgroup_charge_anon() - Called at new page fault and Copy-On-Write. - - mem_cgroup_try_charge_swapin() - Called at do_swap_page() (page fault on swap entry) and swapoff. - Followed by charge-commit-cancel protocol. (With swap accounting) - At commit, a charge recorded in swap_cgroup is removed. - - mem_cgroup_charge_file() - Called at add_to_page_cache() - - mem_cgroup_cache_charge_swapin() - Called at shmem's swapin. - - mem_cgroup_prepare_migration() - Called before migration. "extra" charge is done and followed by - charge-commit-cancel protocol. - At commit, charge against oldpage or newpage will be committed. + mem_cgroup_try_charge() 2. Uncharge a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by @@ -69,19 +52,14 @@ Please note that implementation details can be changed. to new page is committed. At failure, charge to old page is committed. 3. charge-commit-cancel - In some case, we can't know this "charge" is valid or not at charging - (because of races). - To handle such case, there are charge-commit-cancel functions. - mem_cgroup_try_charge_XXX - mem_cgroup_commit_charge_XXX - mem_cgroup_cancel_charge_XXX - these are used in swap-in and migration. + Memcg pages are charged in two steps: + mem_cgroup_try_charge() + mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() At try_charge(), there are no flags to say "this page is charged". at this point, usage += PAGE_SIZE. - At commit(), the function checks the page should be charged or not - and set flags or avoid charging.(usage -= PAGE_SIZE) + At commit(), the page is associated with the memcg. At cancel(), simply usage -= PAGE_SIZE. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eb65d29516ca..1a9a096858e0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,28 +54,11 @@ struct mem_cgroup_reclaim_cookie { }; #ifdef CONFIG_MEMCG -/* - * All "charge" functions with gfp_mask should use GFP_KERNEL or - * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't - * alloc memory but reclaims memory from all available zones. So, "where I want - * memory from" bits of gfp_mask has no meaning. So any bits of that field is - * available but adding a rule is better. charge functions' gfp_mask should - * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous - * codes. - * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) - */ - -extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); -/* for swap handling */ -extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t mask, struct mem_cgroup **memcgp); -extern void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg); -extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); - -extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp); +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare); +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); @@ -233,30 +216,22 @@ void mem_cgroup_print_bad_page(struct page *page); #else /* CONFIG_MEMCG */ struct mem_cgroup; -static inline int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - return 0; -} - -static inline int mem_cgroup_charge_file(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - return 0; -} - -static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) +static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, + struct mem_cgroup **memcgp) { + *memcgp = NULL; return 0; } -static inline void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) +static inline void mem_cgroup_commit_charge(struct page *page, + struct mem_cgroup *memcg, + bool lrucare) { } -static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) +static inline void mem_cgroup_cancel_charge(struct page *page, + struct mem_cgroup *memcg) { } diff --git a/include/linux/swap.h b/include/linux/swap.h index 1eb64043c076..46a649e4e8cd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -320,6 +320,9 @@ extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); +extern void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma); + /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..1d0af8a2c646 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); diff --git a/mm/filemap.c b/mm/filemap.c index af19a6b079f5..349a40e35545 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,6 +31,7 @@ #include #include #include /* for BUG_ON(!in_atomic()) only */ +#include #include #include #include @@ -548,19 +549,24 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + int huge = PageHuge(page); + struct mem_cgroup *memcg; int error; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_charge_file(page, current->mm, - gfp_mask & GFP_RECLAIM_MASK); - if (error) - return error; + if (!huge) { + error = mem_cgroup_try_charge(page, current->mm, + gfp_mask, &memcg); + if (error) + return error; + } error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); return error; } @@ -575,13 +581,16 @@ static int __add_to_page_cache_locked(struct page *page, goto err_insert; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + if (!huge) + mem_cgroup_commit_charge(page, memcg, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); + if (!huge) + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); return error; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3630d577e987..d9a21d06b862 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -715,13 +715,20 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, unsigned long haddr, pmd_t *pmd, struct page *page) { + struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + return VM_FAULT_OOM; + pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) + if (unlikely(!pgtable)) { + mem_cgroup_cancel_charge(page, memcg); return VM_FAULT_OOM; + } clear_huge_page(page, haddr, HPAGE_PMD_NR); /* @@ -734,7 +741,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); put_page(page); pte_free(mm, pgtable); } else { @@ -742,6 +749,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -827,13 +836,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { - mem_cgroup_uncharge_page(page); put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -979,6 +982,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + struct mem_cgroup *memcg; spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; @@ -999,20 +1003,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_charge_anon(pages[i], mm, - GFP_KERNEL))) { + mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, + &memcg))) { if (pages[i]) put_page(pages[i]); - mem_cgroup_uncharge_start(); while (--i >= 0) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); ret |= VM_FAULT_OOM; goto out; } + set_page_private(pages[i], (unsigned long)memcg); } for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1041,7 +1046,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pte_t *pte, entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); page_add_new_anon_rmap(pages[i], vma, haddr); + mem_cgroup_commit_charge(pages[i], memcg, false); + lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1065,12 +1074,12 @@ out: out_free_pages: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); goto out; } @@ -1081,6 +1090,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; + struct mem_cgroup *memcg; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -1132,7 +1142,8 @@ alloc: goto out; } - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1161,7 +1172,7 @@ alloc: put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); put_page(new_page); goto out_mn; } else { @@ -1170,6 +1181,8 @@ alloc: entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); if (!page) { @@ -2413,6 +2426,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; + struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2423,7 +2437,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) return; /* @@ -2510,6 +2525,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); @@ -2523,7 +2540,7 @@ out_up_write: return; out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 90dc501eaf3f..1cbe1e54ff5f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2551,17 +2551,8 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } -/** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ -static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages) +static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -2660,41 +2651,7 @@ done: return ret; } -/** - * mem_cgroup_try_charge_mm - try charging a mm - * @mm: mm_struct to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns the charged mem_cgroup associated with the given mm_struct or - * NULL the charge failed. - */ -static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages) - -{ - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - memcg = NULL; - - return memcg; -} - -/* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) +static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { unsigned long bytes = nr_pages * PAGE_SIZE; @@ -2760,17 +2717,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return memcg; } -static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, - struct page *page, - unsigned int nr_pages, - enum charge_type ctype, - bool lrucare) +static void commit_charge(struct page *page, struct mem_cgroup *memcg, + unsigned int nr_pages, bool anon, bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); struct zone *uninitialized_var(zone); struct lruvec *lruvec; bool was_on_lru = false; - bool anon; lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); @@ -2807,11 +2760,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, spin_unlock_irq(&zone->lru_lock); } - if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) - anon = true; - else - anon = false; - mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); unlock_page_cgroup(pc); @@ -2882,21 +2830,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) if (ret) return ret; - ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT); + ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* - * mem_cgroup_try_charge() chosed to bypass to root due to - * OOM kill or fatal signal. Since our only options are to - * either fail the allocation or charge it to this cgroup, do - * it as a temporary condition. But we can't fail. From a - * kmem/slab perspective, the cache has already been selected, - * by mem_cgroup_kmem_get_cache(), so it is too late to change + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change * our minds. * * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed during - * mem_cgroup_try_charge() above. Tasks that were already - * dying when the allocation triggers should have been already + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ res_counter_charge_nofail(&memcg->res, size, &fail_res); @@ -3618,164 +3566,6 @@ out: return ret; } -int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return 0; - - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, nr_pages, - MEM_CGROUP_CHARGE_TYPE_ANON, false); - return 0; -} - -/* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ -static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - int ret; - - pc = lookup_page_cgroup(page); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. - */ - if (PageCgroupUsed(pc)) - goto out; - if (do_swap_account) - memcg = try_get_mem_cgroup_from_page(page); - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - return ret; -out: - *memcgp = memcg; - return 0; -} - -int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t gfp_mask, struct mem_cgroup **memcgp) -{ - if (mem_cgroup_disabled()) { - *memcgp = NULL; - return 0; - } - /* - * A racing thread's fault, or swapoff, may have already - * updated the pte, and even removed page from swap cache: in - * those cases unuse_pte()'s pte_same() test will fail; but - * there's also a KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); - if (!memcg) - return -ENOMEM; - *memcgp = memcg; - return 0; - } - return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); -} - -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); -} - -static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - - __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); - /* - * Now swap is on-memory. This means this page may be - * counted both as mem and swap....double count. - * Fix it by uncharging from memsw. Basically, this SwapCache is stable - * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() - * may call delete_from_swap_cache() before reach here. - */ - if (do_swap_account && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - mem_cgroup_uncharge_swap(ent); - } -} - -void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) -{ - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - -int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) -{ - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - struct mem_cgroup *memcg; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (PageSwapCache(page)) { /* shmem */ - ret = __mem_cgroup_try_charge_swapin(mm, page, - gfp_mask, &memcg); - if (ret) - return ret; - __mem_cgroup_commit_charge_swapin(page, memcg, type); - return 0; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - return 0; -} - static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages, const enum charge_type ctype) @@ -4122,7 +3912,6 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; struct page_cgroup *pc; - enum charge_type ctype; *memcgp = NULL; @@ -4184,16 +3973,12 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * page. In the case new page is migrated but not remapped, new page's * mapcount will be finally 0 and we call uncharge in end_migration(). */ - if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; /* * The page is committed to the memcg, but it's not actually * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); + commit_charge(newpage, memcg, nr_pages, PageAnon(page), false); } /* remove redundant charge if migration failed*/ @@ -4252,7 +4037,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, { struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; if (mem_cgroup_disabled()) return; @@ -4278,7 +4062,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, * the newpage may be on LRU(or pagevec for LRU) already. We lock * LRU while we overwrite pc->mem_cgroup. */ - __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); + commit_charge(newpage, memcg, 1, false, true); } #ifdef CONFIG_DEBUG_VM @@ -6319,20 +6103,19 @@ static int mem_cgroup_do_precharge(unsigned long count) int ret; /* Try a single bulk charge without reclaim first */ - ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); if (!ret) { mc.precharge += count; return ret; } if (ret == -EINTR) { - __mem_cgroup_cancel_charge(root_mem_cgroup, count); + cancel_charge(root_mem_cgroup, count); return ret; } /* Try charges one by one with reclaim */ while (count--) { - ret = mem_cgroup_try_charge(mc.to, - GFP_KERNEL & ~__GFP_NORETRY, 1); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); /* * In case of failure, any residual charges against * mc.to will be dropped by mem_cgroup_clear_mc() @@ -6340,7 +6123,7 @@ static int mem_cgroup_do_precharge(unsigned long count) * bypassed to root right away or they'll be lost. */ if (ret == -EINTR) - __mem_cgroup_cancel_charge(root_mem_cgroup, 1); + cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; @@ -6609,7 +6392,7 @@ static void __mem_cgroup_clear_mc(void) /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - __mem_cgroup_cancel_charge(mc.to, mc.precharge); + cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@ -6617,7 +6400,7 @@ static void __mem_cgroup_clear_mc(void) * we must uncharge here. */ if (mc.moved_charge) { - __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ @@ -6946,6 +6729,150 @@ static void __init enable_swap_cgroup(void) } #endif +/** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } +out: + *memcgp = memcg; + return ret; +} + +/** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) +{ + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + commit_charge(page, memcg, nr_pages, PageAnon(page), lrucare); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } +} + +/** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +{ + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); +} + /* * subsys_initcall() for memory controller. * diff --git a/mm/memory.c b/mm/memory.c index 5c55270729f7..6d7648773dc4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2049,6 +2049,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *dirty_page = NULL; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ + struct mem_cgroup *memcg; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2204,7 +2205,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -2234,6 +2235,8 @@ gotten: */ ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -2271,7 +2274,7 @@ gotten: new_page = old_page; ret |= VM_FAULT_WRITE; } else - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); if (new_page) page_cache_release(new_page); @@ -2410,10 +2413,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, { spinlock_t *ptl; struct page *page, *swapcache; + struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; @@ -2489,7 +2492,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2514,10 +2517,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * while the page is counted on swap but not yet in mapcount i.e. * before page_add_anon_rmap() and swap_free(); try_to_free_swap() * must be called after the swap_free(), or it will never succeed. - * Because delete_from_swap_page() may be called by reuse_swap_page(), - * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry - * in page->private. In this case, a record in swap_cgroup is silently - * discarded at swap_free(). */ inc_mm_counter_fast(mm, MM_ANONPAGES); @@ -2533,12 +2532,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); - if (page == swapcache) + if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, address); - /* It's better to call commit-charge after rmap is established */ - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) @@ -2571,7 +2572,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge(page, memcg); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -2627,6 +2628,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags) { + struct mem_cgroup *memcg; struct page *page; spinlock_t *ptl; pte_t entry; @@ -2660,7 +2662,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -2673,6 +2675,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -2682,7 +2686,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); goto unlock; oom_free_page: @@ -2919,6 +2923,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page, *new_page; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret; @@ -2930,7 +2935,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { page_cache_release(new_page); return VM_FAULT_OOM; } @@ -2950,12 +2955,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); unlock_page(fault_page); page_cache_release(fault_page); return ret; uncharge_out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); page_cache_release(new_page); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 22a4a7699cdb..f56b5ed78128 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1032,25 +1032,6 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - - VM_BUG_ON_PAGE(PageLRU(page), page); - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { - SetPageActive(page); - lru_cache_add(page); - return; - } - - if (!TestSetPageMlocked(page)) { - /* - * We use the irq-unsafe __mod_zone_page_stat because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); - } - add_page_to_unevictable_list(page); } /** diff --git a/mm/shmem.c b/mm/shmem.c index 302d1cf7ad07..1f1a8085538b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -621,7 +621,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); if (index == -1) - return 0; + return -EAGAIN; /* tell shmem_unuse we found nothing */ /* * Move _head_ to start search for next from here. @@ -680,7 +680,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, spin_unlock(&info->lock); swap_free(swap); } - error = 1; /* not an error, but entry was found */ } return error; } @@ -692,7 +691,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) { struct list_head *this, *next; struct shmem_inode_info *info; - int found = 0; + struct mem_cgroup *memcg; int error = 0; /* @@ -707,26 +706,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ + error = -EAGAIN; mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, &page); + error = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); - if (found) + if (error != -EAGAIN) break; + /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (found < 0) - error = found; + if (error) { + if (error != -ENOMEM) + error = 0; + mem_cgroup_cancel_charge(page, memcg); + } else + mem_cgroup_commit_charge(page, memcg, true); out: unlock_page(page); page_cache_release(page); @@ -1030,6 +1035,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1108,8 +1114,7 @@ repeat: goto failed; } - error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, swp_to_radix_entry(swap)); @@ -1125,12 +1130,16 @@ repeat: * Reset swap.val? No, leave it so "failed" goes back to * "repeat": reading a hole and writing should succeed. */ - if (error) + if (error) { + mem_cgroup_cancel_charge(page, memcg); delete_from_swap_cache(page); + } } if (error) goto failed; + mem_cgroup_commit_charge(page, memcg, true); + spin_lock(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@ -1168,8 +1177,7 @@ repeat: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); @@ -1179,9 +1187,10 @@ repeat: radix_tree_preload_end(); } if (error) { - mem_cgroup_uncharge_cache_page(page); + mem_cgroup_cancel_charge(page, memcg); goto decused; } + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_anon(page); spin_lock(&info->lock); diff --git a/mm/swap.c b/mm/swap.c index c789d01c9ec3..3baca701bb78 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -687,6 +687,40 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * Place @page on the active or unevictable LRU list, depending on its + * evictability. Note that if the page is not evictable, it goes + * directly back onto it's zone's unevictable list, it does NOT use a + * per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { + SetPageActive(page); + lru_cache_add(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); +} + /* * If the page can not be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the diff --git a/mm/swapfile.c b/mm/swapfile.c index 4c524f7bd0bf..0883b4912ff7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1106,15 +1106,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, - GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge_swapin(memcg); + mem_cgroup_cancel_charge(page, memcg); ret = 0; goto out; } @@ -1124,11 +1123,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); - if (page == swapcache) + if (page == swapcache) { page_add_anon_rmap(page, vma, addr); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, memcg); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); /* * Move the page to the active list so it is not -- cgit v1.2.3 From 747db954cab64c6b7a95b121b517165f34751898 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 8 Aug 2014 14:19:24 -0700 Subject: mm: memcontrol: use page lists for uncharge batching Pages are now uncharged at release time, and all sources of batched uncharges operate on lists of pages. Directly use those lists, and get rid of the per-task batching state. This also batches statistics accounting, in addition to the res counter charges, to reduce IRQ-disabling and re-enabling. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Cc: Naoya Horiguchi Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 12 +-- include/linux/sched.h | 6 -- kernel/fork.c | 4 - mm/memcontrol.c | 206 ++++++++++++++++++++++++--------------------- mm/swap.c | 6 +- mm/vmscan.c | 12 ++- 6 files changed, 117 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 806b8fa15c5f..e0752d204d9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -59,12 +59,8 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); - void mem_cgroup_uncharge(struct page *page); - -/* Batched uncharging */ -void mem_cgroup_uncharge_start(void); -void mem_cgroup_uncharge_end(void); +void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, bool lrucare); @@ -233,11 +229,7 @@ static inline void mem_cgroup_uncharge(struct page *page) { } -static inline void mem_cgroup_uncharge_start(void) -{ -} - -static inline void mem_cgroup_uncharge_end(void) +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 7c19d552dc3f..4fcf82a4d243 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1628,12 +1628,6 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { struct mem_cgroup *memcg; diff --git a/kernel/fork.c b/kernel/fork.c index fbd3497b221f..f6f5086c9e7d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1346,10 +1346,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9106f1b12f56..a6e2be0241af 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3581,53 +3581,6 @@ out: return ret; } -/* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - -void mem_cgroup_uncharge_start(void) -{ - unsigned long flags; - - local_irq_save(flags); - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } - local_irq_restore(flags); -} - -void mem_cgroup_uncharge_end(void) -{ - struct memcg_batch_info *batch = ¤t->memcg_batch; - unsigned long flags; - - local_irq_save(flags); - VM_BUG_ON(!batch->do_batch); - if (--batch->do_batch) /* If stacked, do nothing */ - goto out; - /* - * This "batch->memcg" is valid without any css_get/put etc... - * bacause we hide charges behind us. - */ - if (batch->nr_pages) - res_counter_uncharge(&batch->memcg->res, - batch->nr_pages * PAGE_SIZE); - if (batch->memsw_nr_pages) - res_counter_uncharge(&batch->memcg->memsw, - batch->memsw_nr_pages * PAGE_SIZE); - memcg_oom_recover(batch->memcg); -out: - local_irq_restore(flags); -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -6554,6 +6507,98 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) cancel_charge(memcg, nr_pages); } +static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_mem, unsigned long nr_memsw, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) +{ + unsigned long flags; + + if (nr_mem) + res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); + + memcg_oom_recover(memcg); + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + memcg_check_events(memcg, dummy_page); + local_irq_restore(flags); +} + +static void uncharge_list(struct list_head *page_list) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr_memsw = 0; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + unsigned long nr_mem = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + continue; + + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have + * fully exclusive access to the page. + */ + + if (memcg != pc->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + pgpgout = nr_mem = nr_memsw = 0; + nr_anon = nr_file = nr_huge = 0; + } + memcg = pc->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + if (pc->flags & PCG_MEM) + nr_mem += nr_pages; + if (pc->flags & PCG_MEMSW) + nr_memsw += nr_pages; + pc->flags = 0; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); +} + /** * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge @@ -6563,67 +6608,34 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) */ void mem_cgroup_uncharge(struct page *page) { - struct memcg_batch_info *batch; - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; struct page_cgroup *pc; - unsigned long pc_flags; - unsigned long flags; - - VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); if (mem_cgroup_disabled()) return; + /* Don't touch page->lru of any random page, pre-check: */ pc = lookup_page_cgroup(page); - - /* Every final put_page() ends up here */ if (!PageCgroupUsed(pc)) return; - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point, we have fully - * exclusive access to the page. - */ - memcg = pc->mem_cgroup; - pc_flags = pc->flags; - pc->flags = 0; - - local_irq_save(flags); + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); +} - if (nr_pages > 1) - goto direct; - if (unlikely(test_thread_flag(TIF_MEMDIE))) - goto direct; - batch = ¤t->memcg_batch; - if (!batch->do_batch) - goto direct; - if (batch->memcg && batch->memcg != memcg) - goto direct; - if (!batch->memcg) - batch->memcg = memcg; - if (pc_flags & PCG_MEM) - batch->nr_pages++; - if (pc_flags & PCG_MEMSW) - batch->memsw_nr_pages++; - goto out; -direct: - if (pc_flags & PCG_MEM) - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (pc_flags & PCG_MEMSW) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - memcg_oom_recover(memcg); -out: - mem_cgroup_charge_statistics(memcg, page, -nr_pages); - memcg_check_events(memcg, page); +/** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; - local_irq_restore(flags); + if (!list_empty(page_list)) + uncharge_list(page_list); } /** diff --git a/mm/swap.c b/mm/swap.c index 00523fffa5ed..6b2dc3897cd5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -908,8 +908,6 @@ void release_pages(struct page **pages, int nr, bool cold) struct lruvec *lruvec; unsigned long uninitialized_var(flags); - mem_cgroup_uncharge_start(); - for (i = 0; i < nr; i++) { struct page *page = pages[i]; @@ -941,7 +939,6 @@ void release_pages(struct page **pages, int nr, bool cold) __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); } - mem_cgroup_uncharge(page); /* Clear Active bit in case of parallel mark_page_accessed */ __ClearPageActive(page); @@ -951,8 +948,7 @@ void release_pages(struct page **pages, int nr, bool cold) if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); - mem_cgroup_uncharge_end(); - + mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); } EXPORT_SYMBOL(release_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7068e838d22b..2836b5373b2e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -822,7 +822,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); - mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -1103,7 +1102,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ __clear_page_locked(page); free_it: - mem_cgroup_uncharge(page); nr_reclaimed++; /* @@ -1133,8 +1131,8 @@ keep: list_add(&page->lru, &ret_pages); VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - mem_cgroup_uncharge_end(); + mem_cgroup_uncharge_list(&free_pages); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -1437,10 +1435,9 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) __ClearPageActive(page); del_page_from_lru_list(page, lruvec, lru); - mem_cgroup_uncharge(page); - if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1548,6 +1545,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); /* @@ -1660,10 +1658,9 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, __ClearPageActive(page); del_page_from_lru_list(page, lruvec, lru); - mem_cgroup_uncharge(page); - if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1771,6 +1768,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); } -- cgit v1.2.3 From 9a3f4d85d58cb4e02e226f9be946d54c33eb715b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:19:28 -0700 Subject: page-cgroup: get rid of NR_PCG_FLAGS It's not used anywhere today, so let's remove it. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 6 ------ kernel/bounds.c | 2 -- 2 files changed, 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index b8f8c9e36a3e..9d9f540658f5 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -6,12 +6,8 @@ enum { PCG_USED = 0x01, /* This page is charged to a memcg */ PCG_MEM = 0x02, /* This page holds a memory charge */ PCG_MEMSW = 0x04, /* This page holds a memory+swap charge */ - __NR_PCG_FLAGS, }; -#ifndef __GENERATING_BOUNDS_H -#include - struct pglist_data; #ifdef CONFIG_MEMCG @@ -107,6 +103,4 @@ static inline void swap_cgroup_swapoff(int type) #endif /* CONFIG_MEMCG_SWAP */ -#endif /* !__GENERATING_BOUNDS_H */ - #endif /* __LINUX_PAGE_CGROUP_H */ diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif -- cgit v1.2.3 From b86280aa48b67c8119ed8f6c6bebd8c0af13a269 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 8 Aug 2014 14:19:41 -0700 Subject: kernel/kallsyms.c: fix %pB when there's no symbol at the address __sprint_symbol() should restore original address when kallsyms_lookup() failed to find a symbol. It's reported when dumpstack shows an address in a dynamically allocated trampoline for ftrace. [ 1314.612287] [] dump_stack+0x45/0x56 [ 1314.612290] [] ? meminfo_proc_open+0x30/0x30 [ 1314.612293] [] kpatch_ftrace_handler+0x14/0xf0 [kpatch] [ 1314.612306] [] 0xffffffffa00160c3 You can see a difference in the hex address - c4 and c3. Fix it. Signed-off-by: Namhyung Kim Reported-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..ae5167087845 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); -- cgit v1.2.3 From 4878b14b43188ffeceecfc32295ed2a783b7aa7a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:19:48 -0700 Subject: kernel/test_kprobes.c: use current logging functions - Add pr_fmt - Coalesce formats - Use current pr_foo() functions instead of printk - Remove unnecessary "failed" display (already in log level). Signed-off-by: Fabian Frederick Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/test_kprobes.c | 87 ++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include #include #include @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } -- cgit v1.2.3 From a0be55dee71d437f7593c8c3673edd92962bafaf Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Fri, 8 Aug 2014 14:21:18 -0700 Subject: kernel/exit.c: fix coding style warnings and errors Fixed coding style warnings and errors. Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 88c6b3e42583..32c58f7433a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include #include -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +192,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -566,6 +569,7 @@ static void forget_original_parent(struct task_struct *father) list_for_each_entry_safe(p, n, &father->children, sibling) { struct task_struct *t = p; + do { t->real_parent = reaper; if (t->parent == father) { @@ -599,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* * This does two things: * - * A. Make init inherit all the child processes + * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) @@ -649,9 +653,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -692,8 +695,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -717,9 +719,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -837,7 +839,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -847,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -870,6 +870,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -1034,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1418,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1431,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } -- cgit v1.2.3 From ccf94f1b4a8560ffdc221840535bae5e5a91a53c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:21:22 -0700 Subject: proc: constify seq_operations proc_uid_seq_operations, proc_gid_seq_operations and proc_projid_seq_operations are only called in proc_id_map_open with seq_open as const struct seq_operations so we can constify the 3 structures and update proc_id_map_open prototype. text data bss dec hex filename 6817 404 1984 9205 23f5 kernel/user_namespace.o-before 6913 308 1984 9205 23f5 kernel/user_namespace.o-after Signed-off-by: Fabian Frederick Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- include/linux/user_namespace.h | 6 +++--- kernel/user_namespace.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index 2d696b0c93bf..79df9ff71afd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2449,7 +2449,7 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4836ba3c1cd8..e95372654f09 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -57,9 +57,9 @@ static inline void put_user_ns(struct user_namespace *ns) } struct seq_operations; -extern struct seq_operations proc_uid_seq_operations; -extern struct seq_operations proc_gid_seq_operations; -extern struct seq_operations proc_projid_seq_operations; +extern const struct seq_operations proc_uid_seq_operations; +extern const struct seq_operations proc_gid_seq_operations; +extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..aa312b0dc3ec 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, -- cgit v1.2.3 From 41f727fde1fe40efeb4fef6fdce74ff794be5aeb Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:21:56 -0700 Subject: fork/exec: cleanup mm initialization mm initialization on fork/exec is spread all over the place, which makes the code look inconsistent. We have mm_init(), which is supposed to init/nullify mm's internals, but it doesn't init all the fields it should: - on fork ->mmap,mm_rb,vmacache_seqnum,map_count,mm_cpumask,locked_vm are zeroed in dup_mmap(); - on fork ->pmd_huge_pte is zeroed in dup_mm(), immediately before calling mm_init(); - ->cpu_vm_mask_var ptr is initialized by mm_init_cpumask(), which is called before mm_init() on both fork and exec; - ->context is initialized by init_new_context(), which is called after mm_init() on both fork and exec; Let's consolidate all the initializations in mm_init() to make the code look cleaner. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 ---- include/linux/mm_types.h | 1 + kernel/fork.c | 47 ++++++++++++++++++++--------------------------- 3 files changed, 21 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index ab1f1200ce5d..a2b42a98c743 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; - err = init_new_context(current, mm); - if (err) - goto err; - err = __bprm_mm_init(bprm); if (err) goto err; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 796deac19fcf..6e0b286649f1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -461,6 +461,7 @@ static inline void mm_init_cpumask(struct mm_struct *mm) #ifdef CONFIG_CPUMASK_OFFSTACK mm->cpu_vm_mask_var = &mm->cpumask_allocation; #endif + cpumask_clear(mm->cpu_vm_mask_var); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ diff --git a/kernel/fork.c b/kernel/fork.c index f6f5086c9e7d..418b52a9ec6a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -374,12 +374,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -538,17 +532,27 @@ static void mm_init_aio(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -558,11 +562,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + return mm; + +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -596,7 +606,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -828,17 +837,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -860,15 +862,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) -- cgit v1.2.3 From ce65cefa5debefc0e81d0a533bda467f0aa67350 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:21:58 -0700 Subject: fork: reset mm->pinned_vm mm->pinned_vm counts pages of mm's address space that were permanently pinned in memory by increasing their reference counter. The counter was introduced by commit bc3e53f682d9 ("mm: distinguish between mlocked and pinned pages"), while before it locked_vm had been used for such pages. Obviously, we should reset the counter on fork if !CLONE_VM, just like we do with locked_vm, but currently we don't. Let's fix it. This patch will fix the contents of /proc/pid/status:VmPin. ib_umem_get[infiniband] and perf_mmap still check pinned_vm against RLIMIT_MEMLOCK. It's left from the times when pinned pages were accounted under locked_vm, but today it looks wrong. It isn't clear how we should deal with it. We still have some drivers accounting pinned pages under mm->locked_vm - this is what commit bc3e53f682d9 was fighting against. It's infiniband/usnic and vfio. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Cc: Roland Dreier Cc: Sean Hefty Cc: Hal Rosenstock Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 418b52a9ec6a..5a547a59a38a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -543,6 +543,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_long_set(&mm->nr_ptes, 0); mm->map_count = 0; mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_cpumask(mm); -- cgit v1.2.3 From 4f7d461433bb4a4deee61baefdac6cd1a1ecb546 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:22:01 -0700 Subject: fork: copy mm's vm usage counters under mmap_sem If a forking process has a thread calling (un)mmap (silly but still), the child process may have some of its mm's vm usage counters (total_vm and friends) screwed up, because currently they are copied from oldmm w/o holding any locks (memcpy in dup_mm). This patch moves the counters initialization to dup_mmap() to be called under oldmm->mmap_sem, which eliminates any possibility of race. Signed-off-by: Vladimir Davydov Cc: Oleg Nesterov Cc: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5a547a59a38a..aff84f84b0d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -374,6 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; -- cgit v1.2.3 From 33144e8429bd7fceacbb869a7f5061db42e13fe6 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 8 Aug 2014 14:22:03 -0700 Subject: kernel/fork.c: make mm_init_owner static It's only used in fork.c:mm_init(). Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ----- kernel/fork.c | 14 +++++++------- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4fcf82a4d243..b21e9218c0fd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2961,15 +2961,10 @@ static inline void inc_syscw(struct task_struct *tsk) #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); -extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } - -static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ -} #endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, diff --git a/kernel/fork.c b/kernel/fork.c index aff84f84b0d3..86da59e165ad 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -535,6 +535,13 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { mm->mmap = NULL; @@ -1139,13 +1146,6 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ -- cgit v1.2.3 From 834b18b23e1012e6c2987af703490bc60956d211 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Fri, 8 Aug 2014 14:22:20 -0700 Subject: kernel/gcov/fs.c: remove unnecessary null test before debugfs_remove This fixes checkpatch warning: WARNING: debugfs_remove(NULL) is safe this check is probably not required Signed-off-by: Fabian Frederick Cc: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/fs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } -- cgit v1.2.3 From 69361eef9056b0babb507798c2135ad1572f0ef7 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Fri, 8 Aug 2014 14:22:31 -0700 Subject: panic: add TAINT_SOFTLOCKUP This taint flag will be set if the system has ever entered a softlockup state. Similar to TAINT_WARN it is useful to know whether or not the system has been in a softlockup state when debugging. [akpm@linux-foundation.org: apply the taint before calling panic()] Signed-off-by: Josh Hunt Cc: Jason Baron Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/oops-tracing.txt | 2 ++ Documentation/sysctl/kernel.txt | 1 + include/linux/kernel.h | 1 + kernel/panic.c | 1 + kernel/watchdog.c | 1 + 5 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index e3155995ddd8..beefb9f82902 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt @@ -268,6 +268,8 @@ characters, each representing a particular tainted value. 14: 'E' if an unsigned module has been loaded in a kernel supporting module signature. + 15: 'L' if a soft lockup has previously occurred on the system. + The primary reason for the 'Tainted: ' string is to tell kernel debuggers if this is a clean kernel or if anything unusual has occurred. Tainting is permanent: even if an offending module is diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c14374e71775..f79eb9666379 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -826,6 +826,7 @@ can be ORed together: 4096 - An out-of-tree module has been loaded. 8192 - An unsigned module has been loaded in a kernel supporting module signature. +16384 - A soft lockup has previously occurred on the system. ============================================================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3dc22abbc68a..31ae66f34235 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -470,6 +470,7 @@ extern enum system_states { #define TAINT_FIRMWARE_WORKAROUND 11 #define TAINT_OOT_MODULE 12 #define TAINT_UNSIGNED_MODULE 13 +#define TAINT_SOFTLOCKUP 14 extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..d09dc5c32c67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, }; /** diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51b29e9d2ba6..a8d6914030fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -368,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); -- cgit v1.2.3 From ab602f799159393143d567e5c04b936fec79d6bd Mon Sep 17 00:00:00 2001 From: Jack Miller Date: Fri, 8 Aug 2014 14:23:19 -0700 Subject: shm: make exit_shm work proportional to task activity This is small set of patches our team has had kicking around for a few versions internally that fixes tasks getting hung on shm_exit when there are many threads hammering it at once. Anton wrote a simple test to cause the issue: http://ozlabs.org/~anton/junkcode/bust_shm_exit.c Before applying this patchset, this test code will cause either hanging tracebacks or pthread out of memory errors. After this patchset, it will still produce output like: root@somehost:~# ./bust_shm_exit 1024 160 ... INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111 jiffies, g=241, c=240, q=7113) INFO: Stall ended before state dump start ... But the task will continue to run along happily, so we consider this an improvement over hanging, even if it's a bit noisy. This patch (of 3): exit_shm obtains the ipc_ns shm rwsem for write and holds it while it walks every shared memory segment in the namespace. Thus the amount of work is related to the number of shm segments in the namespace not the number of segments that might need to be cleaned. In addition, this occurs after the task has been notified the thread has exited, so the number of tasks waiting for the ns shm rwsem can grow without bound until memory is exausted. Add a list to the task struct of all shmids allocated by this task. Init the list head in copy_process. Use the ns->rwsem for locking. Add segments after id is added, remove before removing from id. On unshare of NEW_IPCNS orphan any ids as if the task had exited, similar to handling of semaphore undo. I chose a define for the init sequence since its a simple list init, otherwise it would require a function call to avoid include loops between the semaphore code and the task struct. Converting the list_del to list_del_init for the unshare cases would remove the exit followed by init, but I left it blow up if not inited. Signed-off-by: Milton Miller Signed-off-by: Jack Miller Cc: Davidlohr Bueso Cc: Manfred Spraul Cc: Anton Blanchard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ include/linux/shm.h | 16 +++++++++++++++- ipc/shm.c | 22 +++++++++++----------- kernel/fork.c | 6 ++++++ 4 files changed, 34 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b21e9218c0fd..db2f6474e95e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -33,6 +33,7 @@ struct sched_param { #include #include +#include #include #include #include @@ -1385,6 +1386,7 @@ struct task_struct { #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ diff --git a/include/linux/shm.h b/include/linux/shm.h index 57d77709fbe2..fd206387048a 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -1,6 +1,7 @@ #ifndef _LINUX_SHM_H_ #define _LINUX_SHM_H_ +#include #include #include #include @@ -20,6 +21,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; + struct list_head shm_clist; /* list by creator */ }; /* shm_mode upper byte flags */ @@ -44,11 +46,20 @@ struct shmid_kernel /* private to the kernel */ #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) #ifdef CONFIG_SYSVIPC +struct sysv_shm { + struct list_head shm_clist; +}; + long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba); extern int is_file_shm_hugepages(struct file *file); -extern void exit_shm(struct task_struct *task); +void exit_shm(struct task_struct *task); +#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist) #else +struct sysv_shm { + /* empty */ +}; + static inline long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba) @@ -62,6 +73,9 @@ static inline int is_file_shm_hugepages(struct file *file) static inline void exit_shm(struct task_struct *task) { } +static inline void shm_init_task(struct task_struct *task) +{ +} #endif #endif /* _LINUX_SHM_H_ */ diff --git a/ipc/shm.c b/ipc/shm.c index 89fc354156cb..1fc3a61b443b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -178,6 +178,7 @@ static void shm_rcu_free(struct rcu_head *head) static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { + list_del(&s->shm_clist); ipc_rmid(&shm_ids(ns), &s->shm_perm); } @@ -268,14 +269,10 @@ static void shm_close(struct vm_area_struct *vma) } /* Called with ns->shm_ids(ns).rwsem locked */ -static int shm_try_destroy_current(int id, void *p, void *data) +static void shm_mark_orphan(struct shmid_kernel *shp, struct ipc_namespace *ns) { - struct ipc_namespace *ns = data; - struct kern_ipc_perm *ipcp = p; - struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); - - if (shp->shm_creator != current) - return 0; + if (WARN_ON(shp->shm_creator != current)) /* Remove me when it works */ + return; /* * Mark it as orphaned to destroy the segment when @@ -289,13 +286,12 @@ static int shm_try_destroy_current(int id, void *p, void *data) * is not set, it shouldn't be deleted here. */ if (!ns->shm_rmid_forced) - return 0; + return; if (shm_may_destroy(ns, shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } - return 0; } /* Called with ns->shm_ids(ns).rwsem locked */ @@ -333,14 +329,17 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) void exit_shm(struct task_struct *task) { struct ipc_namespace *ns = task->nsproxy->ipc_ns; + struct shmid_kernel *shp, *n; if (shm_ids(ns).in_use == 0) return; /* Destroy all already created segments, but not mapped yet */ down_write(&shm_ids(ns).rwsem); - if (shm_ids(ns).in_use) - idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); + list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) + shm_mark_orphan(shp, ns); + /* remove the list head from any segments still attached */ + list_del(&task->sysvshm.shm_clist); up_write(&shm_ids(ns).rwsem); } @@ -561,6 +560,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); /* * shmid gets reported as "inode#" in /proc/pid/maps. diff --git a/kernel/fork.c b/kernel/fork.c index 86da59e165ad..fa9124322cd4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1362,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1913,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); -- cgit v1.2.3 From 934fc295b30ea8ce5d5e0ab9024a10fab9b6f200 Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Fri, 8 Aug 2014 14:23:42 -0700 Subject: kernel/acct.c: fix coding style warnings and errors Signed-off-by: Ionut Alexa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a1844f14c6d6..51793520566f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -141,12 +141,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) if (acct->active) { if (act < 0) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { if (act > 0) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -261,6 +261,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); error = acct_on(tmp); @@ -376,7 +377,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -389,7 +390,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 < 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -486,16 +488,17 @@ static void do_acct_process(struct bsd_acct_struct *acct, run_time -= current->group_leader->start_time; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_etime = encode_float(elapsed); #else ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; ac.ac_etime_lo = (u16) etime; } @@ -505,15 +508,15 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 +#if ACCT_VERSION == 2 ac.ac_ahz = AHZ; #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -574,6 +577,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { -- cgit v1.2.3 From 4bb5f5d9395bc112d93a134d8f5b05611eddc9c0 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:25 -0700 Subject: mm: allow drivers to prevent new writable mappings This patch (of 6): The i_mmap_writable field counts existing writable mappings of an address_space. To allow drivers to prevent new writable mappings, make this counter signed and prevent new writable mappings if it is negative. This is modelled after i_writecount and DENYWRITE. This will be required by the shmem-sealing infrastructure to prevent any new writable mappings after the WRITE seal has been set. In case there exists a writable mapping, this operation will fail with EBUSY. Note that we rely on the fact that iff you already own a writable mapping, you can increase the counter without using the helpers. This is the same that we do for i_writecount. Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 1 + include/linux/fs.h | 29 +++++++++++++++++++++++++++-- kernel/fork.c | 2 +- mm/mmap.c | 30 ++++++++++++++++++++++++------ mm/swap_state.c | 1 + 5 files changed, 54 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/inode.c b/fs/inode.c index 5938f3928944..26753ba7b6d6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ab6c6913040..f0890e4a7c25 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -387,7 +387,7 @@ struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ spinlock_t tree_lock; /* and lock protecting it */ - unsigned int i_mmap_writable;/* count VM_SHARED mappings */ + atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct mutex i_mmap_mutex; /* protect tree, count, list */ @@ -470,10 +470,35 @@ static inline int mapping_mapped(struct address_space *mapping) * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap_pgoff * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. + * + * If i_mmap_writable is negative, no new writable mappings are allowed. You + * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { - return mapping->i_mmap_writable != 0; + return atomic_read(&mapping->i_mmap_writable) > 0; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? + 0 : -EPERM; +} + +static inline void mapping_unmap_writable(struct address_space *mapping) +{ + atomic_dec(&mapping->i_mmap_writable); +} + +static inline int mapping_deny_writable(struct address_space *mapping) +{ + return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? + 0 : -EBUSY; +} + +static inline void mapping_allow_writable(struct address_space *mapping) +{ + atomic_inc(&mapping->i_mmap_writable); } /* diff --git a/kernel/fork.c b/kernel/fork.c index fa9124322cd4..1380d8ace334 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -429,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) atomic_dec(&inode->i_writecount); mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ if (unlikely(tmp->vm_flags & VM_NONLINEAR)) diff --git a/mm/mmap.c b/mm/mmap.c index 64c9d736155c..c1f2ea4a0b99 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -221,7 +221,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; + mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -622,7 +622,7 @@ static void __vma_link_file(struct vm_area_struct *vma) if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file_inode(file)->i_writecount); if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; + atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) @@ -1577,6 +1577,17 @@ munmap_back: if (error) goto free_vma; } + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto allow_write_and_free_vma; + } + + /* ->mmap() can change vma->vm_file, but must guarantee that + * vma_link() below can deny write-access if VM_DENYWRITE is set + * and map writably if VM_SHARED is set. This usually means the + * new file must not have been exposed to user-space, yet. + */ vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); if (error) @@ -1616,8 +1627,12 @@ munmap_back: vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); + if (file) { + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + } file = vma->vm_file; out: perf_event_mmap(vma); @@ -1646,14 +1661,17 @@ out: return addr; unmap_and_free_vma: - if (vm_flags & VM_DENYWRITE) - allow_write_access(file); vma->vm_file = NULL; fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +allow_write_and_free_vma: + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: diff --git a/mm/swap_state.c b/mm/swap_state.c index e160151da6b8..3e0ec83d000c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -39,6 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { struct address_space swapper_spaces[MAX_SWAPFILES] = { [0 ... MAX_SWAPFILES - 1] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, } -- cgit v1.2.3 From 9183df25fe7b194563db3fec6dc3202a5855839c Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Fri, 8 Aug 2014 14:25:29 -0700 Subject: shm: add memfd_create() syscall memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor that you can pass to mmap(). It can support sealing and avoids any connection to user-visible mount-points. Thus, it's not subject to quotas on mounted file-systems, but can be used like malloc()'ed memory, but with a file-descriptor to it. memfd_create() returns the raw shmem file, so calls like ftruncate() can be used to modify the underlying inode. Also calls like fstat() will return proper information and mark the file as regular file. If you want sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not supported (like on all other regular files). Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not subject to a filesystem size limit. It is still properly accounted to memcg limits, though, and to the same overcommit or no-overcommit accounting as all user memory. Signed-off-by: David Herrmann Acked-by: Hugh Dickins Cc: Michael Kerrisk Cc: Ryan Lortie Cc: Lennart Poettering Cc: Daniel Mack Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 1 + include/uapi/linux/memfd.h | 8 +++++ kernel/sys_ni.c | 1 + mm/shmem.c | 73 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+) create mode 100644 include/uapi/linux/memfd.h (limited to 'kernel') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index d1b4a119d4a5..028b78168d85 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -362,3 +362,4 @@ 353 i386 renameat2 sys_renameat2 354 i386 seccomp sys_seccomp 355 i386 getrandom sys_getrandom +356 i386 memfd_create sys_memfd_create diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 252c804bb1aa..ca2b9aa78c81 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -325,6 +325,7 @@ 316 common renameat2 sys_renameat2 317 common seccomp sys_seccomp 318 common getrandom sys_getrandom +319 common memfd_create sys_memfd_create # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 701daff5d899..15a069425cbf 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -802,6 +802,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags, asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); +asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h new file mode 100644 index 000000000000..534e364bda92 --- /dev/null +++ b/include/uapi/linux/memfd.h @@ -0,0 +1,8 @@ +#ifndef _UAPI_LINUX_MEMFD_H +#define _UAPI_LINUX_MEMFD_H + +/* flags for memfd_create(2) (unsigned int) */ +#define MFD_CLOEXEC 0x0001U +#define MFD_ALLOW_SEALING 0x0002U + +#endif /* _UAPI_LINUX_MEMFD_H */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 2904a2105914..1f79e3714533 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -197,6 +197,7 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create); /* performance counters: */ cond_syscall(sys_perf_event_open); diff --git a/mm/shmem.c b/mm/shmem.c index 8b43bb7a4efe..4a5498795a2b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include +#include #include #include @@ -2732,6 +2734,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) shmem_show_mpol(seq, sbinfo->mpol); return 0; } + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct shmem_inode_info *info; + struct file *file; + int fd, error; + char *name; + long len; + + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + info = SHMEM_I(file_inode(file)); + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + if (flags & MFD_ALLOW_SEALING) + info->seals &= ~F_SEAL_SEAL; + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) -- cgit v1.2.3 From 8370edea81e321b8a976969753d6b2811e6d5ed6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:38 -0700 Subject: bin2c: move bin2c in scripts/basic This patch series does not do kernel signature verification yet. I plan to post another patch series for that. Now distributions are already signing PE/COFF bzImage with PKCS7 signature I plan to parse and verify those signatures. Primary goal of this patchset is to prepare groundwork so that kernel image can be signed and signatures be verified during kexec load. This should help with two things. - It should allow kexec/kdump on secureboot enabled machines. - In general it can help even without secureboot. By being able to verify kernel image signature in kexec, it should help with avoiding module signing restrictions. Matthew Garret showed how to boot into a custom kernel, modify first kernel's memory and then jump back to old kernel and bypass any policy one wants to. This patch (of 15): Kexec wants to use bin2c and it wants to use it really early in the build process. See arch/x86/purgatory/ code in later patches. So move bin2c in scripts/basic so that it can be built very early and be usable by arch/x86/purgatory/ Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 2 +- scripts/.gitignore | 1 - scripts/Makefile | 1 - scripts/basic/.gitignore | 1 + scripts/basic/Makefile | 1 + scripts/basic/bin2c.c | 35 +++++++++++++++++++++++++++++++++++ scripts/bin2c.c | 36 ------------------------------------ 7 files changed, 38 insertions(+), 39 deletions(-) create mode 100644 scripts/basic/bin2c.c delete mode 100644 scripts/bin2c.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0026cf531769..dc5c77544fd6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -105,7 +105,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/scripts/.gitignore b/scripts/.gitignore index fb070fa1038f..5ecfe93f2028 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -4,7 +4,6 @@ conmakehash kallsyms pnmtologo -bin2c unifdef ihex2fw recordmcount diff --git a/scripts/Makefile b/scripts/Makefile index 890df5c6adfb..72902b5f2721 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -13,7 +13,6 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash -hostprogs-$(CONFIG_IKCONFIG) += bin2c hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable hostprogs-$(CONFIG_ASN1) += asn1_compiler diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore index a776371a3502..9528ec9e5adc 100644 --- a/scripts/basic/.gitignore +++ b/scripts/basic/.gitignore @@ -1 +1,2 @@ fixdep +bin2c diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 4fcef87bb875..afbc1cd69ac5 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -9,6 +9,7 @@ # fixdep: Used to generate dependency information during build process hostprogs-y := fixdep +hostprogs-$(CONFIG_IKCONFIG) += bin2c always := $(hostprogs-y) # fixdep is needed to compile other host programs diff --git a/scripts/basic/bin2c.c b/scripts/basic/bin2c.c new file mode 100644 index 000000000000..af187e695345 --- /dev/null +++ b/scripts/basic/bin2c.c @@ -0,0 +1,35 @@ +/* + * Unloved program to convert a binary on stdin to a C include on stdout + * + * Jan 1999 Matt Mackall + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include + +int main(int argc, char *argv[]) +{ + int ch, total = 0; + + if (argc > 1) + printf("const char %s[] %s=\n", + argv[1], argc > 2 ? argv[2] : ""); + + do { + printf("\t\""); + while ((ch = getchar()) != EOF) { + total++; + printf("\\x%02x", ch); + if (total % 16 == 0) + break; + } + printf("\"\n"); + } while (ch != EOF); + + if (argc > 1) + printf("\t;\n\nconst int %s_size = %d;\n", argv[1], total); + + return 0; +} diff --git a/scripts/bin2c.c b/scripts/bin2c.c deleted file mode 100644 index 96dd2bcbb407..000000000000 --- a/scripts/bin2c.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Unloved program to convert a binary on stdin to a C include on stdout - * - * Jan 1999 Matt Mackall - * - * This software may be used and distributed according to the terms - * of the GNU General Public License, incorporated herein by reference. - */ - -#include - -int main(int argc, char *argv[]) -{ - int ch, total=0; - - if (argc > 1) - printf("const char %s[] %s=\n", - argv[1], argc > 2 ? argv[2] : ""); - - do { - printf("\t\""); - while ((ch = getchar()) != EOF) - { - total++; - printf("\\x%02x",ch); - if (total % 16 == 0) - break; - } - printf("\"\n"); - } while (ch != EOF); - - if (argc > 1) - printf("\t;\n\nconst int %s_size = %d;\n", argv[1], total); - - return 0; -} -- cgit v1.2.3 From 7d3e2bca22feb1f4a624009ff6c15e6f724cb4e7 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:43 -0700 Subject: kexec: rename unusebale_pages to unusable_pages Let's use the more common "unusable". This patch was originally written and posted by Boris. I am including it in this patch series. Signed-off-by: Borislav Petkov Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 2 +- kernel/kexec.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index a75641930049..d9bb0a57d208 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -100,7 +100,7 @@ struct kimage { struct list_head control_pages; struct list_head dest_pages; - struct list_head unuseable_pages; + struct list_head unusable_pages; /* Address of next control page to allocate for crash kernels. */ unsigned long control_page; diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..c7cc2a00181c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -154,7 +154,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); + INIT_LIST_HEAD(&image->unusable_pages); /* Read in the segments */ image->nr_segments = nr_segments; @@ -609,7 +609,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -732,7 +732,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; -- cgit v1.2.3 From dabe78628dd886c4b71971d1d78f1cecc674b760 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:45 -0700 Subject: kexec: move segment verification code in a separate function Previously do_kimage_alloc() will allocate a kimage structure, copy segment list from user space and then do the segment list sanity verification. Break down this function in 3 parts. do_kimage_alloc_init() to do actual allocation and basic initialization of kimage structure. copy_user_segment_list() to copy segment list from user space and sanity_check_segment_list() to verify the sanity of segment list as passed by user space. In later patches, I need to only allocate kimage and not copy segment list from user space. So breaking down in smaller functions enables re-use of code at other places. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 182 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index c7cc2a00181c..062e5567750e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -125,45 +125,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unusable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -185,9 +167,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -208,7 +190,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -220,18 +202,61 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); @@ -244,10 +269,19 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, struct kimage *image; /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + result = copy_user_segment_list(image, nr_segments, segments); if (result) - goto out; + goto out_free_image; + + result = sanity_check_segment_list(image); + if (result) + goto out_free_image; /* * Find a location for the control code buffer, and add it @@ -259,22 +293,21 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { pr_err("Could not allocate swap buffer\n"); - goto out_free; + goto out_free_control_pages; } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: return result; } @@ -284,19 +317,17 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, { int result; struct kimage *image; - unsigned long i; - image = NULL; /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; - goto out; - } + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; /* Enable the special crash kernel control page * allocation policy. @@ -304,25 +335,13 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + result = copy_user_segment_list(image, nr_segments, segments); + if (result) + goto out_free_image; - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; - } + result = sanity_check_segment_list(image); + if (result) + goto out_free_image; /* * Find a location for the control code buffer, and add @@ -334,15 +353,14 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } *rimage = image; return 0; -out_free: +out_free_image: kfree(image); -out: return result; } -- cgit v1.2.3 From 255aedd90e3e804fb52e1a71636a3b22cf12f81b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:48 -0700 Subject: kexec: use common function for kimage_normal_alloc() and kimage_crash_alloc() kimage_normal_alloc() and kimage_crash_alloc() are doing lot of similar things and differ only little. So instead of having two separate functions create a common function kimage_alloc_init() and pass it the "flags" argument which tells whether it is normal kexec or kexec_on_panic. And this function should be able to deal with both the cases. This consolidation also helps later where we can use a common function kimage_file_alloc_init() to handle normal and crash cases for new file based kexec syscall. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 105 +++++++++++++++++++-------------------------------------- 1 file changed, 34 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 062e5567750e..bfdda316697d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -261,12 +261,20 @@ static struct kimage *do_kimage_alloc_init(void) static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ image = do_kimage_alloc_init(); @@ -275,20 +283,26 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, image->start = entry; - result = copy_user_segment_list(image, nr_segments, segments); - if (result) + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) goto out_free_image; - result = sanity_check_segment_list(image); - if (result) + ret = sanity_check_segment_list(image); + if (ret) goto out_free_image; + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { @@ -296,10 +310,12 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free_control_pages; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; @@ -308,60 +324,7 @@ out_free_control_pages: kimage_free_page_list(&image->control_pages); out_free_image: kfree(image); - return result; -} - -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) -{ - int result; - struct kimage *image; - - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) - return -EADDRNOTAVAIL; - - /* Allocate and initialize a controlling structure */ - image = do_kimage_alloc_init(); - if (!image) - return -ENOMEM; - - image->start = entry; - - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; - - result = copy_user_segment_list(image, nr_segments, segments); - if (result) - goto out_free_image; - - result = sanity_check_segment_list(image); - if (result) - goto out_free_image; - - /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. - */ - result = -ENOMEM; - image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_PAGE_SIZE)); - if (!image->control_code_page) { - pr_err("Could not allocate control_code_buffer\n"); - goto out_free_image; - } - - *rimage = image; - return 0; - -out_free_image: - kfree(image); - return result; + return ret; } static int kimage_is_destination_range(struct kimage *image, @@ -1004,16 +967,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) -- cgit v1.2.3 From 8c86e70acead629aacb4afcd818add66bf6844d9 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:50 -0700 Subject: resource: provide new functions to walk through resources I have added two more functions to walk through resources. Currently walk_system_ram_range() deals with pfn and /proc/iomem can contain partial pages. By dealing in pfn, callback function loses the info that last page of a memory range is a partial page and not the full page. So I implemented walk_system_ram_res() which returns u64 values to callback functions and now it properly return start and end address. walk_system_ram_range() uses find_next_system_ram() to find the next ram resource. This in turn only travels through siblings of top level child and does not travers through all the nodes of the resoruce tree. I also need another function where I can walk through all the resources, for example figure out where "GART" aperture is. Figure out where ACPI memory is. So I wrote another function walk_iomem_res() which walks through all /proc/iomem resources and returns matches as asked by caller. Caller can specify "name" of resource, start and end and flags. Got rid of find_next_system_ram_res() and instead implemented more generic find_next_iomem_res() which can be used to traverse top level children only based on an argument. Signed-off-by: Vivek Goyal Cc: Yinghai Lu Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 6 +++ kernel/resource.c | 101 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 98 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5e3a906cc089..142ec544167c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -237,6 +237,12 @@ extern int iomem_is_exclusive(u64 addr); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); +extern int +walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); +extern int +walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..da14b8d09296 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) BUG_ON(start >= end); read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + if (first_level_children_only) { + p = iomem_resource.child; + sibling_only = true; + } else + p = &iomem_resource; + + while ((p = next_resource(p, sibling_only))) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -364,6 +383,70 @@ static int find_next_system_ram(struct resource *res, char *name) return 0; } +/* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + /* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) -- cgit v1.2.3 From f0895685c7fd8c938c91a9d8a6f7c11f22df58d2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:55 -0700 Subject: kexec: new syscall kexec_file_load() declaration This is the new syscall kexec_file_load() declaration/interface. I have reserved the syscall number only for x86_64 so far. Other architectures (including i386) can reserve syscall number when they enable the support for this new syscall. Signed-off-by: Vivek Goyal Cc: Michael Kerrisk Cc: Borislav Petkov Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 4 ++++ kernel/kexec.c | 7 +++++++ kernel/sys_ni.c | 1 + 4 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index ca2b9aa78c81..35dd922727b9 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -326,6 +326,7 @@ 317 common seccomp sys_seccomp 318 common getrandom sys_getrandom 319 common memfd_create sys_memfd_create +320 common kexec_file_load sys_kexec_file_load # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 15a069425cbf..0f86d85a9ce4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -317,6 +317,10 @@ asmlinkage long sys_restart_syscall(void); asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, unsigned long flags); +asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd, + unsigned long cmdline_len, + const char __user *cmdline_ptr, + unsigned long flags); asmlinkage long sys_exit(int error_code); asmlinkage long sys_exit_group(int error_code); diff --git a/kernel/kexec.c b/kernel/kexec.c index bfdda316697d..ec4386c1b94f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1058,6 +1058,13 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + return -ENOSYS; +} + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1f79e3714533..391d4ddb6f4b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); -- cgit v1.2.3 From cb1052581e2bddd6096544f3f944f4e7fdad4c7f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:25:57 -0700 Subject: kexec: implementation of new syscall kexec_file_load Previous patch provided the interface definition and this patch prvides implementation of new syscall. Previously segment list was prepared in user space. Now user space just passes kernel fd, initrd fd and command line and kernel will create a segment list internally. This patch contains generic part of the code. Actual segment preparation and loading is done by arch and image specific loader. Which comes in next patch. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 45 ++++ include/linux/kexec.h | 53 ++++ include/uapi/linux/kexec.h | 11 + kernel/kexec.c | 483 ++++++++++++++++++++++++++++++++++++- 4 files changed, 587 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 679cef0791cd..c8875b5545e1 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -22,6 +22,10 @@ #include #include +static struct kexec_file_ops *kexec_file_loaders[] = { + NULL, +}; + static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.pud); @@ -283,3 +287,44 @@ void arch_crash_save_vmcoreinfo(void) (unsigned long)&_text - __START_KERNEL); } +/* arch-dependent functionality related to kexec file-based syscall */ + +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + int i, ret = -ENOEXEC; + struct kexec_file_ops *fops; + + for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { + fops = kexec_file_loaders[i]; + if (!fops || !fops->probe) + continue; + + ret = fops->probe(buf, buf_len); + if (!ret) { + image->fops = fops; + return ret; + } + } + + return ret; +} + +void *arch_kexec_kernel_image_load(struct kimage *image) +{ + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image); +} diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 66d56ac0f64c..8e80901e466f 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -121,13 +121,57 @@ struct kimage { #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; + /* If set, we are using file mode kexec syscall */ + unsigned int file_mode:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; #endif + + /* Additional fields for file based kexec syscall */ + void *kernel_buf; + unsigned long kernel_buf_len; + + void *initrd_buf; + unsigned long initrd_buf_len; + + char *cmdline_buf; + unsigned long cmdline_buf_len; + + /* File operations provided by image loader */ + struct kexec_file_ops *fops; + + /* Image loader handling the kernel can store a pointer here */ + void *image_loader_data; }; +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; +typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); +typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); +typedef int (kexec_cleanup_t)(struct kimage *image); + +struct kexec_file_ops { + kexec_probe_t *probe; + kexec_load_t *load; + kexec_cleanup_t *cleanup; +}; /* kexec interface functions */ extern void machine_kexec(struct kimage *image); @@ -138,6 +182,11 @@ extern asmlinkage long sys_kexec_load(unsigned long entry, struct kexec_segment __user *segments, unsigned long flags); extern int kernel_kexec(void); +extern int kexec_add_buffer(struct kimage *image, char *buffer, + unsigned long bufsz, unsigned long memsz, + unsigned long buf_align, unsigned long buf_min, + unsigned long buf_max, bool top_down, + unsigned long *load_addr); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); extern void crash_kexec(struct pt_regs *); @@ -188,6 +237,10 @@ extern int kexec_load_disabled; #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT) #endif +/* List of defined/legal kexec file flags */ +#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ + KEXEC_FILE_NO_INITRAMFS) + #define VMCOREINFO_BYTES (4096) #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index d6629d49a243..6925f5b42f89 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -13,6 +13,17 @@ #define KEXEC_PRESERVE_CONTEXT 0x00000002 #define KEXEC_ARCH_MASK 0xffff0000 +/* + * Kexec file load interface flags. + * KEXEC_FILE_UNLOAD : Unload already loaded kexec/kdump image. + * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. + * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd + * fd field. + */ +#define KEXEC_FILE_UNLOAD 0x00000001 +#define KEXEC_FILE_ON_CRASH 0x00000002 +#define KEXEC_FILE_NO_INITRAMFS 0x00000004 + /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. */ diff --git a/kernel/kexec.c b/kernel/kexec.c index ec4386c1b94f..9b46219254dd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include #include #include @@ -327,6 +329,221 @@ out_free_image: return ret; } +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) +{ + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; + + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; + goto out; + } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; + goto out; + } + + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } + + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } + + if (bytes == 0) + break; + pos += bytes; + } + + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + pr_err("Could not allocate control_code_buffer\n"); + goto out_free_post_load_bufs; + } + + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } + + *rimage = image; + return 0; +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); + kfree(image->image_loader_data); +out_free_image: + kfree(image); + return ret; +} + static int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) @@ -644,6 +861,16 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + kfree(image->image_loader_data); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -772,10 +999,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -807,7 +1038,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -815,7 +1050,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -1062,7 +1300,72 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) { - return -ENOSYS; + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; } void crash_kexec(struct pt_regs *regs) @@ -1620,6 +1923,176 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. -- cgit v1.2.3 From 12db5562e0352986a265841638482b84f3a6899b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:04 -0700 Subject: kexec: load and relocate purgatory at kernel load time Load purgatory code in RAM and relocate it based on the location. Relocation code has been inspired by module relocation code and purgatory relocation code in kexec-tools. Also compute the checksums of loaded kexec segments and store them in purgatory. Arch independent code provides this functionality so that arch dependent bootloaders can make use of it. Helper functions are provided to get/set symbol values in purgatory which are used by bootloaders later to set things like stack and entry point of second kernel etc. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 2 + arch/ia64/Kconfig | 2 + arch/m68k/Kconfig | 2 + arch/mips/Kconfig | 2 + arch/powerpc/Kconfig | 2 + arch/s390/Kconfig | 2 + arch/sh/Kconfig | 2 + arch/tile/Kconfig | 2 + arch/x86/Kconfig | 2 + arch/x86/kernel/machine_kexec_64.c | 142 ++++++++++ include/linux/kexec.h | 33 +++ kernel/kexec.c | 544 ++++++++++++++++++++++++++++++++++++- 12 files changed, 736 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8e9dbcbcf5af..cacc8d5355b3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2065,6 +2065,8 @@ config XIP_PHYS_ADDR config KEXEC bool "Kexec system call (EXPERIMENTAL)" depends on (!SMP || PM_SLEEP_SMP) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index c84c88bbbbd7..64aefb76bd69 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -549,6 +549,8 @@ source "drivers/sn/Kconfig" config KEXEC bool "kexec system call" depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 87b7c7581b1d..3ff8c9a25335 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -91,6 +91,8 @@ config MMU_SUN3 config KEXEC bool "kexec system call" depends on M68KCLASSIC + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 900c7e5333b6..df51e78a72cc 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2396,6 +2396,8 @@ source "kernel/Kconfig.preempt" config KEXEC bool "Kexec system call" + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4bc7b62fb4b6..a577609f8ed6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -399,6 +399,8 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config KEXEC bool "kexec system call" depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 05c78bb5f570..ab39ceb89ecf 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -48,6 +48,8 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KEXEC def_bool y + select CRYPTO + select CRYPTO_SHA256 config AUDIT_ARCH def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index aa2df3eaeb29..453fa5c09550 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -595,6 +595,8 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" depends on SUPERH32 && MMU + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 7fcd492adbfc..a3ffe2dd4832 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -191,6 +191,8 @@ source "kernel/Kconfig.hz" config KEXEC bool "kexec system call" + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 98fe3df6df82..9558b9fcafbf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1583,6 +1583,8 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" select BUILD_BIN2C + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index c8875b5545e1..88404c440727 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include #include #include @@ -328,3 +330,143 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image); } + +/* + * Apply purgatory relocations. + * + * ehdr: Pointer to elf headers + * sechdrs: Pointer to section headers. + * relsec: section index of SHT_RELA section. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, + Elf64_Shdr *sechdrs, unsigned int relsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + Elf64_Shdr *section, *symtabsec; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + + /* + * ->sh_offset has been modified to keep the pointer to section + * contents in memory + */ + rel = (void *)sechdrs[relsec].sh_offset; + + /* Section to which relocations apply */ + section = &sechdrs[sechdrs[relsec].sh_info]; + + pr_debug("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + + /* Associated symbol table */ + symtabsec = &sechdrs[sechdrs[relsec].sh_link]; + + /* String table */ + if (symtabsec->sh_link >= ehdr->e_shnum) { + /* Invalid strtab section number */ + pr_err("Invalid string table section index %d\n", + symtabsec->sh_link); + return -ENOEXEC; + } + + strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + + /* section header string table */ + shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update (->sh_offset). This is temporary + * buffer where section is currently loaded. This will finally + * be loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = (void *)(section->sh_offset + rel[i].r_offset); + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (Elf64_Sym *)symtabsec->sh_offset + + ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8e80901e466f..84f09e9eca26 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -10,6 +10,7 @@ #include #include #include +#include #include /* Verify architecture specific macros are defined */ @@ -95,6 +96,27 @@ struct compat_kexec_segment { }; #endif +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +struct purgatory_info { + /* Pointer to elf header of read only purgatory */ + Elf_Ehdr *ehdr; + + /* Pointer to purgatory sechdrs which are modifiable */ + Elf_Shdr *sechdrs; + /* + * Temporary buffer location where purgatory is loaded and relocated + * This memory can be freed post image load + */ + void *purgatory_buf; + + /* Address where purgatory is finally loaded and is executed from */ + unsigned long purgatory_load_addr; +}; + struct kimage { kimage_entry_t head; kimage_entry_t *entry; @@ -143,6 +165,9 @@ struct kimage { /* Image loader handling the kernel can store a pointer here */ void *image_loader_data; + + /* Information for loading purgatory */ + struct purgatory_info purgatory_info; }; /* @@ -189,6 +214,14 @@ extern int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long *load_addr); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +extern int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr); +extern int kexec_purgatory_get_set_symbol(struct kimage *image, + const char *name, void *buf, + unsigned int size, bool get_value); +extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, + const char *name); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); diff --git a/kernel/kexec.c b/kernel/kexec.c index 9b46219254dd..669e331aa9ec 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -42,6 +42,9 @@ #include #include +#include +#include + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -54,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -404,6 +416,24 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { } +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + /* * Free up memory used by kernel, initrd, and comand line. This is temporary * memory allocation which is not needed any more after these buffers have @@ -411,6 +441,8 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) */ static void kimage_file_post_load_cleanup(struct kimage *image) { + struct purgatory_info *pi = &image->purgatory_info; + vfree(image->kernel_buf); image->kernel_buf = NULL; @@ -420,6 +452,12 @@ static void kimage_file_post_load_cleanup(struct kimage *image) kfree(image->cmdline_buf); image->cmdline_buf = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + /* See if architecture has anything to cleanup post load */ arch_kimage_file_post_load_cleanup(image); } @@ -1105,7 +1143,7 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + buf += mchunk; mbytes -= mchunk; } out: @@ -1340,6 +1378,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + for (i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -2092,6 +2134,506 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, return 0; } +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} /* * Move into place and start executing a preloaded standalone -- cgit v1.2.3 From 27f48d3e633be23656a097baa3be336e04a82d84 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:06 -0700 Subject: kexec-bzImage64: support for loading bzImage using 64bit entry This is loader specific code which can load bzImage and set it up for 64bit entry. This does not take care of 32bit entry or real mode entry. 32bit mode entry can be implemented if somebody needs it. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kexec-bzimage64.h | 6 + arch/x86/include/asm/kexec.h | 21 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/kexec-bzimage64.c | 375 +++++++++++++++++++++++++++++++++ arch/x86/kernel/machine_kexec_64.c | 5 +- include/linux/kexec.h | 2 +- kernel/kexec.c | 11 +- 7 files changed, 415 insertions(+), 6 deletions(-) create mode 100644 arch/x86/include/asm/kexec-bzimage64.h create mode 100644 arch/x86/kernel/kexec-bzimage64.c (limited to 'kernel') diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h new file mode 100644 index 000000000000..d1b5d194e31d --- /dev/null +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -0,0 +1,6 @@ +#ifndef _ASM_KEXEC_BZIMAGE64_H +#define _ASM_KEXEC_BZIMAGE64_H + +extern struct kexec_file_ops kexec_bzImage64_ops; + +#endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 17483a492f18..0dfccced4edf 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -23,6 +23,7 @@ #include #include +#include /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -161,6 +162,26 @@ struct kimage_arch { pmd_t *pmd; pte_t *pte; }; + +struct kexec_entry64_regs { + uint64_t rax; + uint64_t rbx; + uint64_t rcx; + uint64_t rdx; + uint64_t rsi; + uint64_t rdi; + uint64_t rsp; + uint64_t rbp; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rip; +}; #endif typedef void crash_vmclear_fn(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bde3993624f1..b5ea75c4a4b4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -118,4 +118,5 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o + obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 000000000000..bcedd100192f --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,375 @@ +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys; + uint32_t cmdline_low_32, cmdline_ext_32; + + memcpy(cmdline_ptr, cmdline, cmdline_len); + cmdline_ptr[cmdline_len - 1] = '\0'; + + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_memory_map_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_saved.nr_map; + + /* TODO: Pass entries more than E820MAX in bootparams setup data */ + if (nr_e820_entries > E820MAX) + nr_e820_entries = E820MAX; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_map, &e820_saved.map, + nr_e820_entries * sizeof(struct e820entry)); + + return 0; +} + +static int setup_boot_parameters(struct boot_params *params) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &boot_params.screen_info, + sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + /* Default sysdesc table */ + params->sys_desc_table.length = 0; + + setup_memory_map_entries(params); + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_map[i].type != E820_RAM) + continue; + start = params->e820_map[i].addr; + end = params->e820_map[i].addr + params->e820_map[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return 0; +} + +int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be atleast two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + unsigned long purgatory_load_addr; + unsigned long kernel_bufsz, kernel_memsz, kernel_align; + char *kernel_buf; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, + &purgatory_load_addr); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + + /* Load Bootparams and cmdline */ + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len; + params = kzalloc(params_cmdline_sz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + + /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + ret = kexec_add_buffer(image, (char *)params, params_cmdline_sz, + params_cmdline_sz, 16, MIN_BOOTPARAM_ADDR, + ULONG_MAX, 1, &bootparam_load_addr); + if (ret) + goto out_free_params; + pr_debug("Loaded boot_param and command line at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, params_cmdline_sz, params_cmdline_sz); + + /* Load kernel */ + kernel_buf = kernel + kern16_size; + kernel_bufsz = kernel_len - kern16_size; + kernel_memsz = PAGE_ALIGN(header->init_size); + kernel_align = header->kernel_alignment; + + ret = kexec_add_buffer(image, kernel_buf, + kernel_bufsz, kernel_memsz, kernel_align, + MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, + &kernel_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kernel_memsz, kernel_memsz); + + /* Load initrd high */ + if (initrd) { + ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, + PAGE_SIZE, MIN_INITRD_LOAD_ADDR, + ULONG_MAX, 1, &initrd_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(params, bootparam_load_addr, sizeof(struct boot_params), + cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + setup_boot_parameters(params); + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +}; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 88404c440727..18d0f9e0b6da 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -23,9 +23,10 @@ #include #include #include +#include static struct kexec_file_ops *kexec_file_loaders[] = { - NULL, + &kexec_bzImage64_ops, }; static void free_transition_pgtable(struct kimage *image) @@ -328,7 +329,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) if (!image->fops || !image->fops->cleanup) return 0; - return image->fops->cleanup(image); + return image->fops->cleanup(image->image_loader_data); } /* diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 84f09e9eca26..9481703b0e7a 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -190,7 +190,7 @@ typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long kernel_len, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len); -typedef int (kexec_cleanup_t)(struct kimage *image); +typedef int (kexec_cleanup_t)(void *loader_data); struct kexec_file_ops { kexec_probe_t *probe; diff --git a/kernel/kexec.c b/kernel/kexec.c index 669e331aa9ec..0926f2a3ed03 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -460,6 +460,14 @@ static void kimage_file_post_load_cleanup(struct kimage *image) /* See if architecture has anything to cleanup post load */ arch_kimage_file_post_load_cleanup(image); + + /* + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. + */ + kfree(image->image_loader_data); + image->image_loader_data = NULL; } /* @@ -576,7 +584,6 @@ out_free_control_pages: kimage_free_page_list(&image->control_pages); out_free_post_load_bufs: kimage_file_post_load_cleanup(image); - kfree(image->image_loader_data); out_free_image: kfree(image); return ret; @@ -900,8 +907,6 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); - kfree(image->image_loader_data); - /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. -- cgit v1.2.3 From dd5f726076cc7639d9713b334c8c133f77c6757a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:09 -0700 Subject: kexec: support for kexec on panic using new system call This patch adds support for loading a kexec on panic (kdump) kernel usning new system call. It prepares ELF headers for memory areas to be dumped and for saved cpu registers. Also prepares the memory map for second kernel and limits its boot to reserved areas only. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/crash.h | 9 + arch/x86/include/asm/kexec.h | 30 +- arch/x86/kernel/crash.c | 563 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/kexec-bzimage64.c | 55 +++- arch/x86/kernel/machine_kexec_64.c | 40 +++ arch/x86/purgatory/entry64.S | 6 +- kernel/kexec.c | 46 ++- 7 files changed, 724 insertions(+), 25 deletions(-) create mode 100644 arch/x86/include/asm/crash.h (limited to 'kernel') diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h new file mode 100644 index 000000000000..f498411f2500 --- /dev/null +++ b/arch/x86/include/asm/crash.h @@ -0,0 +1,9 @@ +#ifndef _ASM_X86_CRASH_H +#define _ASM_X86_CRASH_H + +int crash_load_segments(struct kimage *image); +int crash_copy_backup_region(struct kimage *image); +int crash_setup_memmap_entries(struct kimage *image, + struct boot_params *params); + +#endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 0dfccced4edf..d2434c1cad05 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -25,6 +25,8 @@ #include #include +struct kimage; + /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. * I.e. Maximum page that is mapped directly into kernel memory, @@ -62,6 +64,10 @@ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif +/* Memory to backup during crash kdump */ +#define KEXEC_BACKUP_SRC_START (0UL) +#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ + /* * CPU does not save ss and sp on stack if execution is already * running in kernel mode at the time of NMI occurrence. This code @@ -161,17 +167,35 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; + /* Details of backup region */ + unsigned long backup_src_start; + unsigned long backup_src_sz; + + /* Physical address of backup segment */ + unsigned long backup_load_addr; + + /* Core ELF header buffer */ + void *elf_headers; + unsigned long elf_headers_sz; + unsigned long elf_load_addr; }; +#endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_64 +/* + * Number of elements and order of elements in this structure should match + * with the ones in arch/x86/purgatory/entry64.S. If you make a change here + * make an appropriate change in purgatory too. + */ struct kexec_entry64_regs { uint64_t rax; - uint64_t rbx; uint64_t rcx; uint64_t rdx; - uint64_t rsi; - uint64_t rdi; + uint64_t rbx; uint64_t rsp; uint64_t rbp; + uint64_t rsi; + uint64_t rdi; uint64_t r8; uint64_t r9; uint64_t r10; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 507de8066594..0553a34fa0df 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -4,9 +4,14 @@ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) * * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal * */ +#define pr_fmt(fmt) "kexec: " fmt + #include #include #include @@ -16,6 +21,7 @@ #include #include #include +#include #include #include @@ -28,6 +34,45 @@ #include #include +/* Alignment required for elf header segment */ +#define ELF_CORE_HEADER_ALIGN 4096 + +/* This primarily represents number of split ranges due to exclusion */ +#define CRASH_MAX_RANGES 16 + +struct crash_mem_range { + u64 start, end; +}; + +struct crash_mem { + unsigned int nr_ranges; + struct crash_mem_range ranges[CRASH_MAX_RANGES]; +}; + +/* Misc data about ram ranges needed to prepare elf headers */ +struct crash_elf_data { + struct kimage *image; + /* + * Total number of ram ranges we have after various adjustments for + * GART, crash reserved region etc. + */ + unsigned int max_nr_ranges; + unsigned long gart_start, gart_end; + + /* Pointer to elf header */ + void *ehdr; + /* Pointer to next phdr */ + void *bufp; + struct crash_mem mem; +}; + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + int in_crash_kexec; /* @@ -39,6 +84,7 @@ int in_crash_kexec; */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); +unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif crash_save_cpu(regs, safe_smp_processor_id()); } + +#ifdef CONFIG_X86_64 + +static int get_nr_ram_ranges_callback(unsigned long start_pfn, + unsigned long nr_pfn, void *arg) +{ + int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int get_gart_ranges_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + + ced->gart_start = start; + ced->gart_end = end; + + /* Not expecting more than 1 gart aperture */ + return 1; +} + + +/* Gather all the required information to prepare elf headers for ram regions */ +static void fill_up_crash_elf_data(struct crash_elf_data *ced, + struct kimage *image) +{ + unsigned int nr_ranges = 0; + + ced->image = image; + + walk_system_ram_range(0, -1, &nr_ranges, + get_nr_ram_ranges_callback); + + ced->max_nr_ranges = nr_ranges; + + /* + * We don't create ELF headers for GART aperture as an attempt + * to dump this memory in second kernel leads to hang/crash. + * If gart aperture is present, one needs to exclude that region + * and that could lead to need of extra phdr. + */ + walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, + ced, get_gart_ranges_callback); + + /* + * If we have gart region, excluding that could potentially split + * a memory range, resulting in extra header. Account for that. + */ + if (ced->gart_end) + ced->max_nr_ranges++; + + /* Exclusion of crash region could split memory ranges */ + ced->max_nr_ranges++; + + /* If crashk_low_res is not 0, another range split possible */ + if (crashk_low_res.end != 0) + ced->max_nr_ranges++; +} + +static int exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happend, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == CRASH_MAX_RANGES - 1) { + pr_err("Too many crash ranges after split\n"); + return -ENOMEM; + } + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in ced->mem.ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_elf_data *ced, + unsigned long long mstart, unsigned long long mend) +{ + struct crash_mem *cmem = &ced->mem; + int ret = 0; + + memset(cmem->ranges, 0, sizeof(cmem->ranges)); + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude crashkernel region */ + ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; + + /* Exclude GART region */ + if (ced->gart_end) { + ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); + if (ret) + return ret; + } + + return ret; +} + +static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long mstart, mend; + struct kimage *image = ced->image; + struct crash_mem *cmem; + int ret, i; + + ehdr = ced->ehdr; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(ced, start, end); + if (ret) + return ret; + + /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ + cmem = &ced->mem; + + for (i = 0; i < cmem->nr_ranges; i++) { + mstart = cmem->ranges[i].start; + mend = cmem->ranges[i].end; + + phdr = ced->bufp; + ced->bufp += sizeof(Elf64_Phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + if (mstart == image->arch.backup_src_start && + (mend - mstart + 1) == image->arch.backup_src_sz) + phdr->p_offset = image->arch.backup_load_addr; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + return ret; +} + +static int prepare_elf64_headers(struct crash_elf_data *ced, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf, *bufp; + unsigned int cpu; + unsigned long long notes_addr; + int ret; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += ced->max_nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area on x86_64 (ffffffff80000000 - ffffffffa0000000). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + bufp = buf; + ehdr = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + (ehdr->e_phnum)++; + +#ifdef CONFIG_X86_64 + /* Prepare PT_LOAD type program header for kernel text region */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + (ehdr->e_phnum)++; +#endif + + /* Prepare PT_LOAD headers for system ram chunks. */ + ced->ehdr = ehdr; + ced->bufp = bufp; + ret = walk_system_ram_res(0, -1, ced, + prepare_elf64_ram_headers_callback); + if (ret < 0) + return ret; + + *addr = buf; + *sz = elf_sz; + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz) +{ + struct crash_elf_data *ced; + int ret; + + ced = kzalloc(sizeof(*ced), GFP_KERNEL); + if (!ced) + return -ENOMEM; + + fill_up_crash_elf_data(ced, image); + + /* By default prepare 64bit headers */ + ret = prepare_elf64_headers(ced, addr, sz); + kfree(ced); + return ret; +} + +static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820MAX) + return 1; + + memcpy(¶ms->e820_map[nr_e820_entries], entry, + sizeof(struct e820entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(u64 start, u64 end, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820entry ei; + + ei.addr = start; + ei.size = end - start + 1; + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + int ret = 0; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude Backup region */ + start = image->arch.backup_load_addr; + end = start + image->arch.backup_src_sz - 1; + ret = exclude_mem_range(cmem, start, end); + if (ret) + return ret; + + /* Exclude elf header region */ + start = image->arch.elf_load_addr; + end = start + image->arch.elf_headers_sz - 1; + return exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(sizeof(struct crash_mem)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add first 640K segment */ + ei.addr = image->arch.backup_src_start; + ei.size = image->arch.backup_src_sz; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + + /* Add ACPI tables */ + cmd.type = E820_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_NVS; + walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = crashk_low_res.end - crashk_low_res.start + 1; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, + crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +static int determine_backup_region(u64 start, u64 end, void *arg) +{ + struct kimage *image = arg; + + image->arch.backup_src_start = start; + image->arch.backup_src_sz = end - start + 1; + + /* Expecting only one range for backup region */ + return 1; +} + +int crash_load_segments(struct kimage *image) +{ + unsigned long src_start, src_sz, elf_sz; + void *elf_addr; + int ret; + + /* + * Determine and load a segment for backup area. First 640K RAM + * region is backup source + */ + + ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, + image, determine_backup_region); + + /* Zero or postive return values are ok */ + if (ret < 0) + return ret; + + src_start = image->arch.backup_src_start; + src_sz = image->arch.backup_src_sz; + + /* Add backup segment. */ + if (src_sz) { + /* + * Ideally there is no source for backup segment. This is + * copied in purgatory after crash. Just add a zero filled + * segment for now to make sure checksum logic works fine. + */ + ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, + sizeof(crash_zero_bytes), src_sz, + PAGE_SIZE, 0, -1, 0, + &image->arch.backup_load_addr); + if (ret) + return ret; + pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", + image->arch.backup_load_addr, src_start, src_sz); + } + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + if (ret) + return ret; + + image->arch.elf_headers = elf_addr; + image->arch.elf_headers_sz = elf_sz; + + ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, + ELF_CORE_HEADER_ALIGN, 0, -1, 0, + &image->arch.elf_load_addr); + if (ret) { + vfree((void *)image->arch.elf_headers); + return ret; + } + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->arch.elf_load_addr, elf_sz, elf_sz); + + return ret; +} + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index bcedd100192f..a8e646458a10 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -21,6 +21,9 @@ #include #include +#include + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ /* * Defines lowest physical address for various segments. Not sure where @@ -58,18 +61,24 @@ static int setup_initrd(struct boot_params *params, return 0; } -static int setup_cmdline(struct boot_params *params, +static int setup_cmdline(struct kimage *image, struct boot_params *params, unsigned long bootparams_load_addr, unsigned long cmdline_offset, char *cmdline, unsigned long cmdline_len) { char *cmdline_ptr = ((char *)params) + cmdline_offset; - unsigned long cmdline_ptr_phys; + unsigned long cmdline_ptr_phys, len; uint32_t cmdline_low_32, cmdline_ext_32; memcpy(cmdline_ptr, cmdline, cmdline_len); + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr + cmdline_len - 1, + " elfcorehdr=0x%lx", image->arch.elf_load_addr); + cmdline_len += len; + } cmdline_ptr[cmdline_len - 1] = '\0'; + pr_debug("Final command line is: %s\n", cmdline_ptr); cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; cmdline_ext_32 = cmdline_ptr_phys >> 32; @@ -98,11 +107,12 @@ static int setup_memory_map_entries(struct boot_params *params) return 0; } -static int setup_boot_parameters(struct boot_params *params) +static int setup_boot_parameters(struct kimage *image, + struct boot_params *params) { unsigned int nr_e820_entries; unsigned long long mem_k, start, end; - int i; + int i, ret = 0; /* Get subarch from existing bootparams */ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; @@ -125,7 +135,13 @@ static int setup_boot_parameters(struct boot_params *params) /* Default sysdesc table */ params->sys_desc_table.length = 0; - setup_memory_map_entries(params); + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_memory_map_entries(params); + nr_e820_entries = params->e820_entries; for (i = 0; i < nr_e820_entries; i++) { @@ -153,7 +169,7 @@ static int setup_boot_parameters(struct boot_params *params) memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, EDD_MBR_SIG_MAX * sizeof(unsigned int)); - return 0; + return ret; } int bzImage64_probe(const char *buf, unsigned long len) @@ -240,6 +256,22 @@ void *bzImage64_load(struct kimage *image, char *kernel, return ERR_PTR(-EINVAL); } + /* + * In case of crash dump, we will append elfcorehdr= to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr= to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + /* * Load purgatory. For 64bit entry point, purgatory code can be * anywhere. @@ -254,7 +286,8 @@ void *bzImage64_load(struct kimage *image, char *kernel, pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); /* Load Bootparams and cmdline */ - params_cmdline_sz = sizeof(struct boot_params) + cmdline_len; + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; params = kzalloc(params_cmdline_sz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); @@ -303,8 +336,8 @@ void *bzImage64_load(struct kimage *image, char *kernel, setup_initrd(params, initrd_load_addr, initrd_len); } - setup_cmdline(params, bootparam_load_addr, sizeof(struct boot_params), - cmdline, cmdline_len); + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); /* bootloader info. Do we need a separate ID for kexec kernel loader? */ params->hdr.type_of_loader = 0x0D << 4; @@ -332,7 +365,9 @@ void *bzImage64_load(struct kimage *image, char *kernel, if (ret) goto out_free_params; - setup_boot_parameters(params); + ret = setup_boot_parameters(image, params); + if (ret) + goto out_free_params; /* Allocate loader specific data */ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 18d0f9e0b6da..9330434da777 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -178,6 +178,38 @@ static void load_segments(void) ); } +/* Update purgatory as needed after various image segments have been prepared */ +static int arch_update_purgatory(struct kimage *image) +{ + int ret = 0; + + if (!image->file_mode) + return 0; + + /* Setup copying of backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + &image->arch.backup_load_addr, + sizeof(image->arch.backup_load_addr), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_src", + &image->arch.backup_src_start, + sizeof(image->arch.backup_src_start), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + &image->arch.backup_src_sz, + sizeof(image->arch.backup_src_sz), 0); + if (ret) + return ret; + } + + return ret; +} + int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -191,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; + /* update purgatory as needed */ + result = arch_update_purgatory(image); + if (result) + return result; + return 0; } @@ -315,6 +352,9 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, void *arch_kexec_kernel_image_load(struct kimage *image) { + vfree(image->arch.elf_headers); + image->arch.elf_headers = NULL; + if (!image->fops || !image->fops->load) return ERR_PTR(-ENOEXEC); diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S index be3249d7ed2d..d1a4291d3568 100644 --- a/arch/x86/purgatory/entry64.S +++ b/arch/x86/purgatory/entry64.S @@ -61,13 +61,13 @@ new_cs_exit: .balign 4 entry64_regs: rax: .quad 0x0 -rbx: .quad 0x0 rcx: .quad 0x0 rdx: .quad 0x0 -rsi: .quad 0x0 -rdi: .quad 0x0 +rbx: .quad 0x0 rsp: .quad 0x0 rbp: .quad 0x0 +rsi: .quad 0x0 +rdi: .quad 0x0 r8: .quad 0x0 r9: .quad 0x0 r10: .quad 0x0 diff --git a/kernel/kexec.c b/kernel/kexec.c index 0926f2a3ed03..f18c780f9716 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -548,6 +548,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, { int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; image = do_kimage_alloc_init(); if (!image) @@ -555,6 +556,12 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, image->file_mode = 1; + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, cmdline_ptr, cmdline_len, flags); if (ret) @@ -572,10 +579,12 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, goto out_free_post_load_bufs; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err(KERN_ERR "Could not allocate swap buffer\n"); - goto out_free_control_pages; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; @@ -1113,10 +1122,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -1139,7 +1152,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -1148,7 +1166,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -2127,7 +2148,14 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, kbuf->top_down = top_down; /* Walk the RAM ranges and allocate a suitable range for the buffer */ - ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); if (ret != 1) { /* A suitable memory range could not be found for buffer */ return -EADDRNOTAVAIL; -- cgit v1.2.3 From 8e7d838103feac320baf9e68d73f954840ac1eea Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 14:26:13 -0700 Subject: kexec: verify the signature of signed PE bzImage This is the final piece of the puzzle of verifying kernel image signature during kexec_file_load() syscall. This patch calls into PE file routines to verify signature of bzImage. If signature are valid, kexec_file_load() succeeds otherwise it fails. Two new config options have been introduced. First one is CONFIG_KEXEC_VERIFY_SIG. This option enforces that kernel has to be validly signed otherwise kernel load will fail. If this option is not set, no signature verification will be done. Only exception will be when secureboot is enabled. In that case signature verification should be automatically enforced when secureboot is enabled. But that will happen when secureboot patches are merged. Second config option is CONFIG_KEXEC_BZIMAGE_VERIFY_SIG. This option enables signature verification support on bzImage. If this option is not set and previous one is set, kernel image loading will fail because kernel does not have support to verify signature of bzImage. I tested these patches with both "pesign" and "sbsign" signed bzImages. I used signing_key.priv key and signing_key.x509 cert for signing as generated during kernel build process (if module signing is enabled). Used following method to sign bzImage. pesign ====== - Convert DER format cert to PEM format cert openssl x509 -in signing_key.x509 -inform DER -out signing_key.x509.PEM -outform PEM - Generate a .p12 file from existing cert and private key file openssl pkcs12 -export -out kernel-key.p12 -inkey signing_key.priv -in signing_key.x509.PEM - Import .p12 file into pesign db pk12util -i /tmp/kernel-key.p12 -d /etc/pki/pesign - Sign bzImage pesign -i /boot/vmlinuz-3.16.0-rc3+ -o /boot/vmlinuz-3.16.0-rc3+.signed.pesign -c "Glacier signing key - Magrathea" -s sbsign ====== sbsign --key signing_key.priv --cert signing_key.x509.PEM --output /boot/vmlinuz-3.16.0-rc3+.signed.sbsign /boot/vmlinuz-3.16.0-rc3+ Patch details: Well all the hard work is done in previous patches. Now bzImage loader has just call into that code and verify whether bzImage signature are valid or not. Also create two config options. First one is CONFIG_KEXEC_VERIFY_SIG. This option enforces that kernel has to be validly signed otherwise kernel load will fail. If this option is not set, no signature verification will be done. Only exception will be when secureboot is enabled. In that case signature verification should be automatically enforced when secureboot is enabled. But that will happen when secureboot patches are merged. Second config option is CONFIG_KEXEC_BZIMAGE_VERIFY_SIG. This option enables signature verification support on bzImage. If this option is not set and previous one is set, kernel image loading will fail because kernel does not have support to verify signature of bzImage. Signed-off-by: Vivek Goyal Cc: Borislav Petkov Cc: Michael Kerrisk Cc: Yinghai Lu Cc: Eric Biederman Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Greg Kroah-Hartman Cc: Dave Young Cc: WANG Chao Cc: Baoquan He Cc: Andy Lutomirski Cc: Matt Fleming Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 22 ++++++++++++++++++++++ arch/x86/kernel/kexec-bzimage64.c | 21 +++++++++++++++++++++ arch/x86/kernel/machine_kexec_64.c | 11 +++++++++++ include/linux/kexec.h | 3 +++ kernel/kexec.c | 15 +++++++++++++++ 5 files changed, 72 insertions(+) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9558b9fcafbf..4aafd322e21e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1599,6 +1599,28 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC + ---help--- + This option makes kernel signature verification mandatory for + kexec_file_load() syscall. If kernel is signature can not be + verified, kexec_file_load() will fail. + + This option enforces signature verification at generic level. + One needs to enable signature verification for type of kernel + image being loaded to make sure it works. For example, enable + bzImage signature verification option to be able to load and + verify signatures of bzImage. Otherwise kernel loading will fail. + +config KEXEC_BZIMAGE_VERIFY_SIG + bool "Enable bzImage signature verification support" + depends on KEXEC_VERIFY_SIG + depends on SIGNED_PE_FILE_VERIFICATION + select SYSTEM_TRUSTED_KEYRING + ---help--- + Enable bzImage signature verification support. + config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 623e6c58081f..9642b9b33655 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -525,8 +527,27 @@ int bzImage64_cleanup(void *loader_data) return 0; } +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG +int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) +{ + bool trusted; + int ret; + + ret = verify_pefile_signature(kernel, kernel_len, + system_trusted_keyring, &trusted); + if (ret < 0) + return ret; + if (!trusted) + return -EKEYREJECTED; + return 0; +} +#endif + struct kexec_file_ops kexec_bzImage64_ops = { .probe = bzImage64_probe, .load = bzImage64_load, .cleanup = bzImage64_cleanup, +#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG + .verify_sig = bzImage64_verify_sig, +#endif }; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9330434da777..8b04018e5d1f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -372,6 +372,17 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) return image->fops->cleanup(image->image_loader_data); } +int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel, + unsigned long kernel_len) +{ + if (!image->fops || !image->fops->verify_sig) { + pr_debug("kernel loader does not support signature verification."); + return -EKEYREJECTED; + } + + return image->fops->verify_sig(kernel, kernel_len); +} + /* * Apply purgatory relocations. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 9481703b0e7a..4b2a0e11cc5b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -191,11 +191,14 @@ typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len); typedef int (kexec_cleanup_t)(void *loader_data); +typedef int (kexec_verify_sig_t)(const char *kernel_buf, + unsigned long kernel_len); struct kexec_file_ops { kexec_probe_t *probe; kexec_load_t *load; kexec_cleanup_t *cleanup; + kexec_verify_sig_t *verify_sig; }; /* kexec interface functions */ diff --git a/kernel/kexec.c b/kernel/kexec.c index f18c780f9716..0b49a0a58102 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -416,6 +416,12 @@ void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) { } +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -EKEYREJECTED; +} + /* Apply relocations of type RELA */ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, @@ -494,6 +500,15 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, if (ret) goto out; +#ifdef CONFIG_KEXEC_VERIFY_SIG + ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, + image->kernel_buf_len); + if (ret) { + pr_debug("kernel signature verification failed.\n"); + goto out; + } + pr_debug("kernel signature verification successful.\n"); +#endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, -- cgit v1.2.3 From 0da1d4a0b9516adb2acc4841e9f6da6618f47f4e Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 8 Aug 2014 16:25:47 -0700 Subject: x86: Add "make tinyconfig" to configure the tiniest possible kernel Since commit 5d2acfc7b974bbd3858b4dd3f2cdc6362dd8843a ("kconfig: make allnoconfig disable options behind EMBEDDED and EXPERT") in 3.15-rc1, "make allnoconfig" disables every possible config option. However, a few configuration options (CC_OPTIMIZE_FOR_SIZE, OPTIMIZE_INLINING) produce a smaller kernel when turned on, and a few choices exist (compression, highmem, allocator) for which a non-default option produces a smaller kernel. Add a "tinyconfig" option, which starts from allnoconfig and then sets these options to configure the tiniest possible kernel. This provides a better baseline for embedded systems or efforts to reduce kernel size. Signed-off-by: Josh Triplett --- arch/x86/configs/tiny.config | 1 + kernel/configs/tiny.config | 4 ++++ scripts/kconfig/Makefile | 5 +++++ 3 files changed, 10 insertions(+) create mode 100644 arch/x86/configs/tiny.config create mode 100644 kernel/configs/tiny.config (limited to 'kernel') diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config new file mode 100644 index 000000000000..4e2ecfa23c15 --- /dev/null +++ b/arch/x86/configs/tiny.config @@ -0,0 +1 @@ +CONFIG_NOHIGHMEM=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config new file mode 100644 index 000000000000..c2de56ab0fce --- /dev/null +++ b/kernel/configs/tiny.config @@ -0,0 +1,4 @@ +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_KERNEL_XZ=y +CONFIG_OPTIMIZE_INLINING=y +CONFIG_SLOB=y diff --git a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile index 8083b94b45ee..ebf40f6edb4d 100644 --- a/scripts/kconfig/Makefile +++ b/scripts/kconfig/Makefile @@ -117,6 +117,10 @@ PHONY += kvmconfig kvmconfig: $(call mergeconfig,kvm_guest) +PHONY += tinyconfig +tinyconfig: allnoconfig + $(call mergeconfig,tiny) + # Help text used by make help help: @echo ' config - Update current config utilising a line-oriented program' @@ -138,6 +142,7 @@ help: @echo ' listnewconfig - List new options' @echo ' olddefconfig - Same as silentoldconfig but sets new symbols to their default value' @echo ' kvmconfig - Enable additional options for guest kernel support' + @echo ' tinyconfig - Configure the tiniest possible kernel' # lxdialog stuff check-lxdialog := $(srctree)/$(src)/lxdialog/check-lxdialog.sh -- cgit v1.2.3 From 69f6a34bdeea4fec50bb90619bc9602973119572 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 10 Aug 2014 20:50:30 -0700 Subject: seccomp: Replace BUG(!spin_is_locked()) with assert_spin_lock Current upstream kernel hangs with mips and powerpc targets in uniprocessor mode if SECCOMP is configured. Bisect points to commit dbd952127d11 ("seccomp: introduce writer locking"). Turns out that code such as BUG_ON(!spin_is_locked(&list_lock)); can not be used in uniprocessor mode because spin_is_locked() always returns false in this configuration, and that assert_spin_locked() exists for that very purpose and must be used instead. Fixes: dbd952127d11 ("seccomp: introduce writer locking") Cc: Kees Cook Signed-off-by: Guenter Roeck Signed-off-by: Kees Cook --- kernel/fork.c | 2 +- kernel/seccomp.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1380d8ace334..0cf9cdb6e491 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1105,7 +1105,7 @@ static void copy_seccomp(struct task_struct *p) * needed because this new task is not yet running and cannot * be racing exec. */ - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Ref-count the new filter user, and assign it. */ get_seccomp_filter(current); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 25b0043f4755..44eb005c6695 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -203,7 +203,7 @@ static u32 seccomp_run_filters(int syscall) static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) { - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) return false; @@ -214,7 +214,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) static inline void seccomp_assign_mode(struct task_struct *task, unsigned long seccomp_mode) { - BUG_ON(!spin_is_locked(&task->sighand->siglock)); + assert_spin_locked(&task->sighand->siglock); task->seccomp.mode = seccomp_mode; /* @@ -253,7 +253,7 @@ static inline pid_t seccomp_can_sync_threads(void) struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Validate all threads being eligible for synchronization. */ caller = current; @@ -294,7 +294,7 @@ static inline void seccomp_sync_threads(void) struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Synchronize all threads. */ caller = current; @@ -464,7 +464,7 @@ static long seccomp_attach_filter(unsigned int flags, unsigned long total_insns; struct seccomp_filter *walker; - BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + assert_spin_locked(¤t->sighand->siglock); /* Validate resulting filter length. */ total_insns = filter->prog->len; -- cgit v1.2.3 From 743cb1ff191f00fee653212bdbcee1e56086d6ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Jul 2014 17:00:21 +0200 Subject: sched/fair: Make calculate_imbalance() independent Rik noticed that calculate_imbalance() relies on update_sd_pick_busiest() to guarantee that busiest->sum_nr_running > busiest->group_capacity_factor. Break this implicit assumption (with the intent of not providing it anymore) by having calculat_imbalance() verify it and not rely on others. Reported-by: Rik van Riel Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20140729152631.GW12054@laptop.lan Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..e9477e6193fc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6248,7 +6248,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (!busiest->group_imb) { + if (busiest->sum_nr_running > busiest->group_capacity_factor) { /* * Don't want to pull so many tasks that a group would go idle. * Except of course for the group_imb case, since then we might -- cgit v1.2.3 From caeb178c60f4f93f1b45c0bc056b5cf6d217b67f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 28 Jul 2014 14:16:28 -0400 Subject: sched/fair: Make update_sd_pick_busiest() return 'true' on a busier sd Currently update_sd_pick_busiest only identifies the busiest sd that is either overloaded, or has a group imbalance. When no sd is imbalanced or overloaded, the load balancer fails to find the busiest domain. This breaks load balancing between domains that are not overloaded, in the !SD_ASYM_PACKING case. This patch makes update_sd_pick_busiest return true when the busiest sd yet is encountered. Groups are ranked in the order overloaded > imbalanced > other, with higher ranked groups getting priority even when their load is lower. This is necessary due to the possibility of unequal capacities and cpumasks between domains within a sched group. Behaviour for SD_ASYM_PACKING does not seem to match the comment, but I have no hardware to test that so I have left the behaviour of that code unchanged. Enum for group classification suggested by Peter Zijlstra. Signed-off-by: Rik van Riel [peterz: replaced sg_lb_stats::group_imb with the new enum group_type in an attempt to avoid endless recalculation] Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Acked-by: Michael Neuling Cc: ktkhai@parallels.com Cc: tim.c.chen@linux.intel.com Cc: nicolas.pitre@linaro.org Cc: jhladky@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140729152743.GI3935@laptop Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9477e6193fc..94377254254e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5559,6 +5559,13 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ + +enum group_type { + group_other = 0, + group_imbalanced, + group_overloaded, +}; + /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -5572,7 +5579,7 @@ struct sg_lb_stats { unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; - int group_imb; /* Is there an imbalance in the group ? */ + enum group_type group_type; int group_has_free_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -5610,6 +5617,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .total_capacity = 0UL, .busiest_stat = { .avg_load = 0UL, + .sum_nr_running = 0, + .group_type = group_other, }, }; } @@ -5891,6 +5900,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro return capacity_factor; } +static enum group_type +group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running > sgs->group_capacity_factor) + return group_overloaded; + + if (sg_imbalanced(group)) + return group_imbalanced; + + return group_other; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -5942,9 +5963,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - - sgs->group_imb = sg_imbalanced(group); sgs->group_capacity_factor = sg_capacity_factor(env, group); + sgs->group_type = group_classify(group, sgs); if (sgs->group_capacity_factor > sgs->sum_nr_running) sgs->group_has_free_capacity = 1; @@ -5968,13 +5988,19 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->busiest_stat.avg_load) - return false; + struct sg_lb_stats *busiest = &sds->busiest_stat; - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_type > busiest->group_type) return true; - if (sgs->group_imb) + if (sgs->group_type < busiest->group_type) + return false; + + if (sgs->avg_load <= busiest->avg_load) + return false; + + /* This is the busiest node in its class. */ + if (!(env->sd->flags & SD_ASYM_PACKING)) return true; /* @@ -5982,8 +6008,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - env->dst_cpu < group_first_cpu(sg)) { + if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -6228,7 +6253,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local = &sds->local_stat; busiest = &sds->busiest_stat; - if (busiest->group_imb) { + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages * to ensure cpu-load equilibrium, look at wider averages. XXX @@ -6248,7 +6273,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (busiest->sum_nr_running > busiest->group_capacity_factor) { + if (busiest->group_type == group_overloaded) { /* * Don't want to pull so many tasks that a group would go idle. * Except of course for the group_imb case, since then we might @@ -6337,7 +6362,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (busiest->group_imb) + if (busiest->group_type == group_imbalanced) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ -- cgit v1.2.3 From 9a5d9ba6a3631d55c358fe1bdbaa162a97471a05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Jul 2014 17:15:11 +0200 Subject: sched/fair: Allow calculate_imbalance() to move idle cpus Allow calculate_imbalance() to 'create' idle cpus in the busiest group if there are idle cpus in the local group. Suggested-by: Rik van Riel Signed-off-by: Peter Zijlstra Acked-by: Vincent Guittot Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140729152705.GX12054@laptop.lan Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94377254254e..df1ed176c7b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6273,12 +6273,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } - if (busiest->group_type == group_overloaded) { - /* - * Don't want to pull so many tasks that a group would go idle. - * Except of course for the group_imb case, since then we might - * have to drop below capacity to reach cpu-load equilibrium. - */ + /* + * If there aren't any idle cpus, avoid creating some. + */ + if (busiest->group_type == group_overloaded && + local->group_type == group_overloaded) { load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity_factor); -- cgit v1.2.3 From aaecac4ad46b35ad308245384d019633fb9bc21b Mon Sep 17 00:00:00 2001 From: Zhihui Zhang Date: Fri, 1 Aug 2014 21:18:03 -0400 Subject: sched: Rename a misleading variable in build_overlap_sched_groups() The child variable in build_overlap_sched_groups() actually refers to the peer or sibling domain of the given CPU. Rename it to sibling to be consistent with the naming in build_group_mask(). Signed-off-by: Zhihui Zhang Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1406942283-18249-1-git-send-email-zzhsuny@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575a2208..7d1ec6e60535 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5739,7 +5739,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) const struct cpumask *span = sched_domain_span(sd); struct cpumask *covered = sched_domains_tmpmask; struct sd_data *sdd = sd->private; - struct sched_domain *child; + struct sched_domain *sibling; int i; cpumask_clear(covered); @@ -5750,10 +5750,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; - child = *per_cpu_ptr(sdd->sd, i); + sibling = *per_cpu_ptr(sdd->sd, i); /* See the comment near build_group_mask(). */ - if (!cpumask_test_cpu(i, sched_domain_span(child))) + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), @@ -5763,10 +5763,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - if (child->child) { - child = child->child; - cpumask_copy(sg_span, sched_domain_span(child)); - } else + if (sibling->child) + cpumask_copy(sg_span, sched_domain_span(sibling->child)); + else cpumask_set_cpu(i, sg_span); cpumask_or(covered, covered, sg_span); -- cgit v1.2.3 From b932c03c34f3b03c7364c06aa8cae5b74609fc41 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 4 Aug 2014 13:23:27 -0400 Subject: sched/numa: Fix off-by-one in capacity check Commit a43455a1d572daf7b730fe12eb747d1e17411365 ensures that task_numa_migrate will call task_numa_compare on the preferred node all the time, even when the preferred node has no free capacity. This could lead to a performance regression if nr_running == capacity on both the source and the destination node. This can be avoided by also checking for nr_running == capacity on the source node, which is one stricter than checking .has_free_capacity. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: vincent.guittot@linaro.org Cc: Morten.Rasmussen@arm.com Cc: nicolas.pitre@linaro.org Cc: efault@gmx.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407173008-9334-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df1ed176c7b7..e1cf419c3c7f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1206,7 +1206,7 @@ static void task_numa_compare(struct task_numa_env *env, if (!cur) { /* Is there capacity at our destination? */ - if (env->src_stats.has_free_capacity && + if (env->src_stats.nr_running <= env->src_stats.task_capacity && !env->dst_stats.has_free_capacity) goto unlock; -- cgit v1.2.3 From 83d7f2424741c9dc76c21377c9d00d47abaf88df Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 4 Aug 2014 13:23:28 -0400 Subject: sched/numa: Fix numa capacity computation Commit c61037e9 fixes the phenomenon of 'fantom' cores due to N*frac(smt_power) >= 1 by limiting the capacity to the actual number of cores in the load balancing code. This patch applies the same correction to the NUMA balancing code. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: vincent.guittot@linaro.org Cc: Morten.Rasmussen@arm.com Cc: nicolas.pitre@linaro.org Cc: efault@gmx.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407173008-9334-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e1cf419c3c7f..1413c44ce8a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1038,7 +1038,8 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu, cpus = 0; + int smt, cpu, cpus = 0; + unsigned long capacity; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1062,8 +1063,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid) if (!cpus) return; - ns->task_capacity = - DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); + /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */ + smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity); + capacity = cpus / smt; /* cores */ + + ns->task_capacity = min_t(unsigned, capacity, + DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE)); ns->has_free_capacity = (ns->nr_running < ns->task_capacity); } -- cgit v1.2.3 From 14c4000a88afaaa2d0877cc86d42a74fde0f35e0 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Sat, 9 Aug 2014 11:15:30 +0530 Subject: printk: Add function to return log buffer address and size Platforms like IBM Power Systems supports service processor assisted dump. It provides interface to add memory region to be captured when system is crashed. During initialization/running we can add kernel memory region to be collected. Presently we don't have a way to get the log buffer base address and size. This patch adds support to return log buffer address and size. Signed-off-by: Vasant Hegde Signed-off-by: Benjamin Herrenschmidt Acked-by: Andrew Morton --- include/linux/printk.h | 3 +++ kernel/printk/printk.c | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 0990997a5304..d78125f73ac4 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -10,6 +10,9 @@ extern const char linux_banner[]; extern const char linux_proc_banner[]; +extern char *log_buf_addr_get(void); +extern u32 log_buf_len_get(void); + static inline int printk_get_level(const char *buffer) { if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index de1a6bb6861d..e04c455a0e38 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -272,6 +272,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* Return log buffer address */ +char *log_buf_addr_get(void) +{ + return log_buf; +} + +/* Return log buffer size */ +u32 log_buf_len_get(void) +{ + return log_buf_len; +} + /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { -- cgit v1.2.3 From f86977620ee4635f26befcf436700493a38ce002 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 1 Aug 2014 14:33:01 +0200 Subject: perf: Set owner pointer for kernel events Adding fake EVENT_OWNER_KERNEL owner pointer value for kernel perf events, so we could distinguish it from user events, which needs special care in following patch. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Mark Rutland Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Mark Rutland Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1406896382-18404-3-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..bbb3ca22f07c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -119,6 +119,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) return data.ret; } +#define EVENT_OWNER_KERNEL ((void *) -1) + +static bool is_kernel_event(struct perf_event *event) +{ + return event->owner == EVENT_OWNER_KERNEL; +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -3312,16 +3319,12 @@ static void free_event(struct perf_event *event) } /* - * Called when the last reference to the file is gone. + * Remove user event from the owner task. */ -static void put_event(struct perf_event *event) +static void perf_remove_from_owner(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; struct task_struct *owner; - if (!atomic_long_dec_and_test(&event->refcount)) - return; - rcu_read_lock(); owner = ACCESS_ONCE(event->owner); /* @@ -3354,6 +3357,20 @@ static void put_event(struct perf_event *event) mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } +} + +/* + * Called when the last reference to the file is gone. + */ +static void put_event(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + if (!atomic_long_dec_and_test(&event->refcount)) + return; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); WARN_ON_ONCE(ctx->parent_ctx); /* @@ -7366,6 +7383,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err; } + /* Mark owner so we could distinguish it from user events. */ + event->owner = EVENT_OWNER_KERNEL; + account_event(event); ctx = find_get_context(event->pmu, task, cpu); -- cgit v1.2.3 From fadfe7be6e50de7f03913833b33c56cd8fb66bac Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 1 Aug 2014 14:33:02 +0200 Subject: perf: Add queued work to remove orphaned child events In cases when the owner task exits before the workload and the workload made some forks, all the events stay in until the last workload process exits. Thats' because each child event holds parent reference. We want to release all children events once the parent is gone, because at that time there's no process to read them anyway, so they're just eating resources. This removal races with process exit, which removes all events and fork, which clone events. To be clear of those two, adding work queue to remove orphaned child for context in case such event is detected. Using delayed work queue (with delay == 1), because we queue this work under perf scheduler callbacks. Normal work queue tries to wake up the queue process, which deadlocks on rq->lock in this place. Also preventing clones from abandoned parent event. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Mark Rutland Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Mark Rutland Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1406896382-18404-4-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 +++ kernel/events/core.c | 87 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 707617a8c0f6..ef5b62bdb103 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -52,6 +52,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -507,6 +508,9 @@ struct perf_event_context { int nr_cgroups; /* cgroup evts */ int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; + + struct delayed_work orphans_remove; + bool orphans_remove_sched; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index bbb3ca22f07c..a25460559b4f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -46,6 +46,8 @@ #include +static struct workqueue_struct *perf_wq; + struct remote_function_call { struct task_struct *p; int (*func)(void *info); @@ -1381,6 +1383,45 @@ out: perf_event__header_size(tmp); } +/* + * User event without the task. + */ +static bool is_orphaned_event(struct perf_event *event) +{ + return event && !is_kernel_event(event) && !event->owner; +} + +/* + * Event has a parent but parent's task finished and it's + * alive only because of children holding refference. + */ +static bool is_orphaned_child(struct perf_event *event) +{ + return is_orphaned_event(event->parent); +} + +static void orphans_remove_work(struct work_struct *work); + +static void schedule_orphans_remove(struct perf_event_context *ctx) +{ + if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) + return; + + if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { + get_ctx(ctx); + ctx->orphans_remove_sched = true; + } +} + +static int __init perf_workqueue_init(void) +{ + perf_wq = create_singlethread_workqueue("perf"); + WARN(!perf_wq, "failed to create perf workqueue\n"); + return perf_wq ? 0 : -1; +} + +core_initcall(perf_workqueue_init); + static inline int event_filter_match(struct perf_event *event) { @@ -1430,6 +1471,9 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + perf_pmu_enable(event->pmu); } @@ -1732,6 +1776,9 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; + if (is_orphaned_child(event)) + schedule_orphans_remove(ctx); + out: perf_pmu_enable(event->pmu); @@ -3074,6 +3121,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); + INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3405,6 +3453,42 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } +/* + * Remove all orphanes events from the context. + */ +static void orphans_remove_work(struct work_struct *work) +{ + struct perf_event_context *ctx; + struct perf_event *event, *tmp; + + ctx = container_of(work, struct perf_event_context, + orphans_remove.work); + + mutex_lock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { + struct perf_event *parent_event = event->parent; + + if (!is_orphaned_child(event)) + continue; + + perf_remove_from_context(event, true); + + mutex_lock(&parent_event->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent_event->child_mutex); + + free_event(event); + put_event(parent_event); + } + + raw_spin_lock_irq(&ctx->lock); + ctx->orphans_remove_sched = false; + raw_spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -7709,7 +7793,8 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; - if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + if (is_orphaned_event(parent_event) || + !atomic_long_inc_not_zero(&parent_event->refcount)) { free_event(child_event); return NULL; } -- cgit v1.2.3 From e708d7ad80737496870fd0b6794704d063fb0cdc Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 4 Aug 2014 15:31:08 +0200 Subject: perf: Do poll_wait() before checking condition in perf_poll() One should first enqueue to the waitqueue and then check for the condition. If the condition gets true after mutex_unlock() but before poll_wait() then we lose it and would have wait for another wakeup. This has been like this since v2.6.31-rc1 commit c7138f37f9 ("perf_counter: fix perf_poll()"). Before that it was slightly worse. I guess we get enough wakeups so if we miss here one it doesn't really matter. It is still a bad example. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1407159068-1478-1-git-send-email-bigeasy@linutronix.de Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a25460559b4f..2d7363adf678 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3629,6 +3629,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) struct ring_buffer *rb; unsigned int events = POLL_HUP; + poll_wait(file, &event->waitq, wait); /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -3638,9 +3639,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) if (rb) events = atomic_xchg(&rb->poll, 0); mutex_unlock(&event->mmap_mutex); - - poll_wait(file, &event->waitq, wait); - return events; } -- cgit v1.2.3 From 2e39465abc4b7856a0ea6fcf4f6b4668bb5db877 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Aug 2014 12:07:15 +0200 Subject: locking: Remove deprecated smp_mb__() barriers Its been a while and there are no in-tree users left, so remove the deprecated barriers. Signed-off-by: Peter Zijlstra Cc: Chen, Gong Cc: Jacob Pan Cc: Joe Perches Cc: John Sullivan Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Srinivas Pandruvada Cc: Theodore Ts'o Signed-off-by: Ingo Molnar --- include/linux/atomic.h | 36 ------------------------------------ include/linux/bitops.h | 20 -------------------- kernel/sched/core.c | 16 ---------------- 3 files changed, 72 deletions(-) (limited to 'kernel') diff --git a/include/linux/atomic.h b/include/linux/atomic.h index fef3a809e7cf..5b08a8540ecf 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -3,42 +3,6 @@ #define _LINUX_ATOMIC_H #include -/* - * Provide __deprecated wrappers for the new interface, avoid flag day changes. - * We need the ugly external functions to break header recursion hell. - */ -#ifndef smp_mb__before_atomic_inc -static inline void __deprecated smp_mb__before_atomic_inc(void) -{ - extern void __smp_mb__before_atomic(void); - __smp_mb__before_atomic(); -} -#endif - -#ifndef smp_mb__after_atomic_inc -static inline void __deprecated smp_mb__after_atomic_inc(void) -{ - extern void __smp_mb__after_atomic(void); - __smp_mb__after_atomic(); -} -#endif - -#ifndef smp_mb__before_atomic_dec -static inline void __deprecated smp_mb__before_atomic_dec(void) -{ - extern void __smp_mb__before_atomic(void); - __smp_mb__before_atomic(); -} -#endif - -#ifndef smp_mb__after_atomic_dec -static inline void __deprecated smp_mb__after_atomic_dec(void) -{ - extern void __smp_mb__after_atomic(void); - __smp_mb__after_atomic(); -} -#endif - /** * atomic_add_unless - add unless the number is already a given value * @v: pointer of type atomic_t diff --git a/include/linux/bitops.h b/include/linux/bitops.h index cbc5833fb221..be5fd38bd5a0 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -32,26 +32,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include -/* - * Provide __deprecated wrappers for the new interface, avoid flag day changes. - * We need the ugly external functions to break header recursion hell. - */ -#ifndef smp_mb__before_clear_bit -static inline void __deprecated smp_mb__before_clear_bit(void) -{ - extern void __smp_mb__before_atomic(void); - __smp_mb__before_atomic(); -} -#endif - -#ifndef smp_mb__after_clear_bit -static inline void __deprecated smp_mb__after_clear_bit(void) -{ - extern void __smp_mb__after_atomic(void); - __smp_mb__after_atomic(); -} -#endif - #define for_each_set_bit(bit, addr, size) \ for ((bit) = find_first_bit((addr), (size)); \ (bit) < (size); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1211575a2208..76c518c9b3a7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,22 +90,6 @@ #define CREATE_TRACE_POINTS #include -#ifdef smp_mb__before_atomic -void __smp_mb__before_atomic(void) -{ - smp_mb__before_atomic(); -} -EXPORT_SYMBOL(__smp_mb__before_atomic); -#endif - -#ifdef smp_mb__after_atomic -void __smp_mb__after_atomic(void) -{ - smp_mb__after_atomic(); -} -EXPORT_SYMBOL(__smp_mb__after_atomic); -#endif - void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; -- cgit v1.2.3 From 242489cfe97d44290e7f88b12591fab6c0819045 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:50 -0700 Subject: locking/mutexes: Standardize arguments in lock/unlock slowpaths Just how the locking-end behaves, when unlocking, go ahead and obtain the proper data structure immediately after the previous (asm-end) call exits and there are (probably) pending waiters. This simplifies a bit some of the layering. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra Cc: jason.low2@hp.com Cc: aswin@hp.com Cc: mingo@kernel.org Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1406752916-3341-1-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index ae712b25e492..ad0e3335c481 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -679,9 +679,8 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); * Release the lock, slowpath: */ static inline void -__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) +__mutex_unlock_common_slowpath(struct mutex *lock, int nested) { - struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; /* @@ -716,7 +715,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) __visible void __mutex_unlock_slowpath(atomic_t *lock_count) { - __mutex_unlock_common_slowpath(lock_count, 1); + struct mutex *lock = container_of(lock_count, struct mutex, count); + + __mutex_unlock_common_slowpath(lock, 1); } #ifndef CONFIG_DEBUG_LOCK_ALLOC -- cgit v1.2.3 From 42fa566bd74aa7b95413fb00611ec983b488222d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:51 -0700 Subject: locking/mutexes: Document quick lock release when unlocking When unlocking, we always want to reach the slowpath with the lock's counter indicating it is unlocked. -- as returned by the asm fastpath call or by explicitly setting it. While doing so, at least in theory, we can optimize and allow faster lock stealing. When unlocking, we always want to reach the slowpath with the lock's counter indicating it is unlocked. -- as returned by the asm fastpath call or by explicitly setting it. While doing so, at least in theory, we can optimize and allow faster lock stealing. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra Cc: jason.low2@hp.com Cc: aswin@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1406752916-3341-2-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index ad0e3335c481..93bec48f09ed 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -684,9 +684,16 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) unsigned long flags; /* - * some architectures leave the lock unlocked in the fastpath failure + * As a performance measurement, release the lock before doing other + * wakeup related duties to follow. This allows other tasks to acquire + * the lock sooner, while still handling cleanups in past unlock calls. + * This can be done as we do not enforce strict equivalence between the + * mutex counter and wait_list. + * + * + * Some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to - * unlock it here + * unlock it here - as the lock counter is currently 0 or negative. */ if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); -- cgit v1.2.3 From aa9fc0c19bee0cbc152e0e06488095fb69229236 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:52 -0700 Subject: locking/mcs: Remove obsolete comment ... as we clearly inline mcs_spin_lock() now. Signed-off-by: Davidlohr Bueso Acked-by: Jason Low Signed-off-by: Peter Zijlstra Cc: aswin@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1406752916-3341-3-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mcs_spinlock.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 23e89c5930e9..4d60986fcbee 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -56,9 +56,6 @@ do { \ * If the lock has already been acquired, then this will proceed to spin * on this node->locked until the previous lock holder sets the node->locked * in mcs_spin_unlock(). - * - * We don't inline mcs_spin_lock() so that perf can correctly account for the - * time spent in this lock function. */ static inline void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) -- cgit v1.2.3 From 76916515d9d84e6552ee5e218e0ed566ad75e600 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:53 -0700 Subject: locking/mutexes: Refactor optimistic spinning code When we fail to acquire the mutex in the fastpath, we end up calling __mutex_lock_common(). A *lot* goes on in this function. Move out the optimistic spinning code into mutex_optimistic_spin() and simplify the former a bit. Furthermore, this is similar to what we have in rwsems. No logical changes. Signed-off-by: Davidlohr Bueso Acked-by: Jason Low Signed-off-by: Peter Zijlstra Cc: aswin@hp.com Cc: mingo@kernel.org Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1406752916-3341-4-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 396 ++++++++++++++++++++++++++----------------------- 1 file changed, 214 insertions(+), 182 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 93bec48f09ed..0d8b6ed93874 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + unsigned long flags; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + + lock->ctx = ctx; + + /* + * The lock->ctx update should be visible on all cores before + * the atomic read is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* ^^^ */ + + /* + * Check if lock is contended, if not there is nobody to wake up + */ + if (likely(atomic_read(&lock->base.count) == 0)) + return; + + /* + * Uh oh, we raced in fastpath, wake up everyone in this case, + * so they can see the new lock->ctx. + */ + spin_lock_mutex(&lock->base.wait_lock, flags); + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } + spin_unlock_mutex(&lock->base.wait_lock, flags); +} + + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* * In order to avoid a stampede of mutex spinners from acquiring the mutex @@ -180,6 +266,129 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) */ return retval; } + +/* + * Atomically try to take the lock when it is available + */ +static inline bool mutex_try_to_acquire(struct mutex *lock) +{ + return !mutex_is_locked(lock) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1); +} + +/* + * Optimistic spinning. + * + * We try to spin for acquisition when we find that the lock owner + * is currently running on a (different) CPU and while we don't + * need to reschedule. The rationale is that if the lock owner is + * running, it is likely to release the lock soon. + * + * Since this needs the lock owner, and this mutex implementation + * doesn't track the owner atomically in the lock field, we need to + * track it non-atomically. + * + * We can't do this for DEBUG_MUTEXES because that relies on wait_lock + * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. + * + * Returns true when the lock was taken, otherwise false, indicating + * that we need to jump to the slowpath and sleep. + */ +static bool mutex_optimistic_spin(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ + struct task_struct *task = current; + + if (!mutex_can_spin_on_owner(lock)) + goto done; + + if (!osq_lock(&lock->osq)) + goto done; + + while (true) { + struct task_struct *owner; + + if (use_ww_ctx && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + */ + if (ACCESS_ONCE(ww->ctx)) + break; + } + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = ACCESS_ONCE(lock->owner); + if (owner && !mutex_spin_on_owner(lock, owner)) + break; + + /* Try to acquire the mutex if it is unlocked. */ + if (mutex_try_to_acquire(lock)) { + lock_acquired(&lock->dep_map, ip); + + if (use_ww_ctx) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } + + mutex_set_owner(lock); + osq_unlock(&lock->osq); + return true; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax_lowlatency(); + } + + osq_unlock(&lock->osq); +done: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) + schedule_preempt_disabled(); + + return false; +} +#else +static bool mutex_optimistic_spin(struct mutex *lock, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ + return false; +} #endif __visible __used noinline @@ -277,91 +486,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) return 0; } -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; -} - -/* - * after acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. - */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) -{ - unsigned long flags; - struct mutex_waiter *cur; - - ww_mutex_lock_acquired(lock, ctx); - - lock->ctx = ctx; - - /* - * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* ^^^ */ - - /* - * Check if lock is contended, if not there is nobody to wake up - */ - if (likely(atomic_read(&lock->base.count) == 0)) - return; - - /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. - */ - spin_lock_mutex(&lock->base.wait_lock, flags); - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } - spin_unlock_mutex(&lock->base.wait_lock, flags); -} - /* * Lock a mutex (possibly interruptible), slowpath: */ @@ -378,104 +502,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - /* - * Optimistic spinning. - * - * We try to spin for acquisition when we find that the lock owner - * is currently running on a (different) CPU and while we don't - * need to reschedule. The rationale is that if the lock owner is - * running, it is likely to release the lock soon. - * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * - * The mutex spinners are queued up using MCS lock so that only one - * spinner can compete for the mutex. However, if mutex spinning isn't - * going to happen, there is no point in going through the lock/unlock - * overhead. - */ - if (!mutex_can_spin_on_owner(lock)) - goto slowpath; - - if (!osq_lock(&lock->osq)) - goto slowpath; - - for (;;) { - struct task_struct *owner; - - if (use_ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - */ - if (ACCESS_ONCE(ww->ctx)) - break; - } - - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; - - /* Try to acquire the mutex if it is unlocked. */ - if (!mutex_is_locked(lock) && - (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { - lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); - } - - mutex_set_owner(lock); - osq_unlock(&lock->osq); - preempt_enable(); - return 0; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax_lowlatency(); + if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + /* got the lock, yay! */ + preempt_enable(); + return 0; } - osq_unlock(&lock->osq); -slowpath: - /* - * If we fell out of the spin path because of need_resched(), - * reschedule now, before we try-lock the mutex. This avoids getting - * scheduled out right after we obtained the mutex. - */ - if (need_resched()) - schedule_preempt_disabled(); -#endif + spin_lock_mutex(&lock->wait_lock, flags); /* -- cgit v1.2.3 From 7608a43d8f2e02f8b532f8e11481d7ecf8b5d3f9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:54 -0700 Subject: locking/mutexes: Use MUTEX_SPIN_ON_OWNER when appropriate 4badad35 ("locking/mutex: Disable optimistic spinning on some architectures") added a ARCH_SUPPORTS_ATOMIC_RMW flag to disable the mutex optimistic feature on specific archs. Because CONFIG_MUTEX_SPIN_ON_OWNER only depended on DEBUG and SMP, it was ok to have the ->owner field conditional a bit flexible. However by adding a new variable to the matter, we can waste space with the unused field, ie: CONFIG_SMP && (!CONFIG_MUTEX_SPIN_ON_OWNER && !CONFIG_DEBUG_MUTEX). Signed-off-by: Davidlohr Bueso Acked-by: Jason Low Signed-off-by: Peter Zijlstra Cc: aswin@hp.com Cc: Davidlohr Bueso Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Tim Chen Link: http://lkml.kernel.org/r/1406752916-3341-5-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 2 +- kernel/locking/mutex.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 8d5535c58cc2..e4c29418f407 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -52,7 +52,7 @@ struct mutex { atomic_t count; spinlock_t wait_lock; struct list_head wait_list; -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) +#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_MUTEX_SPIN_ON_OWNER) struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..5cda397607f2 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,7 +16,7 @@ #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_SMP +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline void mutex_set_owner(struct mutex *lock) { lock->owner = current; -- cgit v1.2.3 From 214e0aed639ef40987bf6159fad303171a6de31e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 30 Jul 2014 13:41:55 -0700 Subject: locking/Documentation: Move locking related docs into Documentation/locking/ Specifically: Documentation/locking/lockdep-design.txt Documentation/locking/lockstat.txt Documentation/locking/mutex-design.txt Documentation/locking/rt-mutex-design.txt Documentation/locking/rt-mutex.txt Documentation/locking/spinlocks.txt Documentation/locking/ww-mutex-design.txt Signed-off-by: Davidlohr Bueso Acked-by: Randy Dunlap Signed-off-by: Peter Zijlstra Cc: jason.low2@hp.com Cc: aswin@hp.com Cc: Alexei Starovoitov Cc: Al Viro Cc: Andrew Morton Cc: Chris Mason Cc: Dan Streetman Cc: David Airlie Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Jason Low Cc: Josef Bacik Cc: Kees Cook Cc: Linus Torvalds Cc: Lubomir Rintel Cc: Masanari Iida Cc: Paul E. McKenney Cc: Randy Dunlap Cc: Tim Chen Cc: Vineet Gupta Cc: fengguang.wu@intel.com Link: http://lkml.kernel.org/r/1406752916-3341-6-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar --- Documentation/00-INDEX | 2 + Documentation/DocBook/kernel-locking.tmpl | 2 +- Documentation/lockdep-design.txt | 286 ----------- Documentation/locking/lockdep-design.txt | 286 +++++++++++ Documentation/locking/lockstat.txt | 178 +++++++ Documentation/locking/mutex-design.txt | 157 ++++++ Documentation/locking/rt-mutex-design.txt | 781 ++++++++++++++++++++++++++++++ Documentation/locking/rt-mutex.txt | 79 +++ Documentation/locking/spinlocks.txt | 167 +++++++ Documentation/locking/ww-mutex-design.txt | 344 +++++++++++++ Documentation/lockstat.txt | 178 ------- Documentation/mutex-design.txt | 157 ------ Documentation/rt-mutex-design.txt | 781 ------------------------------ Documentation/rt-mutex.txt | 79 --- Documentation/spinlocks.txt | 167 ------- Documentation/ww-mutex-design.txt | 344 ------------- MAINTAINERS | 4 +- drivers/gpu/drm/drm_modeset_lock.c | 2 +- include/linux/lockdep.h | 2 +- include/linux/mutex.h | 2 +- include/linux/rwsem.h | 2 +- kernel/locking/mutex.c | 2 +- kernel/locking/rtmutex.c | 2 +- lib/Kconfig.debug | 4 +- 24 files changed, 2005 insertions(+), 2003 deletions(-) delete mode 100644 Documentation/lockdep-design.txt create mode 100644 Documentation/locking/lockdep-design.txt create mode 100644 Documentation/locking/lockstat.txt create mode 100644 Documentation/locking/mutex-design.txt create mode 100644 Documentation/locking/rt-mutex-design.txt create mode 100644 Documentation/locking/rt-mutex.txt create mode 100644 Documentation/locking/spinlocks.txt create mode 100644 Documentation/locking/ww-mutex-design.txt delete mode 100644 Documentation/lockstat.txt delete mode 100644 Documentation/mutex-design.txt delete mode 100644 Documentation/rt-mutex-design.txt delete mode 100644 Documentation/rt-mutex.txt delete mode 100644 Documentation/spinlocks.txt delete mode 100644 Documentation/ww-mutex-design.txt (limited to 'kernel') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 27e67a98b7be..1750fcef1ab4 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -287,6 +287,8 @@ local_ops.txt - semantics and behavior of local atomic operations. lockdep-design.txt - documentation on the runtime locking correctness validator. +locking/ + - directory with info about kernel locking primitives lockstat.txt - info on collecting statistics on locks (and contention). lockup-watchdogs.txt diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index e584ee12a1e7..7c9cc4846cb6 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1972,7 +1972,7 @@ machines due to caching. - Documentation/spinlocks.txt: + Documentation/locking/spinlocks.txt: Linus Torvalds' spinlocking tutorial in the kernel sources. diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt deleted file mode 100644 index 5dbc99c04f6e..000000000000 --- a/Documentation/lockdep-design.txt +++ /dev/null @@ -1,286 +0,0 @@ -Runtime locking correctness validator -===================================== - -started by Ingo Molnar -additions by Arjan van de Ven - -Lock-class ----------- - -The basic object the validator operates upon is a 'class' of locks. - -A class of locks is a group of locks that are logically the same with -respect to locking rules, even if the locks may have multiple (possibly -tens of thousands of) instantiations. For example a lock in the inode -struct is one class, while each inode has its own instantiation of that -lock class. - -The validator tracks the 'state' of lock-classes, and it tracks -dependencies between different lock-classes. The validator maintains a -rolling proof that the state and the dependencies are correct. - -Unlike an lock instantiation, the lock-class itself never goes away: when -a lock-class is used for the first time after bootup it gets registered, -and all subsequent uses of that lock-class will be attached to this -lock-class. - -State ------ - -The validator tracks lock-class usage history into 4n + 1 separate state bits: - -- 'ever held in STATE context' -- 'ever held as readlock in STATE context' -- 'ever held with STATE enabled' -- 'ever held as readlock with STATE enabled' - -Where STATE can be either one of (kernel/lockdep_states.h) - - hardirq - - softirq - - reclaim_fs - -- 'ever used' [ == !unused ] - -When locking rules are violated, these state bits are presented in the -locking error messages, inside curlies. A contrived example: - - modprobe/2287 is trying to acquire lock: - (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 - - but task is already holding lock: - (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 - - -The bit position indicates STATE, STATE-read, for each of the states listed -above, and the character displayed in each indicates: - - '.' acquired while irqs disabled and not in irq context - '-' acquired in irq context - '+' acquired with irqs enabled - '?' acquired in irq context with irqs enabled. - -Unused mutexes cannot be part of the cause of an error. - - -Single-lock state rules: ------------------------- - -A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The -following states are exclusive, and only one of them is allowed to be -set for any lock-class: - - and - and - -The validator detects and reports lock usage that violate these -single-lock state rules. - -Multi-lock dependency rules: ----------------------------- - -The same lock-class must not be acquired twice, because this could lead -to lock recursion deadlocks. - -Furthermore, two locks may not be taken in different order: - - -> - -> - -because this could lead to lock inversion deadlocks. (The validator -finds such dependencies in arbitrary complexity, i.e. there can be any -other locking sequence between the acquire-lock operations, the -validator will still track all dependencies between locks.) - -Furthermore, the following usage based lock dependencies are not allowed -between any two lock-classes: - - -> - -> - -The first rule comes from the fact the a hardirq-safe lock could be -taken by a hardirq context, interrupting a hardirq-unsafe lock - and -thus could result in a lock inversion deadlock. Likewise, a softirq-safe -lock could be taken by an softirq context, interrupting a softirq-unsafe -lock. - -The above rules are enforced for any locking sequence that occurs in the -kernel: when acquiring a new lock, the validator checks whether there is -any rule violation between the new lock and any of the held locks. - -When a lock-class changes its state, the following aspects of the above -dependency rules are enforced: - -- if a new hardirq-safe lock is discovered, we check whether it - took any hardirq-unsafe lock in the past. - -- if a new softirq-safe lock is discovered, we check whether it took - any softirq-unsafe lock in the past. - -- if a new hardirq-unsafe lock is discovered, we check whether any - hardirq-safe lock took it in the past. - -- if a new softirq-unsafe lock is discovered, we check whether any - softirq-safe lock took it in the past. - -(Again, we do these checks too on the basis that an interrupt context -could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which -could lead to a lock inversion deadlock - even if that lock scenario did -not trigger in practice yet.) - -Exception: Nested data dependencies leading to nested locking -------------------------------------------------------------- - -There are a few cases where the Linux kernel acquires more than one -instance of the same lock-class. Such cases typically happen when there -is some sort of hierarchy within objects of the same type. In these -cases there is an inherent "natural" ordering between the two objects -(defined by the properties of the hierarchy), and the kernel grabs the -locks in this fixed order on each of the objects. - -An example of such an object hierarchy that results in "nested locking" -is that of a "whole disk" block-dev object and a "partition" block-dev -object; the partition is "part of" the whole device and as long as one -always takes the whole disk lock as a higher lock than the partition -lock, the lock ordering is fully correct. The validator does not -automatically detect this natural ordering, as the locking rule behind -the ordering is not static. - -In order to teach the validator about this correct usage model, new -versions of the various locking primitives were added that allow you to -specify a "nesting level". An example call, for the block device mutex, -looks like this: - -enum bdev_bd_mutex_lock_class -{ - BD_MUTEX_NORMAL, - BD_MUTEX_WHOLE, - BD_MUTEX_PARTITION -}; - - mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION); - -In this case the locking is done on a bdev object that is known to be a -partition. - -The validator treats a lock that is taken in such a nested fashion as a -separate (sub)class for the purposes of validation. - -Note: When changing code to use the _nested() primitives, be careful and -check really thoroughly that the hierarchy is correctly mapped; otherwise -you can get false positives or false negatives. - -Proof of 100% correctness: --------------------------- - -The validator achieves perfect, mathematical 'closure' (proof of locking -correctness) in the sense that for every simple, standalone single-task -locking sequence that occurred at least once during the lifetime of the -kernel, the validator proves it with a 100% certainty that no -combination and timing of these locking sequences can cause any class of -lock related deadlock. [*] - -I.e. complex multi-CPU and multi-task locking scenarios do not have to -occur in practice to prove a deadlock: only the simple 'component' -locking chains have to occur at least once (anytime, in any -task/context) for the validator to be able to prove correctness. (For -example, complex deadlocks that would normally need more than 3 CPUs and -a very unlikely constellation of tasks, irq-contexts and timings to -occur, can be detected on a plain, lightly loaded single-CPU system as -well!) - -This radically decreases the complexity of locking related QA of the -kernel: what has to be done during QA is to trigger as many "simple" -single-task locking dependencies in the kernel as possible, at least -once, to prove locking correctness - instead of having to trigger every -possible combination of locking interaction between CPUs, combined with -every possible hardirq and softirq nesting scenario (which is impossible -to do in practice). - -[*] assuming that the validator itself is 100% correct, and no other - part of the system corrupts the state of the validator in any way. - We also assume that all NMI/SMM paths [which could interrupt - even hardirq-disabled codepaths] are correct and do not interfere - with the validator. We also assume that the 64-bit 'chain hash' - value is unique for every lock-chain in the system. Also, lock - recursion must not be higher than 20. - -Performance: ------------- - -The above rules require _massive_ amounts of runtime checking. If we did -that for every lock taken and for every irqs-enable event, it would -render the system practically unusably slow. The complexity of checking -is O(N^2), so even with just a few hundred lock-classes we'd have to do -tens of thousands of checks for every event. - -This problem is solved by checking any given 'locking scenario' (unique -sequence of locks taken after each other) only once. A simple stack of -held locks is maintained, and a lightweight 64-bit hash value is -calculated, which hash is unique for every lock chain. The hash value, -when the chain is validated for the first time, is then put into a hash -table, which hash-table can be checked in a lockfree manner. If the -locking chain occurs again later on, the hash table tells us that we -dont have to validate the chain again. - -Troubleshooting: ----------------- - -The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. -Exceeding this number will trigger the following lockdep warning: - - (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - -By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical -desktop systems have less than 1,000 lock classes, so this warning -normally results from lock-class leakage or failure to properly -initialize locks. These two problems are illustrated below: - -1. Repeated module loading and unloading while running the validator - will result in lock-class leakage. The issue here is that each - load of the module will create a new set of lock classes for - that module's locks, but module unloading does not remove old - classes (see below discussion of reuse of lock classes for why). - Therefore, if that module is loaded and unloaded repeatedly, - the number of lock classes will eventually reach the maximum. - -2. Using structures such as arrays that have large numbers of - locks that are not explicitly initialized. For example, - a hash table with 8192 buckets where each bucket has its own - spinlock_t will consume 8192 lock classes -unless- each spinlock - is explicitly initialized at runtime, for example, using the - run-time spin_lock_init() as opposed to compile-time initializers - such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize - the per-bucket spinlocks would guarantee lock-class overflow. - In contrast, a loop that called spin_lock_init() on each lock - would place all 8192 locks into a single lock class. - - The moral of this story is that you should always explicitly - initialize your locks. - -One might argue that the validator should be modified to allow -lock classes to be reused. However, if you are tempted to make this -argument, first review the code and think through the changes that would -be required, keeping in mind that the lock classes to be removed are -likely to be linked into the lock-dependency graph. This turns out to -be harder to do than to say. - -Of course, if you do run out of lock classes, the next thing to do is -to find the offending lock classes. First, the following command gives -you the number of lock classes currently in use along with the maximum: - - grep "lock-classes" /proc/lockdep_stats - -This command produces the following output on a modest system: - - lock-classes: 748 [max: 8191] - -If the number allocated (748 above) increases continually over time, -then there is likely a leak. The following command can be used to -identify the leaking lock classes: - - grep "BD" /proc/lockdep - -Run the command and save the output, then compare against the output from -a later run of this command to identify the leakers. This same output -can also help you find situations where runtime lock initialization has -been omitted. diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt new file mode 100644 index 000000000000..5dbc99c04f6e --- /dev/null +++ b/Documentation/locking/lockdep-design.txt @@ -0,0 +1,286 @@ +Runtime locking correctness validator +===================================== + +started by Ingo Molnar +additions by Arjan van de Ven + +Lock-class +---------- + +The basic object the validator operates upon is a 'class' of locks. + +A class of locks is a group of locks that are logically the same with +respect to locking rules, even if the locks may have multiple (possibly +tens of thousands of) instantiations. For example a lock in the inode +struct is one class, while each inode has its own instantiation of that +lock class. + +The validator tracks the 'state' of lock-classes, and it tracks +dependencies between different lock-classes. The validator maintains a +rolling proof that the state and the dependencies are correct. + +Unlike an lock instantiation, the lock-class itself never goes away: when +a lock-class is used for the first time after bootup it gets registered, +and all subsequent uses of that lock-class will be attached to this +lock-class. + +State +----- + +The validator tracks lock-class usage history into 4n + 1 separate state bits: + +- 'ever held in STATE context' +- 'ever held as readlock in STATE context' +- 'ever held with STATE enabled' +- 'ever held as readlock with STATE enabled' + +Where STATE can be either one of (kernel/lockdep_states.h) + - hardirq + - softirq + - reclaim_fs + +- 'ever used' [ == !unused ] + +When locking rules are violated, these state bits are presented in the +locking error messages, inside curlies. A contrived example: + + modprobe/2287 is trying to acquire lock: + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + but task is already holding lock: + (&sio_locks[i].lock){-.-...}, at: [] mutex_lock+0x21/0x24 + + +The bit position indicates STATE, STATE-read, for each of the states listed +above, and the character displayed in each indicates: + + '.' acquired while irqs disabled and not in irq context + '-' acquired in irq context + '+' acquired with irqs enabled + '?' acquired in irq context with irqs enabled. + +Unused mutexes cannot be part of the cause of an error. + + +Single-lock state rules: +------------------------ + +A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The +following states are exclusive, and only one of them is allowed to be +set for any lock-class: + + and + and + +The validator detects and reports lock usage that violate these +single-lock state rules. + +Multi-lock dependency rules: +---------------------------- + +The same lock-class must not be acquired twice, because this could lead +to lock recursion deadlocks. + +Furthermore, two locks may not be taken in different order: + + -> + -> + +because this could lead to lock inversion deadlocks. (The validator +finds such dependencies in arbitrary complexity, i.e. there can be any +other locking sequence between the acquire-lock operations, the +validator will still track all dependencies between locks.) + +Furthermore, the following usage based lock dependencies are not allowed +between any two lock-classes: + + -> + -> + +The first rule comes from the fact the a hardirq-safe lock could be +taken by a hardirq context, interrupting a hardirq-unsafe lock - and +thus could result in a lock inversion deadlock. Likewise, a softirq-safe +lock could be taken by an softirq context, interrupting a softirq-unsafe +lock. + +The above rules are enforced for any locking sequence that occurs in the +kernel: when acquiring a new lock, the validator checks whether there is +any rule violation between the new lock and any of the held locks. + +When a lock-class changes its state, the following aspects of the above +dependency rules are enforced: + +- if a new hardirq-safe lock is discovered, we check whether it + took any hardirq-unsafe lock in the past. + +- if a new softirq-safe lock is discovered, we check whether it took + any softirq-unsafe lock in the past. + +- if a new hardirq-unsafe lock is discovered, we check whether any + hardirq-safe lock took it in the past. + +- if a new softirq-unsafe lock is discovered, we check whether any + softirq-safe lock took it in the past. + +(Again, we do these checks too on the basis that an interrupt context +could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which +could lead to a lock inversion deadlock - even if that lock scenario did +not trigger in practice yet.) + +Exception: Nested data dependencies leading to nested locking +------------------------------------------------------------- + +There are a few cases where the Linux kernel acquires more than one +instance of the same lock-class. Such cases typically happen when there +is some sort of hierarchy within objects of the same type. In these +cases there is an inherent "natural" ordering between the two objects +(defined by the properties of the hierarchy), and the kernel grabs the +locks in this fixed order on each of the objects. + +An example of such an object hierarchy that results in "nested locking" +is that of a "whole disk" block-dev object and a "partition" block-dev +object; the partition is "part of" the whole device and as long as one +always takes the whole disk lock as a higher lock than the partition +lock, the lock ordering is fully correct. The validator does not +automatically detect this natural ordering, as the locking rule behind +the ordering is not static. + +In order to teach the validator about this correct usage model, new +versions of the various locking primitives were added that allow you to +specify a "nesting level". An example call, for the block device mutex, +looks like this: + +enum bdev_bd_mutex_lock_class +{ + BD_MUTEX_NORMAL, + BD_MUTEX_WHOLE, + BD_MUTEX_PARTITION +}; + + mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION); + +In this case the locking is done on a bdev object that is known to be a +partition. + +The validator treats a lock that is taken in such a nested fashion as a +separate (sub)class for the purposes of validation. + +Note: When changing code to use the _nested() primitives, be careful and +check really thoroughly that the hierarchy is correctly mapped; otherwise +you can get false positives or false negatives. + +Proof of 100% correctness: +-------------------------- + +The validator achieves perfect, mathematical 'closure' (proof of locking +correctness) in the sense that for every simple, standalone single-task +locking sequence that occurred at least once during the lifetime of the +kernel, the validator proves it with a 100% certainty that no +combination and timing of these locking sequences can cause any class of +lock related deadlock. [*] + +I.e. complex multi-CPU and multi-task locking scenarios do not have to +occur in practice to prove a deadlock: only the simple 'component' +locking chains have to occur at least once (anytime, in any +task/context) for the validator to be able to prove correctness. (For +example, complex deadlocks that would normally need more than 3 CPUs and +a very unlikely constellation of tasks, irq-contexts and timings to +occur, can be detected on a plain, lightly loaded single-CPU system as +well!) + +This radically decreases the complexity of locking related QA of the +kernel: what has to be done during QA is to trigger as many "simple" +single-task locking dependencies in the kernel as possible, at least +once, to prove locking correctness - instead of having to trigger every +possible combination of locking interaction between CPUs, combined with +every possible hardirq and softirq nesting scenario (which is impossible +to do in practice). + +[*] assuming that the validator itself is 100% correct, and no other + part of the system corrupts the state of the validator in any way. + We also assume that all NMI/SMM paths [which could interrupt + even hardirq-disabled codepaths] are correct and do not interfere + with the validator. We also assume that the 64-bit 'chain hash' + value is unique for every lock-chain in the system. Also, lock + recursion must not be higher than 20. + +Performance: +------------ + +The above rules require _massive_ amounts of runtime checking. If we did +that for every lock taken and for every irqs-enable event, it would +render the system practically unusably slow. The complexity of checking +is O(N^2), so even with just a few hundred lock-classes we'd have to do +tens of thousands of checks for every event. + +This problem is solved by checking any given 'locking scenario' (unique +sequence of locks taken after each other) only once. A simple stack of +held locks is maintained, and a lightweight 64-bit hash value is +calculated, which hash is unique for every lock chain. The hash value, +when the chain is validated for the first time, is then put into a hash +table, which hash-table can be checked in a lockfree manner. If the +locking chain occurs again later on, the hash table tells us that we +dont have to validate the chain again. + +Troubleshooting: +---------------- + +The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes. +Exceeding this number will trigger the following lockdep warning: + + (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + +By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical +desktop systems have less than 1,000 lock classes, so this warning +normally results from lock-class leakage or failure to properly +initialize locks. These two problems are illustrated below: + +1. Repeated module loading and unloading while running the validator + will result in lock-class leakage. The issue here is that each + load of the module will create a new set of lock classes for + that module's locks, but module unloading does not remove old + classes (see below discussion of reuse of lock classes for why). + Therefore, if that module is loaded and unloaded repeatedly, + the number of lock classes will eventually reach the maximum. + +2. Using structures such as arrays that have large numbers of + locks that are not explicitly initialized. For example, + a hash table with 8192 buckets where each bucket has its own + spinlock_t will consume 8192 lock classes -unless- each spinlock + is explicitly initialized at runtime, for example, using the + run-time spin_lock_init() as opposed to compile-time initializers + such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize + the per-bucket spinlocks would guarantee lock-class overflow. + In contrast, a loop that called spin_lock_init() on each lock + would place all 8192 locks into a single lock class. + + The moral of this story is that you should always explicitly + initialize your locks. + +One might argue that the validator should be modified to allow +lock classes to be reused. However, if you are tempted to make this +argument, first review the code and think through the changes that would +be required, keeping in mind that the lock classes to be removed are +likely to be linked into the lock-dependency graph. This turns out to +be harder to do than to say. + +Of course, if you do run out of lock classes, the next thing to do is +to find the offending lock classes. First, the following command gives +you the number of lock classes currently in use along with the maximum: + + grep "lock-classes" /proc/lockdep_stats + +This command produces the following output on a modest system: + + lock-classes: 748 [max: 8191] + +If the number allocated (748 above) increases continually over time, +then there is likely a leak. The following command can be used to +identify the leaking lock classes: + + grep "BD" /proc/lockdep + +Run the command and save the output, then compare against the output from +a later run of this command to identify the leakers. This same output +can also help you find situations where runtime lock initialization has +been omitted. diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt new file mode 100644 index 000000000000..7428773a1e69 --- /dev/null +++ b/Documentation/locking/lockstat.txt @@ -0,0 +1,178 @@ + +LOCK STATISTICS + +- WHAT + +As the name suggests, it provides statistics on locks. + +- WHY + +Because things like lock contention can severely impact performance. + +- HOW + +Lockdep already has hooks in the lock functions and maps lock instances to +lock classes. We build on that (see Documentation/lokcing/lockdep-design.txt). +The graph below shows the relation between the lock functions and the various +hooks therein. + + __acquire + | + lock _____ + | \ + | __contended + | | + | + | _______/ + |/ + | + __acquired + | + . + + . + | + __release + | + unlock + +lock, unlock - the regular lock functions +__* - the hooks +<> - states + +With these hooks we provide the following statistics: + + con-bounces - number of lock contention that involved x-cpu data + contentions - number of lock acquisitions that had to wait + wait time min - shortest (non-0) time we ever had to wait for a lock + max - longest time we ever had to wait for a lock + total - total time we spend waiting on this lock + avg - average time spent waiting on this lock + acq-bounces - number of lock acquisitions that involved x-cpu data + acquisitions - number of times we took the lock + hold time min - shortest (non-0) time we ever held the lock + max - longest time we ever held the lock + total - total time this lock was held + avg - average time this lock was held + +These numbers are gathered per lock class, per read/write state (when +applicable). + +It also tracks 4 contention points per class. A contention point is a call site +that had to wait on lock acquisition. + + - CONFIGURATION + +Lock statistics are enabled via CONFIG_LOCK_STAT. + + - USAGE + +Enable collection of statistics: + +# echo 1 >/proc/sys/kernel/lock_stat + +Disable collection of statistics: + +# echo 0 >/proc/sys/kernel/lock_stat + +Look at the current lock statistics: + +( line numbers not part of actual output, done for clarity in the explanation + below ) + +# less /proc/lock_stat + +01 lock_stat version 0.4 +02----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg +04----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +05 +06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99 +07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77 +08 --------------- +09 &mm->mmap_sem 1 [] khugepaged_scan_mm_slot+0x57/0x280 +19 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 +11 &mm->mmap_sem 34 [] vm_mmap_pgoff+0x87/0xd0 +12 &mm->mmap_sem 17 [] vm_munmap+0x41/0x80 +13 --------------- +14 &mm->mmap_sem 1 [] dup_mmap+0x2a/0x3f0 +15 &mm->mmap_sem 60 [] SyS_mprotect+0xe9/0x250 +16 &mm->mmap_sem 41 [] __do_page_fault+0x1d4/0x510 +17 &mm->mmap_sem 68 [] vm_mmap_pgoff+0x87/0xd0 +18 +19............................................................................................................................................................................................................................. +20 +21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48 +22 --------------- +23 unix_table_lock 45 [] unix_create1+0x16e/0x1b0 +24 unix_table_lock 47 [] unix_release_sock+0x31/0x250 +25 unix_table_lock 15 [] unix_find_other+0x117/0x230 +26 unix_table_lock 5 [] unix_autobind+0x11f/0x1b0 +27 --------------- +28 unix_table_lock 39 [] unix_release_sock+0x31/0x250 +29 unix_table_lock 49 [] unix_create1+0x16e/0x1b0 +30 unix_table_lock 20 [] unix_find_other+0x117/0x230 +31 unix_table_lock 4 [] unix_autobind+0x11f/0x1b0 + + +This excerpt shows the first two lock class statistics. Line 01 shows the +output version - each time the format changes this will be updated. Line 02-04 +show the header with column descriptions. Lines 05-18 and 20-31 show the actual +statistics. These statistics come in two parts; the actual stats separated by a +short separator (line 08, 13) from the contention points. + +The first lock (05-18) is a read/write lock, and shows two lines above the +short separator. The contention points don't match the column descriptors, +they have two: contentions and [] symbol. The second set of contention +points are the points we're contending with. + +The integer part of the time values is in us. + +Dealing with nested locks, subclasses may appear: + +32........................................................................................................................................................................................................................... +33 +34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82 +35 --------- +36 &rq->lock 645 [] task_rq_lock+0x43/0x75 +37 &rq->lock 297 [] try_to_wake_up+0x127/0x25a +38 &rq->lock 360 [] select_task_rq_fair+0x1f0/0x74a +39 &rq->lock 428 [] scheduler_tick+0x46/0x1fb +40 --------- +41 &rq->lock 77 [] task_rq_lock+0x43/0x75 +42 &rq->lock 174 [] try_to_wake_up+0x127/0x25a +43 &rq->lock 4715 [] double_rq_lock+0x42/0x54 +44 &rq->lock 893 [] schedule+0x157/0x7b8 +45 +46........................................................................................................................................................................................................................... +47 +48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84 +49 ----------- +50 &rq->lock/1 11526 [] double_rq_lock+0x4f/0x54 +51 ----------- +52 &rq->lock/1 5645 [] double_rq_lock+0x42/0x54 +53 &rq->lock/1 1224 [] schedule+0x157/0x7b8 +54 &rq->lock/1 4336 [] double_rq_lock+0x4f/0x54 +55 &rq->lock/1 181 [] try_to_wake_up+0x127/0x25a + +Line 48 shows statistics for the second subclass (/1) of &rq->lock class +(subclass starts from 0), since in this case, as line 50 suggests, +double_rq_lock actually acquires a nested lock of two spinlocks. + +View the top contending locks: + +# grep : /proc/lock_stat | head + clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71 + tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59 + &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59 + &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69 + &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93 + tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72 + tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89 + rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89 + &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27 + rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76 + +Clear the statistics: + +# echo 0 > /proc/lock_stat diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt new file mode 100644 index 000000000000..ee231ed09ec6 --- /dev/null +++ b/Documentation/locking/mutex-design.txt @@ -0,0 +1,157 @@ +Generic Mutex Subsystem + +started by Ingo Molnar +updated by Davidlohr Bueso + +What are mutexes? +----------------- + +In the Linux kernel, mutexes refer to a particular locking primitive +that enforces serialization on shared memory systems, and not only to +the generic term referring to 'mutual exclusion' found in academia +or similar theoretical text books. Mutexes are sleeping locks which +behave similarly to binary semaphores, and were introduced in 2006[1] +as an alternative to these. This new data structure provided a number +of advantages, including simpler interfaces, and at that time smaller +code (see Disadvantages). + +[1] http://lwn.net/Articles/164802/ + +Implementation +-------------- + +Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h +and implemented in kernel/locking/mutex.c. These locks use a three +state atomic counter (->count) to represent the different possible +transitions that can occur during the lifetime of a lock: + + 1: unlocked + 0: locked, no waiters + negative: locked, with potential waiters + +In its most basic form it also includes a wait-queue and a spinlock +that serializes access to it. CONFIG_SMP systems can also include +a pointer to the lock task owner (->owner) as well as a spinner MCS +lock (->osq), both described below in (ii). + +When acquiring a mutex, there are three possible paths that can be +taken, depending on the state of the lock: + +(i) fastpath: tries to atomically acquire the lock by decrementing the + counter. If it was already taken by another task it goes to the next + possible path. This logic is architecture specific. On x86-64, the + locking fastpath is 2 instructions: + + 0000000000000e10 : + e21: f0 ff 0b lock decl (%rbx) + e24: 79 08 jns e2e + + the unlocking fastpath is equally tight: + + 0000000000000bc0 : + bc8: f0 ff 07 lock incl (%rdi) + bcb: 7f 0a jg bd7 + + +(ii) midpath: aka optimistic spinning, tries to spin for acquisition + while the lock owner is running and there are no other tasks ready + to run that have higher priority (need_resched). The rationale is + that if the lock owner is running, it is likely to release the lock + soon. The mutex spinners are queued up using MCS lock so that only + one spinner can compete for the mutex. + + The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock + with the desirable properties of being fair and with each cpu trying + to acquire the lock spinning on a local variable. It avoids expensive + cacheline bouncing that common test-and-set spinlock implementations + incur. An MCS-like lock is specially tailored for optimistic spinning + for sleeping lock implementation. An important feature of the customized + MCS lock is that it has the extra property that spinners are able to exit + the MCS spinlock queue when they need to reschedule. This further helps + avoid situations where MCS spinners that need to reschedule would continue + waiting to spin on mutex owner, only to go directly to slowpath upon + obtaining the MCS lock. + + +(iii) slowpath: last resort, if the lock is still unable to be acquired, + the task is added to the wait-queue and sleeps until woken up by the + unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE. + +While formally kernel mutexes are sleepable locks, it is path (ii) that +makes them more practically a hybrid type. By simply not interrupting a +task and busy-waiting for a few cycles instead of immediately sleeping, +the performance of this lock has been seen to significantly improve a +number of workloads. Note that this technique is also used for rw-semaphores. + +Semantics +--------- + +The mutex subsystem checks and enforces the following rules: + + - Only one task can hold the mutex at a time. + - Only the owner can unlock the mutex. + - Multiple unlocks are not permitted. + - Recursive locking/unlocking is not permitted. + - A mutex must only be initialized via the API (see below). + - A task may not exit with a mutex held. + - Memory areas where held locks reside must not be freed. + - Held mutexes must not be reinitialized. + - Mutexes may not be used in hardware or software interrupt + contexts such as tasklets and timers. + +These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled. +In addition, the mutex debugging code also implements a number of other +features that make lock debugging easier and faster: + + - Uses symbolic names of mutexes, whenever they are printed + in debug output. + - Point-of-acquire tracking, symbolic lookup of function names, + list of all locks held in the system, printout of them. + - Owner tracking. + - Detects self-recursing locks and prints out all relevant info. + - Detects multi-task circular deadlocks and prints out all affected + locks and tasks (and only those tasks). + + +Interfaces +---------- +Statically define the mutex: + DEFINE_MUTEX(name); + +Dynamically initialize the mutex: + mutex_init(mutex); + +Acquire the mutex, uninterruptible: + void mutex_lock(struct mutex *lock); + void mutex_lock_nested(struct mutex *lock, unsigned int subclass); + int mutex_trylock(struct mutex *lock); + +Acquire the mutex, interruptible: + int mutex_lock_interruptible_nested(struct mutex *lock, + unsigned int subclass); + int mutex_lock_interruptible(struct mutex *lock); + +Acquire the mutex, interruptible, if dec to 0: + int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); + +Unlock the mutex: + void mutex_unlock(struct mutex *lock); + +Test if the mutex is taken: + int mutex_is_locked(struct mutex *lock); + +Disadvantages +------------- + +Unlike its original design and purpose, 'struct mutex' is larger than +most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice +as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the +'struct rw_semaphore' variant. Larger structure sizes mean more CPU +cache and memory footprint. + +When to use mutexes +------------------- + +Unless the strict semantics of mutexes are unsuitable and/or the critical +region prevents the lock from being shared, always prefer them to any other +locking primitive. diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt new file mode 100644 index 000000000000..8666070d3189 --- /dev/null +++ b/Documentation/locking/rt-mutex-design.txt @@ -0,0 +1,781 @@ +# +# Copyright (c) 2006 Steven Rostedt +# Licensed under the GNU Free Documentation License, Version 1.2 +# + +RT-mutex implementation design +------------------------------ + +This document tries to describe the design of the rtmutex.c implementation. +It doesn't describe the reasons why rtmutex.c exists. For that please see +Documentation/rt-mutex.txt. Although this document does explain problems +that happen without this code, but that is in the concept to understand +what the code actually is doing. + +The goal of this document is to help others understand the priority +inheritance (PI) algorithm that is used, as well as reasons for the +decisions that were made to implement PI in the manner that was done. + + +Unbounded Priority Inversion +---------------------------- + +Priority inversion is when a lower priority process executes while a higher +priority process wants to run. This happens for several reasons, and +most of the time it can't be helped. Anytime a high priority process wants +to use a resource that a lower priority process has (a mutex for example), +the high priority process must wait until the lower priority process is done +with the resource. This is a priority inversion. What we want to prevent +is something called unbounded priority inversion. That is when the high +priority process is prevented from running by a lower priority process for +an undetermined amount of time. + +The classic example of unbounded priority inversion is where you have three +processes, let's call them processes A, B, and C, where A is the highest +priority process, C is the lowest, and B is in between. A tries to grab a lock +that C owns and must wait and lets C run to release the lock. But in the +meantime, B executes, and since B is of a higher priority than C, it preempts C, +but by doing so, it is in fact preempting A which is a higher priority process. +Now there's no way of knowing how long A will be sleeping waiting for C +to release the lock, because for all we know, B is a CPU hog and will +never give C a chance to release the lock. This is called unbounded priority +inversion. + +Here's a little ASCII art to show the problem. + + grab lock L1 (owned by C) + | +A ---+ + C preempted by B + | +C +----+ + +B +--------> + B now keeps A from running. + + +Priority Inheritance (PI) +------------------------- + +There are several ways to solve this issue, but other ways are out of scope +for this document. Here we only discuss PI. + +PI is where a process inherits the priority of another process if the other +process blocks on a lock owned by the current process. To make this easier +to understand, let's use the previous example, with processes A, B, and C again. + +This time, when A blocks on the lock owned by C, C would inherit the priority +of A. So now if B becomes runnable, it would not preempt C, since C now has +the high priority of A. As soon as C releases the lock, it loses its +inherited priority, and A then can continue with the resource that C had. + +Terminology +----------- + +Here I explain some terminology that is used in this document to help describe +the design that is used to implement PI. + +PI chain - The PI chain is an ordered series of locks and processes that cause + processes to inherit priorities from a previous process that is + blocked on one of its locks. This is described in more detail + later in this document. + +mutex - In this document, to differentiate from locks that implement + PI and spin locks that are used in the PI code, from now on + the PI locks will be called a mutex. + +lock - In this document from now on, I will use the term lock when + referring to spin locks that are used to protect parts of the PI + algorithm. These locks disable preemption for UP (when + CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from + entering critical sections simultaneously. + +spin lock - Same as lock above. + +waiter - A waiter is a struct that is stored on the stack of a blocked + process. Since the scope of the waiter is within the code for + a process being blocked on the mutex, it is fine to allocate + the waiter on the process's stack (local variable). This + structure holds a pointer to the task, as well as the mutex that + the task is blocked on. It also has the plist node structures to + place the task in the waiter_list of a mutex as well as the + pi_list of a mutex owner task (described below). + + waiter is sometimes used in reference to the task that is waiting + on a mutex. This is the same as waiter->task. + +waiters - A list of processes that are blocked on a mutex. + +top waiter - The highest priority process waiting on a specific mutex. + +top pi waiter - The highest priority process waiting on one of the mutexes + that a specific process owns. + +Note: task and process are used interchangeably in this document, mostly to + differentiate between two processes that are being described together. + + +PI chain +-------- + +The PI chain is a list of processes and mutexes that may cause priority +inheritance to take place. Multiple chains may converge, but a chain +would never diverge, since a process can't be blocked on more than one +mutex at a time. + +Example: + + Process: A, B, C, D, E + Mutexes: L1, L2, L3, L4 + + A owns: L1 + B blocked on L1 + B owns L2 + C blocked on L2 + C owns L3 + D blocked on L3 + D owns L4 + E blocked on L4 + +The chain would be: + + E->L4->D->L3->C->L2->B->L1->A + +To show where two chains merge, we could add another process F and +another mutex L5 where B owns L5 and F is blocked on mutex L5. + +The chain for F would be: + + F->L5->B->L1->A + +Since a process may own more than one mutex, but never be blocked on more than +one, the chains merge. + +Here we show both chains: + + E->L4->D->L3->C->L2-+ + | + +->B->L1->A + | + F->L5-+ + +For PI to work, the processes at the right end of these chains (or we may +also call it the Top of the chain) must be equal to or higher in priority +than the processes to the left or below in the chain. + +Also since a mutex may have more than one process blocked on it, we can +have multiple chains merge at mutexes. If we add another process G that is +blocked on mutex L2: + + G->L2->B->L1->A + +And once again, to show how this can grow I will show the merging chains +again. + + E->L4->D->L3->C-+ + +->L2-+ + | | + G-+ +->B->L1->A + | + F->L5-+ + + +Plist +----- + +Before I go further and talk about how the PI chain is stored through lists +on both mutexes and processes, I'll explain the plist. This is similar to +the struct list_head functionality that is already in the kernel. +The implementation of plist is out of scope for this document, but it is +very important to understand what it does. + +There are a few differences between plist and list, the most important one +being that plist is a priority sorted linked list. This means that the +priorities of the plist are sorted, such that it takes O(1) to retrieve the +highest priority item in the list. Obviously this is useful to store processes +based on their priorities. + +Another difference, which is important for implementation, is that, unlike +list, the head of the list is a different element than the nodes of a list. +So the head of the list is declared as struct plist_head and nodes that will +be added to the list are declared as struct plist_node. + + +Mutex Waiter List +----------------- + +Every mutex keeps track of all the waiters that are blocked on itself. The mutex +has a plist to store these waiters by priority. This list is protected by +a spin lock that is located in the struct of the mutex. This lock is called +wait_lock. Since the modification of the waiter list is never done in +interrupt context, the wait_lock can be taken without disabling interrupts. + + +Task PI List +------------ + +To keep track of the PI chains, each process has its own PI list. This is +a list of all top waiters of the mutexes that are owned by the process. +Note that this list only holds the top waiters and not all waiters that are +blocked on mutexes owned by the process. + +The top of the task's PI list is always the highest priority task that +is waiting on a mutex that is owned by the task. So if the task has +inherited a priority, it will always be the priority of the task that is +at the top of this list. + +This list is stored in the task structure of a process as a plist called +pi_list. This list is protected by a spin lock also in the task structure, +called pi_lock. This lock may also be taken in interrupt context, so when +locking the pi_lock, interrupts must be disabled. + + +Depth of the PI Chain +--------------------- + +The maximum depth of the PI chain is not dynamic, and could actually be +defined. But is very complex to figure it out, since it depends on all +the nesting of mutexes. Let's look at the example where we have 3 mutexes, +L1, L2, and L3, and four separate functions func1, func2, func3 and func4. +The following shows a locking order of L1->L2->L3, but may not actually +be directly nested that way. + +void func1(void) +{ + mutex_lock(L1); + + /* do anything */ + + mutex_unlock(L1); +} + +void func2(void) +{ + mutex_lock(L1); + mutex_lock(L2); + + /* do something */ + + mutex_unlock(L2); + mutex_unlock(L1); +} + +void func3(void) +{ + mutex_lock(L2); + mutex_lock(L3); + + /* do something else */ + + mutex_unlock(L3); + mutex_unlock(L2); +} + +void func4(void) +{ + mutex_lock(L3); + + /* do something again */ + + mutex_unlock(L3); +} + +Now we add 4 processes that run each of these functions separately. +Processes A, B, C, and D which run functions func1, func2, func3 and func4 +respectively, and such that D runs first and A last. With D being preempted +in func4 in the "do something again" area, we have a locking that follows: + +D owns L3 + C blocked on L3 + C owns L2 + B blocked on L2 + B owns L1 + A blocked on L1 + +And thus we have the chain A->L1->B->L2->C->L3->D. + +This gives us a PI depth of 4 (four processes), but looking at any of the +functions individually, it seems as though they only have at most a locking +depth of two. So, although the locking depth is defined at compile time, +it still is very difficult to find the possibilities of that depth. + +Now since mutexes can be defined by user-land applications, we don't want a DOS +type of application that nests large amounts of mutexes to create a large +PI chain, and have the code holding spin locks while looking at a large +amount of data. So to prevent this, the implementation not only implements +a maximum lock depth, but also only holds at most two different locks at a +time, as it walks the PI chain. More about this below. + + +Mutex owner and flags +--------------------- + +The mutex structure contains a pointer to the owner of the mutex. If the +mutex is not owned, this owner is set to NULL. Since all architectures +have the task structure on at least a four byte alignment (and if this is +not true, the rtmutex.c code will be broken!), this allows for the two +least significant bits to be used as flags. This part is also described +in Documentation/rt-mutex.txt, but will also be briefly described here. + +Bit 0 is used as the "Pending Owner" flag. This is described later. +Bit 1 is used as the "Has Waiters" flags. This is also described later + in more detail, but is set whenever there are waiters on a mutex. + + +cmpxchg Tricks +-------------- + +Some architectures implement an atomic cmpxchg (Compare and Exchange). This +is used (when applicable) to keep the fast path of grabbing and releasing +mutexes short. + +cmpxchg is basically the following function performed atomically: + +unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) +{ + unsigned long T = *A; + if (*A == *B) { + *A = *C; + } + return T; +} +#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) + +This is really nice to have, since it allows you to only update a variable +if the variable is what you expect it to be. You know if it succeeded if +the return value (the old value of A) is equal to B. + +The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If +the architecture does not support CMPXCHG, then this macro is simply set +to fail every time. But if CMPXCHG is supported, then this will +help out extremely to keep the fast path short. + +The use of rt_mutex_cmpxchg with the flags in the owner field help optimize +the system for architectures that support it. This will also be explained +later in this document. + + +Priority adjustments +-------------------- + +The implementation of the PI code in rtmutex.c has several places that a +process must adjust its priority. With the help of the pi_list of a +process this is rather easy to know what needs to be adjusted. + +The functions implementing the task adjustments are rt_mutex_adjust_prio, +__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock +to already be taken), rt_mutex_getprio, and rt_mutex_setprio. + +rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio. + +rt_mutex_getprio returns the priority that the task should have. Either the +task's own normal priority, or if a process of a higher priority is waiting on +a mutex owned by the task, then that higher priority should be returned. +Since the pi_list of a task holds an order by priority list of all the top +waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs +to compare the top pi waiter to its own normal priority, and return the higher +priority back. + +(Note: if looking at the code, you will notice that the lower number of + prio is returned. This is because the prio field in the task structure + is an inverse order of the actual priority. So a "prio" of 5 is + of higher priority than a "prio" of 10.) + +__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the +result does not equal the task's current priority, then rt_mutex_setprio +is called to adjust the priority of the task to the new priority. +Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the +actual change in priority. + +It is interesting to note that __rt_mutex_adjust_prio can either increase +or decrease the priority of the task. In the case that a higher priority +process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio +would increase/boost the task's priority. But if a higher priority task +were for some reason to leave the mutex (timeout or signal), this same function +would decrease/unboost the priority of the task. That is because the pi_list +always contains the highest priority task that is waiting on a mutex owned +by the task, so we only need to compare the priority of that top pi waiter +to the normal priority of the given task. + + +High level overview of the PI chain walk +---------------------------------------- + +The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain. + +The implementation has gone through several iterations, and has ended up +with what we believe is the best. It walks the PI chain by only grabbing +at most two locks at a time, and is very efficient. + +The rt_mutex_adjust_prio_chain can be used either to boost or lower process +priorities. + +rt_mutex_adjust_prio_chain is called with a task to be checked for PI +(de)boosting (the owner of a mutex that a process is blocking on), a flag to +check for deadlocking, the mutex that the task owns, and a pointer to a waiter +that is the process's waiter struct that is blocked on the mutex (although this +parameter may be NULL for deboosting). + +For this explanation, I will not mention deadlock detection. This explanation +will try to stay at a high level. + +When this function is called, there are no locks held. That also means +that the state of the owner and lock can change when entered into this function. + +Before this function is called, the task has already had rt_mutex_adjust_prio +performed on it. This means that the task is set to the priority that it +should be at, but the plist nodes of the task's waiter have not been updated +with the new priorities, and that this task may not be in the proper locations +in the pi_lists and wait_lists that the task is blocked on. This function +solves all that. + +A loop is entered, where task is the owner to be checked for PI changes that +was passed by parameter (for the first iteration). The pi_lock of this task is +taken to prevent any more changes to the pi_list of the task. This also +prevents new tasks from completing the blocking on a mutex that is owned by this +task. + +If the task is not blocked on a mutex then the loop is exited. We are at +the top of the PI chain. + +A check is now done to see if the original waiter (the process that is blocked +on the current mutex) is the top pi waiter of the task. That is, is this +waiter on the top of the task's pi_list. If it is not, it either means that +there is another process higher in priority that is blocked on one of the +mutexes that the task owns, or that the waiter has just woken up via a signal +or timeout and has left the PI chain. In either case, the loop is exited, since +we don't need to do any more changes to the priority of the current task, or any +task that owns a mutex that this current task is waiting on. A priority chain +walk is only needed when a new top pi waiter is made to a task. + +The next check sees if the task's waiter plist node has the priority equal to +the priority the task is set at. If they are equal, then we are done with +the loop. Remember that the function started with the priority of the +task adjusted, but the plist nodes that hold the task in other processes +pi_lists have not been adjusted. + +Next, we look at the mutex that the task is blocked on. The mutex's wait_lock +is taken. This is done by a spin_trylock, because the locking order of the +pi_lock and wait_lock goes in the opposite direction. If we fail to grab the +lock, the pi_lock is released, and we restart the loop. + +Now that we have both the pi_lock of the task as well as the wait_lock of +the mutex the task is blocked on, we update the task's waiter's plist node +that is located on the mutex's wait_list. + +Now we release the pi_lock of the task. + +Next the owner of the mutex has its pi_lock taken, so we can update the +task's entry in the owner's pi_list. If the task is the highest priority +process on the mutex's wait_list, then we remove the previous top waiter +from the owner's pi_list, and replace it with the task. + +Note: It is possible that the task was the current top waiter on the mutex, + in which case the task is not yet on the pi_list of the waiter. This + is OK, since plist_del does nothing if the plist node is not on any + list. + +If the task was not the top waiter of the mutex, but it was before we +did the priority updates, that means we are deboosting/lowering the +task. In this case, the task is removed from the pi_list of the owner, +and the new top waiter is added. + +Lastly, we unlock both the pi_lock of the task, as well as the mutex's +wait_lock, and continue the loop again. On the next iteration of the +loop, the previous owner of the mutex will be the task that will be +processed. + +Note: One might think that the owner of this mutex might have changed + since we just grab the mutex's wait_lock. And one could be right. + The important thing to remember is that the owner could not have + become the task that is being processed in the PI chain, since + we have taken that task's pi_lock at the beginning of the loop. + So as long as there is an owner of this mutex that is not the same + process as the tasked being worked on, we are OK. + + Looking closely at the code, one might be confused. The check for the + end of the PI chain is when the task isn't blocked on anything or the + task's waiter structure "task" element is NULL. This check is + protected only by the task's pi_lock. But the code to unlock the mutex + sets the task's waiter structure "task" element to NULL with only + the protection of the mutex's wait_lock, which was not taken yet. + Isn't this a race condition if the task becomes the new owner? + + The answer is No! The trick is the spin_trylock of the mutex's + wait_lock. If we fail that lock, we release the pi_lock of the + task and continue the loop, doing the end of PI chain check again. + + In the code to release the lock, the wait_lock of the mutex is held + the entire time, and it is not let go when we grab the pi_lock of the + new owner of the mutex. So if the switch of a new owner were to happen + after the check for end of the PI chain and the grabbing of the + wait_lock, the unlocking code would spin on the new owner's pi_lock + but never give up the wait_lock. So the PI chain loop is guaranteed to + fail the spin_trylock on the wait_lock, release the pi_lock, and + try again. + + If you don't quite understand the above, that's OK. You don't have to, + unless you really want to make a proof out of it ;) + + +Pending Owners and Lock stealing +-------------------------------- + +One of the flags in the owner field of the mutex structure is "Pending Owner". +What this means is that an owner was chosen by the process releasing the +mutex, but that owner has yet to wake up and actually take the mutex. + +Why is this important? Why can't we just give the mutex to another process +and be done with it? + +The PI code is to help with real-time processes, and to let the highest +priority process run as long as possible with little latencies and delays. +If a high priority process owns a mutex that a lower priority process is +blocked on, when the mutex is released it would be given to the lower priority +process. What if the higher priority process wants to take that mutex again. +The high priority process would fail to take that mutex that it just gave up +and it would need to boost the lower priority process to run with full +latency of that critical section (since the low priority process just entered +it). + +There's no reason a high priority process that gives up a mutex should be +penalized if it tries to take that mutex again. If the new owner of the +mutex has not woken up yet, there's no reason that the higher priority process +could not take that mutex away. + +To solve this, we introduced Pending Ownership and Lock Stealing. When a +new process is given a mutex that it was blocked on, it is only given +pending ownership. This means that it's the new owner, unless a higher +priority process comes in and tries to grab that mutex. If a higher priority +process does come along and wants that mutex, we let the higher priority +process "steal" the mutex from the pending owner (only if it is still pending) +and continue with the mutex. + + +Taking of a mutex (The walk through) +------------------------------------ + +OK, now let's take a look at the detailed walk through of what happens when +taking a mutex. + +The first thing that is tried is the fast taking of the mutex. This is +done when we have CMPXCHG enabled (otherwise the fast taking automatically +fails). Only when the owner field of the mutex is NULL can the lock be +taken with the CMPXCHG and nothing else needs to be done. + +If there is contention on the lock, whether it is owned or pending owner +we go about the slow path (rt_mutex_slowlock). + +The slow path function is where the task's waiter structure is created on +the stack. This is because the waiter structure is only needed for the +scope of this function. The waiter structure holds the nodes to store +the task on the wait_list of the mutex, and if need be, the pi_list of +the owner. + +The wait_lock of the mutex is taken since the slow path of unlocking the +mutex also takes this lock. + +We then call try_to_take_rt_mutex. This is where the architecture that +does not implement CMPXCHG would always grab the lock (if there's no +contention). + +try_to_take_rt_mutex is used every time the task tries to grab a mutex in the +slow path. The first thing that is done here is an atomic setting of +the "Has Waiters" flag of the mutex's owner field. Yes, this could really +be false, because if the mutex has no owner, there are no waiters and +the current task also won't have any waiters. But we don't have the lock +yet, so we assume we are going to be a waiter. The reason for this is to +play nice for those architectures that do have CMPXCHG. By setting this flag +now, the owner of the mutex can't release the mutex without going into the +slow unlock path, and it would then need to grab the wait_lock, which this +code currently holds. So setting the "Has Waiters" flag forces the owner +to synchronize with this code. + +Now that we know that we can't have any races with the owner releasing the +mutex, we check to see if we can take the ownership. This is done if the +mutex doesn't have a owner, or if we can steal the mutex from a pending +owner. Let's look at the situations we have here. + + 1) Has owner that is pending + ---------------------------- + + The mutex has a owner, but it hasn't woken up and the mutex flag + "Pending Owner" is set. The first check is to see if the owner isn't the + current task. This is because this function is also used for the pending + owner to grab the mutex. When a pending owner wakes up, it checks to see + if it can take the mutex, and this is done if the owner is already set to + itself. If so, we succeed and leave the function, clearing the "Pending + Owner" bit. + + If the pending owner is not current, we check to see if the current priority is + higher than the pending owner. If not, we fail the function and return. + + There's also something special about a pending owner. That is a pending owner + is never blocked on a mutex. So there is no PI chain to worry about. It also + means that if the mutex doesn't have any waiters, there's no accounting needed + to update the pending owner's pi_list, since we only worry about processes + blocked on the current mutex. + + If there are waiters on this mutex, and we just stole the ownership, we need + to take the top waiter, remove it from the pi_list of the pending owner, and + add it to the current pi_list. Note that at this moment, the pending owner + is no longer on the list of waiters. This is fine, since the pending owner + would add itself back when it realizes that it had the ownership stolen + from itself. When the pending owner tries to grab the mutex, it will fail + in try_to_take_rt_mutex if the owner field points to another process. + + 2) No owner + ----------- + + If there is no owner (or we successfully stole the lock), we set the owner + of the mutex to current, and set the flag of "Has Waiters" if the current + mutex actually has waiters, or we clear the flag if it doesn't. See, it was + OK that we set that flag early, since now it is cleared. + + 3) Failed to grab ownership + --------------------------- + + The most interesting case is when we fail to take ownership. This means that + there exists an owner, or there's a pending owner with equal or higher + priority than the current task. + +We'll continue on the failed case. + +If the mutex has a timeout, we set up a timer to go off to break us out +of this mutex if we failed to get it after a specified amount of time. + +Now we enter a loop that will continue to try to take ownership of the mutex, or +fail from a timeout or signal. + +Once again we try to take the mutex. This will usually fail the first time +in the loop, since it had just failed to get the mutex. But the second time +in the loop, this would likely succeed, since the task would likely be +the pending owner. + +If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done +here. + +The waiter structure has a "task" field that points to the task that is blocked +on the mutex. This field can be NULL the first time it goes through the loop +or if the task is a pending owner and had its mutex stolen. If the "task" +field is NULL then we need to set up the accounting for it. + +Task blocks on mutex +-------------------- + +The accounting of a mutex and process is done with the waiter structure of +the process. The "task" field is set to the process, and the "lock" field +to the mutex. The plist nodes are initialized to the processes current +priority. + +Since the wait_lock was taken at the entry of the slow lock, we can safely +add the waiter to the wait_list. If the current process is the highest +priority process currently waiting on this mutex, then we remove the +previous top waiter process (if it exists) from the pi_list of the owner, +and add the current process to that list. Since the pi_list of the owner +has changed, we call rt_mutex_adjust_prio on the owner to see if the owner +should adjust its priority accordingly. + +If the owner is also blocked on a lock, and had its pi_list changed +(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead +and run rt_mutex_adjust_prio_chain on the owner, as described earlier. + +Now all locks are released, and if the current process is still blocked on a +mutex (waiter "task" field is not NULL), then we go to sleep (call schedule). + +Waking up in the loop +--------------------- + +The schedule can then wake up for a few reasons. + 1) we were given pending ownership of the mutex. + 2) we received a signal and was TASK_INTERRUPTIBLE + 3) we had a timeout and was TASK_INTERRUPTIBLE + +In any of these cases, we continue the loop and once again try to grab the +ownership of the mutex. If we succeed, we exit the loop, otherwise we continue +and on signal and timeout, will exit the loop, or if we had the mutex stolen +we just simply add ourselves back on the lists and go back to sleep. + +Note: For various reasons, because of timeout and signals, the steal mutex + algorithm needs to be careful. This is because the current process is + still on the wait_list. And because of dynamic changing of priorities, + especially on SCHED_OTHER tasks, the current process can be the + highest priority task on the wait_list. + +Failed to get mutex on Timeout or Signal +---------------------------------------- + +If a timeout or signal occurred, the waiter's "task" field would not be +NULL and the task needs to be taken off the wait_list of the mutex and perhaps +pi_list of the owner. If this process was a high priority process, then +the rt_mutex_adjust_prio_chain needs to be executed again on the owner, +but this time it will be lowering the priorities. + + +Unlocking the Mutex +------------------- + +The unlocking of a mutex also has a fast path for those architectures with +CMPXCHG. Since the taking of a mutex on contention always sets the +"Has Waiters" flag of the mutex's owner, we use this to know if we need to +take the slow path when unlocking the mutex. If the mutex doesn't have any +waiters, the owner field of the mutex would equal the current process and +the mutex can be unlocked by just replacing the owner field with NULL. + +If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available), +the slow unlock path is taken. + +The first thing done in the slow unlock path is to take the wait_lock of the +mutex. This synchronizes the locking and unlocking of the mutex. + +A check is made to see if the mutex has waiters or not. On architectures that +do not have CMPXCHG, this is the location that the owner of the mutex will +determine if a waiter needs to be awoken or not. On architectures that +do have CMPXCHG, that check is done in the fast path, but it is still needed +in the slow path too. If a waiter of a mutex woke up because of a signal +or timeout between the time the owner failed the fast path CMPXCHG check and +the grabbing of the wait_lock, the mutex may not have any waiters, thus the +owner still needs to make this check. If there are no waiters then the mutex +owner field is set to NULL, the wait_lock is released and nothing more is +needed. + +If there are waiters, then we need to wake one up and give that waiter +pending ownership. + +On the wake up code, the pi_lock of the current owner is taken. The top +waiter of the lock is found and removed from the wait_list of the mutex +as well as the pi_list of the current owner. The task field of the new +pending owner's waiter structure is set to NULL, and the owner field of the +mutex is set to the new owner with the "Pending Owner" bit set, as well +as the "Has Waiters" bit if there still are other processes blocked on the +mutex. + +The pi_lock of the previous owner is released, and the new pending owner's +pi_lock is taken. Remember that this is the trick to prevent the race +condition in rt_mutex_adjust_prio_chain from adding itself as a waiter +on the mutex. + +We now clear the "pi_blocked_on" field of the new pending owner, and if +the mutex still has waiters pending, we add the new top waiter to the pi_list +of the pending owner. + +Finally we unlock the pi_lock of the pending owner and wake it up. + + +Contact +------- + +For updates on this document, please email Steven Rostedt + + +Credits +------- + +Author: Steven Rostedt + +Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap + +Updates +------- + +This document was originally written for 2.6.17-rc3-mm1 diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt new file mode 100644 index 000000000000..243393d882ee --- /dev/null +++ b/Documentation/locking/rt-mutex.txt @@ -0,0 +1,79 @@ +RT-mutex subsystem with PI support +---------------------------------- + +RT-mutexes with priority inheritance are used to support PI-futexes, +which enable pthread_mutex_t priority inheritance attributes +(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details +about PI-futexes.] + +This technology was developed in the -rt tree and streamlined for +pthread_mutex support. + +Basic principles: +----------------- + +RT-mutexes extend the semantics of simple mutexes by the priority +inheritance protocol. + +A low priority owner of a rt-mutex inherits the priority of a higher +priority waiter until the rt-mutex is released. If the temporarily +boosted owner blocks on a rt-mutex itself it propagates the priority +boosting to the owner of the other rt_mutex it gets blocked on. The +priority boosting is immediately removed once the rt_mutex has been +unlocked. + +This approach allows us to shorten the block of high-prio tasks on +mutexes which protect shared resources. Priority inheritance is not a +magic bullet for poorly designed applications, but it allows +well-designed applications to use userspace locks in critical parts of +an high priority thread, without losing determinism. + +The enqueueing of the waiters into the rtmutex waiter list is done in +priority order. For same priorities FIFO order is chosen. For each +rtmutex, only the top priority waiter is enqueued into the owner's +priority waiters list. This list too queues in priority order. Whenever +the top priority waiter of a task changes (for example it timed out or +got a signal), the priority of the owner task is readjusted. [The +priority enqueueing is handled by "plists", see include/linux/plist.h +for more details.] + +RT-mutexes are optimized for fastpath operations and have no internal +locking overhead when locking an uncontended mutex or unlocking a mutex +without waiters. The optimized fastpath operations require cmpxchg +support. [If that is not available then the rt-mutex internal spinlock +is used] + +The state of the rt-mutex is tracked via the owner field of the rt-mutex +structure: + +rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 +are used to keep track of the "owner is pending" and "rtmutex has +waiters" state. + + owner bit1 bit0 + NULL 0 0 mutex is free (fast acquire possible) + NULL 0 1 invalid state + NULL 1 0 Transitional state* + NULL 1 1 invalid state + taskpointer 0 0 mutex is held (fast release possible) + taskpointer 0 1 task is pending owner + taskpointer 1 0 mutex is held and has waiters + taskpointer 1 1 task is pending owner and mutex has waiters + +Pending-ownership handling is a performance optimization: +pending-ownership is assigned to the first (highest priority) waiter of +the mutex, when the mutex is released. The thread is woken up and once +it starts executing it can acquire the mutex. Until the mutex is taken +by it (bit 0 is cleared) a competing higher priority thread can "steal" +the mutex which puts the woken up thread back on the waiters list. + +The pending-ownership optimization is especially important for the +uninterrupted workflow of high-prio tasks which repeatedly +takes/releases locks that have lower-prio waiters. Without this +optimization the higher-prio thread would ping-pong to the lower-prio +task [because at unlock time we always assign a new owner]. + +(*) The "mutex has waiters" bit gets set to take the lock. If the lock +doesn't already have an owner, this bit is quickly cleared if there are +no waiters. So this is a transitional state to synchronize with looking +at the owner field of the mutex and the mutex owner releasing the lock. diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt new file mode 100644 index 000000000000..ff35e40bdf5b --- /dev/null +++ b/Documentation/locking/spinlocks.txt @@ -0,0 +1,167 @@ +Lesson 1: Spin locks + +The most basic primitive for locking is spinlock. + +static DEFINE_SPINLOCK(xxx_lock); + + unsigned long flags; + + spin_lock_irqsave(&xxx_lock, flags); + ... critical section here .. + spin_unlock_irqrestore(&xxx_lock, flags); + +The above is always safe. It will disable interrupts _locally_, but the +spinlock itself will guarantee the global lock, so it will guarantee that +there is only one thread-of-control within the region(s) protected by that +lock. This works well even under UP also, so the code does _not_ need to +worry about UP vs SMP issues: the spinlocks work correctly under both. + + NOTE! Implications of spin_locks for memory are further described in: + + Documentation/memory-barriers.txt + (5) LOCK operations. + (6) UNLOCK operations. + +The above is usually pretty simple (you usually need and want only one +spinlock for most things - using more than one spinlock can make things a +lot more complex and even slower and is usually worth it only for +sequences that you _know_ need to be split up: avoid it at all cost if you +aren't sure). + +This is really the only really hard part about spinlocks: once you start +using spinlocks they tend to expand to areas you might not have noticed +before, because you have to make sure the spinlocks correctly protect the +shared data structures _everywhere_ they are used. The spinlocks are most +easily added to places that are completely independent of other code (for +example, internal driver data structures that nobody else ever touches). + + NOTE! The spin-lock is safe only when you _also_ use the lock itself + to do locking across CPU's, which implies that EVERYTHING that + touches a shared variable has to agree about the spinlock they want + to use. + +---- + +Lesson 2: reader-writer spinlocks. + +If your data accesses have a very natural pattern where you usually tend +to mostly read from the shared variables, the reader-writer locks +(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple +readers to be in the same critical region at once, but if somebody wants +to change the variables it has to get an exclusive write lock. + + NOTE! reader-writer locks require more atomic memory operations than + simple spinlocks. Unless the reader critical section is long, you + are better off just using spinlocks. + +The routines look the same as above: + + rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); + + unsigned long flags; + + read_lock_irqsave(&xxx_lock, flags); + .. critical section that only reads the info ... + read_unlock_irqrestore(&xxx_lock, flags); + + write_lock_irqsave(&xxx_lock, flags); + .. read and write exclusive access to the info ... + write_unlock_irqrestore(&xxx_lock, flags); + +The above kind of lock may be useful for complex data structures like +linked lists, especially searching for entries without changing the list +itself. The read lock allows many concurrent readers. Anything that +_changes_ the list will have to get the write lock. + + NOTE! RCU is better for list traversal, but requires careful + attention to design detail (see Documentation/RCU/listRCU.txt). + +Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ +time need to do any changes (even if you don't do it every time), you have +to get the write-lock at the very beginning. + + NOTE! We are working hard to remove reader-writer spinlocks in most + cases, so please don't add a new one without consensus. (Instead, see + Documentation/RCU/rcu.txt for complete information.) + +---- + +Lesson 3: spinlocks revisited. + +The single spin-lock primitives above are by no means the only ones. They +are the most safe ones, and the ones that work under all circumstances, +but partly _because_ they are safe they are also fairly slow. They are slower +than they'd need to be, because they do have to disable interrupts +(which is just a single instruction on a x86, but it's an expensive one - +and on other architectures it can be worse). + +If you have a case where you have to protect a data structure across +several CPU's and you want to use spinlocks you can potentially use +cheaper versions of the spinlocks. IFF you know that the spinlocks are +never used in interrupt handlers, you can use the non-irq versions: + + spin_lock(&lock); + ... + spin_unlock(&lock); + +(and the equivalent read-write versions too, of course). The spinlock will +guarantee the same kind of exclusive access, and it will be much faster. +This is useful if you know that the data in question is only ever +manipulated from a "process context", ie no interrupts involved. + +The reasons you mustn't use these versions if you have interrupts that +play with the spinlock is that you can get deadlocks: + + spin_lock(&lock); + ... + <- interrupt comes in: + spin_lock(&lock); + +where an interrupt tries to lock an already locked variable. This is ok if +the other interrupt happens on another CPU, but it is _not_ ok if the +interrupt happens on the same CPU that already holds the lock, because the +lock will obviously never be released (because the interrupt is waiting +for the lock, and the lock-holder is interrupted by the interrupt and will +not continue until the interrupt has been processed). + +(This is also the reason why the irq-versions of the spinlocks only need +to disable the _local_ interrupts - it's ok to use spinlocks in interrupts +on other CPU's, because an interrupt on another CPU doesn't interrupt the +CPU that holds the lock, so the lock-holder can continue and eventually +releases the lock). + +Note that you can be clever with read-write locks and interrupts. For +example, if you know that the interrupt only ever gets a read-lock, then +you can use a non-irq version of read locks everywhere - because they +don't block on each other (and thus there is no dead-lock wrt interrupts. +But when you do the write-lock, you have to use the irq-safe version. + +For an example of being clever with rw-locks, see the "waitqueue_lock" +handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from +within an interrupt, they only read the queue in order to know whom to +wake up. So read-locks are safe (which is good: they are very common +indeed), while write-locks need to protect themselves against interrupts. + + Linus + +---- + +Reference information: + +For dynamic initialization, use spin_lock_init() or rwlock_init() as +appropriate: + + spinlock_t xxx_lock; + rwlock_t xxx_rw_lock; + + static int __init xxx_init(void) + { + spin_lock_init(&xxx_lock); + rwlock_init(&xxx_rw_lock); + ... + } + + module_init(xxx_init); + +For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or +__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt new file mode 100644 index 000000000000..8a112dc304c3 --- /dev/null +++ b/Documentation/locking/ww-mutex-design.txt @@ -0,0 +1,344 @@ +Wait/Wound Deadlock-Proof Mutex Design +====================================== + +Please read mutex-design.txt first, as it applies to wait/wound mutexes too. + +Motivation for WW-Mutexes +------------------------- + +GPU's do operations that commonly involve many buffers. Those buffers +can be shared across contexts/processes, exist in different memory +domains (for example VRAM vs system memory), and so on. And with +PRIME / dmabuf, they can even be shared across devices. So there are +a handful of situations where the driver needs to wait for buffers to +become ready. If you think about this in terms of waiting on a buffer +mutex for it to become available, this presents a problem because +there is no way to guarantee that buffers appear in a execbuf/batch in +the same order in all contexts. That is directly under control of +userspace, and a result of the sequence of GL calls that an application +makes. Which results in the potential for deadlock. The problem gets +more complex when you consider that the kernel may need to migrate the +buffer(s) into VRAM before the GPU operates on the buffer(s), which +may in turn require evicting some other buffers (and you don't want to +evict other buffers which are already queued up to the GPU), but for a +simplified understanding of the problem you can ignore this. + +The algorithm that the TTM graphics subsystem came up with for dealing with +this problem is quite simple. For each group of buffers (execbuf) that need +to be locked, the caller would be assigned a unique reservation id/ticket, +from a global counter. In case of deadlock while locking all the buffers +associated with a execbuf, the one with the lowest reservation ticket (i.e. +the oldest task) wins, and the one with the higher reservation id (i.e. the +younger task) unlocks all of the buffers that it has already locked, and then +tries again. + +In the RDBMS literature this deadlock handling approach is called wait/wound: +The older tasks waits until it can acquire the contended lock. The younger tasks +needs to back off and drop all the locks it is currently holding, i.e. the +younger task is wounded. + +Concepts +-------- + +Compared to normal mutexes two additional concepts/objects show up in the lock +interface for w/w mutexes: + +Acquire context: To ensure eventual forward progress it is important the a task +trying to acquire locks doesn't grab a new reservation id, but keeps the one it +acquired when starting the lock acquisition. This ticket is stored in the +acquire context. Furthermore the acquire context keeps track of debugging state +to catch w/w mutex interface abuse. + +W/w class: In contrast to normal mutexes the lock class needs to be explicit for +w/w mutexes, since it is required to initialize the acquire context. + +Furthermore there are three different class of w/w lock acquire functions: + +* Normal lock acquisition with a context, using ww_mutex_lock. + +* Slowpath lock acquisition on the contending lock, used by the wounded task + after having dropped all already acquired locks. These functions have the + _slow postfix. + + From a simple semantics point-of-view the _slow functions are not strictly + required, since simply calling the normal ww_mutex_lock functions on the + contending lock (after having dropped all other already acquired locks) will + work correctly. After all if no other ww mutex has been acquired yet there's + no deadlock potential and hence the ww_mutex_lock call will block and not + prematurely return -EDEADLK. The advantage of the _slow functions is in + interface safety: + - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow + has a void return type. Note that since ww mutex code needs loops/retries + anyway the __must_check doesn't result in spurious warnings, even though the + very first lock operation can never fail. + - When full debugging is enabled ww_mutex_lock_slow checks that all acquired + ww mutex have been released (preventing deadlocks) and makes sure that we + block on the contending lock (preventing spinning through the -EDEADLK + slowpath until the contended lock can be acquired). + +* Functions to only acquire a single w/w mutex, which results in the exact same + semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL + context. + + Again this is not strictly required. But often you only want to acquire a + single lock in which case it's pointless to set up an acquire context (and so + better to avoid grabbing a deadlock avoidance ticket). + +Of course, all the usual variants for handling wake-ups due to signals are also +provided. + +Usage +----- + +Three different ways to acquire locks within the same w/w class. Common +definitions for methods #1 and #2: + +static DEFINE_WW_CLASS(ww_class); + +struct obj { + struct ww_mutex lock; + /* obj data */ +}; + +struct obj_entry { + struct list_head head; + struct obj *obj; +}; + +Method 1, using a list in execbuf->buffers that's not allowed to be reordered. +This is useful if a list of required objects is already tracked somewhere. +Furthermore the lock helper can use propagate the -EALREADY return code back to +the caller as a signal that an object is twice on the list. This is useful if +the list is constructed from userspace input and the ABI requires userspace to +not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl). + +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj *res_obj = NULL; + struct obj_entry *contended_entry = NULL; + struct obj_entry *entry; + + ww_acquire_init(ctx, &ww_class); + +retry: + list_for_each_entry (entry, list, head) { + if (entry->obj == res_obj) { + res_obj = NULL; + continue; + } + ret = ww_mutex_lock(&entry->obj->lock, ctx); + if (ret < 0) { + contended_entry = entry; + goto err; + } + } + + ww_acquire_done(ctx); + return 0; + +err: + list_for_each_entry_continue_reverse (entry, list, head) + ww_mutex_unlock(&entry->obj->lock); + + if (res_obj) + ww_mutex_unlock(&res_obj->lock); + + if (ret == -EDEADLK) { + /* we lost out in a seqno race, lock and retry.. */ + ww_mutex_lock_slow(&contended_entry->obj->lock, ctx); + res_obj = contended_entry->obj; + goto retry; + } + ww_acquire_fini(ctx); + + return ret; +} + +Method 2, using a list in execbuf->buffers that can be reordered. Same semantics +of duplicate entry detection using -EALREADY as method 1 above. But the +list-reordering allows for a bit more idiomatic code. + +int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj_entry *entry, *entry2; + + ww_acquire_init(ctx, &ww_class); + + list_for_each_entry (entry, list, head) { + ret = ww_mutex_lock(&entry->obj->lock, ctx); + if (ret < 0) { + entry2 = entry; + + list_for_each_entry_continue_reverse (entry2, list, head) + ww_mutex_unlock(&entry2->obj->lock); + + if (ret != -EDEADLK) { + ww_acquire_fini(ctx); + return ret; + } + + /* we lost out in a seqno race, lock and retry.. */ + ww_mutex_lock_slow(&entry->obj->lock, ctx); + + /* + * Move buf to head of the list, this will point + * buf->next to the first unlocked entry, + * restarting the for loop. + */ + list_del(&entry->head); + list_add(&entry->head, list); + } + } + + ww_acquire_done(ctx); + return 0; +} + +Unlocking works the same way for both methods #1 and #2: + +void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj_entry *entry; + + list_for_each_entry (entry, list, head) + ww_mutex_unlock(&entry->obj->lock); + + ww_acquire_fini(ctx); +} + +Method 3 is useful if the list of objects is constructed ad-hoc and not upfront, +e.g. when adjusting edges in a graph where each node has its own ww_mutex lock, +and edges can only be changed when holding the locks of all involved nodes. w/w +mutexes are a natural fit for such a case for two reasons: +- They can handle lock-acquisition in any order which allows us to start walking + a graph from a starting point and then iteratively discovering new edges and + locking down the nodes those edges connect to. +- Due to the -EALREADY return code signalling that a given objects is already + held there's no need for additional book-keeping to break cycles in the graph + or keep track off which looks are already held (when using more than one node + as a starting point). + +Note that this approach differs in two important ways from the above methods: +- Since the list of objects is dynamically constructed (and might very well be + different when retrying due to hitting the -EDEADLK wound condition) there's + no need to keep any object on a persistent list when it's not locked. We can + therefore move the list_head into the object itself. +- On the other hand the dynamic object list construction also means that the -EALREADY return + code can't be propagated. + +Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a +list of starting nodes (passed in from userspace) using one of the above +methods. And then lock any additional objects affected by the operations using +method #3 below. The backoff/retry procedure will be a bit more involved, since +when the dynamic locking step hits -EDEADLK we also need to unlock all the +objects acquired with the fixed list. But the w/w mutex debug checks will catch +any interface misuse for these cases. + +Also, method 3 can't fail the lock acquisition step since it doesn't return +-EALREADY. Of course this would be different when using the _interruptible +variants, but that's outside of the scope of these examples here. + +struct obj { + struct ww_mutex ww_mutex; + struct list_head locked_list; +}; + +static DEFINE_WW_CLASS(ww_class); + +void __unlock_objs(struct list_head *list) +{ + struct obj *entry, *temp; + + list_for_each_entry_safe (entry, temp, list, locked_list) { + /* need to do that before unlocking, since only the current lock holder is + allowed to use object */ + list_del(&entry->locked_list); + ww_mutex_unlock(entry->ww_mutex) + } +} + +void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + struct obj *obj; + + ww_acquire_init(ctx, &ww_class); + +retry: + /* re-init loop start state */ + loop { + /* magic code which walks over a graph and decides which objects + * to lock */ + + ret = ww_mutex_lock(obj->ww_mutex, ctx); + if (ret == -EALREADY) { + /* we have that one already, get to the next object */ + continue; + } + if (ret == -EDEADLK) { + __unlock_objs(list); + + ww_mutex_lock_slow(obj, ctx); + list_add(&entry->locked_list, list); + goto retry; + } + + /* locked a new object, add it to the list */ + list_add_tail(&entry->locked_list, list); + } + + ww_acquire_done(ctx); + return 0; +} + +void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) +{ + __unlock_objs(list); + ww_acquire_fini(ctx); +} + +Method 4: Only lock one single objects. In that case deadlock detection and +prevention is obviously overkill, since with grabbing just one lock you can't +produce a deadlock within just one class. To simplify this case the w/w mutex +api can be used with a NULL context. + +Implementation Details +---------------------- + +Design: + ww_mutex currently encapsulates a struct mutex, this means no extra overhead for + normal mutex locks, which are far more common. As such there is only a small + increase in code size if wait/wound mutexes are not used. + + In general, not much contention is expected. The locks are typically used to + serialize access to resources for devices. The only way to make wakeups + smarter would be at the cost of adding a field to struct mutex_waiter. This + would add overhead to all cases where normal mutexes are used, and + ww_mutexes are generally less performance sensitive. + +Lockdep: + Special care has been taken to warn for as many cases of api abuse + as possible. Some common api abuses will be caught with + CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended. + + Some of the errors which will be warned about: + - Forgetting to call ww_acquire_fini or ww_acquire_init. + - Attempting to lock more mutexes after ww_acquire_done. + - Attempting to lock the wrong mutex after -EDEADLK and + unlocking all mutexes. + - Attempting to lock the right mutex after -EDEADLK, + before unlocking all mutexes. + + - Calling ww_mutex_lock_slow before -EDEADLK was returned. + + - Unlocking mutexes with the wrong unlock function. + - Calling one of the ww_acquire_* twice on the same context. + - Using a different ww_class for the mutex than for the ww_acquire_ctx. + - Normal lockdep errors that can result in deadlocks. + + Some of the lockdep errors that can result in deadlocks: + - Calling ww_acquire_init to initialize a second ww_acquire_ctx before + having called ww_acquire_fini on the first. + - 'normal' deadlocks that can occur. + +FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic +implemented. diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt deleted file mode 100644 index 72d010689751..000000000000 --- a/Documentation/lockstat.txt +++ /dev/null @@ -1,178 +0,0 @@ - -LOCK STATISTICS - -- WHAT - -As the name suggests, it provides statistics on locks. - -- WHY - -Because things like lock contention can severely impact performance. - -- HOW - -Lockdep already has hooks in the lock functions and maps lock instances to -lock classes. We build on that (see Documentation/lockdep-design.txt). -The graph below shows the relation between the lock functions and the various -hooks therein. - - __acquire - | - lock _____ - | \ - | __contended - | | - | - | _______/ - |/ - | - __acquired - | - . - - . - | - __release - | - unlock - -lock, unlock - the regular lock functions -__* - the hooks -<> - states - -With these hooks we provide the following statistics: - - con-bounces - number of lock contention that involved x-cpu data - contentions - number of lock acquisitions that had to wait - wait time min - shortest (non-0) time we ever had to wait for a lock - max - longest time we ever had to wait for a lock - total - total time we spend waiting on this lock - avg - average time spent waiting on this lock - acq-bounces - number of lock acquisitions that involved x-cpu data - acquisitions - number of times we took the lock - hold time min - shortest (non-0) time we ever held the lock - max - longest time we ever held the lock - total - total time this lock was held - avg - average time this lock was held - -These numbers are gathered per lock class, per read/write state (when -applicable). - -It also tracks 4 contention points per class. A contention point is a call site -that had to wait on lock acquisition. - - - CONFIGURATION - -Lock statistics are enabled via CONFIG_LOCK_STAT. - - - USAGE - -Enable collection of statistics: - -# echo 1 >/proc/sys/kernel/lock_stat - -Disable collection of statistics: - -# echo 0 >/proc/sys/kernel/lock_stat - -Look at the current lock statistics: - -( line numbers not part of actual output, done for clarity in the explanation - below ) - -# less /proc/lock_stat - -01 lock_stat version 0.4 -02----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg -04----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -05 -06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99 -07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77 -08 --------------- -09 &mm->mmap_sem 1 [] khugepaged_scan_mm_slot+0x57/0x280 -19 &mm->mmap_sem 96 [] __do_page_fault+0x1d4/0x510 -11 &mm->mmap_sem 34 [] vm_mmap_pgoff+0x87/0xd0 -12 &mm->mmap_sem 17 [] vm_munmap+0x41/0x80 -13 --------------- -14 &mm->mmap_sem 1 [] dup_mmap+0x2a/0x3f0 -15 &mm->mmap_sem 60 [] SyS_mprotect+0xe9/0x250 -16 &mm->mmap_sem 41 [] __do_page_fault+0x1d4/0x510 -17 &mm->mmap_sem 68 [] vm_mmap_pgoff+0x87/0xd0 -18 -19............................................................................................................................................................................................................................. -20 -21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48 -22 --------------- -23 unix_table_lock 45 [] unix_create1+0x16e/0x1b0 -24 unix_table_lock 47 [] unix_release_sock+0x31/0x250 -25 unix_table_lock 15 [] unix_find_other+0x117/0x230 -26 unix_table_lock 5 [] unix_autobind+0x11f/0x1b0 -27 --------------- -28 unix_table_lock 39 [] unix_release_sock+0x31/0x250 -29 unix_table_lock 49 [] unix_create1+0x16e/0x1b0 -30 unix_table_lock 20 [] unix_find_other+0x117/0x230 -31 unix_table_lock 4 [] unix_autobind+0x11f/0x1b0 - - -This excerpt shows the first two lock class statistics. Line 01 shows the -output version - each time the format changes this will be updated. Line 02-04 -show the header with column descriptions. Lines 05-18 and 20-31 show the actual -statistics. These statistics come in two parts; the actual stats separated by a -short separator (line 08, 13) from the contention points. - -The first lock (05-18) is a read/write lock, and shows two lines above the -short separator. The contention points don't match the column descriptors, -they have two: contentions and [] symbol. The second set of contention -points are the points we're contending with. - -The integer part of the time values is in us. - -Dealing with nested locks, subclasses may appear: - -32........................................................................................................................................................................................................................... -33 -34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82 -35 --------- -36 &rq->lock 645 [] task_rq_lock+0x43/0x75 -37 &rq->lock 297 [] try_to_wake_up+0x127/0x25a -38 &rq->lock 360 [] select_task_rq_fair+0x1f0/0x74a -39 &rq->lock 428 [] scheduler_tick+0x46/0x1fb -40 --------- -41 &rq->lock 77 [] task_rq_lock+0x43/0x75 -42 &rq->lock 174 [] try_to_wake_up+0x127/0x25a -43 &rq->lock 4715 [] double_rq_lock+0x42/0x54 -44 &rq->lock 893 [] schedule+0x157/0x7b8 -45 -46........................................................................................................................................................................................................................... -47 -48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84 -49 ----------- -50 &rq->lock/1 11526 [] double_rq_lock+0x4f/0x54 -51 ----------- -52 &rq->lock/1 5645 [] double_rq_lock+0x42/0x54 -53 &rq->lock/1 1224 [] schedule+0x157/0x7b8 -54 &rq->lock/1 4336 [] double_rq_lock+0x4f/0x54 -55 &rq->lock/1 181 [] try_to_wake_up+0x127/0x25a - -Line 48 shows statistics for the second subclass (/1) of &rq->lock class -(subclass starts from 0), since in this case, as line 50 suggests, -double_rq_lock actually acquires a nested lock of two spinlocks. - -View the top contending locks: - -# grep : /proc/lock_stat | head - clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71 - tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59 - &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59 - &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69 - &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93 - tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72 - tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89 - rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89 - &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27 - rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76 - -Clear the statistics: - -# echo 0 > /proc/lock_stat diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt deleted file mode 100644 index ee231ed09ec6..000000000000 --- a/Documentation/mutex-design.txt +++ /dev/null @@ -1,157 +0,0 @@ -Generic Mutex Subsystem - -started by Ingo Molnar -updated by Davidlohr Bueso - -What are mutexes? ------------------ - -In the Linux kernel, mutexes refer to a particular locking primitive -that enforces serialization on shared memory systems, and not only to -the generic term referring to 'mutual exclusion' found in academia -or similar theoretical text books. Mutexes are sleeping locks which -behave similarly to binary semaphores, and were introduced in 2006[1] -as an alternative to these. This new data structure provided a number -of advantages, including simpler interfaces, and at that time smaller -code (see Disadvantages). - -[1] http://lwn.net/Articles/164802/ - -Implementation --------------- - -Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h -and implemented in kernel/locking/mutex.c. These locks use a three -state atomic counter (->count) to represent the different possible -transitions that can occur during the lifetime of a lock: - - 1: unlocked - 0: locked, no waiters - negative: locked, with potential waiters - -In its most basic form it also includes a wait-queue and a spinlock -that serializes access to it. CONFIG_SMP systems can also include -a pointer to the lock task owner (->owner) as well as a spinner MCS -lock (->osq), both described below in (ii). - -When acquiring a mutex, there are three possible paths that can be -taken, depending on the state of the lock: - -(i) fastpath: tries to atomically acquire the lock by decrementing the - counter. If it was already taken by another task it goes to the next - possible path. This logic is architecture specific. On x86-64, the - locking fastpath is 2 instructions: - - 0000000000000e10 : - e21: f0 ff 0b lock decl (%rbx) - e24: 79 08 jns e2e - - the unlocking fastpath is equally tight: - - 0000000000000bc0 : - bc8: f0 ff 07 lock incl (%rdi) - bcb: 7f 0a jg bd7 - - -(ii) midpath: aka optimistic spinning, tries to spin for acquisition - while the lock owner is running and there are no other tasks ready - to run that have higher priority (need_resched). The rationale is - that if the lock owner is running, it is likely to release the lock - soon. The mutex spinners are queued up using MCS lock so that only - one spinner can compete for the mutex. - - The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock - with the desirable properties of being fair and with each cpu trying - to acquire the lock spinning on a local variable. It avoids expensive - cacheline bouncing that common test-and-set spinlock implementations - incur. An MCS-like lock is specially tailored for optimistic spinning - for sleeping lock implementation. An important feature of the customized - MCS lock is that it has the extra property that spinners are able to exit - the MCS spinlock queue when they need to reschedule. This further helps - avoid situations where MCS spinners that need to reschedule would continue - waiting to spin on mutex owner, only to go directly to slowpath upon - obtaining the MCS lock. - - -(iii) slowpath: last resort, if the lock is still unable to be acquired, - the task is added to the wait-queue and sleeps until woken up by the - unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE. - -While formally kernel mutexes are sleepable locks, it is path (ii) that -makes them more practically a hybrid type. By simply not interrupting a -task and busy-waiting for a few cycles instead of immediately sleeping, -the performance of this lock has been seen to significantly improve a -number of workloads. Note that this technique is also used for rw-semaphores. - -Semantics ---------- - -The mutex subsystem checks and enforces the following rules: - - - Only one task can hold the mutex at a time. - - Only the owner can unlock the mutex. - - Multiple unlocks are not permitted. - - Recursive locking/unlocking is not permitted. - - A mutex must only be initialized via the API (see below). - - A task may not exit with a mutex held. - - Memory areas where held locks reside must not be freed. - - Held mutexes must not be reinitialized. - - Mutexes may not be used in hardware or software interrupt - contexts such as tasklets and timers. - -These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled. -In addition, the mutex debugging code also implements a number of other -features that make lock debugging easier and faster: - - - Uses symbolic names of mutexes, whenever they are printed - in debug output. - - Point-of-acquire tracking, symbolic lookup of function names, - list of all locks held in the system, printout of them. - - Owner tracking. - - Detects self-recursing locks and prints out all relevant info. - - Detects multi-task circular deadlocks and prints out all affected - locks and tasks (and only those tasks). - - -Interfaces ----------- -Statically define the mutex: - DEFINE_MUTEX(name); - -Dynamically initialize the mutex: - mutex_init(mutex); - -Acquire the mutex, uninterruptible: - void mutex_lock(struct mutex *lock); - void mutex_lock_nested(struct mutex *lock, unsigned int subclass); - int mutex_trylock(struct mutex *lock); - -Acquire the mutex, interruptible: - int mutex_lock_interruptible_nested(struct mutex *lock, - unsigned int subclass); - int mutex_lock_interruptible(struct mutex *lock); - -Acquire the mutex, interruptible, if dec to 0: - int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); - -Unlock the mutex: - void mutex_unlock(struct mutex *lock); - -Test if the mutex is taken: - int mutex_is_locked(struct mutex *lock); - -Disadvantages -------------- - -Unlike its original design and purpose, 'struct mutex' is larger than -most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice -as large as 'struct semaphore' (24 bytes) and 8 bytes shy of the -'struct rw_semaphore' variant. Larger structure sizes mean more CPU -cache and memory footprint. - -When to use mutexes -------------------- - -Unless the strict semantics of mutexes are unsuitable and/or the critical -region prevents the lock from being shared, always prefer them to any other -locking primitive. diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt deleted file mode 100644 index 8666070d3189..000000000000 --- a/Documentation/rt-mutex-design.txt +++ /dev/null @@ -1,781 +0,0 @@ -# -# Copyright (c) 2006 Steven Rostedt -# Licensed under the GNU Free Documentation License, Version 1.2 -# - -RT-mutex implementation design ------------------------------- - -This document tries to describe the design of the rtmutex.c implementation. -It doesn't describe the reasons why rtmutex.c exists. For that please see -Documentation/rt-mutex.txt. Although this document does explain problems -that happen without this code, but that is in the concept to understand -what the code actually is doing. - -The goal of this document is to help others understand the priority -inheritance (PI) algorithm that is used, as well as reasons for the -decisions that were made to implement PI in the manner that was done. - - -Unbounded Priority Inversion ----------------------------- - -Priority inversion is when a lower priority process executes while a higher -priority process wants to run. This happens for several reasons, and -most of the time it can't be helped. Anytime a high priority process wants -to use a resource that a lower priority process has (a mutex for example), -the high priority process must wait until the lower priority process is done -with the resource. This is a priority inversion. What we want to prevent -is something called unbounded priority inversion. That is when the high -priority process is prevented from running by a lower priority process for -an undetermined amount of time. - -The classic example of unbounded priority inversion is where you have three -processes, let's call them processes A, B, and C, where A is the highest -priority process, C is the lowest, and B is in between. A tries to grab a lock -that C owns and must wait and lets C run to release the lock. But in the -meantime, B executes, and since B is of a higher priority than C, it preempts C, -but by doing so, it is in fact preempting A which is a higher priority process. -Now there's no way of knowing how long A will be sleeping waiting for C -to release the lock, because for all we know, B is a CPU hog and will -never give C a chance to release the lock. This is called unbounded priority -inversion. - -Here's a little ASCII art to show the problem. - - grab lock L1 (owned by C) - | -A ---+ - C preempted by B - | -C +----+ - -B +--------> - B now keeps A from running. - - -Priority Inheritance (PI) -------------------------- - -There are several ways to solve this issue, but other ways are out of scope -for this document. Here we only discuss PI. - -PI is where a process inherits the priority of another process if the other -process blocks on a lock owned by the current process. To make this easier -to understand, let's use the previous example, with processes A, B, and C again. - -This time, when A blocks on the lock owned by C, C would inherit the priority -of A. So now if B becomes runnable, it would not preempt C, since C now has -the high priority of A. As soon as C releases the lock, it loses its -inherited priority, and A then can continue with the resource that C had. - -Terminology ------------ - -Here I explain some terminology that is used in this document to help describe -the design that is used to implement PI. - -PI chain - The PI chain is an ordered series of locks and processes that cause - processes to inherit priorities from a previous process that is - blocked on one of its locks. This is described in more detail - later in this document. - -mutex - In this document, to differentiate from locks that implement - PI and spin locks that are used in the PI code, from now on - the PI locks will be called a mutex. - -lock - In this document from now on, I will use the term lock when - referring to spin locks that are used to protect parts of the PI - algorithm. These locks disable preemption for UP (when - CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from - entering critical sections simultaneously. - -spin lock - Same as lock above. - -waiter - A waiter is a struct that is stored on the stack of a blocked - process. Since the scope of the waiter is within the code for - a process being blocked on the mutex, it is fine to allocate - the waiter on the process's stack (local variable). This - structure holds a pointer to the task, as well as the mutex that - the task is blocked on. It also has the plist node structures to - place the task in the waiter_list of a mutex as well as the - pi_list of a mutex owner task (described below). - - waiter is sometimes used in reference to the task that is waiting - on a mutex. This is the same as waiter->task. - -waiters - A list of processes that are blocked on a mutex. - -top waiter - The highest priority process waiting on a specific mutex. - -top pi waiter - The highest priority process waiting on one of the mutexes - that a specific process owns. - -Note: task and process are used interchangeably in this document, mostly to - differentiate between two processes that are being described together. - - -PI chain --------- - -The PI chain is a list of processes and mutexes that may cause priority -inheritance to take place. Multiple chains may converge, but a chain -would never diverge, since a process can't be blocked on more than one -mutex at a time. - -Example: - - Process: A, B, C, D, E - Mutexes: L1, L2, L3, L4 - - A owns: L1 - B blocked on L1 - B owns L2 - C blocked on L2 - C owns L3 - D blocked on L3 - D owns L4 - E blocked on L4 - -The chain would be: - - E->L4->D->L3->C->L2->B->L1->A - -To show where two chains merge, we could add another process F and -another mutex L5 where B owns L5 and F is blocked on mutex L5. - -The chain for F would be: - - F->L5->B->L1->A - -Since a process may own more than one mutex, but never be blocked on more than -one, the chains merge. - -Here we show both chains: - - E->L4->D->L3->C->L2-+ - | - +->B->L1->A - | - F->L5-+ - -For PI to work, the processes at the right end of these chains (or we may -also call it the Top of the chain) must be equal to or higher in priority -than the processes to the left or below in the chain. - -Also since a mutex may have more than one process blocked on it, we can -have multiple chains merge at mutexes. If we add another process G that is -blocked on mutex L2: - - G->L2->B->L1->A - -And once again, to show how this can grow I will show the merging chains -again. - - E->L4->D->L3->C-+ - +->L2-+ - | | - G-+ +->B->L1->A - | - F->L5-+ - - -Plist ------ - -Before I go further and talk about how the PI chain is stored through lists -on both mutexes and processes, I'll explain the plist. This is similar to -the struct list_head functionality that is already in the kernel. -The implementation of plist is out of scope for this document, but it is -very important to understand what it does. - -There are a few differences between plist and list, the most important one -being that plist is a priority sorted linked list. This means that the -priorities of the plist are sorted, such that it takes O(1) to retrieve the -highest priority item in the list. Obviously this is useful to store processes -based on their priorities. - -Another difference, which is important for implementation, is that, unlike -list, the head of the list is a different element than the nodes of a list. -So the head of the list is declared as struct plist_head and nodes that will -be added to the list are declared as struct plist_node. - - -Mutex Waiter List ------------------ - -Every mutex keeps track of all the waiters that are blocked on itself. The mutex -has a plist to store these waiters by priority. This list is protected by -a spin lock that is located in the struct of the mutex. This lock is called -wait_lock. Since the modification of the waiter list is never done in -interrupt context, the wait_lock can be taken without disabling interrupts. - - -Task PI List ------------- - -To keep track of the PI chains, each process has its own PI list. This is -a list of all top waiters of the mutexes that are owned by the process. -Note that this list only holds the top waiters and not all waiters that are -blocked on mutexes owned by the process. - -The top of the task's PI list is always the highest priority task that -is waiting on a mutex that is owned by the task. So if the task has -inherited a priority, it will always be the priority of the task that is -at the top of this list. - -This list is stored in the task structure of a process as a plist called -pi_list. This list is protected by a spin lock also in the task structure, -called pi_lock. This lock may also be taken in interrupt context, so when -locking the pi_lock, interrupts must be disabled. - - -Depth of the PI Chain ---------------------- - -The maximum depth of the PI chain is not dynamic, and could actually be -defined. But is very complex to figure it out, since it depends on all -the nesting of mutexes. Let's look at the example where we have 3 mutexes, -L1, L2, and L3, and four separate functions func1, func2, func3 and func4. -The following shows a locking order of L1->L2->L3, but may not actually -be directly nested that way. - -void func1(void) -{ - mutex_lock(L1); - - /* do anything */ - - mutex_unlock(L1); -} - -void func2(void) -{ - mutex_lock(L1); - mutex_lock(L2); - - /* do something */ - - mutex_unlock(L2); - mutex_unlock(L1); -} - -void func3(void) -{ - mutex_lock(L2); - mutex_lock(L3); - - /* do something else */ - - mutex_unlock(L3); - mutex_unlock(L2); -} - -void func4(void) -{ - mutex_lock(L3); - - /* do something again */ - - mutex_unlock(L3); -} - -Now we add 4 processes that run each of these functions separately. -Processes A, B, C, and D which run functions func1, func2, func3 and func4 -respectively, and such that D runs first and A last. With D being preempted -in func4 in the "do something again" area, we have a locking that follows: - -D owns L3 - C blocked on L3 - C owns L2 - B blocked on L2 - B owns L1 - A blocked on L1 - -And thus we have the chain A->L1->B->L2->C->L3->D. - -This gives us a PI depth of 4 (four processes), but looking at any of the -functions individually, it seems as though they only have at most a locking -depth of two. So, although the locking depth is defined at compile time, -it still is very difficult to find the possibilities of that depth. - -Now since mutexes can be defined by user-land applications, we don't want a DOS -type of application that nests large amounts of mutexes to create a large -PI chain, and have the code holding spin locks while looking at a large -amount of data. So to prevent this, the implementation not only implements -a maximum lock depth, but also only holds at most two different locks at a -time, as it walks the PI chain. More about this below. - - -Mutex owner and flags ---------------------- - -The mutex structure contains a pointer to the owner of the mutex. If the -mutex is not owned, this owner is set to NULL. Since all architectures -have the task structure on at least a four byte alignment (and if this is -not true, the rtmutex.c code will be broken!), this allows for the two -least significant bits to be used as flags. This part is also described -in Documentation/rt-mutex.txt, but will also be briefly described here. - -Bit 0 is used as the "Pending Owner" flag. This is described later. -Bit 1 is used as the "Has Waiters" flags. This is also described later - in more detail, but is set whenever there are waiters on a mutex. - - -cmpxchg Tricks --------------- - -Some architectures implement an atomic cmpxchg (Compare and Exchange). This -is used (when applicable) to keep the fast path of grabbing and releasing -mutexes short. - -cmpxchg is basically the following function performed atomically: - -unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) -{ - unsigned long T = *A; - if (*A == *B) { - *A = *C; - } - return T; -} -#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) - -This is really nice to have, since it allows you to only update a variable -if the variable is what you expect it to be. You know if it succeeded if -the return value (the old value of A) is equal to B. - -The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If -the architecture does not support CMPXCHG, then this macro is simply set -to fail every time. But if CMPXCHG is supported, then this will -help out extremely to keep the fast path short. - -The use of rt_mutex_cmpxchg with the flags in the owner field help optimize -the system for architectures that support it. This will also be explained -later in this document. - - -Priority adjustments --------------------- - -The implementation of the PI code in rtmutex.c has several places that a -process must adjust its priority. With the help of the pi_list of a -process this is rather easy to know what needs to be adjusted. - -The functions implementing the task adjustments are rt_mutex_adjust_prio, -__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock -to already be taken), rt_mutex_getprio, and rt_mutex_setprio. - -rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio. - -rt_mutex_getprio returns the priority that the task should have. Either the -task's own normal priority, or if a process of a higher priority is waiting on -a mutex owned by the task, then that higher priority should be returned. -Since the pi_list of a task holds an order by priority list of all the top -waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs -to compare the top pi waiter to its own normal priority, and return the higher -priority back. - -(Note: if looking at the code, you will notice that the lower number of - prio is returned. This is because the prio field in the task structure - is an inverse order of the actual priority. So a "prio" of 5 is - of higher priority than a "prio" of 10.) - -__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the -result does not equal the task's current priority, then rt_mutex_setprio -is called to adjust the priority of the task to the new priority. -Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the -actual change in priority. - -It is interesting to note that __rt_mutex_adjust_prio can either increase -or decrease the priority of the task. In the case that a higher priority -process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio -would increase/boost the task's priority. But if a higher priority task -were for some reason to leave the mutex (timeout or signal), this same function -would decrease/unboost the priority of the task. That is because the pi_list -always contains the highest priority task that is waiting on a mutex owned -by the task, so we only need to compare the priority of that top pi waiter -to the normal priority of the given task. - - -High level overview of the PI chain walk ----------------------------------------- - -The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain. - -The implementation has gone through several iterations, and has ended up -with what we believe is the best. It walks the PI chain by only grabbing -at most two locks at a time, and is very efficient. - -The rt_mutex_adjust_prio_chain can be used either to boost or lower process -priorities. - -rt_mutex_adjust_prio_chain is called with a task to be checked for PI -(de)boosting (the owner of a mutex that a process is blocking on), a flag to -check for deadlocking, the mutex that the task owns, and a pointer to a waiter -that is the process's waiter struct that is blocked on the mutex (although this -parameter may be NULL for deboosting). - -For this explanation, I will not mention deadlock detection. This explanation -will try to stay at a high level. - -When this function is called, there are no locks held. That also means -that the state of the owner and lock can change when entered into this function. - -Before this function is called, the task has already had rt_mutex_adjust_prio -performed on it. This means that the task is set to the priority that it -should be at, but the plist nodes of the task's waiter have not been updated -with the new priorities, and that this task may not be in the proper locations -in the pi_lists and wait_lists that the task is blocked on. This function -solves all that. - -A loop is entered, where task is the owner to be checked for PI changes that -was passed by parameter (for the first iteration). The pi_lock of this task is -taken to prevent any more changes to the pi_list of the task. This also -prevents new tasks from completing the blocking on a mutex that is owned by this -task. - -If the task is not blocked on a mutex then the loop is exited. We are at -the top of the PI chain. - -A check is now done to see if the original waiter (the process that is blocked -on the current mutex) is the top pi waiter of the task. That is, is this -waiter on the top of the task's pi_list. If it is not, it either means that -there is another process higher in priority that is blocked on one of the -mutexes that the task owns, or that the waiter has just woken up via a signal -or timeout and has left the PI chain. In either case, the loop is exited, since -we don't need to do any more changes to the priority of the current task, or any -task that owns a mutex that this current task is waiting on. A priority chain -walk is only needed when a new top pi waiter is made to a task. - -The next check sees if the task's waiter plist node has the priority equal to -the priority the task is set at. If they are equal, then we are done with -the loop. Remember that the function started with the priority of the -task adjusted, but the plist nodes that hold the task in other processes -pi_lists have not been adjusted. - -Next, we look at the mutex that the task is blocked on. The mutex's wait_lock -is taken. This is done by a spin_trylock, because the locking order of the -pi_lock and wait_lock goes in the opposite direction. If we fail to grab the -lock, the pi_lock is released, and we restart the loop. - -Now that we have both the pi_lock of the task as well as the wait_lock of -the mutex the task is blocked on, we update the task's waiter's plist node -that is located on the mutex's wait_list. - -Now we release the pi_lock of the task. - -Next the owner of the mutex has its pi_lock taken, so we can update the -task's entry in the owner's pi_list. If the task is the highest priority -process on the mutex's wait_list, then we remove the previous top waiter -from the owner's pi_list, and replace it with the task. - -Note: It is possible that the task was the current top waiter on the mutex, - in which case the task is not yet on the pi_list of the waiter. This - is OK, since plist_del does nothing if the plist node is not on any - list. - -If the task was not the top waiter of the mutex, but it was before we -did the priority updates, that means we are deboosting/lowering the -task. In this case, the task is removed from the pi_list of the owner, -and the new top waiter is added. - -Lastly, we unlock both the pi_lock of the task, as well as the mutex's -wait_lock, and continue the loop again. On the next iteration of the -loop, the previous owner of the mutex will be the task that will be -processed. - -Note: One might think that the owner of this mutex might have changed - since we just grab the mutex's wait_lock. And one could be right. - The important thing to remember is that the owner could not have - become the task that is being processed in the PI chain, since - we have taken that task's pi_lock at the beginning of the loop. - So as long as there is an owner of this mutex that is not the same - process as the tasked being worked on, we are OK. - - Looking closely at the code, one might be confused. The check for the - end of the PI chain is when the task isn't blocked on anything or the - task's waiter structure "task" element is NULL. This check is - protected only by the task's pi_lock. But the code to unlock the mutex - sets the task's waiter structure "task" element to NULL with only - the protection of the mutex's wait_lock, which was not taken yet. - Isn't this a race condition if the task becomes the new owner? - - The answer is No! The trick is the spin_trylock of the mutex's - wait_lock. If we fail that lock, we release the pi_lock of the - task and continue the loop, doing the end of PI chain check again. - - In the code to release the lock, the wait_lock of the mutex is held - the entire time, and it is not let go when we grab the pi_lock of the - new owner of the mutex. So if the switch of a new owner were to happen - after the check for end of the PI chain and the grabbing of the - wait_lock, the unlocking code would spin on the new owner's pi_lock - but never give up the wait_lock. So the PI chain loop is guaranteed to - fail the spin_trylock on the wait_lock, release the pi_lock, and - try again. - - If you don't quite understand the above, that's OK. You don't have to, - unless you really want to make a proof out of it ;) - - -Pending Owners and Lock stealing --------------------------------- - -One of the flags in the owner field of the mutex structure is "Pending Owner". -What this means is that an owner was chosen by the process releasing the -mutex, but that owner has yet to wake up and actually take the mutex. - -Why is this important? Why can't we just give the mutex to another process -and be done with it? - -The PI code is to help with real-time processes, and to let the highest -priority process run as long as possible with little latencies and delays. -If a high priority process owns a mutex that a lower priority process is -blocked on, when the mutex is released it would be given to the lower priority -process. What if the higher priority process wants to take that mutex again. -The high priority process would fail to take that mutex that it just gave up -and it would need to boost the lower priority process to run with full -latency of that critical section (since the low priority process just entered -it). - -There's no reason a high priority process that gives up a mutex should be -penalized if it tries to take that mutex again. If the new owner of the -mutex has not woken up yet, there's no reason that the higher priority process -could not take that mutex away. - -To solve this, we introduced Pending Ownership and Lock Stealing. When a -new process is given a mutex that it was blocked on, it is only given -pending ownership. This means that it's the new owner, unless a higher -priority process comes in and tries to grab that mutex. If a higher priority -process does come along and wants that mutex, we let the higher priority -process "steal" the mutex from the pending owner (only if it is still pending) -and continue with the mutex. - - -Taking of a mutex (The walk through) ------------------------------------- - -OK, now let's take a look at the detailed walk through of what happens when -taking a mutex. - -The first thing that is tried is the fast taking of the mutex. This is -done when we have CMPXCHG enabled (otherwise the fast taking automatically -fails). Only when the owner field of the mutex is NULL can the lock be -taken with the CMPXCHG and nothing else needs to be done. - -If there is contention on the lock, whether it is owned or pending owner -we go about the slow path (rt_mutex_slowlock). - -The slow path function is where the task's waiter structure is created on -the stack. This is because the waiter structure is only needed for the -scope of this function. The waiter structure holds the nodes to store -the task on the wait_list of the mutex, and if need be, the pi_list of -the owner. - -The wait_lock of the mutex is taken since the slow path of unlocking the -mutex also takes this lock. - -We then call try_to_take_rt_mutex. This is where the architecture that -does not implement CMPXCHG would always grab the lock (if there's no -contention). - -try_to_take_rt_mutex is used every time the task tries to grab a mutex in the -slow path. The first thing that is done here is an atomic setting of -the "Has Waiters" flag of the mutex's owner field. Yes, this could really -be false, because if the mutex has no owner, there are no waiters and -the current task also won't have any waiters. But we don't have the lock -yet, so we assume we are going to be a waiter. The reason for this is to -play nice for those architectures that do have CMPXCHG. By setting this flag -now, the owner of the mutex can't release the mutex without going into the -slow unlock path, and it would then need to grab the wait_lock, which this -code currently holds. So setting the "Has Waiters" flag forces the owner -to synchronize with this code. - -Now that we know that we can't have any races with the owner releasing the -mutex, we check to see if we can take the ownership. This is done if the -mutex doesn't have a owner, or if we can steal the mutex from a pending -owner. Let's look at the situations we have here. - - 1) Has owner that is pending - ---------------------------- - - The mutex has a owner, but it hasn't woken up and the mutex flag - "Pending Owner" is set. The first check is to see if the owner isn't the - current task. This is because this function is also used for the pending - owner to grab the mutex. When a pending owner wakes up, it checks to see - if it can take the mutex, and this is done if the owner is already set to - itself. If so, we succeed and leave the function, clearing the "Pending - Owner" bit. - - If the pending owner is not current, we check to see if the current priority is - higher than the pending owner. If not, we fail the function and return. - - There's also something special about a pending owner. That is a pending owner - is never blocked on a mutex. So there is no PI chain to worry about. It also - means that if the mutex doesn't have any waiters, there's no accounting needed - to update the pending owner's pi_list, since we only worry about processes - blocked on the current mutex. - - If there are waiters on this mutex, and we just stole the ownership, we need - to take the top waiter, remove it from the pi_list of the pending owner, and - add it to the current pi_list. Note that at this moment, the pending owner - is no longer on the list of waiters. This is fine, since the pending owner - would add itself back when it realizes that it had the ownership stolen - from itself. When the pending owner tries to grab the mutex, it will fail - in try_to_take_rt_mutex if the owner field points to another process. - - 2) No owner - ----------- - - If there is no owner (or we successfully stole the lock), we set the owner - of the mutex to current, and set the flag of "Has Waiters" if the current - mutex actually has waiters, or we clear the flag if it doesn't. See, it was - OK that we set that flag early, since now it is cleared. - - 3) Failed to grab ownership - --------------------------- - - The most interesting case is when we fail to take ownership. This means that - there exists an owner, or there's a pending owner with equal or higher - priority than the current task. - -We'll continue on the failed case. - -If the mutex has a timeout, we set up a timer to go off to break us out -of this mutex if we failed to get it after a specified amount of time. - -Now we enter a loop that will continue to try to take ownership of the mutex, or -fail from a timeout or signal. - -Once again we try to take the mutex. This will usually fail the first time -in the loop, since it had just failed to get the mutex. But the second time -in the loop, this would likely succeed, since the task would likely be -the pending owner. - -If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done -here. - -The waiter structure has a "task" field that points to the task that is blocked -on the mutex. This field can be NULL the first time it goes through the loop -or if the task is a pending owner and had its mutex stolen. If the "task" -field is NULL then we need to set up the accounting for it. - -Task blocks on mutex --------------------- - -The accounting of a mutex and process is done with the waiter structure of -the process. The "task" field is set to the process, and the "lock" field -to the mutex. The plist nodes are initialized to the processes current -priority. - -Since the wait_lock was taken at the entry of the slow lock, we can safely -add the waiter to the wait_list. If the current process is the highest -priority process currently waiting on this mutex, then we remove the -previous top waiter process (if it exists) from the pi_list of the owner, -and add the current process to that list. Since the pi_list of the owner -has changed, we call rt_mutex_adjust_prio on the owner to see if the owner -should adjust its priority accordingly. - -If the owner is also blocked on a lock, and had its pi_list changed -(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead -and run rt_mutex_adjust_prio_chain on the owner, as described earlier. - -Now all locks are released, and if the current process is still blocked on a -mutex (waiter "task" field is not NULL), then we go to sleep (call schedule). - -Waking up in the loop ---------------------- - -The schedule can then wake up for a few reasons. - 1) we were given pending ownership of the mutex. - 2) we received a signal and was TASK_INTERRUPTIBLE - 3) we had a timeout and was TASK_INTERRUPTIBLE - -In any of these cases, we continue the loop and once again try to grab the -ownership of the mutex. If we succeed, we exit the loop, otherwise we continue -and on signal and timeout, will exit the loop, or if we had the mutex stolen -we just simply add ourselves back on the lists and go back to sleep. - -Note: For various reasons, because of timeout and signals, the steal mutex - algorithm needs to be careful. This is because the current process is - still on the wait_list. And because of dynamic changing of priorities, - especially on SCHED_OTHER tasks, the current process can be the - highest priority task on the wait_list. - -Failed to get mutex on Timeout or Signal ----------------------------------------- - -If a timeout or signal occurred, the waiter's "task" field would not be -NULL and the task needs to be taken off the wait_list of the mutex and perhaps -pi_list of the owner. If this process was a high priority process, then -the rt_mutex_adjust_prio_chain needs to be executed again on the owner, -but this time it will be lowering the priorities. - - -Unlocking the Mutex -------------------- - -The unlocking of a mutex also has a fast path for those architectures with -CMPXCHG. Since the taking of a mutex on contention always sets the -"Has Waiters" flag of the mutex's owner, we use this to know if we need to -take the slow path when unlocking the mutex. If the mutex doesn't have any -waiters, the owner field of the mutex would equal the current process and -the mutex can be unlocked by just replacing the owner field with NULL. - -If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available), -the slow unlock path is taken. - -The first thing done in the slow unlock path is to take the wait_lock of the -mutex. This synchronizes the locking and unlocking of the mutex. - -A check is made to see if the mutex has waiters or not. On architectures that -do not have CMPXCHG, this is the location that the owner of the mutex will -determine if a waiter needs to be awoken or not. On architectures that -do have CMPXCHG, that check is done in the fast path, but it is still needed -in the slow path too. If a waiter of a mutex woke up because of a signal -or timeout between the time the owner failed the fast path CMPXCHG check and -the grabbing of the wait_lock, the mutex may not have any waiters, thus the -owner still needs to make this check. If there are no waiters then the mutex -owner field is set to NULL, the wait_lock is released and nothing more is -needed. - -If there are waiters, then we need to wake one up and give that waiter -pending ownership. - -On the wake up code, the pi_lock of the current owner is taken. The top -waiter of the lock is found and removed from the wait_list of the mutex -as well as the pi_list of the current owner. The task field of the new -pending owner's waiter structure is set to NULL, and the owner field of the -mutex is set to the new owner with the "Pending Owner" bit set, as well -as the "Has Waiters" bit if there still are other processes blocked on the -mutex. - -The pi_lock of the previous owner is released, and the new pending owner's -pi_lock is taken. Remember that this is the trick to prevent the race -condition in rt_mutex_adjust_prio_chain from adding itself as a waiter -on the mutex. - -We now clear the "pi_blocked_on" field of the new pending owner, and if -the mutex still has waiters pending, we add the new top waiter to the pi_list -of the pending owner. - -Finally we unlock the pi_lock of the pending owner and wake it up. - - -Contact -------- - -For updates on this document, please email Steven Rostedt - - -Credits -------- - -Author: Steven Rostedt - -Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap - -Updates -------- - -This document was originally written for 2.6.17-rc3-mm1 diff --git a/Documentation/rt-mutex.txt b/Documentation/rt-mutex.txt deleted file mode 100644 index 243393d882ee..000000000000 --- a/Documentation/rt-mutex.txt +++ /dev/null @@ -1,79 +0,0 @@ -RT-mutex subsystem with PI support ----------------------------------- - -RT-mutexes with priority inheritance are used to support PI-futexes, -which enable pthread_mutex_t priority inheritance attributes -(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details -about PI-futexes.] - -This technology was developed in the -rt tree and streamlined for -pthread_mutex support. - -Basic principles: ------------------ - -RT-mutexes extend the semantics of simple mutexes by the priority -inheritance protocol. - -A low priority owner of a rt-mutex inherits the priority of a higher -priority waiter until the rt-mutex is released. If the temporarily -boosted owner blocks on a rt-mutex itself it propagates the priority -boosting to the owner of the other rt_mutex it gets blocked on. The -priority boosting is immediately removed once the rt_mutex has been -unlocked. - -This approach allows us to shorten the block of high-prio tasks on -mutexes which protect shared resources. Priority inheritance is not a -magic bullet for poorly designed applications, but it allows -well-designed applications to use userspace locks in critical parts of -an high priority thread, without losing determinism. - -The enqueueing of the waiters into the rtmutex waiter list is done in -priority order. For same priorities FIFO order is chosen. For each -rtmutex, only the top priority waiter is enqueued into the owner's -priority waiters list. This list too queues in priority order. Whenever -the top priority waiter of a task changes (for example it timed out or -got a signal), the priority of the owner task is readjusted. [The -priority enqueueing is handled by "plists", see include/linux/plist.h -for more details.] - -RT-mutexes are optimized for fastpath operations and have no internal -locking overhead when locking an uncontended mutex or unlocking a mutex -without waiters. The optimized fastpath operations require cmpxchg -support. [If that is not available then the rt-mutex internal spinlock -is used] - -The state of the rt-mutex is tracked via the owner field of the rt-mutex -structure: - -rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1 -are used to keep track of the "owner is pending" and "rtmutex has -waiters" state. - - owner bit1 bit0 - NULL 0 0 mutex is free (fast acquire possible) - NULL 0 1 invalid state - NULL 1 0 Transitional state* - NULL 1 1 invalid state - taskpointer 0 0 mutex is held (fast release possible) - taskpointer 0 1 task is pending owner - taskpointer 1 0 mutex is held and has waiters - taskpointer 1 1 task is pending owner and mutex has waiters - -Pending-ownership handling is a performance optimization: -pending-ownership is assigned to the first (highest priority) waiter of -the mutex, when the mutex is released. The thread is woken up and once -it starts executing it can acquire the mutex. Until the mutex is taken -by it (bit 0 is cleared) a competing higher priority thread can "steal" -the mutex which puts the woken up thread back on the waiters list. - -The pending-ownership optimization is especially important for the -uninterrupted workflow of high-prio tasks which repeatedly -takes/releases locks that have lower-prio waiters. Without this -optimization the higher-prio thread would ping-pong to the lower-prio -task [because at unlock time we always assign a new owner]. - -(*) The "mutex has waiters" bit gets set to take the lock. If the lock -doesn't already have an owner, this bit is quickly cleared if there are -no waiters. So this is a transitional state to synchronize with looking -at the owner field of the mutex and the mutex owner releasing the lock. diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt deleted file mode 100644 index 97eaf5727178..000000000000 --- a/Documentation/spinlocks.txt +++ /dev/null @@ -1,167 +0,0 @@ -Lesson 1: Spin locks - -The most basic primitive for locking is spinlock. - -static DEFINE_SPINLOCK(xxx_lock); - - unsigned long flags; - - spin_lock_irqsave(&xxx_lock, flags); - ... critical section here .. - spin_unlock_irqrestore(&xxx_lock, flags); - -The above is always safe. It will disable interrupts _locally_, but the -spinlock itself will guarantee the global lock, so it will guarantee that -there is only one thread-of-control within the region(s) protected by that -lock. This works well even under UP also, so the code does _not_ need to -worry about UP vs SMP issues: the spinlocks work correctly under both. - - NOTE! Implications of spin_locks for memory are further described in: - - Documentation/memory-barriers.txt - (5) LOCK operations. - (6) UNLOCK operations. - -The above is usually pretty simple (you usually need and want only one -spinlock for most things - using more than one spinlock can make things a -lot more complex and even slower and is usually worth it only for -sequences that you _know_ need to be split up: avoid it at all cost if you -aren't sure). - -This is really the only really hard part about spinlocks: once you start -using spinlocks they tend to expand to areas you might not have noticed -before, because you have to make sure the spinlocks correctly protect the -shared data structures _everywhere_ they are used. The spinlocks are most -easily added to places that are completely independent of other code (for -example, internal driver data structures that nobody else ever touches). - - NOTE! The spin-lock is safe only when you _also_ use the lock itself - to do locking across CPU's, which implies that EVERYTHING that - touches a shared variable has to agree about the spinlock they want - to use. - ----- - -Lesson 2: reader-writer spinlocks. - -If your data accesses have a very natural pattern where you usually tend -to mostly read from the shared variables, the reader-writer locks -(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple -readers to be in the same critical region at once, but if somebody wants -to change the variables it has to get an exclusive write lock. - - NOTE! reader-writer locks require more atomic memory operations than - simple spinlocks. Unless the reader critical section is long, you - are better off just using spinlocks. - -The routines look the same as above: - - rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock); - - unsigned long flags; - - read_lock_irqsave(&xxx_lock, flags); - .. critical section that only reads the info ... - read_unlock_irqrestore(&xxx_lock, flags); - - write_lock_irqsave(&xxx_lock, flags); - .. read and write exclusive access to the info ... - write_unlock_irqrestore(&xxx_lock, flags); - -The above kind of lock may be useful for complex data structures like -linked lists, especially searching for entries without changing the list -itself. The read lock allows many concurrent readers. Anything that -_changes_ the list will have to get the write lock. - - NOTE! RCU is better for list traversal, but requires careful - attention to design detail (see Documentation/RCU/listRCU.txt). - -Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_ -time need to do any changes (even if you don't do it every time), you have -to get the write-lock at the very beginning. - - NOTE! We are working hard to remove reader-writer spinlocks in most - cases, so please don't add a new one without consensus. (Instead, see - Documentation/RCU/rcu.txt for complete information.) - ----- - -Lesson 3: spinlocks revisited. - -The single spin-lock primitives above are by no means the only ones. They -are the most safe ones, and the ones that work under all circumstances, -but partly _because_ they are safe they are also fairly slow. They are slower -than they'd need to be, because they do have to disable interrupts -(which is just a single instruction on a x86, but it's an expensive one - -and on other architectures it can be worse). - -If you have a case where you have to protect a data structure across -several CPU's and you want to use spinlocks you can potentially use -cheaper versions of the spinlocks. IFF you know that the spinlocks are -never used in interrupt handlers, you can use the non-irq versions: - - spin_lock(&lock); - ... - spin_unlock(&lock); - -(and the equivalent read-write versions too, of course). The spinlock will -guarantee the same kind of exclusive access, and it will be much faster. -This is useful if you know that the data in question is only ever -manipulated from a "process context", ie no interrupts involved. - -The reasons you mustn't use these versions if you have interrupts that -play with the spinlock is that you can get deadlocks: - - spin_lock(&lock); - ... - <- interrupt comes in: - spin_lock(&lock); - -where an interrupt tries to lock an already locked variable. This is ok if -the other interrupt happens on another CPU, but it is _not_ ok if the -interrupt happens on the same CPU that already holds the lock, because the -lock will obviously never be released (because the interrupt is waiting -for the lock, and the lock-holder is interrupted by the interrupt and will -not continue until the interrupt has been processed). - -(This is also the reason why the irq-versions of the spinlocks only need -to disable the _local_ interrupts - it's ok to use spinlocks in interrupts -on other CPU's, because an interrupt on another CPU doesn't interrupt the -CPU that holds the lock, so the lock-holder can continue and eventually -releases the lock). - -Note that you can be clever with read-write locks and interrupts. For -example, if you know that the interrupt only ever gets a read-lock, then -you can use a non-irq version of read locks everywhere - because they -don't block on each other (and thus there is no dead-lock wrt interrupts. -But when you do the write-lock, you have to use the irq-safe version. - -For an example of being clever with rw-locks, see the "waitqueue_lock" -handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from -within an interrupt, they only read the queue in order to know whom to -wake up. So read-locks are safe (which is good: they are very common -indeed), while write-locks need to protect themselves against interrupts. - - Linus - ----- - -Reference information: - -For dynamic initialization, use spin_lock_init() or rwlock_init() as -appropriate: - - spinlock_t xxx_lock; - rwlock_t xxx_rw_lock; - - static int __init xxx_init(void) - { - spin_lock_init(&xxx_lock); - rwlock_init(&xxx_rw_lock); - ... - } - - module_init(xxx_init); - -For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or -__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate. diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt deleted file mode 100644 index 8a112dc304c3..000000000000 --- a/Documentation/ww-mutex-design.txt +++ /dev/null @@ -1,344 +0,0 @@ -Wait/Wound Deadlock-Proof Mutex Design -====================================== - -Please read mutex-design.txt first, as it applies to wait/wound mutexes too. - -Motivation for WW-Mutexes -------------------------- - -GPU's do operations that commonly involve many buffers. Those buffers -can be shared across contexts/processes, exist in different memory -domains (for example VRAM vs system memory), and so on. And with -PRIME / dmabuf, they can even be shared across devices. So there are -a handful of situations where the driver needs to wait for buffers to -become ready. If you think about this in terms of waiting on a buffer -mutex for it to become available, this presents a problem because -there is no way to guarantee that buffers appear in a execbuf/batch in -the same order in all contexts. That is directly under control of -userspace, and a result of the sequence of GL calls that an application -makes. Which results in the potential for deadlock. The problem gets -more complex when you consider that the kernel may need to migrate the -buffer(s) into VRAM before the GPU operates on the buffer(s), which -may in turn require evicting some other buffers (and you don't want to -evict other buffers which are already queued up to the GPU), but for a -simplified understanding of the problem you can ignore this. - -The algorithm that the TTM graphics subsystem came up with for dealing with -this problem is quite simple. For each group of buffers (execbuf) that need -to be locked, the caller would be assigned a unique reservation id/ticket, -from a global counter. In case of deadlock while locking all the buffers -associated with a execbuf, the one with the lowest reservation ticket (i.e. -the oldest task) wins, and the one with the higher reservation id (i.e. the -younger task) unlocks all of the buffers that it has already locked, and then -tries again. - -In the RDBMS literature this deadlock handling approach is called wait/wound: -The older tasks waits until it can acquire the contended lock. The younger tasks -needs to back off and drop all the locks it is currently holding, i.e. the -younger task is wounded. - -Concepts --------- - -Compared to normal mutexes two additional concepts/objects show up in the lock -interface for w/w mutexes: - -Acquire context: To ensure eventual forward progress it is important the a task -trying to acquire locks doesn't grab a new reservation id, but keeps the one it -acquired when starting the lock acquisition. This ticket is stored in the -acquire context. Furthermore the acquire context keeps track of debugging state -to catch w/w mutex interface abuse. - -W/w class: In contrast to normal mutexes the lock class needs to be explicit for -w/w mutexes, since it is required to initialize the acquire context. - -Furthermore there are three different class of w/w lock acquire functions: - -* Normal lock acquisition with a context, using ww_mutex_lock. - -* Slowpath lock acquisition on the contending lock, used by the wounded task - after having dropped all already acquired locks. These functions have the - _slow postfix. - - From a simple semantics point-of-view the _slow functions are not strictly - required, since simply calling the normal ww_mutex_lock functions on the - contending lock (after having dropped all other already acquired locks) will - work correctly. After all if no other ww mutex has been acquired yet there's - no deadlock potential and hence the ww_mutex_lock call will block and not - prematurely return -EDEADLK. The advantage of the _slow functions is in - interface safety: - - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow - has a void return type. Note that since ww mutex code needs loops/retries - anyway the __must_check doesn't result in spurious warnings, even though the - very first lock operation can never fail. - - When full debugging is enabled ww_mutex_lock_slow checks that all acquired - ww mutex have been released (preventing deadlocks) and makes sure that we - block on the contending lock (preventing spinning through the -EDEADLK - slowpath until the contended lock can be acquired). - -* Functions to only acquire a single w/w mutex, which results in the exact same - semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL - context. - - Again this is not strictly required. But often you only want to acquire a - single lock in which case it's pointless to set up an acquire context (and so - better to avoid grabbing a deadlock avoidance ticket). - -Of course, all the usual variants for handling wake-ups due to signals are also -provided. - -Usage ------ - -Three different ways to acquire locks within the same w/w class. Common -definitions for methods #1 and #2: - -static DEFINE_WW_CLASS(ww_class); - -struct obj { - struct ww_mutex lock; - /* obj data */ -}; - -struct obj_entry { - struct list_head head; - struct obj *obj; -}; - -Method 1, using a list in execbuf->buffers that's not allowed to be reordered. -This is useful if a list of required objects is already tracked somewhere. -Furthermore the lock helper can use propagate the -EALREADY return code back to -the caller as a signal that an object is twice on the list. This is useful if -the list is constructed from userspace input and the ABI requires userspace to -not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl). - -int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj *res_obj = NULL; - struct obj_entry *contended_entry = NULL; - struct obj_entry *entry; - - ww_acquire_init(ctx, &ww_class); - -retry: - list_for_each_entry (entry, list, head) { - if (entry->obj == res_obj) { - res_obj = NULL; - continue; - } - ret = ww_mutex_lock(&entry->obj->lock, ctx); - if (ret < 0) { - contended_entry = entry; - goto err; - } - } - - ww_acquire_done(ctx); - return 0; - -err: - list_for_each_entry_continue_reverse (entry, list, head) - ww_mutex_unlock(&entry->obj->lock); - - if (res_obj) - ww_mutex_unlock(&res_obj->lock); - - if (ret == -EDEADLK) { - /* we lost out in a seqno race, lock and retry.. */ - ww_mutex_lock_slow(&contended_entry->obj->lock, ctx); - res_obj = contended_entry->obj; - goto retry; - } - ww_acquire_fini(ctx); - - return ret; -} - -Method 2, using a list in execbuf->buffers that can be reordered. Same semantics -of duplicate entry detection using -EALREADY as method 1 above. But the -list-reordering allows for a bit more idiomatic code. - -int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj_entry *entry, *entry2; - - ww_acquire_init(ctx, &ww_class); - - list_for_each_entry (entry, list, head) { - ret = ww_mutex_lock(&entry->obj->lock, ctx); - if (ret < 0) { - entry2 = entry; - - list_for_each_entry_continue_reverse (entry2, list, head) - ww_mutex_unlock(&entry2->obj->lock); - - if (ret != -EDEADLK) { - ww_acquire_fini(ctx); - return ret; - } - - /* we lost out in a seqno race, lock and retry.. */ - ww_mutex_lock_slow(&entry->obj->lock, ctx); - - /* - * Move buf to head of the list, this will point - * buf->next to the first unlocked entry, - * restarting the for loop. - */ - list_del(&entry->head); - list_add(&entry->head, list); - } - } - - ww_acquire_done(ctx); - return 0; -} - -Unlocking works the same way for both methods #1 and #2: - -void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj_entry *entry; - - list_for_each_entry (entry, list, head) - ww_mutex_unlock(&entry->obj->lock); - - ww_acquire_fini(ctx); -} - -Method 3 is useful if the list of objects is constructed ad-hoc and not upfront, -e.g. when adjusting edges in a graph where each node has its own ww_mutex lock, -and edges can only be changed when holding the locks of all involved nodes. w/w -mutexes are a natural fit for such a case for two reasons: -- They can handle lock-acquisition in any order which allows us to start walking - a graph from a starting point and then iteratively discovering new edges and - locking down the nodes those edges connect to. -- Due to the -EALREADY return code signalling that a given objects is already - held there's no need for additional book-keeping to break cycles in the graph - or keep track off which looks are already held (when using more than one node - as a starting point). - -Note that this approach differs in two important ways from the above methods: -- Since the list of objects is dynamically constructed (and might very well be - different when retrying due to hitting the -EDEADLK wound condition) there's - no need to keep any object on a persistent list when it's not locked. We can - therefore move the list_head into the object itself. -- On the other hand the dynamic object list construction also means that the -EALREADY return - code can't be propagated. - -Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a -list of starting nodes (passed in from userspace) using one of the above -methods. And then lock any additional objects affected by the operations using -method #3 below. The backoff/retry procedure will be a bit more involved, since -when the dynamic locking step hits -EDEADLK we also need to unlock all the -objects acquired with the fixed list. But the w/w mutex debug checks will catch -any interface misuse for these cases. - -Also, method 3 can't fail the lock acquisition step since it doesn't return --EALREADY. Of course this would be different when using the _interruptible -variants, but that's outside of the scope of these examples here. - -struct obj { - struct ww_mutex ww_mutex; - struct list_head locked_list; -}; - -static DEFINE_WW_CLASS(ww_class); - -void __unlock_objs(struct list_head *list) -{ - struct obj *entry, *temp; - - list_for_each_entry_safe (entry, temp, list, locked_list) { - /* need to do that before unlocking, since only the current lock holder is - allowed to use object */ - list_del(&entry->locked_list); - ww_mutex_unlock(entry->ww_mutex) - } -} - -void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - struct obj *obj; - - ww_acquire_init(ctx, &ww_class); - -retry: - /* re-init loop start state */ - loop { - /* magic code which walks over a graph and decides which objects - * to lock */ - - ret = ww_mutex_lock(obj->ww_mutex, ctx); - if (ret == -EALREADY) { - /* we have that one already, get to the next object */ - continue; - } - if (ret == -EDEADLK) { - __unlock_objs(list); - - ww_mutex_lock_slow(obj, ctx); - list_add(&entry->locked_list, list); - goto retry; - } - - /* locked a new object, add it to the list */ - list_add_tail(&entry->locked_list, list); - } - - ww_acquire_done(ctx); - return 0; -} - -void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx) -{ - __unlock_objs(list); - ww_acquire_fini(ctx); -} - -Method 4: Only lock one single objects. In that case deadlock detection and -prevention is obviously overkill, since with grabbing just one lock you can't -produce a deadlock within just one class. To simplify this case the w/w mutex -api can be used with a NULL context. - -Implementation Details ----------------------- - -Design: - ww_mutex currently encapsulates a struct mutex, this means no extra overhead for - normal mutex locks, which are far more common. As such there is only a small - increase in code size if wait/wound mutexes are not used. - - In general, not much contention is expected. The locks are typically used to - serialize access to resources for devices. The only way to make wakeups - smarter would be at the cost of adding a field to struct mutex_waiter. This - would add overhead to all cases where normal mutexes are used, and - ww_mutexes are generally less performance sensitive. - -Lockdep: - Special care has been taken to warn for as many cases of api abuse - as possible. Some common api abuses will be caught with - CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended. - - Some of the errors which will be warned about: - - Forgetting to call ww_acquire_fini or ww_acquire_init. - - Attempting to lock more mutexes after ww_acquire_done. - - Attempting to lock the wrong mutex after -EDEADLK and - unlocking all mutexes. - - Attempting to lock the right mutex after -EDEADLK, - before unlocking all mutexes. - - - Calling ww_mutex_lock_slow before -EDEADLK was returned. - - - Unlocking mutexes with the wrong unlock function. - - Calling one of the ww_acquire_* twice on the same context. - - Using a different ww_class for the mutex than for the ww_acquire_ctx. - - Normal lockdep errors that can result in deadlocks. - - Some of the lockdep errors that can result in deadlocks: - - Calling ww_acquire_init to initialize a second ww_acquire_ctx before - having called ww_acquire_fini on the first. - - 'normal' deadlocks that can occur. - -FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic -implemented. diff --git a/MAINTAINERS b/MAINTAINERS index 1acc624ecfd7..aac481fcbf5c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5523,8 +5523,8 @@ M: Ingo Molnar L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/locking S: Maintained -F: Documentation/lockdep*.txt -F: Documentation/lockstat.txt +F: Documentation/locking/lockdep*.txt +F: Documentation/locking/lockstat.txt F: include/linux/lockdep.h F: kernel/locking/ diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c index 0dc57d5ecd10..3a02e5e3e9f3 100644 --- a/drivers/gpu/drm/drm_modeset_lock.c +++ b/drivers/gpu/drm/drm_modeset_lock.c @@ -35,7 +35,7 @@ * of extra utility/tracking out of our acquire-ctx. This is provided * by drm_modeset_lock / drm_modeset_acquire_ctx. * - * For basic principles of ww_mutex, see: Documentation/ww-mutex-design.txt + * For basic principles of ww_mutex, see: Documentation/locking/ww-mutex-design.txt * * The basic usage pattern is to: * diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 008388f920d7..f388481201cd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -4,7 +4,7 @@ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * - * see Documentation/lockdep-design.txt for more details. + * see Documentation/locking/lockdep-design.txt for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H diff --git a/include/linux/mutex.h b/include/linux/mutex.h index e4c29418f407..cc31498fc526 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -133,7 +133,7 @@ static inline int mutex_is_locked(struct mutex *lock) /* * See kernel/locking/mutex.c for detailed documentation of these APIs. - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 035d3c57fc8a..8f498cdde280 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -149,7 +149,7 @@ extern void downgrade_write(struct rw_semaphore *sem); * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. - * See Documentation/lockdep-design.txt for more details.) + * See Documentation/locking/lockdep-design.txt for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0d8b6ed93874..dadbf88c22c4 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -15,7 +15,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/mutex-design.txt. + * Also see Documentation/locking/mutex-design.txt. */ #include #include diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index a0ea2a141b3b..7c98873a3077 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,7 +8,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.txt for details. */ #include #include diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 901096d31c66..9b94a063e26c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -924,7 +924,7 @@ config PROVE_LOCKING the proof of observed correctness is also maintained for an arbitrary combination of these separate locking variants. - For more details, see Documentation/lockdep-design.txt. + For more details, see Documentation/locking/lockdep-design.txt. config LOCKDEP bool @@ -945,7 +945,7 @@ config LOCK_STAT help This feature enables tracking lock contention points - For more details, see Documentation/lockstat.txt + For more details, see Documentation/locking/lockstat.txt This also enables lock events required by "perf lock", subcommand of perf. -- cgit v1.2.3 From f0bab73cb539fb803c4d419951e8d28aa4964f8f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2014 13:22:01 -0400 Subject: locking/lockdep: Restrict the use of recursive read_lock() with qrwlock Unlike the original unfair rwlock implementation, queued rwlock will grant lock according to the chronological sequence of the lock requests except when the lock requester is in the interrupt context. Consequently, recursive read_lock calls will now hang the process if there is a write_lock call somewhere in between the read_lock calls. This patch updates the lockdep implementation to look for recursive read_lock calls. A new read state (3) is used to mark those read_lock call that cannot be recursively called except in the interrupt context. The new read state does exhaust the 2 bits available in held_lock:read bit field. The addition of any new read state in the future may require a redesign of how all those bits are squeezed together in the held_lock structure. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra Cc: Maarten Lankhorst Cc: Rik van Riel Cc: Scott J Norton Cc: Fengguang Wu Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407345722-61615-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 10 +++++++++- kernel/locking/lockdep.c | 6 ++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f388481201cd..b5a84b62fb84 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -478,16 +478,24 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ +/* + * Read states in the 2-bit held_lock:read field: + * 0: Exclusive lock + * 1: Shareable lock, cannot be recursively called + * 2: Shareable lock, can be recursively called + * 3: Shareable lock, cannot be recursively called except in interrupt context + */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) +#define lock_acquire_shared_irecursive(l, s, t, n, i) lock_acquire(l, s, t, 3, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, n, i) lock_release(l, n, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) -#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_irecursive(l, s, t, NULL, i) #define rwlock_release(l, n, i) lock_release(l, n, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..420ba685c4e5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3597,6 +3597,12 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); + /* + * An interrupt recursive read in interrupt context can be considered + * to be the same as a recursive read from checking perspective. + */ + if ((read == 3) && in_interrupt()) + read = 2; current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, -- cgit v1.2.3 From 0680eb1f485ba5aac2ee02c9f0622239c9a4b16c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 13 Aug 2014 12:47:14 -0700 Subject: timekeeping: Another fix to the VSYSCALL_OLD update_vsyscall Benjamin Herrenschmidt pointed out that I further missed modifying update_vsyscall after the wall_to_mono value was changed to a timespec64. This causes issues on powerpc32, which expects a 32bit timespec. This patch fixes the problem by properly converting from a timespec64 to a timespec before passing the value on to the arch-specific vsyscall logic. [ Thomas is currently on vacation, but reviewed it and wanted me to send this fix on to you directly. ] Cc: LKML Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Benjamin Herrenschmidt Reported-by: Benjamin Herrenschmidt Reviewed-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f36b02838a47..fb4a9c2cf8d9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); static inline void update_vsyscall(struct timekeeper *tk) { - struct timespec xt; + struct timespec xt, wm; xt = timespec64_to_timespec(tk_xtime(tk)); - update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, + wm = timespec64_to_timespec(tk->wall_to_monotonic); + update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, tk->tkr.cycle_last); } -- cgit v1.2.3 From ff7e0055bb5ddbbb320cdd8dfd3e18672bddd2ad Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 16 Aug 2014 04:13:37 +0930 Subject: module: Clean up ro/nx after early module load failures The commit 4982223e51e8 module: set nx before marking module MODULE_STATE_COMING. introduced a regression: if a module fails to parse its arguments or if mod_sysfs_setup fails, then the module's memory will be freed while still read-only. Anything that reuses that memory will crash as soon as it tries to write to it. Cc: stable@vger.kernel.org # v3.16 Cc: Rusty Russell Signed-off-by: Andy Lutomirski Signed-off-by: Rusty Russell --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6f69463f0066..03214bd288e9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3304,6 +3304,11 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); + + /* we can't deallocate the module until we clear memory protection */ + unset_module_init_ro_nx(mod); + unset_module_core_ro_nx(mod); + ddebug_cleanup: dynamic_debug_remove(info->debug); synchronize_sched(); -- cgit v1.2.3 From d3ac21cacc24790eb45d735769f35753f5b56ceb Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 17 Aug 2014 19:41:09 -0500 Subject: mm: Support compiling out madvise and fadvise Many embedded systems will not need these syscalls, and omitting them saves space. Add a new EXPERT config option CONFIG_ADVISE_SYSCALLS (default y) to support compiling them out. bloat-o-meter: add/remove: 0/3 grow/shrink: 0/0 up/down: 0/-2250 (-2250) function old new delta sys_fadvise64 57 - -57 sys_fadvise64_64 691 - -691 sys_madvise 1502 - -1502 Signed-off-by: Josh Triplett --- init/Kconfig | 10 ++++++++++ kernel/sys_ni.c | 3 +++ mm/Makefile | 7 +++++-- 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index e84c6423a2e5..782a65bf76ea 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1537,6 +1537,16 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. +config ADVISE_SYSCALLS + bool "Enable madvise/fadvise syscalls" if EXPERT + default y + help + This option enables the madvise and fadvise syscalls, used by + applications to advise the kernel about their future memory or file + usage, improving performance. If building an embedded system where no + applications use these syscalls, you can disable this option to save + space. + config PCI_QUIRKS default y bool "Enable PCI quirk workarounds" if EXPERT diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 391d4ddb6f4b..d4709d481053 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -156,6 +156,9 @@ cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); cond_syscall(sys_uselib); +cond_syscall(sys_fadvise64); +cond_syscall(sys_fadvise64_64); +cond_syscall(sys_madvise); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/mm/Makefile b/mm/Makefile index 632ae77e6070..fe7a053c0f45 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o @@ -11,7 +11,7 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o endif -obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ @@ -28,6 +28,9 @@ else obj-y += bootmem.o endif +ifdef CONFIG_MMU + obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o madvise.o +endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o -- cgit v1.2.3 From f530504a063cfa028971e4b26ea8e0c32908de25 Mon Sep 17 00:00:00 2001 From: chai wen Date: Mon, 11 Aug 2014 10:49:23 -0400 Subject: watchdog: Remove unnecessary header files Signed-off-by: chai wen Signed-off-by: Don Zickus Cc: pbonzini@redhat.com Link: http://lkml.kernel.org/r/1407768567-171794-2-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..4c2e11ce5425 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -15,11 +15,6 @@ #include #include #include -#include -#include -#include -#include -#include #include #include #include -- cgit v1.2.3 From df577149594cefacd62740e86de080c6336d699e Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Mon, 11 Aug 2014 10:49:25 -0400 Subject: watchdog: Fix print-once on enable This patch avoids printing the message 'enabled on all CPUs, ...' multiple times. For example, the issue can occur in the following scenario: 1) watchdog_nmi_enable() fails to enable PMU counters and sets cpu0_err. 2) 'echo [0|1] > /proc/sys/kernel/nmi_watchdog' is executed to disable and re-enable the watchdog mechanism 'on the fly'. 3) If watchdog_nmi_enable() succeeds to enable PMU counters, each CPU will print the message because step1 left behind a non-zero cpu0_err. if (!IS_ERR(event)) { if (cpu == 0 || cpu0_err) pr_info("enabled on all CPUs, ...") The patch avoids this by clearing cpu0_err in watchdog_nmi_disable(). Signed-off-by: Ulrich Obergfell Signed-off-by: Andrew Jones Signed-off-by: Don Zickus Cc: pbonzini@redhat.com Link: http://lkml.kernel.org/r/1407768567-171794-4-git-send-email-dzickus@redhat.com [ Applied small cleanups. ] Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4c2e11ce5425..df5494edf694 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -506,7 +506,10 @@ static void watchdog_nmi_disable(unsigned int cpu) /* should be in cleanup, but blocks oprofile */ perf_event_release_kernel(event); } - return; + if (cpu == 0) { + /* watchdog_nmi_enable() expects this to be zero initially. */ + cpu0_err = 0; + } } #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } -- cgit v1.2.3 From 71b1fb5c4473a5b1e601d41b109bdfe001ec82e0 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 18 Aug 2014 12:20:20 +0100 Subject: cgroup: reject cgroup names with '\n' /proc//cgroup contains one cgroup path on each line. If cgroup names are allowed to contain "\n", applications cannot parse /proc//cgroup safely. Signed-off-by: Alban Crequy Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..c3d1802a9b30 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4543,6 +4543,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, struct cftype *base_files; int ssid, ret; + /* Do not accept '\n' to prevent making /proc//cgroup unparsable. + */ + if (strchr(name, '\n')) + return -EINVAL; + parent = cgroup_kn_lock_live(parent_kn); if (!parent) return -ENODEV; -- cgit v1.2.3 From b3f207855f57b9c8f43a547a801340bb5cbc59e5 Mon Sep 17 00:00:00 2001 From: Pawel Moll Date: Fri, 13 Jun 2014 16:03:32 +0100 Subject: perf: Handle compat ioctl When running a 32-bit userspace on a 64-bit kernel (eg. i386 application on x86_64 kernel or 32-bit arm userspace on arm64 kernel) some of the perf ioctls must be treated with special care, as they have a pointer size encoded in the command. For example, PERF_EVENT_IOC_ID in 32-bit world will be encoded as 0x80042407, but 64-bit kernel will expect 0x80082407. In result the ioctl will fail returning -ENOTTY. This patch solves the problem by adding code fixing up the size as compat_ioctl file operation. Reported-by: Drew Richardson Signed-off-by: Pawel Moll Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1402671812-9078-1-git-send-email-pawel.moll@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..f9c1ed002dbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "internal.h" @@ -3717,6 +3718,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +#ifdef CONFIG_COMPAT +static long perf_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): + case _IOC_NR(PERF_EVENT_IOC_ID): + /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ + if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { + cmd &= ~IOCSIZE_MASK; + cmd |= sizeof(void *) << IOCSIZE_SHIFT; + } + break; + } + return perf_ioctl(file, cmd, arg); +} +#else +# define perf_compat_ioctl NULL +#endif + int perf_event_task_enable(void) { struct perf_event *event; @@ -4222,7 +4243,7 @@ static const struct file_operations perf_fops = { .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, + .compat_ioctl = perf_compat_ioctl, .mmap = perf_mmap, .fasync = perf_fasync, }; -- cgit v1.2.3 From 5d07f4202c5d63b73ba1734ed38e08461a689313 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:19:53 +0200 Subject: sched: s/do_each_thread/for_each_process_thread/ in core.c Change kernel/sched/core.c to use for_each_process_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813191953.GA19315@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d1ec6e60535..4f2826f46e95 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4505,7 +4505,7 @@ void show_state_filter(unsigned long state_filter) " task PC stack pid father\n"); #endif rcu_read_lock(); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: @@ -4513,7 +4513,7 @@ void show_state_filter(unsigned long state_filter) touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); - } while_each_thread(g, p); + } touch_all_softlockup_watchdogs(); @@ -7137,7 +7137,7 @@ void normalize_rt_tasks(void) struct rq *rq; read_lock_irqsave(&tasklist_lock, flags); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* * Only normalize user tasks: */ @@ -7168,8 +7168,7 @@ void normalize_rt_tasks(void) __task_rq_unlock(rq); raw_spin_unlock(&p->pi_lock); - } while_each_thread(g, p); - + } read_unlock_irqrestore(&tasklist_lock, flags); } @@ -7357,10 +7356,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg) { struct task_struct *g, *p; - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (rt_task(p) && task_rq(p)->rt.tg == tg) return 1; - } while_each_thread(g, p); + } return 0; } -- cgit v1.2.3 From d38e83c715270cc2e137bbf6f25206c8c023896b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:19:56 +0200 Subject: sched: s/do_each_thread/for_each_process_thread/ in debug.c Change kernel/sched/debug.c to use for_each_process_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813191956.GA19324@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 627b3c34b821..c7fe1ea0e8ab 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -160,14 +160,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "----------------------------------------------------\n"); read_lock_irqsave(&tasklist_lock, flags); - - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); - } while_each_thread(g, p); - + } read_unlock_irqrestore(&tasklist_lock, flags); } -- cgit v1.2.3 From 1e4dda08b4c39b3d8f4a3ee7269d49e0200c8af8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:20:00 +0200 Subject: sched: Change thread_group_cputime() to use for_each_thread() Change thread_group_cputime() to use for_each_thread() instead of buggy while_each_thread(). This also makes the pid_alive() check unnecessary. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813192000.GA19327@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 72fdf06ef865..3e52836359ba 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -294,18 +294,12 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) times->sum_exec_runtime = sig->sum_sched_runtime; rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { + for_each_thread(tsk, t) { task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: + } rcu_read_unlock(); } -- cgit v1.2.3 From 5aface53d1a0ef7823215c4078fca8445995d006 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 13 Aug 2014 21:20:03 +0200 Subject: sched: Change autogroup_move_group() to use for_each_thread() Change autogroup_move_group() to use for_each_thread() instead of buggy while_each_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Hidetoshi Seto Cc: Frank Mayhar Cc: Frederic Weisbecker Cc: Andrew Morton Cc: Sanjay Rao Cc: Larry Woodman Cc: Rik van Riel Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140813192003.GA19334@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e73efba98301..8a2e230fb86a 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) goto out; - t = p; - do { + for_each_thread(p, t) sched_move_task(t); - } while_each_thread(p, t); - out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); -- cgit v1.2.3 From 8b06c55bdb8b402cb4814e83dc4b1cb245fcc9f5 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 13 Aug 2014 13:28:12 -0400 Subject: sched: Match declaration with definition Match the declaration of runqueues with the definition. Signed-off-by: Pranith Kumar Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407950893-32731-1-git-send-email-bobby.prani@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..4c2b87fd5f52 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -647,7 +647,7 @@ static inline int cpu_of(struct rq *rq) #endif } -DECLARE_PER_CPU(struct rq, runqueues); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) -- cgit v1.2.3 From f36c019c79edb3a89920afae1b2b45987af1a112 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 6 Aug 2014 12:06:01 +0400 Subject: sched/fair: Fix reschedule which is generated on throttled cfs_rq (sched_entity::on_rq == 1) does not guarantee the task is pickable; changes on throttled cfs_rq must not lead to reschedule. Check for task_struct::on_rq instead. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1407312361.8424.35.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1413c44ce8a1..bc37bb97159f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7494,7 +7494,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!p->on_rq) return; /* @@ -7550,15 +7550,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *se = &p->se; /* * Since the real-depth could have been changed (only FAIR * class maintain depth value), reset depth properly. */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!se->on_rq) + if (!p->on_rq) return; /* -- cgit v1.2.3 From da0c1e65b51a289540159663aa4b90ba2366bc21 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:47:32 +0400 Subject: sched: Add wrapper for checking task_struct::on_rq Implement task_on_rq_queued() and use it everywhere instead of on_rq check. No functional changes. The only exception is we do not use the wrapper in check_for_tasks(), because it requires to export task_on_rq_queued() in global header files. Next patch in series would return it back, so we do not twist it from here to there. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528052.23412.87.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 82 ++++++++++++++++++++++++------------------------ kernel/sched/deadline.c | 15 ++++----- kernel/sched/fair.c | 22 ++++++------- kernel/sched/rt.c | 16 +++++----- kernel/sched/sched.h | 7 +++++ kernel/sched/stop_task.c | 2 +- 6 files changed, 76 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f2826f46e95..a02b624fee6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1043,7 +1043,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) + if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -1088,7 +1088,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) static void __migrate_swap_task(struct task_struct *p, int cpu) { - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; src_rq = task_rq(p); @@ -1214,7 +1214,7 @@ static int migration_cpu_stop(void *data); unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; - int running, on_rq; + int running, queued; unsigned long ncsw; struct rq *rq; @@ -1252,7 +1252,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -1284,7 +1284,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1478,7 +1478,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -1537,7 +1537,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) int ret = 0; rq = __task_rq_lock(p); - if (p->on_rq) { + if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); ttwu_do_wakeup(rq, p, wake_flags); @@ -1678,7 +1678,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (task_on_rq_queued(p) && ttwu_remote(p, wake_flags)) goto stat; #ifdef CONFIG_SMP @@ -1742,7 +1742,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) goto out; - if (!p->on_rq) + if (!task_on_rq_queued(p)) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); @@ -2095,7 +2095,7 @@ void wake_up_new_task(struct task_struct *p) init_task_runnable_average(p); rq = __task_rq_lock(p); activate_task(rq, p, 0); - p->on_rq = 1; + p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -2444,7 +2444,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (task_current(rq, p) && p->on_rq) { + if (task_current(rq, p) && task_on_rq_queued(p)) { update_rq_clock(rq); ns = rq_clock_task(rq) - p->se.exec_start; if ((s64)ns < 0) @@ -2490,7 +2490,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. */ - if (!p->on_cpu || !p->on_rq) + if (!p->on_cpu || !task_on_rq_queued(p)) return p->se.sum_exec_runtime; #endif @@ -2794,7 +2794,7 @@ need_resched: switch_count = &prev->nvcsw; } - if (prev->on_rq || rq->skip_clock_update < 0) + if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) update_rq_clock(rq); next = pick_next_task(rq, prev); @@ -2959,7 +2959,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; @@ -2988,9 +2988,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -3030,7 +3030,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); @@ -3041,7 +3041,7 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, on_rq; + int old_prio, delta, queued; unsigned long flags; struct rq *rq; @@ -3062,8 +3062,8 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); p->static_prio = NICE_TO_PRIO(nice); @@ -3072,7 +3072,7 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); delta = p->prio - old_prio; - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); /* * If the task increased its priority or is running and @@ -3344,7 +3344,7 @@ static int __sched_setscheduler(struct task_struct *p, { int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : MAX_RT_PRIO - 1 - attr->sched_priority; - int retval, oldprio, oldpolicy = -1, on_rq, running; + int retval, oldprio, oldpolicy = -1, queued, running; int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; @@ -3541,9 +3541,9 @@ change: return 0; } - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -3553,7 +3553,7 @@ change: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). @@ -4568,7 +4568,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_rq = 1; + idle->on_rq = TASK_ON_RQ_QUEUED; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4645,7 +4645,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (p->on_rq) { + if (task_on_rq_queued(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); @@ -4695,7 +4695,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->on_rq) { + if (task_on_rq_queued(p)) { dequeue_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); enqueue_task(rq_dest, p, 0); @@ -4736,13 +4736,13 @@ void sched_setnuma(struct task_struct *p, int nid) { struct rq *rq; unsigned long flags; - bool on_rq, running; + bool queued, running; rq = task_rq_lock(p, &flags); - on_rq = p->on_rq; + queued = task_on_rq_queued(p); running = task_current(rq, p); - if (on_rq) + if (queued) dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -4751,7 +4751,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, p, 0); task_rq_unlock(rq, p, &flags); } @@ -7116,13 +7116,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p) .sched_policy = SCHED_NORMAL, }; int old_prio = p->prio; - int on_rq; + int queued; - on_rq = p->on_rq; - if (on_rq) + queued = task_on_rq_queued(p); + if (queued) dequeue_task(rq, p, 0); __setscheduler(rq, p, &attr); - if (on_rq) { + if (queued) { enqueue_task(rq, p, 0); resched_curr(rq); } @@ -7309,16 +7309,16 @@ void sched_offline_group(struct task_group *tg) void sched_move_task(struct task_struct *tsk) { struct task_group *tg; - int on_rq, running; + int queued, running; unsigned long flags; struct rq *rq; rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->on_rq; + queued = task_on_rq_queued(tsk); - if (on_rq) + if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); @@ -7331,14 +7331,14 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, on_rq); + tsk->sched_class->task_move_group(tsk, queued); else #endif set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); - if (on_rq) + if (queued) enqueue_task(rq, tsk, 0); task_rq_unlock(rq, tsk, &flags); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..d21a8e0259d2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -530,7 +530,7 @@ again: update_rq_clock(rq); dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; - if (p->on_rq) { + if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) check_preempt_curr_dl(rq, p, 0); @@ -1030,7 +1030,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) * means a stop task can slip in, in which case we need to * re-start task selection. */ - if (rq->stop && rq->stop->on_rq) + if (rq->stop && task_on_rq_queued(rq->stop)) return RETRY_TASK; } @@ -1257,7 +1257,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || - task_running(rq, task) || !task->on_rq)) { + task_running(rq, task) || + !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1296,7 +1297,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!dl_task(p)); return p; @@ -1443,7 +1444,7 @@ static int pull_dl_task(struct rq *this_rq) dl_time_before(p->dl.deadline, this_rq->dl.earliest_dl.curr))) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * Then we pull iff p has actually an earlier @@ -1596,7 +1597,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (unlikely(p->dl.dl_throttled)) return; - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ @@ -1614,7 +1615,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (p->on_rq || rq->curr == p) { + if (task_on_rq_queued(p) || rq->curr == p) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc37bb97159f..9e6ca0d88f51 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7494,7 +7494,7 @@ static void task_fork_fair(struct task_struct *p) static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7519,11 +7519,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it's on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !on_rq, then only when + * If it's queued, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !queued, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!p->on_rq && p->state != TASK_RUNNING) { + if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7558,7 +7558,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) */ se->depth = se->parent ? se->parent->depth + 1 : 0; #endif - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; /* @@ -7604,7 +7604,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int queued) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; @@ -7623,7 +7623,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * fair sleeper stuff for the first placement, but who cares. */ /* - * When !on_rq, vruntime of the task has usually NOT been normalized. + * When !queued, vruntime of the task has usually NOT been normalized. * But there are some cases where it has already been normalized: * * - Moving a forked child which is waiting for being woken up by @@ -7634,14 +7634,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - on_rq = 1; + if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) + queued = 1; - if (!on_rq) + if (!queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!on_rq) { + if (!queued) { cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..4feac8fcb47f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) * means a dl or stop task can slip in, in which case we need * to re-start task selection. */ - if (unlikely((rq->stop && rq->stop->on_rq) || + if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || rq->dl.dl_nr_running)) return RETRY_TASK; } @@ -1624,7 +1624,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || - !task->on_rq)) { + !task_on_rq_queued(task))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1658,7 +1658,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->nr_cpus_allowed <= 1); - BUG_ON(!p->on_rq); + BUG_ON(!task_on_rq_queued(p)); BUG_ON(!rt_task(p)); return p; @@ -1809,7 +1809,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->on_rq); + WARN_ON(!task_on_rq_queued(p)); /* * There's a chance that p is higher in priority @@ -1870,7 +1870,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, BUG_ON(!rt_task(p)); - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; weight = cpumask_weight(new_mask); @@ -1936,7 +1936,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (!p->on_rq || rq->rt.rt_nr_running) + if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) return; if (pull_rt_task(rq)) @@ -1970,7 +1970,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->on_rq && rq->curr != p) { + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && /* Don't resched if we changed runqueues */ @@ -1989,7 +1989,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->on_rq) + if (!task_on_rq_queued(p)) return; if (rq->curr == p) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c2b87fd5f52..26566d0c67ac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -15,6 +15,9 @@ struct rq; +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 + extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; @@ -942,6 +945,10 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #endif } +static inline int task_on_rq_queued(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_QUEUED; +} #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index bfe0edadbfbb..67426e529f59 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (!stop || !stop->on_rq) + if (!stop || !task_on_rq_queued(stop)) return NULL; put_prev_task(rq, prev); -- cgit v1.2.3 From cca26e8009d1939a6a5bf0200d276fa26f03e536 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:47:42 +0400 Subject: sched: Teach scheduler to understand TASK_ON_RQ_MIGRATING state This is a new p->on_rq state which will be used to indicate that a task is in a process of migrating between two RQs. It allows to get rid of double_rq_lock(), which we used to use to change a rq of a queued task before. Let's consider an example. To move a task between src_rq and dst_rq we will do the following: raw_spin_lock(&src_rq->lock); /* p is a task which is queued on src_rq */ p = ...; dequeue_task(src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, dst_cpu); raw_spin_unlock(&src_rq->lock); /* * Both RQs are unlocked here. * Task p is dequeued from src_rq * but its on_rq value is not zero. */ raw_spin_lock(&dst_rq->lock); p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(dst_rq, p, 0); raw_spin_unlock(&dst_rq->lock); While p->on_rq is TASK_ON_RQ_MIGRATING, task is considered as "migrating", and other parallel scheduler actions with it are not available to parallel callers. The parallel caller is spining till migration is completed. The unavailable actions are changing of cpu affinity, changing of priority etc, in other words all the functionality which used to require task_rq(p)->lock before (and related to the task). To implement TASK_ON_RQ_MIGRATING support we primarily are using the following fact. Most of scheduler users (from which we are protecting a migrating task) use task_rq_lock() and __task_rq_lock() to get the lock of task_rq(p). These primitives know that task's cpu may change, and they are spining while the lock of the right RQ is not held. We add one more condition into them, so they will be also spinning until the migration is finished. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528062.23412.88.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 +++++++++--- kernel/sched/sched.h | 6 ++++++ 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a02b624fee6c..71b836034912 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -333,9 +333,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -352,10 +355,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); - if (likely(rq == task_rq(p))) + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) return rq; raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + + while (unlikely(task_on_rq_migrating(p))) + cpu_relax(); } } @@ -1678,7 +1684,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - if (task_on_rq_queued(p) && ttwu_remote(p, wake_flags)) + if (p->on_rq && ttwu_remote(p, wake_flags)) goto stat; #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 26566d0c67ac..aa0f73ba3777 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -17,6 +17,7 @@ struct rq; /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 extern __read_mostly int scheduler_running; @@ -950,6 +951,11 @@ static inline int task_on_rq_queued(struct task_struct *p) return p->on_rq == TASK_ON_RQ_QUEUED; } +static inline int task_on_rq_migrating(struct task_struct *p) +{ + return p->on_rq == TASK_ON_RQ_MIGRATING; +} + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v1.2.3 From a1e01829796aa7a993e28ffd7fee5c8d525be175 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:47:50 +0400 Subject: sched: Remove double_rq_lock() from __migrate_task() Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for __migrate_task(). The advantage is (obviously) not holding two rq->lock's at the same time and thereby increasing parallelism. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528070.23412.89.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 71b836034912..a773c919d88d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4679,20 +4679,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); */ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { - struct rq *rq_dest, *rq_src; + struct rq *rq; int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; - rq_src = cpu_rq(src_cpu); - rq_dest = cpu_rq(dest_cpu); + rq = cpu_rq(src_cpu); raw_spin_lock(&p->pi_lock); - double_rq_lock(rq_src, rq_dest); + raw_spin_lock(&rq->lock); /* Already moved. */ if (task_cpu(p) != src_cpu) goto done; + /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) goto fail; @@ -4702,15 +4702,22 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * placed properly. */ if (task_on_rq_queued(p)) { - dequeue_task(rq_src, p, 0); + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, dest_cpu); - enqueue_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p, 0); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(dest_cpu); + raw_spin_lock(&rq->lock); + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); } done: ret = 1; fail: - double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); return ret; } -- cgit v1.2.3 From e5673f280501298dbb56efa46e333cf64ee5080a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:48:01 +0400 Subject: sched/fair: Remove double_lock_balance() from active_load_balance_cpu_stop() Avoid double_rq_lock() and use the TASK_ON_RQ_MIGRATING state for active_load_balance_cpu_stop(). The advantage is (obviously) not holding two 'rq->lock's at the same time and thereby increasing parallelism. Further note that if there was no task to migrate we will not have acquired the second rq->lock at all. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528081.23412.92.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 60 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9e6ca0d88f51..7e5cf051c144 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5138,6 +5138,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env) { s64 delta; + lockdep_assert_held(&env->src_rq->lock); + if (p->sched_class != &fair_sched_class) return 0; @@ -5257,6 +5259,9 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; + + lockdep_assert_held(&env->src_rq->lock); + /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or @@ -5341,30 +5346,49 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) } /* - * move_one_task tries to move exactly one task from busiest to this_rq, as + * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". - * Returns 1 if successful and 0 otherwise. * - * Called with both runqueues locked. + * Returns a task if successful and NULL otherwise. */ -static int move_one_task(struct lb_env *env) +static struct task_struct *detach_one_task(struct lb_env *env) { struct task_struct *p, *n; + lockdep_assert_held(&env->src_rq->lock); + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { if (!can_migrate_task(p, env)) continue; - move_task(p, env); + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); + /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). + * Right now, this is only the second place where + * lb_gained[env->idle] is updated (other is move_tasks) + * so we can safely collect stats here rather than + * inside move_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; + return p; } - return 0; + return NULL; +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + raw_spin_unlock(&rq->lock); } static const unsigned int sched_nr_migrate_break = 32; @@ -6943,6 +6967,7 @@ static int active_load_balance_cpu_stop(void *data) int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; + struct task_struct *p = NULL; raw_spin_lock_irq(&busiest_rq->lock); @@ -6962,9 +6987,6 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -6985,16 +7007,22 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env)) + p = detach_one_task(&env); + if (p) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); - double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock_irq(&busiest_rq->lock); + raw_spin_unlock(&busiest_rq->lock); + + if (p) + attach_one_task(target_rq, p); + + local_irq_enable(); + return 0; } -- cgit v1.2.3 From 163122b7fcfa28c0e4a838fcc8043c616746802e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Aug 2014 13:48:29 +0400 Subject: sched/fair: Remove double_lock_balance() from load_balance() Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for load_balance(). The advantage is (obviously) not holding two rq->lock's at the same time and thereby increasing parallelism. Further note that if there was no task to migrate we will not have acquired the second rq->lock at all. The important point to note is that because we acquire dst->lock immediately after releasing src->lock the potential wait time of task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than it would have been in the double rq lock scenario. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Paul Turner Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Mike Galbraith Cc: Kirill Tkhai Cc: Tim Chen Cc: Nicolas Pitre Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 151 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 99 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e5cf051c144..d3427a8f254b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as move_task(), in which we + * This is possible from callers such as attach_tasks(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -5117,20 +5117,9 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + struct list_head tasks; }; -/* - * move_task - move a task from one runqueue to another runqueue. - * Both runqueues must be locked. - */ -static void move_task(struct task_struct *p, struct lb_env *env) -{ - deactivate_task(env->src_rq, p, 0); - set_task_cpu(p, env->dst_cpu); - activate_task(env->dst_rq, p, 0); - check_preempt_curr(env->dst_rq, p, 0); -} - /* * Is this task likely cache-hot: */ @@ -5345,6 +5334,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; } +/* + * detach_task() -- detach the task for the migration specified in env + */ +static void detach_task(struct task_struct *p, struct lb_env *env) +{ + lockdep_assert_held(&env->src_rq->lock); + + deactivate_task(env->src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, env->dst_cpu); +} + /* * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". @@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env) if (!can_migrate_task(p, env)) continue; - deactivate_task(env->src_rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, env->dst_cpu); + detach_task(p, env); /* * Right now, this is only the second place where - * lb_gained[env->idle] is updated (other is move_tasks) + * lb_gained[env->idle] is updated (other is detach_tasks) * so we can safely collect stats here rather than - * inside move_tasks(). + * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); return p; @@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -/* - * attach_one_task() -- attaches the task returned from detach_one_task() to - * its new rq. - */ -static void attach_one_task(struct rq *rq, struct task_struct *p) -{ - raw_spin_lock(&rq->lock); - BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; - activate_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - raw_spin_unlock(&rq->lock); -} - static const unsigned int sched_nr_migrate_break = 32; /* - * move_tasks tries to move up to imbalance weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. + * detach_tasks() -- tries to detach up to imbalance weighted load from + * busiest_rq, as part of a balancing operation within domain "sd". * - * Called with both runqueues locked. + * Returns number of detached tasks if successful and 0 otherwise. */ -static int move_tasks(struct lb_env *env) +static int detach_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; unsigned long load; - int pulled = 0; + int detached = 0; + + lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; @@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - move_task(p, env); - pulled++; + detach_task(p, env); + list_add(&p->se.group_node, &env->tasks); + + detached++; env->imbalance -= load; #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize + * kernels will stop after the first task is detached to minimize * the critical section. */ if (env->idle == CPU_NEWLY_IDLE) @@ -5463,13 +5451,58 @@ next: } /* - * Right now, this is one of only two places move_task() is called, - * so we can safely collect move_task() stats here rather than - * inside move_task(). + * Right now, this is one of only two places we collect this stat + * so we can safely collect detach_one_task() stats here rather + * than inside detach_one_task(). */ - schedstat_add(env->sd, lb_gained[env->idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], detached); - return pulled; + return detached; +} + +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_held(&rq->lock); + + BUG_ON(task_rq(p) != rq); + p->on_rq = TASK_ON_RQ_QUEUED; + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static void attach_one_task(struct rq *rq, struct task_struct *p) +{ + raw_spin_lock(&rq->lock); + attach_task(rq, p); + raw_spin_unlock(&rq->lock); +} + +/* + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their + * new rq. + */ +static void attach_tasks(struct lb_env *env) +{ + struct list_head *tasks = &env->tasks; + struct task_struct *p; + + raw_spin_lock(&env->dst_rq->lock); + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); + list_del_init(&p->se.group_node); + + attach_task(env->dst_rq, p); + } + + raw_spin_unlock(&env->dst_rq->lock); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop_break = sched_nr_migrate_break, .cpus = cpus, .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), }; /* @@ -6652,16 +6686,29 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - local_irq_save(flags); - double_rq_lock(env.dst_rq, busiest); + raw_spin_lock_irqsave(&busiest->lock, flags); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = move_tasks(&env); - ld_moved += cur_ld_moved; - double_rq_unlock(env.dst_rq, busiest); + cur_ld_moved = detach_tasks(&env); + + /* + * We've detached some tasks from busiest_rq. Every + * task is masked "TASK_ON_RQ_MIGRATING", so we can safely + * unlock busiest->lock, and we are able to be sure + * that nobody can manipulate the tasks in parallel. + * See task_rq_lock() family for the details. + */ + + raw_spin_unlock(&busiest->lock); + + if (cur_ld_moved) { + attach_tasks(&env); + ld_moved += cur_ld_moved; + } + local_irq_restore(flags); /* @@ -6797,7 +6844,7 @@ more_balance: * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call - * move_tasks). + * detach_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; -- cgit v1.2.3 From b5e995e671d8e4d7a75b339ce78ecc586014b0eb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 12 Jun 2014 16:24:41 +0530 Subject: nohz: Fix spurious periodic tick behaviour in low-res dynticks mode When we reach the end of the tick handler, we unconditionally reschedule the next tick to the next jiffy. Then on irq exit, the nohz code overrides that setting if needed and defers the next tick as far away in the future as possible. Now in the best dynticks case, when we actually don't need any tick in the future (ie: expires == KTIME_MAX), low-res and high-res behave differently. What we want in this case is to cancel the next tick programmed by the previous one. That's what we do in high-res mode. OTOH we lack a low-res mode equivalent of hrtimer_cancel() so we simply don't do anything in this case and the next tick remains scheduled to jiffies + 1. As a result, in low-res mode, when the dynticks code determines that no tick is needed in the future, we can recursively get a spurious tick every jiffy because then the next tick is always reprogrammed from the tick handler and is never cancelled. And this can happen indefinetly until some subsystem actually needs a precise tick in the future and only then we eventually overwrite the previous tick handler setting to defer the next tick. We are fixing this by introducing the ONESHOT_STOPPED mode which will let us pause a clockevent when no further interrupt is needed. Meanwhile we can't expect all drivers to support this new mode. So lets reduce much of the symptoms by skipping the nohz-blind tick rescheduling from the tick-handler when the CPU is in dynticks mode. That tick rescheduling wrongly assumed periodicity and the low-res dynticks code can't cancel such decision. This breaks the recursive (and thus the worst) part of the problem. In the worst case now, we'll get only one extra tick due to uncancelled tick scheduled before we entered dynticks mode. This also removes a needless clockevent write on idle ticks. Since those clock write are usually considered to be slow, it's a general win. Reviewed-by: Preeti U Murthy Signed-off-by: Viresh Kumar Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..153870a91350 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -968,6 +968,10 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(now); tick_sched_handle(ts, regs); + /* No need to reprogram if we are running tickless */ + if (unlikely(ts->tick_stopped)) + return; + while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); -- cgit v1.2.3 From 2a16fc93d2c9568e16d45db77c7b5f15e1921cf1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 12 Jun 2014 16:24:41 +0530 Subject: nohz: Avoid tick's double reprogramming in highres mode In highres mode, the tick reschedules itself unconditionally to the next jiffies. However while this clock reprogramming is relevant when the tick is in periodic mode, it's not that interesting when we run in dynticks mode because irq exit is likely going to overwrite the next tick to some randomly deferred future. So lets just get rid of this tick self rescheduling in dynticks mode. This way we can avoid some clockevents double write in favourable scenarios like when we stop the tick completely in idle while no other hrtimer is pending. Suggested-by: Frederic Weisbecker Signed-off-by: Viresh Kumar Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 153870a91350..cc0a5b6f741b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1099,6 +1099,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) if (regs) tick_sched_handle(ts, regs); + /* No need to reprogram if we are in idle or full dynticks mode */ + if (unlikely(ts->tick_stopped)) + return HRTIMER_NORESTART; + hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; -- cgit v1.2.3 From 33b7f99cf003ca6c1d31c42b50e1100ad71aaec0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 15 Aug 2014 17:23:02 -0400 Subject: ftrace: Allow ftrace_ops to use the hashes from other ops Currently the top level debug file system function tracer shares its ftrace_ops with the function graph tracer. This was thought to be fine because the tracers are not used together, as one can only enable function or function_graph tracer in the current_tracer file. But that assumption proved to be incorrect. The function profiler can use the function graph tracer when function tracing is enabled. Since all function graph users uses the function tracing ftrace_ops this causes a conflict and when a user enables both function profiling as well as the function tracer it will crash ftrace and disable it. The quick solution so far is to move them as separate ftrace_ops like it was earlier. The problem though is to synchronize the functions that are traced because both function and function_graph tracer are limited by the selections made in the set_ftrace_filter and set_ftrace_notrace files. To handle this, a new structure is made called ftrace_ops_hash. This structure will now hold the filter_hash and notrace_hash, and the ftrace_ops will point to this structure. That will allow two ftrace_ops to share the same hashes. Since most ftrace_ops do not share the hashes, and to keep allocation simple, the ftrace_ops structure will include both a pointer to the ftrace_ops_hash called func_hash, as well as the structure itself, called local_hash. When the ops are registered, the func_hash pointer will be initialized to point to the local_hash within the ftrace_ops structure. Some of the ftrace internal ftrace_ops will be initialized statically. This will allow for the function and function_graph tracer to have separate ops but still share the same hash tables that determine what functions they trace. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 14 +++++-- kernel/trace/ftrace.c | 100 +++++++++++++++++++++++++------------------------ 2 files changed, 63 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6bb5e3f2a3b4..f0b0edbf55a9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -102,6 +102,15 @@ enum { FTRACE_OPS_FL_DELETED = 1 << 8, }; +#ifdef CONFIG_DYNAMIC_FTRACE +/* The hash used to know what functions callbacks trace */ +struct ftrace_ops_hash { + struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash; + struct mutex regex_lock; +}; +#endif + /* * Note, ftrace_ops can be referenced outside of RCU protection. * (Although, for perf, the control ops prevent that). If ftrace_ops is @@ -121,10 +130,9 @@ struct ftrace_ops { int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE int nr_trampolines; - struct ftrace_hash *notrace_hash; - struct ftrace_hash *filter_hash; + struct ftrace_ops_hash local_hash; + struct ftrace_ops_hash *func_hash; struct ftrace_hash *tramp_hash; - struct mutex regex_lock; unsigned long trampoline; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1654b12c891a..c92757adba79 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -65,15 +65,17 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) #ifdef CONFIG_DYNAMIC_FTRACE -#define INIT_REGEX_LOCK(opsname) \ - .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), +#define INIT_OPS_HASH(opsname) \ + .func_hash = &opsname.local_hash, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else -#define INIT_REGEX_LOCK(opsname) +#define INIT_OPS_HASH(opsname) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + INIT_OPS_HASH(ftrace_list_end) }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -140,7 +142,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) { #ifdef CONFIG_DYNAMIC_FTRACE if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { - mutex_init(&ops->regex_lock); + mutex_init(&ops->local_hash.regex_lock); + ops->func_hash = &ops->local_hash; ops->flags |= FTRACE_OPS_FL_INITIALIZED; } #endif @@ -899,7 +902,7 @@ static void unregister_ftrace_profiler(void) static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(ftrace_profile_ops) + INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) @@ -1081,11 +1084,12 @@ static const struct ftrace_hash empty_hash = { #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) static struct ftrace_ops global_ops = { - .func = ftrace_stub, - .notrace_hash = EMPTY_HASH, - .filter_hash = EMPTY_HASH, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + .func = ftrace_stub, + .local_hash.notrace_hash = EMPTY_HASH, + .local_hash.filter_hash = EMPTY_HASH, + INIT_OPS_HASH(global_ops) + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED, }; struct ftrace_page { @@ -1226,8 +1230,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) void ftrace_free_filter(struct ftrace_ops *ops) { ftrace_ops_init(ops); - free_ftrace_hash(ops->filter_hash); - free_ftrace_hash(ops->notrace_hash); + free_ftrace_hash(ops->func_hash->filter_hash); + free_ftrace_hash(ops->func_hash->notrace_hash); } static struct ftrace_hash *alloc_ftrace_hash(int size_bits) @@ -1382,8 +1386,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); + filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); if ((ftrace_hash_empty(filter_hash) || ftrace_lookup_ip(filter_hash, ip)) && @@ -1554,14 +1558,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * gets inversed. */ if (filter_hash) { - hash = ops->filter_hash; - other_hash = ops->notrace_hash; + hash = ops->func_hash->filter_hash; + other_hash = ops->func_hash->notrace_hash; if (ftrace_hash_empty(hash)) all = 1; } else { inc = !inc; - hash = ops->notrace_hash; - other_hash = ops->filter_hash; + hash = ops->func_hash->notrace_hash; + other_hash = ops->func_hash->filter_hash; /* * If the notrace hash has no items, * then there's nothing to do. @@ -2436,8 +2440,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops) * Filter_hash being empty will default to trace module. * But notrace hash requires a test of individual module functions. */ - return ftrace_hash_empty(ops->filter_hash) && - ftrace_hash_empty(ops->notrace_hash); + return ftrace_hash_empty(ops->func_hash->filter_hash) && + ftrace_hash_empty(ops->func_hash->notrace_hash); } /* @@ -2459,12 +2463,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 0; /* The function must be in the filter */ - if (!ftrace_hash_empty(ops->filter_hash) && - !ftrace_lookup_ip(ops->filter_hash, rec->ip)) + if (!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) return 0; /* If in notrace hash, we ignore it too */ - if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) + if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) return 0; return 1; @@ -2785,10 +2789,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } else { rec = &iter->pg->records[iter->idx++]; if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || + !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -2837,9 +2841,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) * functions are enabled. */ if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->filter_hash)) || + ftrace_hash_empty(ops->func_hash->filter_hash)) || (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->notrace_hash))) { + ftrace_hash_empty(ops->func_hash->notrace_hash))) { if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; @@ -3001,12 +3005,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, iter->ops = ops; iter->flags = flag; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (flag & FTRACE_ITER_NOTRACE) - hash = ops->notrace_hash; + hash = ops->func_hash->notrace_hash; else - hash = ops->filter_hash; + hash = ops->func_hash->filter_hash; if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; @@ -3041,7 +3045,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, file->private_data = iter; out_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); return ret; } @@ -3279,7 +3283,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = { .func = function_trace_probe_call, .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(trace_probe_ops) + INIT_OPS_HASH(trace_probe_ops) }; static int ftrace_probe_registered; @@ -3342,7 +3346,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3359,7 +3363,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, if (WARN_ON(not)) return -EINVAL; - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) { @@ -3428,7 +3432,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, out_unlock: mutex_unlock(&ftrace_lock); out: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); return count; @@ -3446,7 +3450,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; struct ftrace_func_probe *p; - struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; @@ -3468,7 +3472,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, return; } - mutex_lock(&trace_probe_ops.regex_lock); + mutex_lock(&trace_probe_ops.func_hash->regex_lock); hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); if (!hash) @@ -3521,7 +3525,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.regex_lock); + mutex_unlock(&trace_probe_ops.func_hash->regex_lock); free_ftrace_hash(hash); } @@ -3717,12 +3721,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, if (unlikely(ftrace_disabled)) return -ENODEV; - mutex_lock(&ops->regex_lock); + mutex_lock(&ops->func_hash->regex_lock); if (enable) - orig_hash = &ops->filter_hash; + orig_hash = &ops->func_hash->filter_hash; else - orig_hash = &ops->notrace_hash; + orig_hash = &ops->func_hash->notrace_hash; if (reset) hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); @@ -3752,7 +3756,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); out_regex_unlock: - mutex_unlock(&ops->regex_lock); + mutex_unlock(&ops->func_hash->regex_lock); free_ftrace_hash(hash); return ret; @@ -3975,15 +3979,15 @@ int ftrace_regex_release(struct inode *inode, struct file *file) trace_parser_put(parser); - mutex_lock(&iter->ops->regex_lock); + mutex_lock(&iter->ops->func_hash->regex_lock); if (file->f_mode & FMODE_WRITE) { filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); if (filter_hash) - orig_hash = &iter->ops->filter_hash; + orig_hash = &iter->ops->func_hash->filter_hash; else - orig_hash = &iter->ops->notrace_hash; + orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); ret = ftrace_hash_move(iter->ops, filter_hash, @@ -3994,7 +3998,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&ftrace_lock); } - mutex_unlock(&iter->ops->regex_lock); + mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); kfree(iter); @@ -4611,7 +4615,7 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(global_ops) + INIT_OPS_HASH(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -4713,7 +4717,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_REGEX_LOCK(control_ops) + INIT_OPS_HASH(control_ops) }; static inline void -- cgit v1.2.3 From fa8137be6ba632041e725e4623258ba27a2cf9be Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 8 Aug 2014 11:44:03 -0400 Subject: cgroup: Display legacy cgroup files on default hierarchy Kernel command line parameter cgroup__DEVEL__legacy_files_on_dfl forces legacy cgroup files to show up on default hierarhcy if susbsystem does not have any files defined for default hierarchy. But this seems to be working only if legacy files are defined in ss->legacy_cftypes. If one adds some cftypes later using cgroup_add_legacy_cftypes(), these files don't show up on default hierarchy. Update the function accordingly so that the dynamically added legacy files also show up in the default hierarchy if the target subsystem is also using the base legacy files for the default hierarchy. tj: Patch description and comment updates. Signed-off-by: Vivek Goyal Signed-off-by: Tejun Heo --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c3d1802a9b30..50b94113f4f7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3271,8 +3271,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; - for (cft = cfts; cft && cft->name[0] != '\0'; cft++) - cft->flags |= __CFTYPE_NOT_ON_DFL; + /* + * If legacy_flies_on_dfl, we want to show the legacy files on the + * dfl hierarchy but iff the target subsystem hasn't been updated + * for the dfl hierarchy yet. + */ + if (!cgroup_legacy_files_on_dfl || + ss->dfl_cftypes != ss->legacy_cftypes) { + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) + cft->flags |= __CFTYPE_NOT_ON_DFL; + } + return cgroup_add_cftypes(ss, cfts); } -- cgit v1.2.3 From 84261912ebee41269004e8a9f3614ba38ef6b206 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Mon, 18 Aug 2014 13:21:08 -0400 Subject: ftrace: Update all ftrace_ops for a ftrace_hash_ops update When updating what an ftrace_ops traces, if it is registered (that is, actively tracing), and that ftrace_ops uses the shared global_ops local_hash, then we need to update all tracers that are active and also share the global_ops' ftrace_hash_ops. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c92757adba79..37f9e90d241c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1292,9 +1292,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) } static void -ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); static void -ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); +ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_move(struct ftrace_ops *ops, int enable, @@ -1346,13 +1346,13 @@ update: * Remove the current set, update the hash and add * them back. */ - ftrace_hash_rec_disable(ops, enable); + ftrace_hash_rec_disable_modify(ops, enable); old_hash = *dst; rcu_assign_pointer(*dst, new_hash); free_ftrace_hash_rcu(old_hash); - ftrace_hash_rec_enable(ops, enable); + ftrace_hash_rec_enable_modify(ops, enable); return 0; } @@ -1686,6 +1686,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } +static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, + int filter_hash, int inc) +{ + struct ftrace_ops *op; + + __ftrace_hash_rec_update(ops, filter_hash, inc); + + if (ops->func_hash != &global_ops.local_hash) + return; + + /* + * If the ops shares the global_ops hash, then we need to update + * all ops that are enabled and use this hash. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Already done */ + if (op == ops) + continue; + if (op->func_hash == &global_ops.local_hash) + __ftrace_hash_rec_update(op, filter_hash, inc); + } while_for_each_ftrace_op(op); +} + +static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, + int filter_hash) +{ + ftrace_hash_rec_update_modify(ops, filter_hash, 1); +} + static void print_ip_ins(const char *fmt, unsigned char *p) { int i; -- cgit v1.2.3 From bce0b6c51ac76fc0e763262a6c2a9d05e486f0d8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 20 Aug 2014 23:57:04 -0400 Subject: ftrace: Fix up trampoline accounting with looping on hash ops Now that a ftrace_hash can be shared by multiple ftrace_ops, they can dec the rec->flags by more than once (one per those that share the ftrace_hash). This means that the tramp_hash may not have a hash item when it was added. For example, if two ftrace_ops share a hash for a ftrace record, and the first ops has a trampoline, when it adds itself it will set the rec->flags TRAMP flag and increments its nr_trampolines counter. When the second ops is added, it must clear that tramp flag but also decrement the other ops that shares its hash. As the update to the function callbacks has not yet been performed, the other ops will not have the tramp hash set yet and it can not be used to know to decrement its nr_trampolines. Luckily, the tramp_hash does not need to be used. As the ftrace_mutex is held, a ops with a trampoline to a record during an update of another ops that shares the record will have its func_hash pointing to it. Since a trampoline can only be set for a record if only one ops is attached to it, we can just check if the record has a trampoline (the FTRACE_FL_TRAMP flag is set) and then find the ops that has this record in its hashes. Also added some output to help debug when things go wrong. Cc: stable@vger.kernel.org # 3.16+ (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37f9e90d241c..92376aeac4a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1507,25 +1507,38 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static void ftrace_remove_tramp(struct ftrace_ops *ops, struct dyn_ftrace *rec) { - struct ftrace_func_entry *entry; - - entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); - if (!entry) + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) return; + rec->flags &= ~FTRACE_FL_TRAMP; + + if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && + !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || + ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) + return; /* * The tramp_hash entry will be removed at time * of update. */ ops->nr_trampolines--; - rec->flags &= ~FTRACE_FL_TRAMP; } -static void ftrace_clear_tramps(struct dyn_ftrace *rec) +static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) { struct ftrace_ops *op; + /* If TRAMP is not set, no ops should have a trampoline for this */ + if (!(rec->flags & FTRACE_FL_TRAMP)) + return; + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This function is called to clear other tramps + * not the one that is being updated. + */ + if (op == ops) + continue; if (op->nr_trampolines) ftrace_remove_tramp(op, rec); } while_for_each_ftrace_op(op); @@ -1626,13 +1639,10 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, /* * If we are adding another function callback * to this function, and the previous had a - * trampoline used, then we need to go back to - * the default trampoline. + * custom trampoline in use, then we need to go + * back to the default trampoline. */ - rec->flags &= ~FTRACE_FL_TRAMP; - - /* remove trampolines from any ops for this rec */ - ftrace_clear_tramps(rec); + ftrace_clear_tramps(rec, ops); } /* @@ -1935,8 +1945,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP) { ops = ftrace_find_tramp_ops_new(rec); if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n", + (void *)rec->ip, (void *)rec->ip, rec->flags); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } @@ -2266,7 +2276,10 @@ static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) } while_for_each_ftrace_rec(); /* The number of recs in the hash must match nr_trampolines */ - FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); + if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) + pr_warn("count=%ld trampolines=%d\n", + ops->tramp_hash->count, + ops->nr_trampolines); return 0; } -- cgit v1.2.3 From 5f151b240192a1557119d5375af71efc26825bc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 15 Aug 2014 17:18:46 -0400 Subject: ftrace: Fix function_profiler and function tracer together The latest rewrite of ftrace removed the separate ftrace_ops of the function tracer and the function graph tracer and had them share the same ftrace_ops. This simplified the accounting by removing the multiple layers of functions called, where the global_ops func would call a special list that would iterate over the other ops that were registered within it (like function and function graph), which itself was registered to the ftrace ops list of all functions currently active. If that sounds confusing, the code that implemented it was also confusing and its removal is a good thing. The problem with this change was that it assumed that the function and function graph tracer can never be used at the same time. This is mostly true, but there is an exception. That is when the function profiler uses the function graph tracer to profile. The function profiler can be activated the same time as the function tracer, and this breaks the assumption and the result is that ftrace will crash (it detects the error and shuts itself down, it does not cause a kernel oops). To solve this issue, a previous change allowed the hash tables for the functions traced by a ftrace_ops to be a pointer and let multiple ftrace_ops share the same hash. This allows the function and function_graph tracer to have separate ftrace_ops, but still share the hash, which is what is done. Now the function and function graph tracers have separate ftrace_ops again, and the function tracer can be run while the function_profile is active. Cc: stable@vger.kernel.org # 3.16 (apply after 3.17-rc4 is out) Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 60 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 92376aeac4a7..08aca65d709a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -68,8 +68,12 @@ #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), +#define ASSIGN_OPS_HASH(opsname, val) \ + .func_hash = val, \ + .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else #define INIT_OPS_HASH(opsname) +#define ASSIGN_OPS_HASH(opsname, val) #endif static struct ftrace_ops ftrace_list_end __read_mostly = { @@ -4663,7 +4667,6 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(global_ops) }; static int __init ftrace_nodyn_init(void) @@ -5197,6 +5200,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER +static struct ftrace_ops graph_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_STUB, +#ifdef FTRACE_GRAPH_TRAMP_ADDR + .trampoline = FTRACE_GRAPH_TRAMP_ADDR, +#endif + ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) +}; + static int ftrace_graph_active; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -5359,12 +5373,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) */ static void update_function_graph_func(void) { - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list == &global_ops && - global_ops.next == &ftrace_list_end)) - ftrace_graph_entry = __ftrace_graph_entry; - else + struct ftrace_ops *op; + bool do_test = false; + + /* + * The graph and global ops share the same set of functions + * to test. If any other ops is on the list, then + * the graph tracing needs to test if its the function + * it should call. + */ + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op != &global_ops && op != &graph_ops && + op != &ftrace_list_end) { + do_test = true; + /* in double loop, break out with goto */ + goto out; + } + } while_for_each_ftrace_op(op); + out: + if (do_test) ftrace_graph_entry = ftrace_graph_entry_test; + else + ftrace_graph_entry = __ftrace_graph_entry; } static struct notifier_block ftrace_suspend_notifier = { @@ -5405,16 +5435,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_entry = ftrace_graph_entry_test; update_function_graph_func(); - /* Function graph doesn't use the .func field of global_ops */ - global_ops.flags |= FTRACE_OPS_FL_STUB; - -#ifdef CONFIG_DYNAMIC_FTRACE - /* Optimize function graph calling (if implemented by arch) */ - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; -#endif - - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -5432,12 +5453,7 @@ void unregister_ftrace_graph(void) ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); - global_ops.flags &= ~FTRACE_OPS_FL_STUB; -#ifdef CONFIG_DYNAMIC_FTRACE - if (FTRACE_GRAPH_TRAMP_ADDR != 0) - global_ops.trampoline = 0; -#endif + ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -- cgit v1.2.3 From 39b5552cd5090d4c210d278cd2732f493075f033 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sun, 17 Aug 2014 20:59:10 -0400 Subject: ftrace: Use current addr when converting to nop in __ftrace_replace_code() In __ftrace_replace_code(), when converting the call to a nop in a function it needs to compare against the "curr" (current) value of the ftrace ops, and not the "new" one. It currently does not affect x86 which is the only arch to do the trampolines with function graph tracer, but when other archs that do depend on this code implement the function graph trampoline, it can crash. Here's an example when ARM uses the trampolines (in the future): ------------[ cut here ]------------ WARNING: CPU: 0 PID: 9 at kernel/trace/ftrace.c:1716 ftrace_bug+0x17c/0x1f4() Modules linked in: omap_rng rng_core ipv6 CPU: 0 PID: 9 Comm: migration/0 Not tainted 3.16.0-test-10959-gf0094b28f303-dirty #52 [] (unwind_backtrace) from [] (show_stack+0x20/0x24) [] (show_stack) from [] (dump_stack+0x78/0x94) [] (dump_stack) from [] (warn_slowpath_common+0x7c/0x9c) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x2c/0x34) [] (warn_slowpath_null) from [] (ftrace_bug+0x17c/0x1f4) [] (ftrace_bug) from [] (ftrace_replace_code+0x80/0x9c) [] (ftrace_replace_code) from [] (ftrace_modify_all_code+0xb8/0x164) [] (ftrace_modify_all_code) from [] (__ftrace_modify_code+0x14/0x1c) [] (__ftrace_modify_code) from [] (multi_cpu_stop+0xf4/0x134) [] (multi_cpu_stop) from [] (cpu_stopper_thread+0x54/0x130) [] (cpu_stopper_thread) from [] (smpboot_thread_fn+0x1ac/0x1bc) [] (smpboot_thread_fn) from [] (kthread+0xe0/0xfc) [] (kthread) from [] (ret_from_fork+0x14/0x20) ---[ end trace dc9ce72c5b617d8f ]--- [ 65.047264] ftrace failed to modify [] asm_do_IRQ+0x10/0x1c [ 65.054070] actual: 85:1b:00:eb Fixes: 7413af1fb70e7 "ftrace: Make get_ftrace_addr() and get_ftrace_addr_old() global" Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08aca65d709a..5916a8e59e87 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2017,7 +2017,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: - return ftrace_make_nop(NULL, rec, ftrace_addr); + return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); -- cgit v1.2.3 From 61b67684c4a4d04b30d9ed67aa2eadfa0089c590 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 13 Aug 2014 19:39:56 +0200 Subject: perf: Fix perf_poll to return proper POLLHUP value Currently perf_poll returns POLL_HUP in case of error, which is wrong, because poll syscall expects POLLHUP. The POLL_HUP is meant to be used for SIGIO state. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140811120102.GY9918@twins.programming.kicks-ass.net Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jean Pihet Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-0ywfthh4lh65swe15f6w2x2q@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2d7363adf678..4575dd6e59ea 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3627,7 +3627,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - unsigned int events = POLL_HUP; + unsigned int events = POLLHUP; poll_wait(file, &event->waitq, wait); /* -- cgit v1.2.3 From 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Aug 2014 11:48:26 -0400 Subject: perf: Add PERF_EVENT_STATE_EXIT state for events with exited task Adding new perf event state to indicate that the monitored task has exited. In this case the event stays alive until the owner task exits or close the event fd while providing the last data through the read syscall and ring buffer. Instead it needs to propagate the error info (monitored task has died) via poll and read syscalls by returning POLLHUP and 0 respectively. Signed-off-by: Jiri Olsa Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140811120102.GY9918@twins.programming.kicks-ass.net Cc: Adrian Hunter Cc: Arnaldo Carvalho de Melo Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jean Pihet Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-t5y3w8jjx6tfo5w8y6oajsjq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 1 + kernel/events/core.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f0a1036b1911..893a0d07986f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -269,6 +269,7 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { + PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, diff --git a/kernel/events/core.c b/kernel/events/core.c index 4575dd6e59ea..d8cb4d21a346 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3600,7 +3600,8 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if (event->state == PERF_EVENT_STATE_ERROR) + if ((event->state == PERF_EVENT_STATE_ERROR) || + (event->state == PERF_EVENT_STATE_EXIT)) return 0; if (count < event->read_size) @@ -3630,6 +3631,10 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLLHUP; poll_wait(file, &event->waitq, wait); + + if (event->state == PERF_EVENT_STATE_EXIT) + return events; + /* * Pin the event->rb by taking event->mmap_mutex; otherwise * perf_event_set_output() can swizzle our rb and make us miss wakeups. @@ -7588,6 +7593,9 @@ __perf_event_exit_task(struct perf_event *child_event, if (child_event->parent) { sync_child_event(child_event, child); free_event(child_event); + } else { + child_event->state = PERF_EVENT_STATE_EXIT; + perf_event_wakeup(child_event); } } -- cgit v1.2.3 From 2ee507c472939db4b146d545352b8a7c79ef47f8 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Thu, 31 Jul 2014 10:29:48 -0700 Subject: sched: Add function single_task_running to let a task check if it is the only task running on a cpu This function will help an async task processing batched jobs from workqueue decide if it wants to keep processing on more chunks of batched work that can be delayed, or to accumulate more work for more efficient batched processing later. If no other tasks are running on the cpu, the batching process can take advantgae of the available cpu cycles to a make decision to continue processing the existing accumulated work to minimize delay, otherwise it will yield. Signed-off-by: Tim Chen Signed-off-by: Herbert Xu --- include/linux/sched.h | 1 + kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885ee52b..e6d2c056d8e0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -167,6 +167,7 @@ extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); +extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec1a286684a5..59965ec0b7de 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2366,6 +2366,18 @@ unsigned long nr_running(void) return sum; } +/* + * Check if only the current task is running on the cpu. + */ +bool single_task_running(void) +{ + if (cpu_rq(smp_processor_id())->nr_running == 1) + return true; + else + return false; +} +EXPORT_SYMBOL(single_task_running); + unsigned long long nr_context_switches(void) { int i; -- cgit v1.2.3 From 251f8c0364f99fc21fcc7b07e4ec6b4f3250d841 Mon Sep 17 00:00:00 2001 From: Dongsheng Yang Date: Mon, 25 Aug 2014 19:27:52 +0800 Subject: cgroup: fix a typo in comment. There is no function named cgroup_enable_task_cg_links(). Instead, the correct function name in this comment should be cgroup_enabled_task_cg_lists(). Signed-off-by: Dongsheng Yang Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..64bbb56496c2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5161,7 +5161,7 @@ void cgroup_post_fork(struct task_struct *child) int i; /* - * This may race against cgroup_enable_task_cg_links(). As that + * This may race against cgroup_enable_task_cg_lists(). As that * function sets use_task_css_set_links before grabbing * tasklist_lock and we just went through tasklist_lock to add * @child, it's guaranteed that either we see the set @@ -5176,7 +5176,7 @@ void cgroup_post_fork(struct task_struct *child) * when implementing operations which need to migrate all tasks of * a cgroup to another. * - * Note that if we lose to cgroup_enable_task_cg_links(), @child + * Note that if we lose to cgroup_enable_task_cg_lists(), @child * will remain in init_css_set. This is safe because all tasks are * in the init_css_set before cg_links is enabled and there's no * operation which transfers all tasks out of init_css_set. -- cgit v1.2.3 From 7cad45eea3849faeb34591b60d16b50d13a38d77 Mon Sep 17 00:00:00 2001 From: Vincent Stehlé Date: Fri, 22 Aug 2014 01:31:20 +0200 Subject: irq: Export handle_fasteoi_irq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export handle_fasteoi_irq to be able to use it in e.g. the Zynq gpio driver since commit 6dd859508336 ("gpio: zynq: Fix IRQ handlers"). This fixes the following link issue: ERROR: "handle_fasteoi_irq" [drivers/gpio/gpio-zynq.ko] undefined! Signed-off-by: Vincent Stehlé Acked-by: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Cc: Vincent Stehle Cc: Lars-Peter Clausen Cc: Linus Walleij Link: http://lkml.kernel.org/r/1408663880-29179-1-git-send-email-vincent.stehle@laposte.net Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2fd7b1..6223fab9a9d2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -517,6 +517,7 @@ out: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } +EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler -- cgit v1.2.3 From b3292e88e33677a90c18236fd895ace2d80efa3c Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Mon, 25 Aug 2014 16:06:52 +0300 Subject: crash_dump: Make is_kdump_kernel() accessible from modules In order to make is_kdump_kernel() accessible from modules, need to make elfcorehdr_addr exported. This was rejected in the past [1] because reset_devices was prefered in that context (reseting the device in kdump kernel), but now there are some network drivers that need to reduce memory usage when loaded from a kdump kernel. And in that context, is_kdump_kernel() suits better. [1] - https://lkml.org/lkml/2011/1/27/341 CC: Vivek Goyal Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- kernel/crash_dump.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index c766ee54c0b1..b64e238b553b 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -18,6 +18,7 @@ unsigned long saved_max_pfn; * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +EXPORT_SYMBOL_GPL(elfcorehdr_addr); /* * stores the size of elf header of crash image -- cgit v1.2.3 From 4ce97dbf50245227add17c83d87dc838e7ca79d0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Aug 2014 13:59:41 -0400 Subject: trace: Fix epoll hang when we race with new entries Epoll on trace_pipe can sometimes hang in a weird case. If the ring buffer is empty when we set waiters_pending but an event shows up exactly at that moment we can miss being woken up by the ring buffers irq work. Since ring_buffer_empty() is inherently racey we will sometimes think that the buffer is not empty. So we don't get woken up and we don't think there are any events even though there were some ready when we added the watch, which makes us hang. This patch fixes this by making sure that we are actually on the wait list before we set waiters_pending, and add a memory barrier to make sure ring_buffer_empty() is going to be correct. Link: http://lkml.kernel.org/p/1408989581-23727-1-git-send-email-jbacik@fb.com Cc: stable@vger.kernel.org # 3.10+ Cc: Martin Lau Signed-off-by: Josef Bacik Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index afb04b9b818a..b38fb2b9e237 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -626,8 +626,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, work = &cpu_buffer->irq_work; } - work->waiters_pending = true; poll_wait(filp, &work->waiters, poll_table); + work->waiters_pending = true; + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit + * is set, the next event will wake the task up, but we can get stuck + * if there's only a single event in. + * + * FIXME: Ideally, we need a memory barrier on the writer side as well, + * but adding a memory barrier to all events will cause too much of a + * performance hit in the fast path. We only need a memory barrier when + * the buffer goes from empty to having content. But as this race is + * extremely small, and it's not a problem if another event comes in, we + * will fix it later. + */ + smp_mb(); if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) -- cgit v1.2.3 From bb964a92ce70ac2039115edd019aa5eef8faa6bb Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:24 -0500 Subject: kernel misc: Replace __get_cpu_var uses Replace uses of __get_cpu_var for address calculation with this_cpu_ptr. Cc: akpm@linux-foundation.org Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/printk/printk.c | 4 ++-- kernel/smp.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e04c455a0e38..960fbfc6cd0a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2628,7 +2628,7 @@ void wake_up_klogd(void) preempt_disable(); if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } @@ -2644,7 +2644,7 @@ int printk_deferred(const char *fmt, ...) va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); preempt_enable(); return r; diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..af24183fe6bb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -164,7 +164,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd, if (!csd) { csd = &csd_stack; if (!wait) - csd = &__get_cpu_var(csd_data); + csd = this_cpu_ptr(&csd_data); } csd_lock(csd); @@ -229,7 +229,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) WARN_ON(!irqs_disabled()); - head = &__get_cpu_var(call_single_queue); + head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); @@ -419,7 +419,7 @@ void smp_call_function_many(const struct cpumask *mask, return; } - cfd = &__get_cpu_var(cfd_data); + cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, cfd->cpumask); -- cgit v1.2.3 From 22127e93c587afa01e4f7225d2d1cf1d26ae7dfe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:25 -0500 Subject: time: Replace __get_cpu_var uses Convert uses of __get_cpu_var for creating a address from a percpu offset to this_cpu_ptr. The two cases where get_cpu_var is used to actually access a percpu variable are changed to use this_cpu_read/raw_cpu_read. Reviewed-by: Thomas Gleixner Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- drivers/clocksource/dummy_timer.c | 2 +- kernel/irq_work.c | 12 ++++++------ kernel/sched/clock.c | 2 +- kernel/softirq.c | 4 ++-- kernel/time/hrtimer.c | 6 +++--- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 6 +++--- kernel/time/tick-oneshot.c | 2 +- kernel/time/tick-sched.c | 20 ++++++++++---------- kernel/time/timer.c | 2 +- 10 files changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/drivers/clocksource/dummy_timer.c b/drivers/clocksource/dummy_timer.c index ad3572541728..31990600fcff 100644 --- a/drivers/clocksource/dummy_timer.c +++ b/drivers/clocksource/dummy_timer.c @@ -28,7 +28,7 @@ static void dummy_timer_set_mode(enum clock_event_mode mode, static void dummy_timer_setup(void) { int cpu = smp_processor_id(); - struct clock_event_device *evt = __this_cpu_ptr(&dummy_timer_evt); + struct clock_event_device *evt = raw_cpu_ptr(&dummy_timer_evt); evt->name = "dummy_timer"; evt->features = CLOCK_EVT_FEAT_PERIODIC | diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e6bcbe756663..345d19edcdae 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work) /* If the work is "lazy", handle it from next tick if any */ if (work->flags & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && + if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); } else { - if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) + if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) arch_irq_work_raise(); } @@ -113,8 +113,8 @@ bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; - raised = &__get_cpu_var(raised_list); - lazy = &__get_cpu_var(lazy_list); + raised = this_cpu_ptr(&raised_list); + lazy = this_cpu_ptr(&lazy_list); if (llist_empty(raised) && llist_empty(lazy)) return false; @@ -166,8 +166,8 @@ static void irq_work_run_list(struct llist_head *list) */ void irq_work_run(void) { - irq_work_run_list(&__get_cpu_var(raised_list)); - irq_work_run_list(&__get_cpu_var(lazy_list)); + irq_work_run_list(this_cpu_ptr(&raised_list)); + irq_work_run_list(this_cpu_ptr(&lazy_list)); } EXPORT_SYMBOL_GPL(irq_work_run); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index 3ef6451e972e..c27e4f8f4879 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); static inline struct sched_clock_data *this_scd(void) { - return &__get_cpu_var(sched_clock_data); + return this_cpu_ptr(&sched_clock_data); } static inline struct sched_clock_data *cpu_sdc(int cpu) diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..2d44b5714fe6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a) local_irq_disable(); list = __this_cpu_read(tasklet_vec.head); __this_cpu_write(tasklet_vec.head, NULL); - __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); + __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); local_irq_enable(); while (list) { @@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a) local_irq_disable(); list = __this_cpu_read(tasklet_hi_vec.head); __this_cpu_write(tasklet_hi_vec.head, NULL); - __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); + __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); local_irq_enable(); while (list) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..5f2229ba53d6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; @@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) struct hrtimer_cpu_base *cpu_base; int base = hrtimer_clockid_to_base(which_clock); - cpu_base = &__raw_get_cpu_var(hrtimer_bases); + cpu_base = raw_cpu_ptr(&hrtimer_bases); *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; @@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void) if (!hrtimer_hres_active()) return; - td = &__get_cpu_var(tick_cpu_device); + td = this_cpu_ptr(&tick_cpu_device); if (td && td->evtdev) hrtimer_interrupt(td->evtdev); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 64c5990fd500..066f0ec05e48 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) void tick_check_oneshot_broadcast_this_cpu(void) { if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); /* * We might be in the middle of switching over from diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..decfb5f6edb0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td, void tick_install_replacement(struct clock_event_device *newdev) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int cpu = smp_processor_id(); clockevents_exchange_device(td->evtdev, newdev); @@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup) void tick_suspend(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); } void tick_resume(void) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int broadcast = tick_resume_broadcast(); clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 824109060a33..7ce740e78e1b 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { - struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..73f90932282b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); */ void __tick_nohz_full_check(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_nohz_full_cpu(smp_processor_id())) { if (ts->tick_stopped && !is_idle_task(current)) { @@ -545,7 +545,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; ktime_t last_update, expires, ret = { .tv64 = 0 }; unsigned long rcu_delta_jiffies; - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 time_delta; time_delta = timekeeping_max_deferment(); @@ -813,7 +813,7 @@ void tick_nohz_idle_enter(void) local_irq_disable(); - ts = &__get_cpu_var(tick_cpu_sched); + ts = this_cpu_ptr(&tick_cpu_sched); ts->inidle = 1; __tick_nohz_idle_enter(ts); @@ -831,7 +831,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); */ void tick_nohz_irq_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) __tick_nohz_idle_enter(ts); @@ -846,7 +846,7 @@ void tick_nohz_irq_exit(void) */ ktime_t tick_nohz_get_sleep_length(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->sleep_length; } @@ -959,7 +959,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) */ static void tick_nohz_handler(struct clock_event_device *dev) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); @@ -979,7 +979,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) */ static void tick_nohz_switch_to_nohz(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) @@ -1115,7 +1115,7 @@ early_param("skew_tick", skew_tick); */ void tick_setup_sched_timer(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* @@ -1184,7 +1184,7 @@ void tick_clock_notify(void) */ void tick_oneshot_notify(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } @@ -1199,7 +1199,7 @@ void tick_oneshot_notify(void) */ int tick_check_oneshot_change(int allow_nohz) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..04d8ed8399b0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = __raw_get_cpu_var(tvec_bases); + struct tvec_base *base = raw_cpu_read(tvec_bases); timer->entry.next = NULL; timer->base = (void *)((unsigned long)base | flags); -- cgit v1.2.3 From dc5df73b3afffc8d042dadffc1c959008b2c1163 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:26 -0500 Subject: time: Convert a bunch of &__get_cpu_var introduced in the 3.16 merge period Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/time/hrtimer.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5f2229ba53d6..a50600d87fb7 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) static int hrtimer_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); int res; @@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) */ static void retrigger_next_event(void *arg) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (!hrtimer_hres_active()) return; @@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) */ debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); - reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * We must preserve the CALLBACK state flag here, * otherwise we could move the timer base in @@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * on dynticks target. */ wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && + } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && hrtimer_reprogram(timer, new_base)) { /* * Only allow reprogramming if the new base is on this CPU. @@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); */ ktime_t hrtimer_get_next_event(void) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; unsigned long flags; @@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) */ void hrtimer_interrupt(struct clock_event_device *dev) { - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; int i, retries = 0; @@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void) void hrtimer_run_queues(void) { struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base; int index, gettime = 1; @@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu) local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); - new_base = &__get_cpu_var(hrtimer_bases); + new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. -- cgit v1.2.3 From 4a32fea9d78f2d2315c0072757b197d5a304dc8b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:27 -0500 Subject: scheduler: Replace __get_cpu_var with this_cpu_ptr Convert all uses of __get_cpu_var for address calculation to use this_cpu_ptr instead. [Uses of __get_cpu_var with cpumask_var_t are no longer handled by this patch] Cc: Peter Zijlstra Acked-by: Ingo Molnar Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- include/linux/kernel_stat.h | 4 ++-- kernel/events/callchain.c | 4 ++-- kernel/events/core.c | 24 ++++++++++++------------ kernel/sched/sched.h | 4 ++-- kernel/taskstats.c | 2 +- kernel/time/tick-sched.c | 4 ++-- kernel/user-return-notifier.c | 4 ++-- 7 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index ecbc52f9ff77..8422b4ed6882 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -44,8 +44,8 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); /* Must have preemption disabled for this to be meaningful. */ -#define kstat_this_cpu (&__get_cpu_var(kstat)) -#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat)) +#define kstat_this_cpu this_cpu_ptr(&kstat) +#define kcpustat_this_cpu this_cpu_ptr(&kernel_cpustat) #define kstat_cpu(cpu) per_cpu(kstat, cpu) #define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu) diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 97b67df8fbfe..c4f63e68a35c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) int cpu; struct callchain_cpus_entries *entries; - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion)); if (*rctx == -1) return NULL; @@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx) static void put_callchain_entry(int rctx) { - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); + put_recursion_context(this_cpu_ptr(callchain_recursion), rctx); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..4d44e40a0483 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -239,7 +239,7 @@ static void perf_duration_warn(struct irq_work *w) u64 avg_local_sample_len; u64 local_samples_len; - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; printk_ratelimited(KERN_WARNING @@ -261,10 +261,10 @@ void perf_sample_event_took(u64 sample_len_ns) return; /* decay the counter by 1 average sample */ - local_samples_len = __get_cpu_var(running_sample_length); + local_samples_len = __this_cpu_read(running_sample_length); local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; local_samples_len += sample_len_ns; - __get_cpu_var(running_sample_length) = local_samples_len; + __this_cpu_write(running_sample_length, local_samples_len); /* * note: this will be biased artifically low until we have @@ -877,7 +877,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list); static void perf_pmu_rotate_start(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - struct list_head *head = &__get_cpu_var(rotation_list); + struct list_head *head = this_cpu_ptr(&rotation_list); WARN_ON(!irqs_disabled()); @@ -2389,7 +2389,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_out(task, next); } @@ -2632,11 +2632,11 @@ void __perf_event_task_sched_in(struct task_struct *prev, * to check if we have to switch in PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); /* check for system-wide branch_stack events */ - if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) perf_branch_stack_sched_in(prev, task); } @@ -2891,7 +2891,7 @@ bool perf_event_can_stop_tick(void) void perf_event_task_tick(void) { - struct list_head *head = &__get_cpu_var(rotation_list); + struct list_head *head = this_cpu_ptr(&rotation_list); struct perf_cpu_context *cpuctx, *tmp; struct perf_event_context *ctx; int throttled; @@ -5671,7 +5671,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct perf_event *event; struct hlist_head *head; @@ -5690,7 +5690,7 @@ end: int perf_swevent_get_recursion_context(void) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); return get_recursion_context(swhash->recursion); } @@ -5698,7 +5698,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); inline void perf_swevent_put_recursion_context(int rctx) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); put_recursion_context(swhash->recursion, rctx); } @@ -5727,7 +5727,7 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_add(struct perf_event *event, int flags) { - struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 579712f4e9d5..77d92f8130e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -650,10 +650,10 @@ static inline int cpu_of(struct rq *rq) DECLARE_PER_CPU(struct rq, runqueues); #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) +#define this_rq() this_cpu_ptr(&runqueues) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) +#define raw_rq() raw_cpu_ptr(&runqueues) static inline u64 rq_clock(struct rq *rq) { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 13d2f7cd65db..b312fcc73024 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) fill_tgid_exit(tsk); } - listeners = __this_cpu_ptr(&listener_array); + listeners = raw_cpu_ptr(&listener_array); if (list_empty(&listeners->list)) return; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 73f90932282b..3cadc112519f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -924,7 +924,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -1041,7 +1041,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 394f70b17162..9586b670a5b2 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); void user_return_notifier_register(struct user_return_notifier *urn) { set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); - hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); + hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list)); } EXPORT_SYMBOL_GPL(user_return_notifier_register); @@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); void user_return_notifier_unregister(struct user_return_notifier *urn) { hlist_del(&urn->link); - if (hlist_empty(&__get_cpu_var(return_notifier_list))) + if (hlist_empty(this_cpu_ptr(&return_notifier_list))) clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); } EXPORT_SYMBOL_GPL(user_return_notifier_unregister); -- cgit v1.2.3 From f7f66b05aa2ac2632c5441a3f129f3be827fe7e7 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:34 -0500 Subject: watchdog: Replace __raw_get_cpu_var uses Most of these are the uses of &__raw_get_cpu_var for address calculation. touch_softlockup_watchdog_sync() uses __raw_get_cpu_var to write to per cpu variables. Use __this_cpu_write instead. Cc: Wim Van Sebroeck Cc: linux-watchdog@vger.kernel.org Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/watchdog.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a8d6914030fe..dca8cae7e55d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -185,7 +185,7 @@ void touch_nmi_watchdog(void) * case we shouldn't have to worry about the watchdog * going off. */ - __raw_get_cpu_var(watchdog_nmi_touch) = true; + raw_cpu_write(watchdog_nmi_touch, true); touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -194,8 +194,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog); void touch_softlockup_watchdog_sync(void) { - __raw_get_cpu_var(softlockup_touch_sync) = true; - __raw_get_cpu_var(watchdog_touch_ts) = 0; + __this_cpu_write(softlockup_touch_sync, true); + __this_cpu_write(watchdog_touch_ts, 0); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -387,7 +387,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); /* kick off the timer for the hardlockup detector */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -407,7 +407,7 @@ static void watchdog_enable(unsigned int cpu) static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); hrtimer_cancel(hrtimer); @@ -534,7 +534,7 @@ static struct smp_hotplug_thread watchdog_threads = { static void restart_watchdog_hrtimer(void *info) { - struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); int ret; /* -- cgit v1.2.3 From 532d0d0690d1532dcc5a190162ad820b636bcd4d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sun, 17 Aug 2014 12:30:39 -0500 Subject: irqchips: Replace __this_cpu_ptr uses [ARM specific] These are generally replaced with raw_cpu_ptr. However, in gic_get_percpu_base() we immediately dereference the pointer. This is equivalent to a raw_cpu_read. So use that operation there. Cc: nicolas.pitre@linaro.org Cc: Russell King Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- drivers/irqchip/irq-gic.c | 10 +++++----- kernel/irq/chip.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c index 4b959e606fe8..399a707ec51e 100644 --- a/drivers/irqchip/irq-gic.c +++ b/drivers/irqchip/irq-gic.c @@ -102,7 +102,7 @@ static struct gic_chip_data gic_data[MAX_GIC_NR] __read_mostly; #ifdef CONFIG_GIC_NON_BANKED static void __iomem *gic_get_percpu_base(union gic_base *base) { - return *__this_cpu_ptr(base->percpu_base); + return raw_cpu_read(base->percpu_base); } static void __iomem *gic_get_common_base(union gic_base *base) @@ -504,11 +504,11 @@ static void gic_cpu_save(unsigned int gic_nr) if (!dist_base || !cpu_base) return; - ptr = __this_cpu_ptr(gic_data[gic_nr].saved_ppi_enable); + ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable); for (i = 0; i < DIV_ROUND_UP(32, 32); i++) ptr[i] = readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4); - ptr = __this_cpu_ptr(gic_data[gic_nr].saved_ppi_conf); + ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf); for (i = 0; i < DIV_ROUND_UP(32, 16); i++) ptr[i] = readl_relaxed(dist_base + GIC_DIST_CONFIG + i * 4); @@ -530,11 +530,11 @@ static void gic_cpu_restore(unsigned int gic_nr) if (!dist_base || !cpu_base) return; - ptr = __this_cpu_ptr(gic_data[gic_nr].saved_ppi_enable); + ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_enable); for (i = 0; i < DIV_ROUND_UP(32, 32); i++) writel_relaxed(ptr[i], dist_base + GIC_DIST_ENABLE_SET + i * 4); - ptr = __this_cpu_ptr(gic_data[gic_nr].saved_ppi_conf); + ptr = raw_cpu_ptr(gic_data[gic_nr].saved_ppi_conf); for (i = 0; i < DIV_ROUND_UP(32, 16); i++) writel_relaxed(ptr[i], dist_base + GIC_DIST_CONFIG + i * 4); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2fd7b1..cca7292fc61e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -669,7 +669,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; - void *dev_id = __this_cpu_ptr(action->percpu_dev_id); + void *dev_id = raw_cpu_ptr(action->percpu_dev_id); irqreturn_t res; kstat_incr_irqs_this_cpu(irq, desc); -- cgit v1.2.3 From 6a4c264313c4ae32dc53821a9c57e0dc9696fb81 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 27 Aug 2014 06:21:23 +0930 Subject: module: rename KERNEL_PARAM_FL_NOARG to avoid confusion Make it clear this is about kernel_param_ops, not kernel_param (which will soon have a flags field of its own). No functional changes. Cc: Rusty Russell Cc: Jean Delvare Cc: Andrew Morton Cc: Li Zhong Cc: Jon Mason Cc: Daniel Vetter Signed-off-by: Jani Nikula Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 2 +- kernel/module.c | 2 +- kernel/params.c | 6 +++--- security/apparmor/lsm.c | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 494f99e852da..16fdddab856a 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -42,7 +42,7 @@ struct kernel_param; * NOARG - the parameter allows for no argument (foo instead of foo=1) */ enum { - KERNEL_PARAM_FL_NOARG = (1 << 0) + KERNEL_PARAM_OPS_FL_NOARG = (1 << 0) }; struct kernel_param_ops { diff --git a/kernel/module.c b/kernel/module.c index 03214bd288e9..8a0dc91eddbc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -135,7 +135,7 @@ static int param_set_bool_enable_only(const char *val, } static const struct kernel_param_ops param_ops_bool_enable_only = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; diff --git a/kernel/params.c b/kernel/params.c index 34f527023794..8a484fc8bde8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -104,7 +104,7 @@ static int parse_one(char *param, return 0; /* No one handled NULL, so do it here. */ if (!val && - !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) + !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); @@ -318,7 +318,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) EXPORT_SYMBOL(param_get_bool); struct kernel_param_ops param_ops_bool = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; @@ -369,7 +369,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) EXPORT_SYMBOL(param_set_bint); struct kernel_param_ops param_ops_bint = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 998100093332..65ca451a764d 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -668,7 +668,7 @@ static int param_set_aabool(const char *val, const struct kernel_param *kp); static int param_get_aabool(char *buffer, const struct kernel_param *kp); #define param_check_aabool param_check_bool static struct kernel_param_ops param_ops_aabool = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aabool, .get = param_get_aabool }; @@ -685,7 +685,7 @@ static int param_set_aalockpolicy(const char *val, const struct kernel_param *kp static int param_get_aalockpolicy(char *buffer, const struct kernel_param *kp); #define param_check_aalockpolicy param_check_bool static struct kernel_param_ops param_ops_aalockpolicy = { - .flags = KERNEL_PARAM_FL_NOARG, + .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = param_set_aalockpolicy, .get = param_get_aalockpolicy }; -- cgit v1.2.3 From 91f9d330cc14932084c37751997213cb0e7ea882 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 27 Aug 2014 06:22:23 +0930 Subject: module: make it possible to have unsafe, tainting module params Add flags field to struct kernel_params, and add the first flag: unsafe parameter. Modifying a kernel parameter with the unsafe flag set, either via the kernel command line or sysfs, will issue a warning and taint the kernel. Cc: Rusty Russell Cc: Jean Delvare Cc: Andrew Morton Cc: Li Zhong Cc: Jon Mason Cc: Daniel Vetter Signed-off-by: Jani Nikula Signed-off-by: Rusty Russell --- drivers/tty/serial/8250/8250_core.c | 2 +- include/linux/moduleparam.h | 44 +++++++++++++++++++++++++++++-------- kernel/params.c | 11 ++++++++++ 3 files changed, 47 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 1d42dba6121d..bd672948f2f1 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -3587,7 +3587,7 @@ static void __used s8250_options(void) #ifdef CONFIG_SERIAL_8250_RSA __module_param_call(MODULE_PARAM_PREFIX, probe_rsa, ¶m_array_ops, .arr = &__param_arr_probe_rsa, - 0444, -1); + 0444, -1, 0); #endif } #else diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 16fdddab856a..1e3ffb839daa 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -56,11 +56,21 @@ struct kernel_param_ops { void (*free)(void *arg); }; +/* + * Flags available for kernel_param + * + * UNSAFE - the parameter is dangerous and setting it will taint the kernel + */ +enum { + KERNEL_PARAM_FL_UNSAFE = (1 << 0) +}; + struct kernel_param { const char *name; const struct kernel_param_ops *ops; u16 perm; - s16 level; + s8 level; + u8 flags; union { void *arg; const struct kparam_string *str; @@ -137,7 +147,7 @@ struct kparam_array * The ops can have NULL set or get functions. */ #define module_param_cb(name, ops, arg, perm) \ - __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1) + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1, 0) /** * _param_cb - general callback for a module/cmdline parameter @@ -149,7 +159,7 @@ struct kparam_array * The ops can have NULL set or get functions. */ #define __level_param_cb(name, ops, arg, perm, level) \ - __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, level) + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, level, 0) #define core_param_cb(name, ops, arg, perm) \ __level_param_cb(name, ops, arg, perm, 1) @@ -184,14 +194,14 @@ struct kparam_array /* This is the fundamental function for registering boot/module parameters. */ -#define __module_param_call(prefix, name, ops, arg, perm, level) \ +#define __module_param_call(prefix, name, ops, arg, perm, level, flags) \ /* Default value instead of permissions? */ \ static const char __param_str_##name[] = prefix #name; \ static struct kernel_param __moduleparam_const __param_##name \ __used \ __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \ = { __param_str_##name, ops, VERIFY_OCTAL_PERMISSIONS(perm), \ - level, { arg } } + level, flags, { arg } } /* Obsolete - use module_param_cb() */ #define module_param_call(name, set, get, arg, perm) \ @@ -199,7 +209,7 @@ struct kparam_array { 0, (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ - (perm) + sizeof(__check_old_set_param(set))*0, -1) + (perm) + sizeof(__check_old_set_param(set))*0, -1, 0) /* We don't get oldget: it's often a new-style param_get_uint, etc. */ static inline int @@ -279,7 +289,7 @@ static inline void __kernel_param_unlock(void) */ #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ - __module_param_call("", name, ¶m_ops_##type, &var, perm, -1) + __module_param_call("", name, ¶m_ops_##type, &var, perm, -1, 0) #endif /* !MODULE */ /** @@ -297,7 +307,7 @@ static inline void __kernel_param_unlock(void) = { len, string }; \ __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_ops_string, \ - .str = &__param_string_##name, perm, -1); \ + .str = &__param_string_##name, perm, -1, 0);\ __MODULE_PARM_TYPE(name, "string") /** @@ -346,6 +356,22 @@ static inline void destroy_params(const struct kernel_param *params, #define __param_check(name, p, type) \ static inline type __always_unused *__check_##name(void) { return(p); } +/** + * param_check_unsafe - Warn and taint the kernel if setting dangerous options. + * + * This gets called from all the standard param setters, but can be used from + * custom setters as well. + */ +static inline void +param_check_unsafe(const struct kernel_param *kp) +{ + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { + pr_warn("Setting dangerous option %s - tainting kernel\n", + kp->name); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } +} + extern struct kernel_param_ops param_ops_byte; extern int param_set_byte(const char *val, const struct kernel_param *kp); extern int param_get_byte(char *buffer, const struct kernel_param *kp); @@ -444,7 +470,7 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_array_ops, \ .arr = &__param_arr_##name, \ - perm, -1); \ + perm, -1, 0); \ __MODULE_PARM_TYPE(name, "array of " #type) extern struct kernel_param_ops param_array_ops; diff --git a/kernel/params.c b/kernel/params.c index 8a484fc8bde8..ad8d04563c3a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -233,6 +233,7 @@ char *parse_args(const char *doing, #define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ + param_check_unsafe(kp); \ return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ @@ -265,6 +266,8 @@ int param_set_charp(const char *val, const struct kernel_param *kp) return -ENOSPC; } + param_check_unsafe(kp); + maybe_kfree_parameter(*(char **)kp->arg); /* This is a hack. We can't kmalloc in early boot, and we @@ -302,6 +305,8 @@ EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { + param_check_unsafe(kp); + /* No equals means "set"... */ if (!val) val = "1"; @@ -331,6 +336,8 @@ int param_set_invbool(const char *val, const struct kernel_param *kp) bool boolval; struct kernel_param dummy; + param_check_unsafe(kp); + dummy.arg = &boolval; ret = param_set_bool(val, &dummy); if (ret == 0) @@ -357,6 +364,8 @@ int param_set_bint(const char *val, const struct kernel_param *kp) bool v; int ret; + param_check_unsafe(kp); + /* Match bool exactly, by re-using it. */ boolkp = *kp; boolkp.arg = &v; @@ -476,6 +485,8 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; + param_check_unsafe(kp); + if (strlen(val)+1 > kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); -- cgit v1.2.3 From 7a486d3781295b5298cbf9556928a76d26896863 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 27 Aug 2014 06:25:23 +0930 Subject: param: check for tainting before calling set op. This means every set op doesn't need to call it, and it can move into params.c. Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 16 ---------------- kernel/params.c | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 9531f9f9729e..593501996574 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -374,22 +374,6 @@ static inline void destroy_params(const struct kernel_param *params, #define __param_check(name, p, type) \ static inline type __always_unused *__check_##name(void) { return(p); } -/** - * param_check_unsafe - Warn and taint the kernel if setting dangerous options. - * - * This gets called from all the standard param setters, but can be used from - * custom setters as well. - */ -static inline void -param_check_unsafe(const struct kernel_param *kp) -{ - if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { - pr_warn("Setting dangerous option %s - tainting kernel\n", - kp->name); - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - } -} - extern struct kernel_param_ops param_ops_byte; extern int param_set_byte(const char *val, const struct kernel_param *kp); extern int param_get_byte(char *buffer, const struct kernel_param *kp); diff --git a/kernel/params.c b/kernel/params.c index ad8d04563c3a..041b5899d5e2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -83,6 +83,15 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } +static void param_check_unsafe(const struct kernel_param *kp) +{ + if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { + pr_warn("Setting dangerous option %s - tainting kernel\n", + kp->name); + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + } +} + static int parse_one(char *param, char *val, const char *doing, @@ -109,6 +118,7 @@ static int parse_one(char *param, pr_debug("handling %s with %p\n", param, params[i].ops->set); mutex_lock(¶m_lock); + param_check_unsafe(¶ms[i]); err = params[i].ops->set(val, ¶ms[i]); mutex_unlock(¶m_lock); return err; @@ -233,7 +243,6 @@ char *parse_args(const char *doing, #define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ - param_check_unsafe(kp); \ return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ @@ -266,8 +275,6 @@ int param_set_charp(const char *val, const struct kernel_param *kp) return -ENOSPC; } - param_check_unsafe(kp); - maybe_kfree_parameter(*(char **)kp->arg); /* This is a hack. We can't kmalloc in early boot, and we @@ -305,8 +312,6 @@ EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { - param_check_unsafe(kp); - /* No equals means "set"... */ if (!val) val = "1"; @@ -336,8 +341,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp) bool boolval; struct kernel_param dummy; - param_check_unsafe(kp); - dummy.arg = &boolval; ret = param_set_bool(val, &dummy); if (ret == 0) @@ -364,8 +367,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp) bool v; int ret; - param_check_unsafe(kp); - /* Match bool exactly, by re-using it. */ boolkp = *kp; boolkp.arg = &v; @@ -485,8 +486,6 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - param_check_unsafe(kp); - if (strlen(val)+1 > kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); @@ -563,6 +562,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; mutex_lock(¶m_lock); + param_check_unsafe(attribute->param); err = attribute->param->ops->set(buf, attribute->param); mutex_unlock(¶m_lock); if (!err) -- cgit v1.2.3 From 4ba2968420fa9d0604b6a6a5c61bfa8d0fa84ae0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 26 Aug 2014 19:12:21 -0500 Subject: percpu: Resolve ambiguities in __get_cpu_var/cpumask_var_t __get_cpu_var can paper over differences in the definitions of cpumask_var_t and either use the address of the cpumask variable directly or perform a fetch of the address of the struct cpumask allocated elsewhere. This is important particularly when using per cpu cpumask_var_t declarations because in one case we have an offset into a per cpu area to handle and in the other case we need to fetch a pointer from the offset. This patch introduces a new macro this_cpu_cpumask_var_ptr() that is defined where cpumask_var_t is defined and performs the proper actions. All use cases where __get_cpu_var is used with cpumask_var_t are converted to the use of this_cpu_cpumask_var_ptr(). Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/include/asm/perf_event_p4.h | 2 +- arch/x86/kernel/apic/x2apic_cluster.c | 3 +-- arch/x86/oprofile/op_model_p4.c | 2 +- include/linux/cpumask.h | 11 +++++++++++ kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- 7 files changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index 85e13ccf15c4..d725382c2ae0 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -189,7 +189,7 @@ static inline int p4_ht_thread(int cpu) { #ifdef CONFIG_SMP if (smp_num_siblings == 2) - return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); + return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6ce600f9bc78..1f5d5f2ffae6 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -42,8 +42,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) * We are to modify mask, so we need an own copy * and be sure it's manipulated with irq off. */ - ipi_mask_ptr = __raw_get_cpu_var(ipi_mask); - cpumask_copy(ipi_mask_ptr, mask); + ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask); /* * The idea is to send one IPI per cluster. diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 98ab13058f89..ad1d91f475ab 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -372,7 +372,7 @@ static unsigned int get_stagger(void) { #ifdef CONFIG_SMP int cpu = smp_processor_id(); - return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); + return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map)); #endif return 0; } diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 2997af6d2ccd..0a9a6da21e74 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -666,10 +666,19 @@ static inline size_t cpumask_size(void) * * This code makes NR_CPUS length memcopy and brings to a memory corruption. * cpumask_copy() provide safe copy functionality. + * + * Note that there is another evil here: If you define a cpumask_var_t + * as a percpu variable then the way to obtain the address of the cpumask + * structure differently influences what this_cpu_* operation needs to be + * used. Please use this_cpu_cpumask_var_t in those cases. The direct use + * of this_cpu_ptr() or this_cpu_read() will lead to failures when the + * other type of cpumask_var_t implementation is configured. */ #ifdef CONFIG_CPUMASK_OFFSTACK typedef struct cpumask *cpumask_var_t; +#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) + bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); @@ -681,6 +690,8 @@ void free_bootmem_cpumask_var(cpumask_var_t mask); #else typedef struct cpumask cpumask_var_t[1]; +#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) + static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 255ce138b652..4a608cfaecbd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1158,7 +1158,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); int this_cpu = smp_processor_id(); int best_cpu, cpu = task_cpu(task); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..197d659c144c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6539,7 +6539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_mask); + struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { .sd = sd, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5f6edca4fafd..a4c50fce9b90 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1526,7 +1526,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; - struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); -- cgit v1.2.3 From 11ed7f934cb807f26da09547b5946c2e534d1dac Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 27 Aug 2014 16:43:40 -0400 Subject: rcu: Make nocb leader kthreads process pending callbacks after spawning The nocb callbacks generated before the nocb kthreads are spawned are enqueued in the nocb queue for later processing. Commit fbce7497ee5af ("rcu: Parallelize and economize NOCB kthread wakeups") introduced nocb leader kthreads which checked the nocb_leader_wake flag to see if there were any such pending callbacks. A case was reported in which newly spawned leader kthreads were not processing the pending callbacks as this flag was not set, which led to a boot hang. The following commit ensures that the newly spawned nocb kthreads process the pending callbacks by allowing the kthreads to run immediately after spawning instead of waiting. This is done by inverting the logic of nocb_leader_wake tests to nocb_leader_sleep which allows us to use the default initialization of this flag to 0 to let the kthreads run. Reported-by: Amit Shah Signed-off-by: Pranith Kumar Link: http://www.spinics.net/lists/kernel/msg1802899.html [ paulmck: Backported to v3.17-rc2. ] Signed-off-by: Paul E. McKenney Tested-by: Amit Shah --- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 71e64c718f75..6a86eb7bac45 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -358,7 +358,7 @@ struct rcu_data { struct rcu_head **nocb_gp_tail; long nocb_gp_count; long nocb_gp_count_lazy; - bool nocb_leader_wake; /* Is the nocb leader thread awake? */ + bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ struct rcu_data *nocb_next_follower; /* Next follower in wakeup chain. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 00dc411e9676..a7997e272564 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2074,9 +2074,9 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) return; - if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { + if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior xchg orders against prior callback enqueue. */ - ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; + ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; wake_up(&rdp_leader->nocb_wq); } } @@ -2253,7 +2253,7 @@ wait_again: if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); wait_event_interruptible(my_rdp->nocb_wq, - ACCESS_ONCE(my_rdp->nocb_leader_wake)); + !ACCESS_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ @@ -2292,12 +2292,12 @@ wait_again: schedule_timeout_interruptible(1); /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) if (ACCESS_ONCE(rdp->nocb_head)) { /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_wake = true; + my_rdp->nocb_leader_sleep = false; break; } goto wait_again; @@ -2307,17 +2307,17 @@ wait_again: rcu_nocb_wait_gp(my_rdp); /* - * We left ->nocb_leader_wake set to reduce cache thrashing. - * We clear it now, but recheck for new callbacks while + * We left ->nocb_leader_sleep unset to reduce cache thrashing. + * We set it now, but recheck for new callbacks while * traversing our follower list. */ - my_rdp->nocb_leader_wake = false; - smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + my_rdp->nocb_leader_sleep = true; + smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { if (ACCESS_ONCE(rdp->nocb_head)) - my_rdp->nocb_leader_wake = true; /* No need to wait. */ + my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ -- cgit v1.2.3 From 800df627e2eabaf4a921d342a1d5162c843b7fc2 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:29 -0700 Subject: resource: fix the case of null pointer access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard and Daniel reported that UML is broken due to changes to resource traversal functions. Problem is that iomem_resource.child can be null and new code does not consider that possibility. Old code used a for loop and that loop will not even execute if p was null. Revert back to for() loop logic and bail out if p is null. I also moved sibling_only check out of resource_lock. There is no reason to keep it inside the lock. Following is backtrace of the UML crash. RIP: 0033:[<0000000060039b9f>] RSP: 0000000081459da0 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 00000000219b3fff RCX: 000000006010d1d9 RDX: 0000000000000001 RSI: 00000000602dfb94 RDI: 0000000081459df8 RBP: 0000000081459de0 R08: 00000000601b59f4 R09: ffffffff0000ff00 R10: ffffffff0000ff00 R11: 0000000081459e88 R12: 0000000081459df8 R13: 00000000219b3fff R14: 00000000602dfb94 R15: 0000000000000000 Kernel panic - not syncing: Segfault with no mm CPU: 0 PID: 1 Comm: swapper Not tainted 3.16.0-10454-g58d08e3 #13 Stack: 00000000 000080d0 81459df0 219b3fff 81459e70 6010d1d9 ffffffff 6033e010 81459e50 6003a269 81459e30 00000000 Call Trace: [<6010d1d9>] ? kclist_add_private+0x0/0xe7 [<6003a269>] walk_system_ram_range+0x61/0xb7 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6010d574>] kcore_update_ram+0x4c/0x168 [<6010d72e>] ? kclist_add+0x0/0x2e [<6000e943>] proc_kcore_init+0xea/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<6000e859>] ? proc_kcore_init+0x0/0xf1 [<600189f0>] do_one_initcall+0x13c/0x204 [<6004ca46>] ? parse_args+0x1df/0x2e0 [<6004c82d>] ? parameq+0x0/0x3a [<601b5990>] ? strcpy+0x0/0x18 [<60001e1a>] kernel_init_freeable+0x240/0x31e [<6026f1c0>] kernel_init+0x12/0x148 [<60019fad>] new_thread_handler+0x81/0xa3 Fixes 8c86e70acead629aacb4a ("resource: provide new functions to walk through resources"). Reported-by: Daniel Walter Tested-by: Richard Weinberger Tested-by: Toralf Förster Tested-by: Daniel Walter Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index da14b8d09296..60c5a3856ab7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -351,15 +351,12 @@ static int find_next_iomem_res(struct resource *res, char *name, end = res->end; BUG_ON(start >= end); - read_lock(&resource_lock); - - if (first_level_children_only) { - p = iomem_resource.child; + if (first_level_children_only) sibling_only = true; - } else - p = &iomem_resource; - while ((p = next_resource(p, sibling_only))) { + read_lock(&resource_lock); + + for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) -- cgit v1.2.3 From 74ca317c26a3f8543203b61d262c0ab2e30c384e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 29 Aug 2014 15:18:46 -0700 Subject: kexec: create a new config option CONFIG_KEXEC_FILE for new syscall Currently new system call kexec_file_load() and all the associated code compiles if CONFIG_KEXEC=y. But new syscall also compiles purgatory code which currently uses gcc option -mcmodel=large. This option seems to be available only gcc 4.4 onwards. Hiding new functionality behind a new config option will not break existing users of old gcc. Those who wish to enable new functionality will require new gcc. Having said that, I am trying to figure out how can I move away from using -mcmodel=large but that can take a while. I think there are other advantages of introducing this new config option. As this option will be enabled only on x86_64, other arches don't have to compile generic kexec code which will never be used. This new code selects CRYPTO=y and CRYPTO_SHA256=y. And all other arches had to do this for CONFIG_KEXEC. Now with introduction of new config option, we can remove crypto dependency from other arches. Now CONFIG_KEXEC_FILE is available only on x86_64. So whereever I had CONFIG_X86_64 defined, I got rid of that. For CONFIG_KEXEC_FILE, instead of doing select CRYPTO=y, I changed it to "depends on CRYPTO=y". This should be safer as "select" is not recursive. Signed-off-by: Vivek Goyal Cc: Eric Biederman Cc: H. Peter Anvin Tested-by: Shaun Ruffell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kbuild | 4 +--- arch/x86/Kconfig | 18 ++++++++++++++---- arch/x86/Makefile | 5 +---- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/crash.c | 6 ++---- arch/x86/kernel/machine_kexec_64.c | 11 +++++++++++ arch/x86/purgatory/Makefile | 5 +---- kernel/kexec.c | 11 +++++++++++ 8 files changed, 42 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 61b6d51866f8..3942f74c92d7 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -17,6 +17,4 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ -ifeq ($(CONFIG_X86_64),y) -obj-$(CONFIG_KEXEC) += purgatory/ -endif +obj-$(CONFIG_KEXEC_FILE) += purgatory/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5d0bf1aa9dcb..778178f4c7d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1585,9 +1585,6 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" - select BUILD_BIN2C - select CRYPTO - select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1602,9 +1599,22 @@ config KEXEC interface is strongly in flux, so no good recommendation can be made. +config KEXEC_FILE + bool "kexec file based system call" + select BUILD_BIN2C + depends on KEXEC + depends on X86_64 + depends on CRYPTO=y + depends on CRYPTO_SHA256=y + ---help--- + This is new version of kexec system call. This system call is + file based and takes file descriptors as system call argument + for kernel and initramfs as opposed to list of segments as + accepted by previous system call. + config KEXEC_VERIFY_SIG bool "Verify kernel signature during kexec_file_load() syscall" - depends on KEXEC + depends on KEXEC_FILE ---help--- This option makes kernel signature verification mandatory for kexec_file_load() syscall. If kernel is signature can not be diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c1aa36887843..c96bcec544fc 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -184,11 +184,8 @@ archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all archprepare: -ifeq ($(CONFIG_KEXEC),y) -# Build only for 64bit. No loaders for 32bit yet. - ifeq ($(CONFIG_X86_64),y) +ifeq ($(CONFIG_KEXEC_FILE),y) $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c - endif endif ### diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b5ea75c4a4b4..ada2e2d6be3e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o @@ -118,5 +119,4 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o - obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 0553a34fa0df..a618fcd2c07d 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -182,8 +182,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) crash_save_cpu(regs, safe_smp_processor_id()); } -#ifdef CONFIG_X86_64 - +#ifdef CONFIG_KEXEC_FILE static int get_nr_ram_ranges_callback(unsigned long start_pfn, unsigned long nr_pfn, void *arg) { @@ -696,5 +695,4 @@ int crash_load_segments(struct kimage *image) return ret; } - -#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8b04018e5d1f..485981059a40 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -25,9 +25,11 @@ #include #include +#ifdef CONFIG_KEXEC_FILE static struct kexec_file_ops *kexec_file_loaders[] = { &kexec_bzImage64_ops, }; +#endif static void free_transition_pgtable(struct kimage *image) { @@ -178,6 +180,7 @@ static void load_segments(void) ); } +#ifdef CONFIG_KEXEC_FILE /* Update purgatory as needed after various image segments have been prepared */ static int arch_update_purgatory(struct kimage *image) { @@ -209,6 +212,12 @@ static int arch_update_purgatory(struct kimage *image) return ret; } +#else /* !CONFIG_KEXEC_FILE */ +static inline int arch_update_purgatory(struct kimage *image) +{ + return 0; +} +#endif /* CONFIG_KEXEC_FILE */ int machine_kexec_prepare(struct kimage *image) { @@ -329,6 +338,7 @@ void arch_crash_save_vmcoreinfo(void) /* arch-dependent functionality related to kexec file-based syscall */ +#ifdef CONFIG_KEXEC_FILE int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { @@ -522,3 +532,4 @@ overflow: (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } +#endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 7fde9ee438a4..c4ae06e4ae74 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -24,7 +24,4 @@ $(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE $(call if_changed,bin2c) -# No loaders for 32bits yet. -ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_KEXEC) += kexec-purgatory.o -endif +obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o diff --git a/kernel/kexec.c b/kernel/kexec.c index 0b49a0a58102..2bee072268d9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -64,7 +64,9 @@ bool kexec_in_progress = false; char __weak kexec_purgatory[0]; size_t __weak kexec_purgatory_size = 0; +#ifdef CONFIG_KEXEC_FILE static int kexec_calculate_store_digests(struct kimage *image); +#endif /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { @@ -341,6 +343,7 @@ out_free_image: return ret; } +#ifdef CONFIG_KEXEC_FILE static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { struct fd f = fdget(fd); @@ -612,6 +615,9 @@ out_free_image: kfree(image); return ret; } +#else /* CONFIG_KEXEC_FILE */ +static inline void kimage_file_post_load_cleanup(struct kimage *image) { } +#endif /* CONFIG_KEXEC_FILE */ static int kimage_is_destination_range(struct kimage *image, unsigned long start, @@ -1375,6 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +#ifdef CONFIG_KEXEC_FILE SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, unsigned long, cmdline_len, const char __user *, cmdline_ptr, unsigned long, flags) @@ -1451,6 +1458,8 @@ out: return ret; } +#endif /* CONFIG_KEXEC_FILE */ + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -2006,6 +2015,7 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +#ifdef CONFIG_KEXEC_FILE static int __kexec_add_segment(struct kimage *image, char *buf, unsigned long bufsz, unsigned long mem, unsigned long memsz) @@ -2682,6 +2692,7 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, return 0; } +#endif /* CONFIG_KEXEC_FILE */ /* * Move into place and start executing a preloaded standalone -- cgit v1.2.3 From 068765ba7987e73d4381edfe47b70aa121c7155c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 1 Sep 2014 13:47:49 +0200 Subject: PM / sleep: Mechanism for aborting system suspends unconditionally It sometimes may be necessary to abort a system suspend in progress or wake up the system from suspend-to-idle even if the pm_wakeup_event()/pm_stay_awake() mechanism is not enabled. For this purpose, introduce a new global variable pm_abort_suspend and make pm_wakeup_pending() check its value. Also add routines for manipulating that variable. Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeup.c | 16 +++++++++++++++- include/linux/suspend.h | 4 ++++ kernel/power/process.c | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index eb1bd2ecad8b..c2744b30d5d9 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -24,6 +24,9 @@ */ bool events_check_enabled __read_mostly; +/* If set and the system is suspending, terminate the suspend. */ +static bool pm_abort_suspend __read_mostly; + /* * Combined counters of registered wakeup events and wakeup events in progress. * They need to be modified together atomically, so it's better to use one @@ -719,7 +722,18 @@ bool pm_wakeup_pending(void) pm_print_active_wakeup_sources(); } - return ret; + return ret || pm_abort_suspend; +} + +void pm_system_wakeup(void) +{ + pm_abort_suspend = true; + freeze_wake(); +} + +void pm_wakeup_clear(void) +{ + pm_abort_suspend = false; } /** diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 519064e0c943..06a9910827c2 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -371,6 +371,8 @@ extern int unregister_pm_notifier(struct notifier_block *nb); extern bool events_check_enabled; extern bool pm_wakeup_pending(void); +extern void pm_system_wakeup(void); +extern void pm_wakeup_clear(void); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); extern void pm_wakep_autosleep_enabled(bool set); @@ -418,6 +420,8 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) #define pm_notifier(fn, pri) do { (void)(fn); } while (0) static inline bool pm_wakeup_pending(void) { return false; } +static inline void pm_system_wakeup(void) {} +static inline void pm_wakeup_clear(void) {} static inline void lock_system_sleep(void) {} static inline void unlock_system_sleep(void) {} diff --git a/kernel/power/process.c b/kernel/power/process.c index 4ee194eb524b..7b323221b9ee 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -129,6 +129,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); + pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); -- cgit v1.2.3 From 8df2e02c5c4de9e65ee60153dd9c442356534ad9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 Aug 2014 11:49:28 +0200 Subject: genirq: Move suspend/resume logic into irq/pm code No functional change. Preparatory patch for cleaning up the suspend abort functionality. Update the comments while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- kernel/irq/internals.h | 4 ++-- kernel/irq/manage.c | 28 +++++----------------------- kernel/irq/pm.c | 44 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 099ea2e0eb88..af2821178900 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -63,8 +63,8 @@ enum { extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); -extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); -extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern void __disable_irq(struct irq_desc *desc, unsigned int irq); +extern void __enable_irq(struct irq_desc *desc, unsigned int irq); extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3dc6a61bf06a..fa564e8db996 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -382,14 +382,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) } #endif -void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) +void __disable_irq(struct irq_desc *desc, unsigned int irq) { - if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) - return; - desc->istate |= IRQS_SUSPENDED; - } - if (!desc->depth++) irq_disable(desc); } @@ -401,7 +395,7 @@ static int __disable_irq_nosync(unsigned int irq) if (!desc) return -EINVAL; - __disable_irq(desc, irq, false); + __disable_irq(desc, irq); irq_put_desc_busunlock(desc, flags); return 0; } @@ -442,20 +436,8 @@ void disable_irq(unsigned int irq) } EXPORT_SYMBOL(disable_irq); -void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) +void __enable_irq(struct irq_desc *desc, unsigned int irq) { - if (resume) { - if (!(desc->istate & IRQS_SUSPENDED)) { - if (!desc->action) - return; - if (!(desc->action->flags & IRQF_FORCE_RESUME)) - return; - /* Pretend that it got disabled ! */ - desc->depth++; - } - desc->istate &= ~IRQS_SUSPENDED; - } - switch (desc->depth) { case 0: err_out: @@ -497,7 +479,7 @@ void enable_irq(unsigned int irq) KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) goto out; - __enable_irq(desc, irq, false); + __enable_irq(desc, irq); out: irq_put_desc_busunlock(desc, flags); } @@ -1228,7 +1210,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { desc->istate &= ~IRQS_SPURIOUS_DISABLED; - __enable_irq(desc, irq, false); + __enable_irq(desc, irq); } raw_spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index abcd6ca86cb7..b84141dcee5e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -13,13 +13,26 @@ #include "internals.h" +static void suspend_device_irq(struct irq_desc *desc, int irq) +{ + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) + return; + + desc->istate |= IRQS_SUSPENDED; + __disable_irq(desc, irq); +} + /** * suspend_device_irqs - disable all currently enabled interrupt lines * - * During system-wide suspend or hibernation device drivers need to be prevented - * from receiving interrupts and this function is provided for this purpose. - * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQS_SUSPENDED flag for each of them. + * During system-wide suspend or hibernation device drivers need to be + * prevented from receiving interrupts and this function is provided + * for this purpose. + * + * So we disable all interrupts and mark them IRQS_SUSPENDED except + * for those which are unused and those which are marked as not + * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND + * set. */ void suspend_device_irqs(void) { @@ -30,7 +43,7 @@ void suspend_device_irqs(void) unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, true); + suspend_device_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); } @@ -40,6 +53,25 @@ void suspend_device_irqs(void) } EXPORT_SYMBOL_GPL(suspend_device_irqs); +static void resume_irq(struct irq_desc *desc, int irq) +{ + if (desc->istate & IRQS_SUSPENDED) + goto resume; + + if (!desc->action) + return; + + /* Interrupts marked with that flag are force reenabled */ + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + + /* Pretend that it got disabled ! */ + desc->depth++; +resume: + desc->istate &= ~IRQS_SUSPENDED; + __enable_irq(desc, irq); +} + static void resume_irqs(bool want_early) { struct irq_desc *desc; @@ -54,7 +86,7 @@ static void resume_irqs(bool want_early) continue; raw_spin_lock_irqsave(&desc->lock, flags); - __enable_irq(desc, irq, true); + resume_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); } } -- cgit v1.2.3 From cab303be91dc47942bc25de33dc1140123540800 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 Aug 2014 11:44:31 +0200 Subject: genirq: Add sanity checks for PM options on shared interrupt lines Account the IRQF_NO_SUSPEND and IRQF_RESUME_EARLY actions on shared interrupt lines and yell loudly if there is a mismatch. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- include/linux/irqdesc.h | 10 ++++++++++ kernel/irq/internals.h | 10 ++++++++++ kernel/irq/manage.c | 4 ++++ kernel/irq/pm.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 472c021a2d4f..cb1a31e448ae 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -36,6 +36,11 @@ struct irq_desc; * @threads_oneshot: bitfield to handle shared oneshot threads * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers + * @nr_actions: number of installed actions on this descriptor + * @no_suspend_depth: number of irqactions on a irq descriptor with + * IRQF_NO_SUSPEND set + * @force_resume_depth: number of irqactions on a irq descriptor with + * IRQF_FORCE_RESUME set * @dir: /proc/irq/ procfs entry * @name: flow handler name for /proc/interrupts output */ @@ -68,6 +73,11 @@ struct irq_desc { unsigned long threads_oneshot; atomic_t threads_active; wait_queue_head_t wait_for_threads; +#ifdef CONFIG_PM_SLEEP + unsigned int nr_actions; + unsigned int no_suspend_depth; + unsigned int force_resume_depth; +#endif #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index af2821178900..c402502a5111 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -194,3 +194,13 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } + +#ifdef CONFIG_PM_SLEEP +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); +#else +static inline void +irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } +static inline void +irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { } +#endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa564e8db996..0a9104b4608b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1200,6 +1200,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->irq = irq; *old_ptr = new; + irq_pm_install_action(desc, new); + /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -1318,6 +1320,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; + irq_pm_remove_action(desc, action); + /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_shutdown(desc); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index b84141dcee5e..1b1b67a73218 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -13,6 +13,42 @@ #include "internals.h" +/* + * Called from __setup_irq() with desc->lock held after @action has + * been installed in the action chain. + */ +void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions++; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth++; + + WARN_ON_ONCE(desc->force_resume_depth && + desc->force_resume_depth != desc->nr_actions); + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth++; + + WARN_ON_ONCE(desc->no_suspend_depth && + desc->no_suspend_depth != desc->nr_actions); +} + +/* + * Called from __free_irq() with desc->lock held after @action has + * been removed from the action chain. + */ +void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) +{ + desc->nr_actions--; + + if (action->flags & IRQF_FORCE_RESUME) + desc->force_resume_depth--; + + if (action->flags & IRQF_NO_SUSPEND) + desc->no_suspend_depth--; +} + static void suspend_device_irq(struct irq_desc *desc, int irq) { if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) -- cgit v1.2.3 From 5417de222393164b87b2d142b6ec332be40a2564 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 Aug 2014 15:48:59 +0200 Subject: genirq: Make use of pm misfeature accounting Use the accounting fields which got introduced for snity checking for the various PM options. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- kernel/irq/pm.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 1b1b67a73218..74ca6bb541d5 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -51,7 +51,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static void suspend_device_irq(struct irq_desc *desc, int irq) { - if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) + if (!desc->action || desc->no_suspend_depth) return; desc->istate |= IRQS_SUSPENDED; @@ -94,11 +94,8 @@ static void resume_irq(struct irq_desc *desc, int irq) if (desc->istate & IRQS_SUSPENDED) goto resume; - if (!desc->action) - return; - - /* Interrupts marked with that flag are force reenabled */ - if (!(desc->action->flags & IRQF_FORCE_RESUME)) + /* Force resume the interrupt? */ + if (!desc->force_resume_depth) return; /* Pretend that it got disabled ! */ -- cgit v1.2.3 From 092fadd59b50208f6859f89dd7ea84e03955b544 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 Aug 2014 16:49:43 +0200 Subject: genirq: Move MASK_ON_SUSPEND handling into suspend_device_irqs() There is no reason why we should delay the masking of interrupts whose interrupt chip requests MASK_ON_SUSPEND to the point where we check the wakeup interrupts. We can do it right at the point where we mark the interrupt as suspended. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- kernel/irq/pm.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 74ca6bb541d5..a21b3dc9825a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -56,6 +56,15 @@ static void suspend_device_irq(struct irq_desc *desc, int irq) desc->istate |= IRQS_SUSPENDED; __disable_irq(desc, irq); + + /* + * Hardware which has no wakeup source configuration facility + * requires that the non wakeup interrupts are masked at the + * chip level. The chip implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); } /** @@ -176,19 +185,7 @@ int check_wakeup_irqs(void) if (irqd_is_wakeup_set(&desc->irq_data)) { if (desc->depth == 1 && desc->istate & IRQS_PENDING) return -EBUSY; - continue; } - /* - * Check the non wakeup interrupts whether they need - * to be masked before finally going into suspend - * state. That's for hardware which has no wakeup - * source configuration facility. The chip - * implementation indicates that with - * IRQCHIP_MASK_ON_SUSPEND. - */ - if (desc->istate & IRQS_SUSPENDED && - irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) - mask_irq(desc); } return 0; -- cgit v1.2.3 From c4df606c40c3ac8ba76ad11fdbb10139f7fbb261 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 Aug 2014 22:50:43 +0200 Subject: genirq: Avoid double loop on suspend We can synchronize the suspended interrupts right away. No need for an extra loop. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- kernel/irq/pm.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index a21b3dc9825a..cf0ce0163db9 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -49,10 +49,10 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) desc->no_suspend_depth--; } -static void suspend_device_irq(struct irq_desc *desc, int irq) +static bool suspend_device_irq(struct irq_desc *desc, int irq) { if (!desc->action || desc->no_suspend_depth) - return; + return false; desc->istate |= IRQS_SUSPENDED; __disable_irq(desc, irq); @@ -65,6 +65,7 @@ static void suspend_device_irq(struct irq_desc *desc, int irq) */ if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) mask_irq(desc); + return true; } /** @@ -86,15 +87,15 @@ void suspend_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; + bool sync; raw_spin_lock_irqsave(&desc->lock, flags); - suspend_device_irq(desc, irq); + sync = suspend_device_irq(desc, irq); raw_spin_unlock_irqrestore(&desc->lock, flags); - } - for_each_irq_desc(irq, desc) - if (desc->istate & IRQS_SUSPENDED) + if (sync) synchronize_irq(irq); + } } EXPORT_SYMBOL_GPL(suspend_device_irqs); -- cgit v1.2.3 From c3d7acd0273edf0ee50ccf85167acd7ae0759eda Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Aug 2014 13:46:08 +0200 Subject: genirq: Distangle edge handler entry If the interrupt is disabled or has no action, then we should not call the poll check. Separate the checks. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- kernel/irq/chip.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a2b28a2fd7b1..f10c2e58a786 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -540,19 +540,29 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out + * If the handler is currently running, mark it pending, + * handle the necessary masking and go out */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { if (!irq_check_poll(desc)) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; } } + + /* + * If its disabled or no action available then mask it and get + * out of here. + */ + if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; + } + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ @@ -601,18 +611,27 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* - * If we're currently running this IRQ, or its disabled, - * we shouldn't process the IRQ. Mark it pending, handle - * the necessary masking and go out + * If the handler is currently running, mark it pending, + * handle the necessary masking and go out */ - if (unlikely(irqd_irq_disabled(&desc->irq_data) || - irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { if (!irq_check_poll(desc)) { desc->istate |= IRQS_PENDING; goto out_eoi; } } + + /* + * If its disabled or no action available then mask it and get + * out of here. + */ + if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + kstat_incr_irqs_this_cpu(irq, desc); do { -- cgit v1.2.3 From c7bd3ec0531aa636ad57ed9f27e637cbd247e64a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Aug 2014 13:39:37 +0200 Subject: genirq: Create helper for flow handler entry check All flow handlers - except the per cpu ones - check for an interrupt in progress and an eventual concurrent polling on another cpu. Create a helper function for the repeated code pattern. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- kernel/irq/chip.c | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f10c2e58a786..6baf86085571 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -342,6 +342,13 @@ static bool irq_check_poll(struct irq_desc *desc) return irq_wait_for_poll(desc); } +static bool irq_may_run(struct irq_desc *desc) +{ + if (!irqd_irq_inprogress(&desc->irq_data)) + return true; + return irq_check_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -359,9 +366,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; + if (!irq_may_run(desc)) + goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -412,9 +418,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out_unlock; + if (!irq_may_run(desc)) + goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -485,9 +490,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) - if (!irq_check_poll(desc)) - goto out; + if (!irq_may_run(desc)) + goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -541,16 +545,10 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - /* - * If the handler is currently running, mark it pending, - * handle the necessary masking and go out - */ - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - mask_ack_irq(desc); - goto out_unlock; - } + if (!irq_may_run(desc)) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; } /* @@ -612,15 +610,9 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - /* - * If the handler is currently running, mark it pending, - * handle the necessary masking and go out - */ - if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { - if (!irq_check_poll(desc)) { - desc->istate |= IRQS_PENDING; - goto out_eoi; - } + if (!irq_may_run(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; } /* -- cgit v1.2.3 From b76f16748fa61801b1a1fd3ffb6f25ee228a35e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Aug 2014 13:54:09 +0200 Subject: genirq: Mark wakeup sources as armed on suspend This allows us to utilize this information in the irq_may_run() check without adding another conditional to the fast path. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki --- include/linux/irq.h | 8 ++++++++ kernel/irq/pm.c | 5 +++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 62af59242ddc..03f48d936f66 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -173,6 +173,7 @@ struct irq_data { * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt + * IRQD_WAKEUP_ARMED - Wakeup mode armed */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -186,6 +187,7 @@ enum { IRQD_IRQ_DISABLED = (1 << 16), IRQD_IRQ_MASKED = (1 << 17), IRQD_IRQ_INPROGRESS = (1 << 18), + IRQD_WAKEUP_ARMED = (1 << 19), }; static inline bool irqd_is_setaffinity_pending(struct irq_data *d) @@ -257,6 +259,12 @@ static inline bool irqd_irq_inprogress(struct irq_data *d) return d->state_use_accessors & IRQD_IRQ_INPROGRESS; } +static inline bool irqd_is_wakeup_armed(struct irq_data *d) +{ + return d->state_use_accessors & IRQD_WAKEUP_ARMED; +} + + /* * Functions for chained handlers which can be enabled/disabled by the * standard disable_irq/enable_irq calls. Must be called with diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index cf0ce0163db9..766930eaeed9 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -54,6 +54,9 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) if (!desc->action || desc->no_suspend_depth) return false; + if (irqd_is_wakeup_set(&desc->irq_data)) + irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED; __disable_irq(desc, irq); @@ -101,6 +104,8 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); static void resume_irq(struct irq_desc *desc, int irq) { + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + if (desc->istate & IRQS_SUSPENDED) goto resume; -- cgit v1.2.3 From 9ce7a25849e80cfb264f4995f832b932c1987e1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Aug 2014 14:00:16 +0200 Subject: genirq: Simplify wakeup mechanism Currently we suspend wakeup interrupts by lazy disabling them and check later whether the interrupt has fired, but that's not sufficient for suspend to idle as there is no way to check that once we transitioned into the CPU idle state. So we change the mechanism in the following way: 1) Leave the wakeup interrupts enabled across suspend 2) Add a check to irq_may_run() which is called at the beginning of each flow handler whether the interrupt is an armed wakeup source. This check is basically free as it just extends the existing check for IRQD_IRQ_INPROGRESS. So no new conditional in the hot path. If the IRQD_WAKEUP_ARMED flag is set, then the interrupt is disabled, marked as pending/suspended and the pm core is notified about the wakeup event. Signed-off-by: Thomas Gleixner [ rjw: syscore.c and put irq_pm_check_wakeup() into pm.c ] Signed-off-by: Rafael J. Wysocki --- drivers/base/syscore.c | 7 +++--- include/linux/interrupt.h | 5 ----- kernel/irq/chip.c | 20 ++++++++++++++++- kernel/irq/internals.h | 2 ++ kernel/irq/pm.c | 55 +++++++++++++++++++++++++---------------------- 5 files changed, 53 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c index dbb8350ea8dc..8d98a329f6ea 100644 --- a/drivers/base/syscore.c +++ b/drivers/base/syscore.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include static LIST_HEAD(syscore_ops_list); @@ -54,9 +54,8 @@ int syscore_suspend(void) pr_debug("Checking wakeup interrupts\n"); /* Return error code if there are any wakeup interrupts pending. */ - ret = check_wakeup_irqs(); - if (ret) - return ret; + if (pm_wakeup_pending()) + return -EBUSY; WARN_ONCE(!irqs_disabled(), "Interrupts enabled before system core suspend.\n"); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 698ad053d064..69517a24bc50 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -193,11 +193,6 @@ extern void irq_wake_thread(unsigned int irq, void *dev_id); /* The following three functions are for the core kernel use only. */ extern void suspend_device_irqs(void); extern void resume_device_irqs(void); -#ifdef CONFIG_PM_SLEEP -extern int check_wakeup_irqs(void); -#else -static inline int check_wakeup_irqs(void) { return 0; } -#endif /** * struct irq_affinity_notify - context for notification of IRQ affinity changes diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6baf86085571..e7917ff8a486 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -344,8 +344,26 @@ static bool irq_check_poll(struct irq_desc *desc) static bool irq_may_run(struct irq_desc *desc) { - if (!irqd_irq_inprogress(&desc->irq_data)) + unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; + + /* + * If the interrupt is not in progress and is not an armed + * wakeup interrupt, proceed. + */ + if (!irqd_has_set(&desc->irq_data, mask)) return true; + + /* + * If the interrupt is an armed wakeup source, mark it pending + * and suspended, disable it and notify the pm core about the + * event. + */ + if (irq_pm_check_wakeup(desc)) + return false; + + /* + * Handle a potential concurrent poll on a different core. + */ return irq_check_poll(desc); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c402502a5111..4332d766619d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -196,9 +196,11 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d } #ifdef CONFIG_PM_SLEEP +bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else +static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 766930eaeed9..3ca532592704 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -9,10 +9,24 @@ #include #include #include +#include #include #include "internals.h" +bool irq_pm_check_wakeup(struct irq_desc *desc) +{ + if (irqd_is_wakeup_armed(&desc->irq_data)) { + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_wakeup(); + return true; + } + return false; +} + /* * Called from __setup_irq() with desc->lock held after @action has * been installed in the action chain. @@ -54,8 +68,16 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) if (!desc->action || desc->no_suspend_depth) return false; - if (irqd_is_wakeup_set(&desc->irq_data)) + if (irqd_is_wakeup_set(&desc->irq_data)) { irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED); + /* + * We return true here to force the caller to issue + * synchronize_irq(). We need to make sure that the + * IRQD_WAKEUP_ARMED is visible before we return from + * suspend_device_irqs(). + */ + return true; + } desc->istate |= IRQS_SUSPENDED; __disable_irq(desc, irq); @@ -79,9 +101,13 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq) * for this purpose. * * So we disable all interrupts and mark them IRQS_SUSPENDED except - * for those which are unused and those which are marked as not + * for those which are unused, those which are marked as not * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND - * set. + * set and those which are marked as active wakeup sources. + * + * The active wakeup sources are handled by the flow handler entry + * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the + * interrupt and notifies the pm core about the wakeup. */ void suspend_device_irqs(void) { @@ -173,26 +199,3 @@ void resume_device_irqs(void) resume_irqs(false); } EXPORT_SYMBOL_GPL(resume_device_irqs); - -/** - * check_wakeup_irqs - check if any wake-up interrupts are pending - */ -int check_wakeup_irqs(void) -{ - struct irq_desc *desc; - int irq; - - for_each_irq_desc(irq, desc) { - /* - * Only interrupts which are marked as wakeup source - * and have not been disabled before the suspend check - * can abort suspend. - */ - if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->depth == 1 && desc->istate & IRQS_PENDING) - return -EBUSY; - } - } - - return 0; -} -- cgit v1.2.3 From 62109b43176b87e78b2b6d91bcfe16128c30229b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Sep 2014 01:21:03 +0200 Subject: PM / sleep: Fix test_suspend= command line option After commit d431cbc53cb7 (PM / sleep: Simplify sleep states sysfs interface code) the pm_states[] array is not populated initially, which causes setup_test_suspend() to always fail and the suspend testing during boot doesn't work any more. Fix the problem by using pm_labels[] instead of pm_states[] in setup_test_suspend() and storing a pointer to the label of the sleep state to test rather than the number representing it, because the connection between the state numbers and labels is only established by suspend_set_ops(). Fixes: d431cbc53cb7 (PM / sleep: Simplify sleep states sysfs interface code) Reported-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 1 + kernel/power/suspend.c | 2 +- kernel/power/suspend_test.c | 31 +++++++++++++++++++------------ 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 5d49dcac2537..2df883a9d3cb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *, #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ +extern const char *pm_labels[]; extern const char *pm_states[]; extern int suspend_devices_and_enter(suspend_state_t state); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6dadb25cb0d8..18c62195660f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,7 +31,7 @@ #include "power.h" -static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; static const struct platform_suspend_ops *suspend_ops; diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 2f524928b6aa..bd91bc177c93 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -129,20 +129,20 @@ static int __init has_wakealarm(struct device *dev, const void *data) * at startup time. They're normally disabled, for faster boot and because * we can't know which states really work on this particular system. */ -static suspend_state_t test_state __initdata = PM_SUSPEND_ON; +static const char *test_state_label __initdata; static char warn_bad_state[] __initdata = KERN_WARNING "PM: can't test '%s' suspend state\n"; static int __init setup_test_suspend(char *value) { - suspend_state_t i; + int i; /* "=mem" ==> "mem" */ value++; - for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) - if (!strcmp(pm_states[i], value)) { - test_state = i; + for (i = 0; pm_labels[i]; i++) + if (!strcmp(pm_labels[i], value)) { + test_state_label = pm_labels[i]; return 0; } @@ -158,13 +158,21 @@ static int __init test_suspend(void) struct rtc_device *rtc = NULL; struct device *dev; + suspend_state_t test_state; /* PM is initialized by now; is that state testable? */ - if (test_state == PM_SUSPEND_ON) - goto done; - if (!pm_states[test_state]) { - printk(warn_bad_state, pm_states[test_state]); - goto done; + if (!test_state_label) + return 0; + + for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) { + const char *state_label = pm_states[test_state]; + + if (state_label && !strcmp(test_state_label, state_label)) + break; + } + if (test_state == PM_SUSPEND_MAX) { + printk(warn_bad_state, test_state_label); + return 0; } /* RTCs have initialized by now too ... can we use one? */ @@ -173,13 +181,12 @@ static int __init test_suspend(void) rtc = rtc_class_open(dev_name(dev)); if (!rtc) { printk(warn_no_rtc); - goto done; + return 0; } /* go for it */ test_wakealarm(rtc, test_state); rtc_class_close(rtc); -done: return 0; } late_initcall(test_suspend); -- cgit v1.2.3 From 76ba59f8366f2d9282cb5bda9de75b4b68cbe55f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Aug 2014 11:03:16 +0100 Subject: genirq: Add irq_domain-aware core IRQ handler Calling irq_find_mapping from outside a irq_{enter,exit} section is unsafe and produces ugly messages if CONFIG_PROVE_RCU is enabled: If coming from the idle state, the rcu_read_lock call in irq_find_mapping will generate an unpleasant warning: =============================== [ INFO: suspicious RCU usage. ] 3.16.0-rc1+ #135 Not tainted ------------------------------- include/linux/rcupdate.h:871 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! 1 lock held by swapper/0/0: #0: (rcu_read_lock){......}, at: [] irq_find_mapping+0x4c/0x198 As this issue is fairly widespread and involves at least three different architectures, a possible solution is to add a new handle_domain_irq entry point into the generic IRQ code that the interrupt controller code can call. This new function takes an irq_domain, and calls into irq_find_domain inside the irq_{enter,exit} block. An additional "lookup" parameter is used to allow non-domain architecture code to be replaced by this as well. Interrupt controllers can then be updated to use the new mechanism. This code is sitting behind a new CONFIG_HANDLE_DOMAIN_IRQ, as not all architectures implement set_irq_regs (yes, mn10300, I'm looking at you...). Reported-by: Vladimir Murzin Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/1409047421-27649-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- include/linux/irqdesc.h | 19 +++++++++++++++++++ kernel/irq/Kconfig | 3 +++ kernel/irq/irqdesc.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 472c021a2d4f..ff24667cd86c 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -12,6 +12,8 @@ struct irq_affinity_notify; struct proc_dir_entry; struct module; struct irq_desc; +struct irq_domain; +struct pt_regs; /** * struct irq_desc - interrupt descriptor @@ -118,6 +120,23 @@ static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *de int generic_handle_irq(unsigned int irq); +#ifdef CONFIG_HANDLE_DOMAIN_IRQ +/* + * Convert a HW interrupt number to a logical one using a IRQ domain, + * and handle the result interrupt number. Return -EINVAL if + * conversion failed. Providing a NULL domain indicates that the + * conversion has already been done. + */ +int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, + bool lookup, struct pt_regs *regs); + +static inline int handle_domain_irq(struct irq_domain *domain, + unsigned int hwirq, struct pt_regs *regs) +{ + return __handle_domain_irq(domain, hwirq, true, regs); +} +#endif + /* Test to see if a driver has successfully requested an irq */ static inline int irq_has_action(unsigned int irq) { diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d269cecdfbf0..225086b2652e 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -55,6 +55,9 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +config HANDLE_DOMAIN_IRQ + bool + config IRQ_DOMAIN_DEBUG bool "Expose hardware/virtual IRQ mapping via debugfs" depends on IRQ_DOMAIN && DEBUG_FS diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1487a123db5c..a1782f88f0af 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internals.h" @@ -336,6 +337,47 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +#ifdef CONFIG_HANDLE_DOMAIN_IRQ +/** + * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * @lookup: Whether to perform the domain lookup or not + * @regs: Register file coming from the low-level handling code + * + * Returns: 0 on success, or -EINVAL if conversion has failed + */ +int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, + bool lookup, struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned int irq = hwirq; + int ret = 0; + + irq_enter(); + +#ifdef CONFIG_IRQ_DOMAIN + if (lookup) + irq = irq_find_mapping(domain, hwirq); +#endif + + /* + * Some hardware gives randomly wrong interrupts. Rather + * than crashing, do something sensible. + */ + if (unlikely(!irq || irq >= nr_irqs)) { + ack_bad_irq(irq); + ret = -EINVAL; + } else { + generic_handle_irq(irq); + } + + irq_exit(); + set_irq_regs(old_regs); + return ret; +} +#endif + /* Dynamic interrupt handling */ /** -- cgit v1.2.3 From a4412fc9486ec85686c6c7929e7e829f62ae377e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 21 Jul 2014 18:49:14 -0700 Subject: seccomp,x86,arm,mips,s390: Remove nr parameter from secure_computing The secure_computing function took a syscall number parameter, but it only paid any attention to that parameter if seccomp mode 1 was enabled. Rather than coming up with a kludge to get the parameter to work in mode 2, just remove the parameter. To avoid churn in arches that don't have seccomp filters (and may not even support syscall_get_nr right now), this leaves the parameter in secure_computing_strict, which is now a real function. For ARM, this is a bit ugly due to the fact that ARM conditionally supports seccomp filters. Fixing that would probably only be a couple of lines of code, but it should be coordinated with the audit maintainers. This will be a slight slowdown on some arches. The right fix is to pass in all of seccomp_data instead of trying to make just the syscall nr part be fast. This is a prerequisite for making two-phase seccomp work cleanly. Cc: Russell King Cc: linux-arm-kernel@lists.infradead.org Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux-s390@vger.kernel.org Cc: x86@kernel.org Cc: Kees Cook Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- arch/arm/kernel/ptrace.c | 7 ++++- arch/mips/kernel/ptrace.c | 2 +- arch/s390/kernel/ptrace.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/vsyscall_64.c | 2 +- include/linux/seccomp.h | 21 +++++++------- kernel/seccomp.c | 64 ++++++++++++++++++++++++++++++------------- 7 files changed, 66 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 0c27ed6f3f23..5e772a21ab97 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -933,8 +933,13 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno) current_thread_info()->syscall = scno; /* Do the secure computing check first; failures should be fast. */ - if (secure_computing(scno) == -1) +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER + if (secure_computing() == -1) return -1; +#else + /* XXX: remove this once OABI gets fixed */ + secure_computing_strict(scno); +#endif if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 645b3c4fcfba..f7aac5b57b4b 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -770,7 +770,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) long ret = 0; user_exit(); - if (secure_computing(syscall) == -1) + if (secure_computing() == -1) return -1; if (test_thread_flag(TIF_SYSCALL_TRACE) && diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 5dc7ad9e2fbf..bebacad48305 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -803,7 +803,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) long ret = 0; /* Do the secure computing check first. */ - if (secure_computing(regs->gprs[2])) { + if (secure_computing()) { /* seccomp failures shouldn't expose any additional code. */ ret = -1; goto out; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 678c0ada3b3c..93c182a00506 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1471,7 +1471,7 @@ long syscall_trace_enter(struct pt_regs *regs) regs->flags |= X86_EFLAGS_TF; /* do the secure computing check first */ - if (secure_computing(regs->orig_ax)) { + if (secure_computing()) { /* seccomp failures shouldn't expose any additional code. */ ret = -1L; goto out; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e1e1e80fc6a6..957779f4eb40 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) */ regs->orig_ax = syscall_nr; regs->ax = -ENOSYS; - tmp = secure_computing(syscall_nr); + tmp = secure_computing(); if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 5d586a45a319..aa3c040230be 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -27,19 +27,17 @@ struct seccomp { struct seccomp_filter *filter; }; -extern int __secure_computing(int); -static inline int secure_computing(int this_syscall) +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER +extern int __secure_computing(void); +static inline int secure_computing(void) { if (unlikely(test_thread_flag(TIF_SECCOMP))) - return __secure_computing(this_syscall); + return __secure_computing(); return 0; } - -/* A wrapper for architectures supporting only SECCOMP_MODE_STRICT. */ -static inline void secure_computing_strict(int this_syscall) -{ - BUG_ON(secure_computing(this_syscall) != 0); -} +#else +extern void secure_computing_strict(int this_syscall); +#endif extern long prctl_get_seccomp(void); extern long prctl_set_seccomp(unsigned long, char __user *); @@ -56,8 +54,11 @@ static inline int seccomp_mode(struct seccomp *s) struct seccomp { }; struct seccomp_filter { }; -static inline int secure_computing(int this_syscall) { return 0; } +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER +static inline int secure_computing(void) { return 0; } +#else static inline void secure_computing_strict(int this_syscall) { return; } +#endif static inline long prctl_get_seccomp(void) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..5e738e0dd2e9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -23,8 +23,11 @@ /* #define SECCOMP_DEBUG 1 */ -#ifdef CONFIG_SECCOMP_FILTER +#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include +#endif + +#ifdef CONFIG_SECCOMP_FILTER #include #include #include @@ -172,7 +175,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(int syscall) +static u32 seccomp_run_filters(void) { struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); struct seccomp_data sd; @@ -564,10 +567,43 @@ static int mode1_syscalls_32[] = { }; #endif -int __secure_computing(int this_syscall) +static void __secure_computing_strict(int this_syscall) +{ + int *syscall_whitelist = mode1_syscalls; +#ifdef CONFIG_COMPAT + if (is_compat_task()) + syscall_whitelist = mode1_syscalls_32; +#endif + do { + if (*syscall_whitelist == this_syscall) + return; + } while (*++syscall_whitelist); + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL); + do_exit(SIGKILL); +} + +#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER +void secure_computing_strict(int this_syscall) +{ + int mode = current->seccomp.mode; + + if (mode == 0) + return; + else if (mode == SECCOMP_MODE_STRICT) + __secure_computing_strict(this_syscall); + else + BUG(); +} +#else +int __secure_computing(void) { + struct pt_regs *regs = task_pt_regs(current); + int this_syscall = syscall_get_nr(current, regs); int exit_sig = 0; - int *syscall; u32 ret; /* @@ -578,23 +614,12 @@ int __secure_computing(int this_syscall) switch (current->seccomp.mode) { case SECCOMP_MODE_STRICT: - syscall = mode1_syscalls; -#ifdef CONFIG_COMPAT - if (is_compat_task()) - syscall = mode1_syscalls_32; -#endif - do { - if (*syscall == this_syscall) - return 0; - } while (*++syscall); - exit_sig = SIGKILL; - ret = SECCOMP_RET_KILL; - break; + __secure_computing_strict(this_syscall); + return 0; #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; - struct pt_regs *regs = task_pt_regs(current); - ret = seccomp_run_filters(this_syscall); + ret = seccomp_run_filters(); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { @@ -652,9 +677,10 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER skip: audit_seccomp(this_syscall, exit_sig, ret); -#endif return -1; +#endif } +#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ long prctl_get_seccomp(void) { -- cgit v1.2.3 From 13aa72f0fd0a9f98a41cefb662487269e2f1ad65 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 21 Jul 2014 18:49:15 -0700 Subject: seccomp: Refactor the filter callback and the API The reason I did this is to add a seccomp API that will be usable for an x86 fast path. The x86 entry code needs to use a rather expensive slow path for a syscall that might be visible to things like ptrace. By splitting seccomp into two phases, we can check whether we need the slow path and then use the fast path in if the filter allows the syscall or just returns some errno. As a side effect, I think the new code is much easier to understand than the old code. This has one user-visible effect: the audit record written for SECCOMP_RET_TRACE is now a simple indication that SECCOMP_RET_TRACE happened. It used to depend in a complicated way on what the tracer did. I couldn't make much sense of it. Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- include/linux/seccomp.h | 6 ++ kernel/seccomp.c | 190 +++++++++++++++++++++++++++++++----------------- 2 files changed, 130 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index aa3c040230be..38851085e481 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -35,6 +35,12 @@ static inline int secure_computing(void) return __secure_computing(); return 0; } + +#define SECCOMP_PHASE1_OK 0 +#define SECCOMP_PHASE1_SKIP 1 + +extern u32 seccomp_phase1(void); +int seccomp_phase2(u32 phase1_result); #else extern void secure_computing_strict(int this_syscall); #endif diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5e738e0dd2e9..6c8528ce9df9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -21,8 +21,6 @@ #include #include -/* #define SECCOMP_DEBUG 1 */ - #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER #include #endif @@ -601,10 +599,21 @@ void secure_computing_strict(int this_syscall) #else int __secure_computing(void) { - struct pt_regs *regs = task_pt_regs(current); - int this_syscall = syscall_get_nr(current, regs); - int exit_sig = 0; - u32 ret; + u32 phase1_result = seccomp_phase1(); + + if (likely(phase1_result == SECCOMP_PHASE1_OK)) + return 0; + else if (likely(phase1_result == SECCOMP_PHASE1_SKIP)) + return -1; + else + return seccomp_phase2(phase1_result); +} + +#ifdef CONFIG_SECCOMP_FILTER +static u32 __seccomp_phase1_filter(int this_syscall, struct pt_regs *regs) +{ + u32 filter_ret, action; + int data; /* * Make sure that any changes to mode from another thread have @@ -612,73 +621,122 @@ int __secure_computing(void) */ rmb(); - switch (current->seccomp.mode) { + filter_ret = seccomp_run_filters(); + data = filter_ret & SECCOMP_RET_DATA; + action = filter_ret & SECCOMP_RET_ACTION; + + switch (action) { + case SECCOMP_RET_ERRNO: + /* Set the low-order 16-bits as a errno. */ + syscall_set_return_value(current, regs, + -data, 0); + goto skip; + + case SECCOMP_RET_TRAP: + /* Show the handler the original registers. */ + syscall_rollback(current, regs); + /* Let the filter pass back 16 bits of data. */ + seccomp_send_sigsys(this_syscall, data); + goto skip; + + case SECCOMP_RET_TRACE: + return filter_ret; /* Save the rest for phase 2. */ + + case SECCOMP_RET_ALLOW: + return SECCOMP_PHASE1_OK; + + case SECCOMP_RET_KILL: + default: + audit_seccomp(this_syscall, SIGSYS, action); + do_exit(SIGSYS); + } + + unreachable(); + +skip: + audit_seccomp(this_syscall, 0, action); + return SECCOMP_PHASE1_SKIP; +} +#endif + +/** + * seccomp_phase1() - run fast path seccomp checks on the current syscall + * + * This only reads pt_regs via the syscall_xyz helpers. The only change + * it will make to pt_regs is via syscall_set_return_value, and it will + * only do that if it returns SECCOMP_PHASE1_SKIP. + * + * It may also call do_exit or force a signal; these actions must be + * safe. + * + * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should + * be processed normally. + * + * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be + * invoked. In this case, seccomp_phase1 will have set the return value + * using syscall_set_return_value. + * + * If it returns anything else, then the return value should be passed + * to seccomp_phase2 from a context in which ptrace hooks are safe. + */ +u32 seccomp_phase1(void) +{ + int mode = current->seccomp.mode; + struct pt_regs *regs = task_pt_regs(current); + int this_syscall = syscall_get_nr(current, regs); + + switch (mode) { case SECCOMP_MODE_STRICT: - __secure_computing_strict(this_syscall); - return 0; + __secure_computing_strict(this_syscall); /* may call do_exit */ + return SECCOMP_PHASE1_OK; #ifdef CONFIG_SECCOMP_FILTER - case SECCOMP_MODE_FILTER: { - int data; - ret = seccomp_run_filters(); - data = ret & SECCOMP_RET_DATA; - ret &= SECCOMP_RET_ACTION; - switch (ret) { - case SECCOMP_RET_ERRNO: - /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, - -data, 0); - goto skip; - case SECCOMP_RET_TRAP: - /* Show the handler the original registers. */ - syscall_rollback(current, regs); - /* Let the filter pass back 16 bits of data. */ - seccomp_send_sigsys(this_syscall, data); - goto skip; - case SECCOMP_RET_TRACE: - /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { - syscall_set_return_value(current, regs, - -ENOSYS, 0); - goto skip; - } - /* Allow the BPF to provide the event message */ - ptrace_event(PTRACE_EVENT_SECCOMP, data); - /* - * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. - */ - if (fatal_signal_pending(current)) - break; - if (syscall_get_nr(current, regs) < 0) - goto skip; /* Explicit request to skip. */ - - return 0; - case SECCOMP_RET_ALLOW: - return 0; - case SECCOMP_RET_KILL: - default: - break; - } - exit_sig = SIGSYS; - break; - } + case SECCOMP_MODE_FILTER: + return __seccomp_phase1_filter(this_syscall, regs); #endif default: BUG(); } +} -#ifdef SECCOMP_DEBUG - dump_stack(); -#endif - audit_seccomp(this_syscall, exit_sig, ret); - do_exit(exit_sig); -#ifdef CONFIG_SECCOMP_FILTER -skip: - audit_seccomp(this_syscall, exit_sig, ret); - return -1; -#endif +/** + * seccomp_phase2() - finish slow path seccomp work for the current syscall + * @phase1_result: The return value from seccomp_phase1() + * + * This must be called from a context in which ptrace hooks can be used. + * + * Returns 0 if the syscall should be processed or -1 to skip the syscall. + */ +int seccomp_phase2(u32 phase1_result) +{ + struct pt_regs *regs = task_pt_regs(current); + u32 action = phase1_result & SECCOMP_RET_ACTION; + int data = phase1_result & SECCOMP_RET_DATA; + + BUG_ON(action != SECCOMP_RET_TRACE); + + audit_seccomp(syscall_get_nr(current, regs), 0, action); + + /* Skip these calls if there is no tracer. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); + return -1; + } + + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification. + * Terminating the task now avoids executing a system + * call that may not be intended. + */ + if (fatal_signal_pending(current)) + do_exit(SIGSYS); + if (syscall_get_nr(current, regs) < 0) + return -1; /* Explicit request to skip. */ + + return 0; } #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */ -- cgit v1.2.3 From d39bd00deabe57420f2a3669eb71b0e0c4997184 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 21 Jul 2014 18:49:16 -0700 Subject: seccomp: Allow arch code to provide seccomp_data populate_seccomp_data is expensive: it works by inspecting task_pt_regs and various other bits to piece together all the information, and it's does so in multiple partially redundant steps. Arch-specific code in the syscall entry path can do much better. Admittedly this adds a bit of additional room for error, but the speedup should be worth it. Signed-off-by: Andy Lutomirski Signed-off-by: Kees Cook --- include/linux/seccomp.h | 2 +- kernel/seccomp.c | 32 +++++++++++++++++++------------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 38851085e481..a19ddacdac30 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -39,7 +39,7 @@ static inline int secure_computing(void) #define SECCOMP_PHASE1_OK 0 #define SECCOMP_PHASE1_SKIP 1 -extern u32 seccomp_phase1(void); +extern u32 seccomp_phase1(struct seccomp_data *sd); int seccomp_phase2(u32 phase1_result); #else extern void secure_computing_strict(int this_syscall); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 6c8528ce9df9..1285cb205d49 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -173,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) * * Returns valid seccomp BPF response codes. */ -static u32 seccomp_run_filters(void) +static u32 seccomp_run_filters(struct seccomp_data *sd) { struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); - struct seccomp_data sd; + struct seccomp_data sd_local; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ @@ -186,14 +186,17 @@ static u32 seccomp_run_filters(void) /* Make sure cross-thread synced filter points somewhere sane. */ smp_read_barrier_depends(); - populate_seccomp_data(&sd); + if (!sd) { + populate_seccomp_data(&sd_local); + sd = &sd_local; + } /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { - u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); + u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; @@ -599,7 +602,7 @@ void secure_computing_strict(int this_syscall) #else int __secure_computing(void) { - u32 phase1_result = seccomp_phase1(); + u32 phase1_result = seccomp_phase1(NULL); if (likely(phase1_result == SECCOMP_PHASE1_OK)) return 0; @@ -610,7 +613,7 @@ int __secure_computing(void) } #ifdef CONFIG_SECCOMP_FILTER -static u32 __seccomp_phase1_filter(int this_syscall, struct pt_regs *regs) +static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd) { u32 filter_ret, action; int data; @@ -621,20 +624,20 @@ static u32 __seccomp_phase1_filter(int this_syscall, struct pt_regs *regs) */ rmb(); - filter_ret = seccomp_run_filters(); + filter_ret = seccomp_run_filters(sd); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION; switch (action) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, regs, + syscall_set_return_value(current, task_pt_regs(current), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, regs); + syscall_rollback(current, task_pt_regs(current)); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; @@ -661,11 +664,14 @@ skip: /** * seccomp_phase1() - run fast path seccomp checks on the current syscall + * @arg sd: The seccomp_data or NULL * * This only reads pt_regs via the syscall_xyz helpers. The only change * it will make to pt_regs is via syscall_set_return_value, and it will * only do that if it returns SECCOMP_PHASE1_SKIP. * + * If sd is provided, it will not read pt_regs at all. + * * It may also call do_exit or force a signal; these actions must be * safe. * @@ -679,11 +685,11 @@ skip: * If it returns anything else, then the return value should be passed * to seccomp_phase2 from a context in which ptrace hooks are safe. */ -u32 seccomp_phase1(void) +u32 seccomp_phase1(struct seccomp_data *sd) { int mode = current->seccomp.mode; - struct pt_regs *regs = task_pt_regs(current); - int this_syscall = syscall_get_nr(current, regs); + int this_syscall = sd ? sd->nr : + syscall_get_nr(current, task_pt_regs(current)); switch (mode) { case SECCOMP_MODE_STRICT: @@ -691,7 +697,7 @@ u32 seccomp_phase1(void) return SECCOMP_PHASE1_OK; #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: - return __seccomp_phase1_filter(this_syscall, regs); + return __seccomp_phase1_filter(this_syscall, sd); #endif default: BUG(); -- cgit v1.2.3 From 315427691c7a064718b5ad7d378d7f1c1898a626 Mon Sep 17 00:00:00 2001 From: Mark Rustad Date: Wed, 3 Sep 2014 03:17:24 -0700 Subject: locking/semaphore: Resolve some shadow warnings Resolve some shadow warnings resulting from using the name jiffies, which is a well-known global. This is not a problem of course, but it could be a trap for someone copying and pasting code, and it just makes W=2 a little cleaner. Signed-off-by: Mark Rustad Signed-off-by: Jeff Kirsher Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1409739444-13635-1-git-send-email-jeffrey.t.kirsher@intel.com Signed-off-by: Ingo Molnar --- kernel/locking/semaphore.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..b8120abe594b 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -36,7 +36,7 @@ static noinline void __down(struct semaphore *sem); static noinline int __down_interruptible(struct semaphore *sem); static noinline int __down_killable(struct semaphore *sem); -static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline int __down_timeout(struct semaphore *sem, long timeout); static noinline void __up(struct semaphore *sem); /** @@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock); /** * down_timeout - acquire the semaphore within a specified time * @sem: the semaphore to be acquired - * @jiffies: how long to wait before failing + * @timeout: how long to wait before failing * * Attempts to acquire the semaphore. If no more tasks are allowed to * acquire the semaphore, calling this function will put the task to sleep. * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long jiffies) +int down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; @@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies) if (likely(sem->count > 0)) sem->count--; else - result = __down_timeout(sem, jiffies); + result = __down_timeout(sem, timeout); raw_spin_unlock_irqrestore(&sem->lock, flags); return result; @@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem) return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); } -static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) { - return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); + return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout); } static noinline void __sched __up(struct semaphore *sem) -- cgit v1.2.3 From a4189487da1b4f8260c6006b9dc47c3c4107a5ae Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 4 Sep 2014 14:43:07 +0800 Subject: cgroup: delay the clearing of cgrp->kn->priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run these two scripts concurrently: for ((; ;)) { mkdir /cgroup/sub rmdir /cgroup/sub } for ((; ;)) { echo $$ > /cgroup/sub/cgroup.procs echo $$ > /cgroup/cgroup.procs } A kernel bug will be triggered: BUG: unable to handle kernel NULL pointer dereference at 00000038 IP: [] cgroup_put+0x9/0x80 ... Call Trace: [] cgroup_kn_unlock+0x39/0x50 [] cgroup_kn_lock_live+0x61/0x70 [] __cgroup_procs_write.isra.26+0x51/0x230 [] cgroup_tasks_write+0x12/0x20 [] cgroup_file_write+0x40/0x130 [] kernfs_fop_write+0xd1/0x160 [] vfs_write+0x98/0x1e0 [] SyS_write+0x4d/0xa0 [] sysenter_do_call+0x12/0x12 We clear cgrp->kn->priv in the end of cgroup_rmdir(), but another concurrent thread can access kn->priv after the clearing. We should move the clearing to css_release_work_fn(). At that time no one is holding reference to the cgroup and no one can gain a new reference to access it. v2: - move RCU_INIT_POINTER() into the else block. (Tejun) - remove the cgroup_parent() check. (Tejun) - update the comment in css_tryget_online_from_dir(). Cc: # 3.15+ Reported-by: Toralf Förster Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 50b94113f4f7..bf30076664ca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4396,6 +4396,15 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; + + /* + * There are two control paths which try to determine + * cgroup from dentry without going through kernfs - + * cgroupstats_build() and css_tryget_online_from_dir(). + * Those are supported by RCU protecting clearing of + * cgrp->kn->priv backpointer. + */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); } mutex_unlock(&cgroup_mutex); @@ -4834,16 +4843,6 @@ static int cgroup_rmdir(struct kernfs_node *kn) cgroup_kn_unlock(kn); - /* - * There are two control paths which try to determine cgroup from - * dentry without going through kernfs - cgroupstats_build() and - * css_tryget_online_from_dir(). Those are supported by RCU - * protecting clearing of cgrp->kn->priv backpointer, which should - * happen after all files under it have been removed. - */ - if (!ret) - RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL); - cgroup_put(cgrp); return ret; } @@ -5430,7 +5429,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, /* * This path doesn't originate from kernfs and @kn could already * have been or be removed at any point. @kn->priv is RCU - * protected for this access. See cgroup_rmdir() for details. + * protected for this access. See css_release_work_fn() for details. */ cgrp = rcu_dereference(kn->priv); if (cgrp) -- cgit v1.2.3 From aa32362f011c6e863132b16c1761487166a4bad2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 4 Sep 2014 14:43:38 +0800 Subject: cgroup: check cgroup liveliness before unbreaking kernfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cgroup_kn_lock_live() is called through some kernfs operation and another thread is calling cgroup_rmdir(), we'll trigger the warning in cgroup_get(). ------------[ cut here ]------------ WARNING: CPU: 1 PID: 1228 at kernel/cgroup.c:1034 cgroup_get+0x89/0xa0() ... Call Trace: [] dump_stack+0x41/0x52 [] warn_slowpath_common+0x7f/0xa0 [] warn_slowpath_null+0x1d/0x20 [] cgroup_get+0x89/0xa0 [] cgroup_kn_lock_live+0x28/0x70 [] __cgroup_procs_write.isra.26+0x51/0x230 [] cgroup_tasks_write+0x12/0x20 [] cgroup_file_write+0x40/0x130 [] kernfs_fop_write+0xd1/0x160 [] vfs_write+0x98/0x1e0 [] SyS_write+0x4d/0xa0 [] sysenter_do_call+0x12/0x12 ---[ end trace 6f2e0c38c2108a74 ]--- Fix this by calling css_tryget() instead of cgroup_get(). v2: - move cgroup_tryget() right below cgroup_get() definition. (Tejun) Cc: # 3.15+ Reported-by: Toralf Förster Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bf30076664ca..940aced4ed00 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1035,6 +1035,11 @@ static void cgroup_get(struct cgroup *cgrp) css_get(&cgrp->self); } +static bool cgroup_tryget(struct cgroup *cgrp) +{ + return css_tryget(&cgrp->self); +} + static void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); @@ -1147,7 +1152,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) * protection against removal. Ensure @cgrp stays accessible and * break the active_ref protection. */ - cgroup_get(cgrp); + if (!cgroup_tryget(cgrp)) + return NULL; kernfs_break_active_protection(kn); mutex_lock(&cgroup_mutex); -- cgit v1.2.3 From 40bea039593dfc7f3f9814dab844f6db43ae580b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Aug 2014 18:50:16 +0200 Subject: nohz: Restore NMI safe local irq work for local nohz kick The local nohz kick is currently used by perf which needs it to be NMI-safe. Recent commit though (7d1311b93e58ed55f3a31cc8f94c4b8fe988a2b9) changed its implementation to fire the local kick using the remote kick API. It was convenient to make the code more generic but the remote kick isn't NMI-safe. As a result: WARNING: CPU: 3 PID: 18062 at kernel/irq_work.c:72 irq_work_queue_on+0x11e/0x140() CPU: 3 PID: 18062 Comm: trinity-subchil Not tainted 3.16.0+ #34 0000000000000009 00000000903774d1 ffff880244e06c00 ffffffff9a7f1e37 0000000000000000 ffff880244e06c38 ffffffff9a0791dd ffff880244fce180 0000000000000003 ffff880244e06d58 ffff880244e06ef8 0000000000000000 Call Trace: [] dump_stack+0x4e/0x7a [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_null+0x1a/0x20 [] irq_work_queue_on+0x11e/0x140 [] tick_nohz_full_kick_cpu+0x57/0x90 [] __perf_event_overflow+0x275/0x350 [] ? perf_event_task_disable+0xa0/0xa0 [] ? x86_perf_event_set_period+0xbf/0x150 [] perf_event_overflow+0x14/0x20 [] intel_pmu_handle_irq+0x206/0x410 [] ? arch_vtime_task_switch+0x63/0x130 [] perf_event_nmi_handler+0x2b/0x50 [] nmi_handle+0xd2/0x390 [] ? nmi_handle+0x5/0x390 [] ? lock_release+0xab/0x330 [] default_do_nmi+0x72/0x1c0 [] ? cpuacct_account_field+0xcf/0x200 [] do_nmi+0xb8/0x100 Lets fix this by restoring the use of local irq work for the nohz local kick. Reported-by: Catalin Iacob Reported-and-tested-by: Dave Jones Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 7 +------ kernel/time/tick-sched.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 059052306831..9a82c7dc3fdd 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -183,13 +183,8 @@ static inline bool tick_nohz_full_cpu(int cpu) extern void tick_nohz_init(void); extern void __tick_nohz_full_check(void); +extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); - -static inline void tick_nohz_full_kick(void) -{ - tick_nohz_full_kick_cpu(smp_processor_id()); -} - extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(struct task_struct *tsk); #else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99aa6ee3908f..f654a8a298fa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -224,6 +224,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { .func = nohz_full_kick_work_func, }; +/* + * Kick this CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), + * is NMI safe. + */ +void tick_nohz_full_kick(void) +{ + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. -- cgit v1.2.3 From 8d38821cbcf51292cd5a23469d03bd38932a3ba9 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 1 Aug 2014 14:15:10 +0200 Subject: resources: Add device-managed request/release_resource() Provide device-managed implementations of the request_resource() and release_resource() functions. Upon failure to request a resource, the new devm_request_resource() function will output an error message for consistent error reporting. Signed-off-by: Thierry Reding Signed-off-by: Bjorn Helgaas Acked-by: Tejun Heo --- Documentation/driver-model/devres.txt | 2 + include/linux/ioport.h | 5 +++ kernel/resource.c | 70 +++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+) (limited to 'kernel') diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index d14710b04439..befc3fe12ba6 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -264,8 +264,10 @@ IIO IO region devm_release_mem_region() devm_release_region() + devm_release_resource() devm_request_mem_region() devm_request_region() + devm_request_resource() IOMAP devm_ioport_map() diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 142ec544167c..2c5250222278 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -215,6 +215,11 @@ static inline int __deprecated check_region(resource_size_t s, /* Wrappers for managed devices */ struct device; + +extern int devm_request_resource(struct device *dev, struct resource *root, + struct resource *new); +extern void devm_release_resource(struct device *dev, struct resource *new); + #define devm_request_region(dev,start,n,name) \ __devm_request_region(dev, &ioport_resource, (start), (n), (name)) #define devm_request_mem_region(dev,start,n,name) \ diff --git a/kernel/resource.c b/kernel/resource.c index da14b8d09296..ca24f19f9d18 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1248,6 +1248,76 @@ int release_mem_region_adjustable(struct resource *parent, /* * Managed region resource */ +static void devm_resource_release(struct device *dev, void *ptr) +{ + struct resource **r = ptr; + + release_resource(*r); +} + +/** + * devm_request_resource() - request and reserve an I/O or memory resource + * @dev: device for which to request the resource + * @root: root of the resource tree from which to request the resource + * @new: descriptor of the resource to request + * + * This is a device-managed version of request_resource(). There is usually + * no need to release resources requested by this function explicitly since + * that will be taken care of when the device is unbound from its driver. + * If for some reason the resource needs to be released explicitly, because + * of ordering issues for example, drivers must call devm_release_resource() + * rather than the regular release_resource(). + * + * When a conflict is detected between any existing resources and the newly + * requested resource, an error message will be printed. + * + * Returns 0 on success or a negative error code on failure. + */ +int devm_request_resource(struct device *dev, struct resource *root, + struct resource *new) +{ + struct resource *conflict, **ptr; + + ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return -ENOMEM; + + *ptr = new; + + conflict = request_resource_conflict(root, new); + if (conflict) { + dev_err(dev, "resource collision: %pR conflicts with %s %pR\n", + new, conflict->name, conflict); + devres_free(ptr); + return -EBUSY; + } + + devres_add(dev, ptr); + return 0; +} +EXPORT_SYMBOL(devm_request_resource); + +static int devm_resource_match(struct device *dev, void *res, void *data) +{ + struct resource **ptr = res; + + return *ptr == data; +} + +/** + * devm_release_resource() - release a previously requested resource + * @dev: device for which to release the resource + * @new: descriptor of the resource to release + * + * Releases a resource previously requested using devm_request_resource(). + */ +void devm_release_resource(struct device *dev, struct resource *new) +{ + WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match, + new)); +} +EXPORT_SYMBOL(devm_release_resource); + struct region_devres { struct resource *parent; resource_size_t start; -- cgit v1.2.3 From 35b123e2f701b28977db2cde7dbbdb3fad28cad1 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Fri, 22 Aug 2014 17:50:43 +0300 Subject: sched/fair: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Peter Zijlstra (Intel) Cc: paulmck@linux.vnet.ibm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140822145043.GA580@ada Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d3427a8f254b..02fc949eb348 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1780,7 +1780,7 @@ void task_numa_free(struct task_struct *p) list_del(&p->numa_entry); grp->nr_tasks--; spin_unlock_irqrestore(&grp->lock, flags); - rcu_assign_pointer(p->numa_group, NULL); + RCU_INIT_POINTER(p->numa_group, NULL); put_numa_group(grp); } -- cgit v1.2.3 From 60a3b2253c413cf601783b070507d7dd6620c954 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Sep 2014 22:53:44 +0200 Subject: net: bpf: make eBPF interpreter images read-only With eBPF getting more extended and exposure to user space is on it's way, hardening the memory range the interpreter uses to steer its command flow seems appropriate. This patch moves the to be interpreted bytecode to read-only pages. In case we execute a corrupted BPF interpreter image for some reason e.g. caused by an attacker which got past a verifier stage, it would not only provide arbitrary read/write memory access but arbitrary function calls as well. After setting up the BPF interpreter image, its contents do not change until destruction time, thus we can setup the image on immutable made pages in order to mitigate modifications to that code. The idea is derived from commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks"). This is possible because bpf_prog is not part of sk_filter anymore. After setup bpf_prog cannot be altered during its life-time. This prevents any modifications to the entire bpf_prog structure (incl. function/JIT image pointer). Every eBPF program (including classic BPF that are migrated) have to call bpf_prog_select_runtime() to select either interpreter or a JIT image as a last setup step, and they all are being freed via bpf_prog_free(), including non-JIT. Therefore, we can easily integrate this into the eBPF life-time, plus since we directly allocate a bpf_prog, we have no performance penalty. Tested with seccomp and test_bpf testsuite in JIT/non-JIT mode and manual inspection of kernel_page_tables. Brad Spengler proposed the same idea via Twitter during development of this patch. Joint work with Hannes Frederic Sowa. Suggested-by: Brad Spengler Signed-off-by: Daniel Borkmann Signed-off-by: Hannes Frederic Sowa Cc: Alexei Starovoitov Cc: Kees Cook Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/arm/net/bpf_jit_32.c | 3 +- arch/mips/net/bpf_jit.c | 3 +- arch/powerpc/net/bpf_jit_comp.c | 3 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 3 +- arch/x86/net/bpf_jit_comp.c | 18 ++++------ include/linux/filter.h | 49 ++++++++++++++++++++++--- kernel/bpf/core.c | 80 +++++++++++++++++++++++++++++++++++++++-- kernel/seccomp.c | 7 ++-- lib/test_bpf.c | 2 +- net/core/filter.c | 6 ++-- 11 files changed, 144 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index a37b989a2f91..a76623bcf722 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -930,5 +930,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 05a56619ece2..cfa83cf2447d 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1427,5 +1427,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 3afa6f4c1957..40c53ff59124 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -697,5 +697,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 61e45b7c04d7..f2833c5b218a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -887,5 +887,5 @@ void bpf_jit_free(struct bpf_prog *fp) module_free(NULL, header); free_filter: - kfree(fp); + bpf_prog_unlock_free(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index 1f76c22a6a75..f7a736b645e8 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -812,5 +812,6 @@ void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) module_free(NULL, fp->bpf_func); - kfree(fp); + + bpf_prog_unlock_free(fp); } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b08a98c59530..39ccfbb4a723 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -972,23 +972,17 @@ out: kfree(addrs); } -static void bpf_jit_free_deferred(struct work_struct *work) +void bpf_jit_free(struct bpf_prog *fp) { - struct bpf_prog *fp = container_of(work, struct bpf_prog, work); unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; struct bpf_binary_header *header = (void *)addr; + if (!fp->jited) + goto free_filter; + set_memory_rw(addr, header->pages); module_free(NULL, header); - kfree(fp); -} -void bpf_jit_free(struct bpf_prog *fp) -{ - if (fp->jited) { - INIT_WORK(&fp->work, bpf_jit_free_deferred); - schedule_work(&fp->work); - } else { - kfree(fp); - } +free_filter: + bpf_prog_unlock_free(fp); } diff --git a/include/linux/filter.h b/include/linux/filter.h index a5227ab8ccb1..c78994593355 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -9,6 +9,11 @@ #include #include #include +#include + +struct sk_buff; +struct sock; +struct seccomp_data; /* Internally used and optimized filter representation with extended * instruction set based on top of classic BPF. @@ -320,20 +325,23 @@ struct sock_fprog_kern { struct sock_filter *filter; }; -struct sk_buff; -struct sock; -struct seccomp_data; +struct bpf_work_struct { + struct bpf_prog *prog; + struct work_struct work; +}; struct bpf_prog { + u32 pages; /* Number of allocated pages */ u32 jited:1, /* Is our filter JIT'ed? */ len:31; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_work_struct *work; /* Deferred free work struct */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); + /* Instructions for interpreter */ union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; - struct work_struct work; }; }; @@ -353,6 +361,26 @@ static inline unsigned int bpf_prog_size(unsigned int proglen) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ + set_memory_ro((unsigned long)fp, fp->pages); +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ + set_memory_rw((unsigned long)fp, fp->pages); +} +#else +static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +{ +} + +static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +{ +} +#endif /* CONFIG_DEBUG_SET_MODULE_RONX */ + int sk_filter(struct sock *sk, struct sk_buff *skb); void bpf_prog_select_runtime(struct bpf_prog *fp); @@ -361,6 +389,17 @@ void bpf_prog_free(struct bpf_prog *fp); int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_insn *new_prog, int *new_len); +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags); +void __bpf_prog_free(struct bpf_prog *fp); + +static inline void bpf_prog_unlock_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_ro(fp); + __bpf_prog_free(fp); +} + int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); @@ -450,7 +489,7 @@ static inline void bpf_jit_compile(struct bpf_prog *fp) static inline void bpf_jit_free(struct bpf_prog *fp) { - kfree(fp); + bpf_prog_unlock_free(fp); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7f0dbcbb34af..b54bb2c2e494 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -22,6 +22,7 @@ */ #include #include +#include #include /* Registers */ @@ -63,6 +64,67 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_work_struct *ws; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); + if (ws == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->work = ws; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->work from fp_old around in the new + * reallocated structure. + */ + fp_old->work = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->work); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. @@ -523,12 +585,26 @@ void bpf_prog_select_runtime(struct bpf_prog *fp) /* Probe if internal BPF can be JITed */ bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); -/* free internal BPF program */ +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_work_struct *ws; + + ws = container_of(work, struct bpf_work_struct, work); + bpf_jit_free(ws->prog); +} + +/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - bpf_jit_free(fp); + struct bpf_work_struct *ws = fp->work; + + INIT_WORK(&ws->work, bpf_prog_free_deferred); + ws->prog = fp; + schedule_work(&ws->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 44eb005c6695..84922befea84 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,16 +395,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) if (!filter) goto free_prog; - filter->prog = kzalloc(bpf_prog_size(new_len), - GFP_KERNEL|__GFP_NOWARN); + filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN); if (!filter->prog) goto free_filter; ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); if (ret) goto free_filter_prog; - kfree(fp); + kfree(fp); atomic_set(&filter->usage, 1); filter->prog->len = new_len; @@ -413,7 +412,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return filter; free_filter_prog: - kfree(filter->prog); + __bpf_prog_free(filter->prog); free_filter: kfree(filter); free_prog: diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8c66c6aace04..9a67456ba29a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1836,7 +1836,7 @@ static struct bpf_prog *generate_filter(int which, int *err) break; case INTERNAL: - fp = kzalloc(bpf_prog_size(flen), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(flen), 0); if (fp == NULL) { pr_cont("UNEXPECTED_FAIL no memory left\n"); *err = -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index d814b8a89d0f..37f8eb06fdee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -933,7 +933,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) /* Expand fp for appending the new filter representation. */ old_fp = fp; - fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL); + fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. @@ -1013,7 +1013,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) if (fprog->filter == NULL) return -EINVAL; - fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL); + fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; @@ -1069,7 +1069,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (fprog->filter == NULL) return -EINVAL; - prog = kmalloc(bpf_fsize, GFP_KERNEL); + prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) return -ENOMEM; -- cgit v1.2.3 From 849151dd5481bc8acb1d287a299b5d6a4ca9f1c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2014 12:18:07 +0200 Subject: compat: nanosleep: Clarify error handling The error handling in compat_sys_nanosleep() is correct, but completely non obvious. Document it and restrict it to the -ERESTART_RESTARTBLOCK return value for clarity. Reported-by: Kees Cook Signed-off-by: Thomas Gleixner --- kernel/compat.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 633394f442f8..ebb3c369d03d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); - if (ret) { + if (ret == -ERESTART_RESTARTBLOCK) { rmtp = restart->nanosleep.compat_rmtp; if (rmtp && compat_put_timespec(&rmt, rmtp)) @@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); - if (ret) { + /* + * hrtimer_nanosleep() can only return 0 or + * -ERESTART_RESTARTBLOCK here because: + * + * - we call it with HRTIMER_MODE_REL and therefor exclude the + * -ERESTARTNOHAND return path. + * + * - we supply the rmtp argument from the task stack (due to + * the necessary compat conversion. So the update cannot + * fail, which excludes the -EFAULT return path as well. If + * it fails nevertheless we have a bigger problem and wont + * reach this place anymore. + * + * - if the return value is 0, we do not have to update rmtp + * because there is no remaining time. + * + * We check for -ERESTART_RESTARTBLOCK nevertheless if the + * core implementation decides to return random nonsense. + */ + if (ret == -ERESTART_RESTARTBLOCK) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } - return ret; } -- cgit v1.2.3 From 9bf2419fa7bffa16ce58a4d5c20399eff8c970c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2014 12:24:49 +0200 Subject: timekeeping: Update timekeeper before updating vsyscall and pvclock The update_walltime() code works on the shadow timekeeper to make the seqcount protected region as short as possible. But that update to the shadow timekeeper does not update all timekeeper fields because it's sufficient to do that once before it becomes life. One of these fields is tkr.base_mono. That stays stale in the shadow timekeeper unless an operation happens which copies the real timekeeper to the shadow. The update function is called after the update calls to vsyscall and pvclock. While not correct, it did not cause any problems because none of the invoked update functions used base_mono. commit cbcf2dd3b3d4 (x86: kvm: Make kvm_get_time_and_clockread() nanoseconds based) changed that in the kvm pvclock update function, so the stale mono_base value got used and caused kvm-clock to malfunction. Put the update where it belongs and fix the issue. Reported-by: Chris J Arges Reported-by: Paolo Bonzini Cc: Gleb Natapov Cc: John Stultz Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1409050000570.3333@nanos Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb4a9c2cf8d9..ec1791fae965 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -442,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) tk->ntp_error = 0; ntp_clear(); } - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); tk_update_ktime_data(tk); + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); -- cgit v1.2.3 From 177ef2a6315ea7bf173653182324e1dcd08ffeaa Mon Sep 17 00:00:00 2001 From: "xiaofeng.yan" Date: Tue, 26 Aug 2014 03:15:41 +0000 Subject: sched/deadline: Fix a precision problem in the microseconds range An overrun could happen in function start_hrtick_dl() when a task with SCHED_DEADLINE runs in the microseconds range. For example, if a task with SCHED_DEADLINE has the following parameters: Task runtime deadline period P1 200us 500us 500us The deadline and period from task P1 are less than 1ms. In order to achieve microsecond precision, we need to enable HRTICK feature by the next command: PC#echo "HRTICK" > /sys/kernel/debug/sched_features PC#trace-cmd record -e sched_switch & PC#./schedtool -E -t 200000:500000:500000 -e ./test The binary test is in an endless while(1) loop here. Some pieces of trace.dat are as follows: -0 157.603157: sched_switch: :R ==> 2481:4294967295: test test-2481 157.603203: sched_switch: 2481:R ==> 0:120: swapper/2 -0 157.605657: sched_switch: :R ==> 2481:4294967295: test test-2481 157.608183: sched_switch: 2481:R ==> 2483:120: trace-cmd trace-cmd-2483 157.609656: sched_switch:2483:R==>2481:4294967295: test We can get the runtime of P1 from the information above: runtime = 157.608183 - 157.605657 runtime = 0.002526(2.526ms) The correct runtime should be less than or equal to 200us at some point. The problem is caused by a conditional judgment "delta > 10000" in function start_hrtick_dl(). Because no hrtimer start up to control the rest of runtime when the reset of runtime is less than 10us. So the process will continue to run until tick-period is coming. Move the code with the limit of the least time slice from hrtick_start_fair() to hrtick_start() because the EDF schedule class also needs this function in start_hrtick_dl(). To fix this problem, we call hrtimer_start() unconditionally in start_hrtick_dl(), and make sure the scheduling slice won't be smaller than 10us in hrtimer_start(). Signed-off-by: Xiaofeng Yan Reviewed-by: Li Zefan Acked-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409022941-5880-1-git-send-email-xiaofeng.yan@huawei.com [ Massaged the changelog and the code. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++++- kernel/sched/deadline.c | 5 +---- kernel/sched/fair.c | 8 -------- 3 files changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a773c919d88d..8d00f4a8c126 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -455,7 +455,15 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + ktime_t time; + s64 delta; + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense and can cause timer DoS. + */ + delta = max_t(s64, delay, 10000LL); + time = ktime_add_ns(timer->base->get_time(), delta); hrtimer_set_expires(timer, time); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d21a8e0259d2..cc4eb89019c1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SCHED_HRTICK static void start_hrtick_dl(struct rq *rq, struct task_struct *p) { - s64 delta = p->dl.dl_runtime - p->dl.runtime; - - if (delta > 10000) - hrtick_start(rq, p->dl.runtime); + hrtick_start(rq, p->dl.runtime); } #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02fc949eb348..50d2025c1777 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3897,14 +3897,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) resched_curr(rq); return; } - - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - if (rq->curr != p) - delta = max_t(s64, 10000LL, delta); - hrtick_start(rq, delta); } } -- cgit v1.2.3 From 4de376a1b14e32f550931274f06b571abc0f3d4b Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 17:46:50 -0400 Subject: rcu: Remove remaining read-modify-write ACCESS_ONCE() calls Change the remaining uses of ACCESS_ONCE() so that each ACCESS_ONCE() either does a load or a store, but not both. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++++-- kernel/rcu/tree_plugin.h | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..4b526ca46801 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1684,7 +1684,8 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); - ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) = + ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } return fqs_state; @@ -2505,7 +2506,8 @@ static void force_quiescent_state(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } - ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; + ACCESS_ONCE(rsp->gp_flags) = + ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a7997e272564..218fae30c380 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -897,7 +897,8 @@ void synchronize_rcu_expedited(void) /* Clean up and exit. */ smp_mb(); /* ensure expedited GP seen before counter increment. */ - ACCESS_ONCE(sync_rcu_preempt_exp_count)++; + ACCESS_ONCE(sync_rcu_preempt_exp_count) = + sync_rcu_preempt_exp_count + 1; unlock_mb_ret: mutex_unlock(&sync_rcu_preempt_exp_mutex); mb_ret: @@ -2428,8 +2429,9 @@ static int rcu_nocb_kthread(void *arg) list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) -= c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; + ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) = + rdp->nocb_p_count_lazy - cl; rdp->n_nocbs_invoked += c; } return 0; -- cgit v1.2.3 From bf33eb1aef23e8049cd222471d35b0988c420b18 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:10 -0400 Subject: rcu: Fix sparse warning about rcu_batches_completed_preempt() being non-static fix sparse warning about rcu_batches_completed_preempt() being non-static by marking it as static Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 218fae30c380..5defa2f089af 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -134,7 +134,7 @@ static void __init rcu_bootup_announce(void) * Return the number of RCU-preempt batches processed thus far * for debug and statistics. */ -long rcu_batches_completed_preempt(void) +static long rcu_batches_completed_preempt(void) { return rcu_preempt_state.completed; } -- cgit v1.2.3 From f534ed1fd71cea885a59255d9b44c3b17df03eb1 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:11 -0400 Subject: rcu: Use bool type for return value in rcu_is_watching() Use a bool type for return in rcu_is_watching(). Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4b526ca46801..253ea55dc508 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -819,7 +819,7 @@ bool notrace __rcu_is_watching(void) */ bool notrace rcu_is_watching(void) { - int ret; + bool ret; preempt_disable(); ret = __rcu_is_watching(); -- cgit v1.2.3 From d0bc90fd37e50e4ea22c51c26947fd78c2a7a6c2 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:13 -0400 Subject: rcu: Return bool type for rcu_try_advance_all_cbs() Return a bool type instead of 0 in rcu_try_advance_all_cbs(). Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5defa2f089af..bb564560aeb8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1626,7 +1626,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* Exit early if we advanced recently. */ if (jiffies == rdtp->last_advance_all) - return 0; + return false; rdtp->last_advance_all = jiffies; for_each_rcu_flavor(rsp) { -- cgit v1.2.3 From e02b2edfa13878c6671d31d5c736f56f89d99bf1 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 9 Jul 2014 00:08:17 -0400 Subject: rcu: Use true/false instead of 1/0 for a bool type This commit uses true/false instead of 1/0 for bool types in rcu_gp_fqs() and force_qs_rnp(). Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 253ea55dc508..2719978ea018 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1668,7 +1668,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ if (is_sysidle_rcu_state(rsp)) { - isidle = 1; + isidle = true; maxj = jiffies - ULONG_MAX / 4; } force_qs_rnp(rsp, dyntick_save_progress_counter, @@ -1677,7 +1677,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = 0; + isidle = false; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2450,7 +2450,7 @@ static void force_qs_rnp(struct rcu_state *rsp, for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0) { if ((rnp->qsmaskinit & bit) != 0) - *isidle = 0; + *isidle = false; if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } -- cgit v1.2.3 From 85b39d305bfe809a11ff2770d380be3e2465beec Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 8 Jul 2014 15:17:59 -0700 Subject: rcu: Uninline rcu_read_lock_held() This commit uninlines rcu_read_lock_held(). According to "size vmlinux" this saves 28549 in .text: - 5541731 3014560 14757888 23314179 + 5513182 3026848 14757888 23297918 Note: it looks as if the data grows by 12288 bytes but this is not true, it does not actually grow. But .data starts with ALIGN(THREAD_SIZE) and since .text shrinks the padding grows, and thus .data grows too as it seen by /bin/size. diff System.map: - ffffffff81510000 D _sdata - ffffffff81510000 D init_thread_union + ffffffff81509000 D _sdata + ffffffff8150c000 D init_thread_union Perhaps we can change vmlinux.lds.S to .data itself, so that /bin/size can't "wrongly" report that .data grows if .text shinks. Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 36 +----------------------------------- kernel/rcu/update.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 7e47e44bce03..321ed0d4e675 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -371,41 +371,7 @@ extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; int debug_lockdep_rcu_enabled(void); -/** - * rcu_read_lock_held() - might we be in RCU read-side critical section? - * - * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU - * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, - * this assumes we are in an RCU read-side critical section unless it can - * prove otherwise. This is useful for debug checks in functions that - * require that they be called within an RCU read-side critical section. - * - * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot - * and while lockdep is disabled. - * - * Note that rcu_read_lock() and the matching rcu_read_unlock() must - * occur in the same context, for example, it is illegal to invoke - * rcu_read_unlock() in process context if the matching rcu_read_lock() - * was invoked from within an irq handler. - * - * Note that rcu_read_lock() is disallowed if the CPU is either idle or - * offline from an RCU perspective, so check for those as well. - */ -static inline int rcu_read_lock_held(void) -{ - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; - return lock_is_held(&rcu_lock_map); -} - -/* - * rcu_read_lock_bh_held() is defined out of line to avoid #include-file - * hell. - */ +int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); /** diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4056d7992a6c..ea8ea7b16e11 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -136,6 +136,38 @@ int notrace debug_lockdep_rcu_enabled(void) } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); +/** + * rcu_read_lock_held() - might we be in RCU read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU + * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, + * this assumes we are in an RCU read-side critical section unless it can + * prove otherwise. This is useful for debug checks in functions that + * require that they be called within an RCU read-side critical section. + * + * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that rcu_read_lock() and the matching rcu_read_unlock() must + * occur in the same context, for example, it is illegal to invoke + * rcu_read_unlock() in process context if the matching rcu_read_lock() + * was invoked from within an irq handler. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. + */ +int rcu_read_lock_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + return lock_is_held(&rcu_lock_map); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_held); + /** * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * -- cgit v1.2.3 From a8a29b3b7b18251c4e3ffce501f25ae868302a75 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 12 Jul 2014 19:01:49 +0200 Subject: rcu: Define tracepoint strings only if CONFIG_TRACING is set Commit f7f7bac9cb1c ("rcu: Have the RCU tracepoints use the tracepoint_string infrastructure") unconditionally populates the __tracepoint_str input section, but this section is not assigned an output section if CONFIG_TRACING is not set. This results in the __tracepoint_str turning up in unexpected places, i.e., after _edata. Signed-off-by: Ard Biesheuvel Reviewed-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2719978ea018..dc52dc3b8c3e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; * the tracing userspace tools to be able to decipher the string * address to the matching string. */ -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +#ifdef CONFIG_TRACING +# define DEFINE_RCU_TPS(sname) \ static char sname##_varname[] = #sname; \ -static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ +static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; +# define RCU_STATE_NAME(sname) sname##_varname +#else +# define DEFINE_RCU_TPS(sname) +# define RCU_STATE_NAME(sname) __stringify(sname) +#endif + +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +DEFINE_RCU_TPS(sname) \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ @@ -93,7 +102,7 @@ struct rcu_state sname##_state = { \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ - .name = sname##_varname, \ + .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ }; \ DEFINE_PER_CPU(struct rcu_data, sname##_data) -- cgit v1.2.3 From fafb6e843f229a6e842a22773f16d93194ca06e4 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 15 Jul 2014 18:31:47 -0400 Subject: rcu: Update tiny.c references to tree.c This commit updates the references to rcutree.c which is now rcu/tree.c Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d9efcc13008c..6bd785c34add 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; #include "tiny_plugin.h" -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ +/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */ static void rcu_idle_enter_common(long long newval) { if (newval) { @@ -114,7 +114,7 @@ void rcu_irq_exit(void) } EXPORT_SYMBOL_GPL(rcu_irq_exit); -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ +/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */ static void rcu_idle_exit_common(long long oldval) { if (oldval) { -- cgit v1.2.3 From 66d701ea7e148f8ed8b1497c9159fbf6175d462f Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 16 Jul 2014 22:20:33 -0400 Subject: rcu: Remove stale comment in tree.c This commit removes a stale comment in rcu/tree.c which was left out when some code was moved around previously in commit 2036d94a7b61 ("rcu: Rework detection of use of RCU by offline CPUs") For reference, the following updated comment exists a few lines below this which means the same: /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ Signed-off-by: Pranith Kumar Reviewed-by: Josh Triplett Reviewed-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dc52dc3b8c3e..dd6c8b519691 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2220,8 +2220,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ - /* Exclude any attempts to start a new grace period. */ mutex_lock(&rsp->onoff_mutex); raw_spin_lock_irqsave(&rsp->orphan_lock, flags); -- cgit v1.2.3 From 9fdd3bc9005824704f9802bec7b3e06f5edae434 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 29 Jul 2014 14:50:47 -0700 Subject: rcu: Break more call_rcu() deadlock involving scheduler and perf Commit 96d3fd0d315a9 (rcu: Break call_rcu() deadlock involving scheduler and perf) covered the case where __call_rcu_nocb_enqueue() needs to wake the rcuo kthread due to the queue being initially empty, but did not do anything for the case where the queue was overflowing. This commit therefore also defers wakeup for the overflow case. Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 3 +++ kernel/rcu/tree.h | 9 +++++++-- kernel/rcu/tree_plugin.h | 26 ++++++++++++++++++-------- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index aca382266411..9b56f37148cf 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -180,9 +180,12 @@ TRACE_EVENT(rcu_grace_period_init, * argument is a string as follows: * * "WakeEmpty": Wake rcuo kthread, first CB to empty list. + * "WakeEmptyIsDeferred": Wake rcuo kthread later, first CB to empty list. * "WakeOvf": Wake rcuo kthread, CB list is huge. + * "WakeOvfIsDeferred": Wake rcuo kthread later, CB list is huge. * "WakeNot": Don't wake rcuo kthread. * "WakeNotPoll": Don't wake rcuo kthread because it is polling. + * "DeferredWake": Carried out the "IsDeferred" wakeup. * "Poll": Start of new polling cycle for rcu_nocb_poll. * "Sleep": Sleep waiting for CBs for !rcu_nocb_poll. * "WokeEmpty": rcuo kthread woke to find empty list. diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a86eb7bac45..e33562f2a655 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -350,7 +350,7 @@ struct rcu_data { int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; - bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ /* The following fields are used by the leader, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; @@ -383,6 +383,11 @@ struct rcu_data { #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +/* Values for nocb_defer_wakeup field in struct rcu_data. */ +#define RCU_NOGP_WAKE_NOT 0 +#define RCU_NOGP_WAKE 1 +#define RCU_NOGP_WAKE_FORCE 2 + #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) /* For jiffies_till_first_fqs and */ /* and jiffies_till_next_fqs. */ @@ -589,7 +594,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags); -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bb564560aeb8..d67cc5c375c5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2121,16 +2121,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = true; + rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { /* ... or if many callbacks queued. */ - wake_nocb_leader(rdp, true); + if (!irqs_disabled_flags(flags)) { + wake_nocb_leader(rdp, true); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeOvf")); + } else { + rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeOvfIsDeferred")); + } rdp->qlen_last_fqs_check = LONG_MAX / 2; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); } @@ -2438,7 +2445,7 @@ static int rcu_nocb_kthread(void *arg) } /* Is a deferred wakeup of rcu_nocb_kthread() required? */ -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { return ACCESS_ONCE(rdp->nocb_defer_wakeup); } @@ -2446,11 +2453,14 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) /* Do a deferred wakeup of rcu_nocb_kthread(). */ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { + int ndw; + if (!rcu_nocb_need_deferred_wakeup(rdp)) return; - ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; - wake_nocb_leader(rdp, false); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); + ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup); + ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT; + wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } /* Initialize per-rcu_data variables for no-CBs CPUs. */ @@ -2557,7 +2567,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } -static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { return false; } -- cgit v1.2.3 From ade9862470dd0595d8e292ecea8445ed90b98df5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 31 Jul 2014 16:02:33 -0700 Subject: rcu: Make TINY_RCU tinier by putting error checks under #ifdef The rcu_idle_enter_common() and rcu_idle_exit_common() functions contain error checks that have to the best of my knowledge have never triggered over the past several years. These are nevertheless valuable when creating new architectures or doing other low-level changes, so the checks should not be deleted. This commit instead places these checks under #ifdef CONFIG_RCU_TRACE so that they are executed only when specifically requested. The savings are significant: Before: text data bss dec hex filename 1749 39 0 1788 6fc /tmp/b/kernel/rcu/tiny.o 632 152 0 784 310 /tmp/b/kernel/rcu/update.o ---- 2572 After: text data bss dec hex filename 1281 37 0 1318 526 /tmp/b/kernel/rcu/tiny.o 632 152 0 784 310 /tmp/b/kernel/rcu/update.o ---- 2102 This amounts to 470 bytes, or 18% of the original. Switched from #ifdef to IS_ENABLED() on Josh Triplett's advice. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/tiny.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 6bd785c34add..4a55a2416e3c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval) } RCU_TRACE(trace_rcu_dyntick(TPS("Start"), rcu_dynticks_nesting, newval)); - if (!is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), @@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval) return; } RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { + if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), -- cgit v1.2.3 From 2aa792e6faf1a00f5accf1f69e87e11a390ba2cd Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 12 Aug 2014 13:07:47 -0400 Subject: rcu: Use rcu_gp_kthread_wake() to wake up grace period kthreads The rcu_gp_kthread_wake() function checks for three conditions before waking up grace period kthreads: * Is the thread we are trying to wake up the current thread? * Are the gp_flags zero? (all threads wait on non-zero gp_flags condition) * Is there no thread created for this flavour, hence nothing to wake up? If any one of these condition is true, we do not call wake_up(). It was found that there are quite a few avoidable wake ups both during idle time and under stress induced by rcutorture. Idle: Total:66000, unnecessary:66000, case1:61827, case2:66000, case3:0 Total:68000, unnecessary:68000, case1:63696, case2:68000, case3:0 rcutorture: Total:254000, unnecessary:254000, case1:199913, case2:254000, case3:0 Total:256000, unnecessary:256000, case1:201784, case2:256000, case3:0 Here case{1-3} are the cases listed above. We can avoid these wake ups by using rcu_gp_kthread_wake() to conditionally wake up the grace period kthreads. There is a comment about an implied barrier supplied by the wake_up() logic. This barrier is necessary for the awakened thread to see the updated ->gp_flags. This flag is always being updated with the root node lock held. Also, the awakened thread tries to acquire the root node lock before reading ->gp_flags because of which there is proper ordering. Hence this commit tries to avoid calling wake_up() whenever we can by using rcu_gp_kthread_wake() function. Signed-off-by: Pranith Kumar CC: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd6c8b519691..9e83cd9a32f1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1938,7 +1938,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* @@ -2516,7 +2516,7 @@ static void force_quiescent_state(struct rcu_state *rsp) ACCESS_ONCE(rsp->gp_flags) = ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ + rcu_gp_kthread_wake(rsp); } /* -- cgit v1.2.3 From 73a860cd58a1eb258e889b615cebf738ab33aa23 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Aug 2014 10:28:23 -0700 Subject: rcu: Replace flush_signals() with WARN_ON(signal_pending()) Currently, when RCU awakens from a wait_event_interruptible() that might have awakened prematurely, it does a flush_signals(). This is done on the off-chance that someone figured out how to deliver a signal to a kthread, which is supposed to be impossible. Given that this is supposed to be impossible, this commit changes the flush_signals() calls into WARN_ON(signal_pending()). Reported-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_plugin.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9e83cd9a32f1..3e002c1cb441 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1796,7 +1796,7 @@ static int __noreturn rcu_gp_kthread(void *arg) if (rcu_gp_init(rsp)) break; cond_resched(); - flush_signals(current); + WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("reqwaitsig")); @@ -1842,7 +1842,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } else { /* Deal with stray signal. */ cond_resched(); - flush_signals(current); + WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqswaitsig")); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d67cc5c375c5..bbb0a0cd091b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2237,7 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); if (likely(d)) break; - flush_signals(current); + WARN_ON(signal_pending(current)); trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); } trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); @@ -2296,7 +2296,7 @@ wait_again: if (!rcu_nocb_poll) trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "WokeEmpty"); - flush_signals(current); + WARN_ON(signal_pending(current)); schedule_timeout_interruptible(1); /* Rescan in case we were a victim of memory ordering. */ @@ -2375,7 +2375,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) if (!rcu_nocb_poll) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeEmpty"); - flush_signals(current); + WARN_ON(signal_pending(current)); schedule_timeout_interruptible(1); } } -- cgit v1.2.3 From 58ade2dbe9a253635e0835adedfaa822849aa3a3 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 11 Jun 2014 16:39:43 -0400 Subject: rcutorture: Fix a sparse warning by marking boost_mutex static This commit fixes the following sparse warning by marking boost_mutex static: kernel/rcu/rcutorture.c:185:1: warning: symbol 'boost_mutex' was not declared. Should it be static? Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..7e67711cbae8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,7 +182,7 @@ static u64 notrace rcu_trace_clock_local(void) #endif /* #else #ifdef CONFIG_RCU_TRACE */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ -DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ +static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ static bool barrier_phase; /* Test phase. */ -- cgit v1.2.3 From eea203fea3484598280a07fe503e025e886297fb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 14 Jul 2014 09:16:15 -0400 Subject: rcu: Use pr_alert/pr_cont for printing logs User pr_alert/pr_cont for printing the logs from rcutorture module directly instead of writing it to a buffer and then printing it. This allows us from not having to allocate such buffers. Also remove a resulting empty function. I tested this using the parse-torture.sh script as follows: $ dmesg | grep torture > log.txt $ bash parse-torture.sh log.txt test $ There were no warnings which means that parsing went fine. Signed-off-by: Joe Perches Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 2 +- kernel/rcu/rcutorture.c | 127 +++++++++++++++++++++--------------------------- kernel/torture.c | 16 +++--- 3 files changed, 64 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 5ca58fcbaf1b..fec46f8c08eb 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -51,7 +51,7 @@ /* Definitions for online/offline exerciser. */ int torture_onoff_init(long ooholdoff, long oointerval); -char *torture_onoff_stats(char *page); +void torture_onoff_stats(void); bool torture_onoff_failures(void); /* Low-rider random number generator. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7e67711cbae8..ff4f0c756dee 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -242,7 +242,7 @@ struct rcu_torture_ops { void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); - void (*stats)(char *page); + void (*stats)(void); int irq_capable; int can_boost; const char *name; @@ -525,21 +525,21 @@ static void srcu_torture_barrier(void) srcu_barrier(&srcu_ctl); } -static void srcu_torture_stats(char *page) +static void srcu_torture_stats(void) { int cpu; int idx = srcu_ctl.completed & 0x1; - page += sprintf(page, "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); + pr_alert("%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { long c0, c1; c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; - page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } - sprintf(page, "\n"); + pr_cont("\n"); } static void srcu_torture_synchronize_expedited(void) @@ -1031,10 +1031,15 @@ rcu_torture_reader(void *arg) } /* - * Create an RCU-torture statistics message in the specified buffer. + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). */ static void -rcu_torture_printk(char *page) +rcu_torture_stats_print(void) { int cpu; int i; @@ -1052,55 +1057,60 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free)); - page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); - page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers); - page = torture_onoff_stats(page); - page += sprintf(page, "barrier: %ld/%ld:%ld", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ", + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror); + pr_cont("rtbf: %ld rtb: %ld nt: %ld ", + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + torture_onoff_stats(); + pr_cont("barrier: %ld/%ld:%ld\n", + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || n_rcu_torture_boost_failure != 0 || i > 1) { - page += sprintf(page, "!!! "); + pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); } - page += sprintf(page, "Reader Pipe: "); + pr_cont("Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", pipesummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Reader Batch: "); + pr_cont(" %ld", pipesummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Reader Batch: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - page += sprintf(page, " %ld", batchsummary[i]); - page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); - page += sprintf(page, "Free-Block Circulation: "); + pr_cont(" %ld", batchsummary[i]); + pr_cont("\n"); + + pr_alert("%s%s ", torture_type, TORTURE_FLAG); + pr_cont("Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - page += sprintf(page, " %d", - atomic_read(&rcu_torture_wcount[i])); + pr_cont(" %d", atomic_read(&rcu_torture_wcount[i])); } - page += sprintf(page, "\n"); + pr_cont("\n"); + if (cur_ops->stats) - cur_ops->stats(page); + cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { int __maybe_unused flags; @@ -1109,40 +1119,15 @@ rcu_torture_printk(char *page) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - page += sprintf(page, - "??? Writer stall state %d g%lu c%lu f%#x\n", - rcu_torture_writer_state, - gpnum, completed, flags); + pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + rcu_torture_writer_state, + gpnum, completed, flags); show_rcu_gp_kthreads(); rcutorture_trace_dump(); } rtcv_snap = rcu_torture_current_version; } -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int size = nr_cpu_ids * 200 + 8192; - char *buf; - - buf = kmalloc(size, GFP_KERNEL); - if (!buf) { - pr_err("rcu-torture: Out of memory, need: %d", size); - return; - } - rcu_torture_printk(buf); - pr_alert("%s", buf); - kfree(buf); -} - /* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. diff --git a/kernel/torture.c b/kernel/torture.c index d600af21f022..ede8b25ec1ae 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup); /* * Print online/offline testing statistics. */ -char *torture_onoff_stats(char *page) +void torture_onoff_stats(void) { #ifdef CONFIG_HOTPLUG_CPU - page += sprintf(page, - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); + pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ - return page; } EXPORT_SYMBOL_GPL(torture_onoff_stats); -- cgit v1.2.3 From 38706bc5a29a73645e512c06ffb759fb56259d83 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Aug 2014 21:12:17 -0700 Subject: rcutorture: Add callback-flood test Although RCU is designed to handle arbitrary floods of callbacks, this capability is not routinely tested. This commit therefore adds a cbflood capability in which kthreads repeatedly registers large numbers of callbacks. One such kthread is created for each four CPUs (rounding up), and the test may be controlled by several cbflood_* kernel boot parameters, which control the number of bursts per flood, the number of callbacks per burst, the time between bursts, and the time between floods. The default values are large enough to exercise RCU's emergency responses to callback flooding. Signed-off-by: Paul E. McKenney Cc: David Miller Reviewed-by: Pranith Kumar --- Documentation/kernel-parameters.txt | 18 ++++++++ kernel/rcu/rcutorture.c | 86 ++++++++++++++++++++++++++++++++++++- 2 files changed, 103 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 5ae8608ca9f5..0a104be4ad86 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2881,6 +2881,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. + rcutorture.cbflood_inter_holdoff= [KNL] + Set holdoff time (jiffies) between successive + callback-flood tests. + + rcutorture.cbflood_intra_holdoff= [KNL] + Set holdoff time (jiffies) between successive + bursts of callbacks within a given callback-flood + test. + + rcutorture.cbflood_n_burst= [KNL] + Set the number of bursts making up a given + callback-flood test. Set this to zero to + disable callback-flood testing. + + rcutorture.cbflood_n_per_burst= [KNL] + Set the number of callbacks to be registered + in a given burst of a callback-flood test. + rcutorture.fqs_duration= [KNL] Set duration of force_quiescent_state bursts. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ff4f0c756dee..0bcd53adac73 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -49,11 +49,19 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +torture_param(int, cbflood_inter_holdoff, HZ, + "Holdoff between floods (jiffies)"); +torture_param(int, cbflood_intra_holdoff, 1, + "Holdoff between bursts (jiffies)"); +torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable"); +torture_param(int, cbflood_n_per_burst, 20000, + "# callbacks per burst in flood"); torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); @@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); static int nrealreaders; +static int ncbflooders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; +static struct task_struct **cbflood_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; @@ -138,6 +148,7 @@ static long n_rcu_torture_boosts; static long n_rcu_torture_timers; static long n_barrier_attempts; static long n_barrier_successes; +static atomic_long_t n_cbfloods; static struct list_head rcu_torture_removed; static int rcu_torture_writer_state; @@ -707,6 +718,58 @@ checkwait: stutter_wait("rcu_torture_boost"); return 0; } +static void rcu_torture_cbflood_cb(struct rcu_head *rhp) +{ +} + +/* + * RCU torture callback-flood kthread. Repeatedly induces bursts of calls + * to call_rcu() or analogous, increasing the probability of occurrence + * of callback-overflow corner cases. + */ +static int +rcu_torture_cbflood(void *arg) +{ + int err = 1; + int i; + int j; + struct rcu_head *rhp; + + if (cbflood_n_per_burst > 0 && + cbflood_inter_holdoff > 0 && + cbflood_intra_holdoff > 0 && + cur_ops->call && + cur_ops->cb_barrier) { + rhp = vmalloc(sizeof(*rhp) * + cbflood_n_burst * cbflood_n_per_burst); + err = !rhp; + } + if (err) { + VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); + while (!torture_must_stop()) + schedule_timeout_interruptible(HZ); + return 0; + } + VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); + do { + schedule_timeout_interruptible(cbflood_inter_holdoff); + atomic_long_inc(&n_cbfloods); + WARN_ON(signal_pending(current)); + for (i = 0; i < cbflood_n_burst; i++) { + for (j = 0; j < cbflood_n_per_burst; j++) { + cur_ops->call(&rhp[i * cbflood_n_per_burst + j], + rcu_torture_cbflood_cb); + } + schedule_timeout_interruptible(cbflood_intra_holdoff); + WARN_ON(signal_pending(current)); + } + cur_ops->cb_barrier(); + stutter_wait("rcu_torture_cbflood"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_cbflood"); + return 0; +} + /* * RCU torture force-quiescent-state kthread. Repeatedly induces * bursts of calls to force_quiescent_state(), increasing the probability @@ -1075,10 +1138,11 @@ rcu_torture_stats_print(void) n_rcu_torture_boosts, n_rcu_torture_timers); torture_onoff_stats(); - pr_cont("barrier: %ld/%ld:%ld\n", + pr_cont("barrier: %ld/%ld:%ld ", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); + pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || @@ -1432,6 +1496,8 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); + for (i = 0; i < ncbflooders; i++) + torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]); if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { unregister_cpu_notifier(&rcutorture_cpu_nb); @@ -1678,6 +1744,24 @@ rcu_torture_init(void) goto unwind; if (object_debug) rcu_test_debug_objects(); + if (cbflood_n_burst > 0) { + /* Create the cbflood threads */ + ncbflooders = (num_online_cpus() + 3) / 4; + cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task), + GFP_KERNEL); + if (!cbflood_task) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < ncbflooders; i++) { + firsterr = torture_create_kthread(rcu_torture_cbflood, + NULL, + cbflood_task[i]); + if (firsterr) + goto unwind; + } + } rcutorture_record_test_transition(); torture_init_end(); return 0; -- cgit v1.2.3 From 8315f42295d2667a7f942f154b73a86fd7cb2227 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 27 Jun 2014 13:42:20 -0700 Subject: rcu: Add call_rcu_tasks() This commit adds a new RCU-tasks flavor of RCU, which provides call_rcu_tasks(). This RCU flavor's quiescent states are voluntary context switch (not preemption!) and userspace execution (not the idle loop -- use some sort of schedule_on_each_cpu() if you need to handle the idle tasks. Note that unlike other RCU flavors, these quiescent states occur in tasks, not necessarily CPUs. Includes fixes from Steven Rostedt. This RCU flavor is assumed to have very infrequent latency-tolerant updaters. This assumption permits significant simplifications, including a single global callback list protected by a single global lock, along with a single task-private linked list containing all tasks that have not yet passed through a quiescent state. If experience shows this assumption to be incorrect, the required additional complexity will be added. Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 9 +++ include/linux/rcupdate.h | 36 ++++++++++ include/linux/sched.h | 23 ++++--- init/Kconfig | 10 +++ kernel/rcu/tiny.c | 2 + kernel/rcu/tree.c | 2 + kernel/rcu/update.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 242 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2bb4c4f3531a..dffd9258ee60 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -117,6 +117,14 @@ extern struct group_info init_groups; #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif +#ifdef CONFIG_TASKS_RCU +#define INIT_TASK_RCU_TASKS(tsk) \ + .rcu_tasks_holdout = false, \ + .rcu_tasks_holdout_list = \ + LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), +#else +#define INIT_TASK_RCU_TASKS(tsk) +#endif extern struct cred init_cred; @@ -224,6 +232,7 @@ extern struct task_group root_task_group; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_TASK_RCU_TASKS(tsk) \ INIT_CPUSET_SEQ(tsk) \ INIT_RT_MUTEXES(tsk) \ INIT_VTIME(tsk) \ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d231aa17b1d7..3432063f4c87 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -197,6 +197,26 @@ void call_rcu_sched(struct rcu_head *head, void synchronize_sched(void); +/** + * call_rcu_tasks() - Queue an RCU for invocation task-based grace period + * @head: structure to be used for queueing the RCU updates. + * @func: actual callback function to be invoked after the grace period + * + * The callback function will be invoked some time after a full grace + * period elapses, in other words after all currently executing RCU + * read-side critical sections have completed. call_rcu_tasks() assumes + * that the read-side critical sections end at a voluntary context + * switch (not a preemption!), entry into idle, or transition to usermode + * execution. As such, there are no read-side primitives analogous to + * rcu_read_lock() and rcu_read_unlock() because this primitive is intended + * to determine that all tasks have passed through a safe state, not so + * much for data-strcuture synchronization. + * + * See the description of call_rcu() for more detailed information on + * memory ordering guarantees. + */ +void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head)); + #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); @@ -294,6 +314,22 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, rcu_irq_exit(); \ } while (0) +/* + * Note a voluntary context switch for RCU-tasks benefit. This is a + * macro rather than an inline function to avoid #include hell. + */ +#ifdef CONFIG_TASKS_RCU +#define rcu_note_voluntary_context_switch(t) \ + do { \ + preempt_disable(); /* Exclude synchronize_sched(); */ \ + if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \ + ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \ + preempt_enable(); \ + } while (0) +#else /* #ifdef CONFIG_TASKS_RCU */ +#define rcu_note_voluntary_context_switch(t) do { } while (0) +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885ee52b..eaacac4ae77d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1270,6 +1270,11 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + unsigned long rcu_tasks_nvcsw; + bool rcu_tasks_holdout; + struct list_head rcu_tasks_holdout_list; +#endif /* #ifdef CONFIG_TASKS_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -2000,28 +2005,24 @@ extern void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask); #ifdef CONFIG_PREEMPT_RCU - #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ static inline void rcu_copy_process(struct task_struct *p) { +#ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special = 0; -#ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ INIT_LIST_HEAD(&p->rcu_node_entry); +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + p->rcu_tasks_holdout = false; + INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); +#endif /* #ifdef CONFIG_TASKS_RCU */ } -#else - -static inline void rcu_copy_process(struct task_struct *p) -{ -} - -#endif - static inline void tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) { diff --git a/init/Kconfig b/init/Kconfig index e84c6423a2e5..c4539c4e177f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -507,6 +507,16 @@ config PREEMPT_RCU This option enables preemptible-RCU code that is common between TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU. +config TASKS_RCU + bool "Task_based RCU implementation using voluntary context switch" + default n + help + This option enables a task-based RCU implementation that uses + only voluntary context switch (not preemption!), idle, and + user-mode execution as quiescent states. + + If unsure, say N. + config RCU_STALL_COMMON def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE ) help diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d9efcc13008c..717f00854fc0 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -254,6 +254,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + if (user) + rcu_note_voluntary_context_switch(current); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..8ad91d1e317d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2410,6 +2410,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); + if (user) + rcu_note_voluntary_context_switch(current); trace_rcu_utilization(TPS("End scheduler-tick")); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 4056d7992a6c..19b3dacb0753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -47,6 +47,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -347,3 +348,173 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +#ifdef CONFIG_TASKS_RCU + +/* + * Simple variant of RCU whose quiescent states are voluntary context switch, + * user-space execution, and idle. As such, grace periods can take one good + * long time. There are no read-side primitives similar to rcu_read_lock() + * and rcu_read_unlock() because this implementation is intended to get + * the system into a safe state for some of the manipulations involved in + * tracing and the like. Finally, this implementation does not support + * high call_rcu_tasks() rates from multiple CPUs. If this is required, + * per-CPU callback lists will be needed. + */ + +/* Global list of callbacks and associated lock. */ +static struct rcu_head *rcu_tasks_cbs_head; +static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; +static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); + +/* Post an RCU-tasks callback. */ +void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +{ + unsigned long flags; + + rhp->next = NULL; + rhp->func = func; + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + *rcu_tasks_cbs_tail = rhp; + rcu_tasks_cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); +} +EXPORT_SYMBOL_GPL(call_rcu_tasks); + +/* See if the current task has stopped holding out, remove from list if so. */ +static void check_holdout_task(struct task_struct *t) +{ + if (!ACCESS_ONCE(t->rcu_tasks_holdout) || + t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || + !ACCESS_ONCE(t->on_rq)) { + ACCESS_ONCE(t->rcu_tasks_holdout) = false; + list_del_rcu(&t->rcu_tasks_holdout_list); + put_task_struct(t); + } +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + unsigned long flags; + struct task_struct *g, *t; + struct rcu_head *list; + struct rcu_head *next; + LIST_HEAD(rcu_tasks_holdouts); + + /* FIXME: Add housekeeping affinity. */ + + /* + * Each pass through the following loop makes one check for + * newly arrived callbacks, and, if there are some, waits for + * one RCU-tasks grace period and then invokes the callbacks. + * This loop is terminated by the system going down. ;-) + */ + for (;;) { + + /* Pick up any new callbacks. */ + raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + list = rcu_tasks_cbs_head; + rcu_tasks_cbs_head = NULL; + rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; + raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + + /* If there were none, wait a bit and start over. */ + if (!list) { + schedule_timeout_interruptible(HZ); + WARN_ON(signal_pending(current)); + continue; + } + + /* + * Wait for all pre-existing t->on_rq and t->nvcsw + * transitions to complete. Invoking synchronize_sched() + * suffices because all these transitions occur with + * interrupts disabled. Without this synchronize_sched(), + * a read-side critical section that started before the + * grace period might be incorrectly seen as having started + * after the grace period. + * + * This synchronize_sched() also dispenses with the + * need for a memory barrier on the first store to + * ->rcu_tasks_holdout, as it forces the store to happen + * after the beginning of the grace period. + */ + synchronize_sched(); + + /* + * There were callbacks, so we need to wait for an + * RCU-tasks grace period. Start off by scanning + * the task list for tasks that are not already + * voluntarily blocked. Mark these tasks and make + * a list of them in rcu_tasks_holdouts. + */ + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t != current && ACCESS_ONCE(t->on_rq) && + !is_idle_task(t)) { + get_task_struct(t); + t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw); + ACCESS_ONCE(t->rcu_tasks_holdout) = true; + list_add(&t->rcu_tasks_holdout_list, + &rcu_tasks_holdouts); + } + } + rcu_read_unlock(); + + /* + * Each pass through the following loop scans the list + * of holdout tasks, removing any that are no longer + * holdouts. When the list is empty, we are done. + */ + while (!list_empty(&rcu_tasks_holdouts)) { + schedule_timeout_interruptible(HZ); + WARN_ON(signal_pending(current)); + rcu_read_lock(); + list_for_each_entry_rcu(t, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) + check_holdout_task(t); + rcu_read_unlock(); + } + + /* + * Because ->on_rq and ->nvcsw are not guaranteed + * to have a full memory barriers prior to them in the + * schedule() path, memory reordering on other CPUs could + * cause their RCU-tasks read-side critical sections to + * extend past the end of the grace period. However, + * because these ->nvcsw updates are carried out with + * interrupts disabled, we can use synchronize_sched() + * to force the needed ordering on all such CPUs. + * + * This synchronize_sched() also confines all + * ->rcu_tasks_holdout accesses to be within the grace + * period, avoiding the need for memory barriers for + * ->rcu_tasks_holdout accesses. + */ + synchronize_sched(); + + /* Invoke the callbacks. */ + while (list) { + next = list->next; + local_bh_disable(); + list->func(list); + local_bh_enable(); + list = next; + cond_resched(); + } + } +} + +/* Spawn rcu_tasks_kthread() at boot time. */ +static int __init rcu_spawn_tasks_kthread(void) +{ + struct task_struct __maybe_unused *t; + + t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); + BUG_ON(IS_ERR(t)); + return 0; +} +early_initcall(rcu_spawn_tasks_kthread); + +#endif /* #ifdef CONFIG_TASKS_RCU */ -- cgit v1.2.3 From bde6c3aa993066acb0d6ce32ecabe03b9d5df92d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 11:26:57 -0700 Subject: rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops RCU-tasks requires the occasional voluntary context switch from CPU-bound in-kernel tasks. In some cases, this requires instrumenting cond_resched(). However, there is some reluctance to countenance unconditionally instrumenting cond_resched() (see http://lwn.net/Articles/603252/), so this commit creates a separate cond_resched_rcu_qs() that may be used in place of cond_resched() in locations prone to long-duration in-kernel looping. This commit currently instruments only RCU-tasks. Future possibilities include also instrumenting RCU, RCU-bh, and RCU-sched in order to reduce IPI usage. Signed-off-by: Paul E. McKenney --- fs/file.c | 2 +- include/linux/rcupdate.h | 13 +++++++++++++ kernel/rcu/rcutorture.c | 4 ++-- kernel/rcu/tree.c | 12 ++++++------ kernel/rcu/tree_plugin.h | 2 +- mm/mlock.c | 2 +- 6 files changed, 24 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/fs/file.c b/fs/file.c index 66923fe3176e..1cafc4c9275b 100644 --- a/fs/file.c +++ b/fs/file.c @@ -367,7 +367,7 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); - cond_resched(); + cond_resched_rcu_qs(); } } i++; diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3432063f4c87..473350462d04 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -330,6 +330,19 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, #define rcu_note_voluntary_context_switch(t) do { } while (0) #endif /* #else #ifdef CONFIG_TASKS_RCU */ +/** + * cond_resched_rcu_qs - Report potential quiescent states to RCU + * + * This macro resembles cond_resched(), except that it is defined to + * report potential quiescent states to RCU-tasks even if the cond_resched() + * machinery were to be shut off, as some advocate for PREEMPT kernels. + */ +#define cond_resched_rcu_qs() \ +do { \ + rcu_note_voluntary_context_switch(current); \ + cond_resched(); \ +} while (0) + #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 948a7693748e..178716713e11 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -667,7 +667,7 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -1019,7 +1019,7 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched(); + cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8ad91d1e317d..e23dad0661e2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1647,7 +1647,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } mutex_unlock(&rsp->onoff_mutex); @@ -1736,7 +1736,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); - cond_resched(); + cond_resched_rcu_qs(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); @@ -1785,7 +1785,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched(); + cond_resched_rcu_qs(); flush_signals(current); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -1828,10 +1828,10 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched(); + cond_resched_rcu_qs(); } else { /* Deal with stray signal. */ - cond_resched(); + cond_resched_rcu_qs(); flush_signals(current); trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), @@ -2434,7 +2434,7 @@ static void force_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched(); + cond_resched_rcu_qs(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a7997e272564..7672586d3920 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1848,7 +1848,7 @@ static int rcu_oom_notify(struct notifier_block *self, get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched(); + cond_resched_rcu_qs(); } put_online_cpus(); diff --git a/mm/mlock.c b/mm/mlock.c index ce84cb0b83ef..ab3150c26711 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -789,7 +789,7 @@ static int do_mlockall(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); - cond_resched(); + cond_resched_rcu_qs(); } out: return 0; -- cgit v1.2.3 From 53c6d4edf874d3cbc031a53738c6cba9277faea5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 12:22:23 -0700 Subject: rcu: Add synchronous grace-period waiting for RCU-tasks It turns out to be easier to add the synchronous grace-period waiting functions to RCU-tasks than to work around their absense in rcutorture, so this commit adds them. The key point is that the existence of call_rcu_tasks() means that rcutorture needs an rcu_barrier_tasks(). Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 ++ kernel/rcu/update.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 473350462d04..640152fedcde 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -216,6 +216,8 @@ void synchronize_sched(void); * memory ordering guarantees. */ void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head)); +void synchronize_rcu_tasks(void); +void rcu_barrier_tasks(void); #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 19b3dacb0753..5fd1ddbfcc55 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -381,6 +381,61 @@ void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) } EXPORT_SYMBOL_GPL(call_rcu_tasks); +/** + * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-tasks + * grace period has elapsed, in other words after all currently + * executing rcu-tasks read-side critical sections have elapsed. These + * read-side critical sections are delimited by calls to schedule(), + * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). + * + * This is a very specialized primitive, intended only for a few uses in + * tracing and other situations requiring manipulation of function + * preambles and profiling hooks. The synchronize_rcu_tasks() function + * is not (yet) intended for heavy use from multiple CPUs. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu_tasks() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-tasks read-side critical section whose beginning + * preceded the call to synchronize_rcu_tasks(). In addition, each CPU + * having an RCU-tasks read-side critical section that extends beyond + * the return from synchronize_rcu_tasks() is guaranteed to have executed + * a full memory barrier after the beginning of synchronize_rcu_tasks() + * and before the beginning of that RCU-tasks read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU + * (but again only if the system has more than one CPU). + */ +void synchronize_rcu_tasks(void) +{ + /* Complain if the scheduler has not started. */ + rcu_lockdep_assert(!rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); + + /* Wait for the grace period. */ + wait_rcu_gp(call_rcu_tasks); +} + +/** + * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. + * + * Although the current implementation is guaranteed to wait, it is not + * obligated to, for example, if there are no pending callbacks. + */ +void rcu_barrier_tasks(void) +{ + /* There is only one callback queue, so this is easy. ;-) */ + synchronize_rcu_tasks(); +} + /* See if the current task has stopped holding out, remove from list if so. */ static void check_holdout_task(struct task_struct *t) { -- cgit v1.2.3 From 3f95aa81d265223fdb13ea2b59883766a05adbdf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Aug 2014 06:10:23 -0700 Subject: rcu: Make TASKS_RCU handle tasks that are almost done exiting Once a task has passed exit_notify() in the do_exit() code path, it is no longer on the task lists, and is therefore no longer visible to rcu_tasks_kthread(). This means that an almost-exited task might be preempted while within a trampoline, and this task won't be waited on by rcu_tasks_kthread(). This commit fixes this bug by adding an srcu_struct. An exiting task does srcu_read_lock() just before calling exit_notify(), and does the corresponding srcu_read_unlock() after doing the final preempt_disable(). This means that rcu_tasks_kthread() can do synchronize_srcu() to wait for all mostly-exited tasks to reach their final preempt_disable() region, and then use synchronize_sched() to wait for those tasks to finish exiting. Reported-by: Oleg Nesterov Suggested-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 3 +++ kernel/exit.c | 3 +++ kernel/rcu/update.c | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 640152fedcde..54b2ebb20313 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -321,6 +321,8 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, * macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU +#define TASKS_RCU(x) x +extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch(t) \ do { \ preempt_disable(); /* Exclude synchronize_sched(); */ \ @@ -329,6 +331,7 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, preempt_enable(); \ } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ +#define TASKS_RCU(x) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #endif /* #else #ifdef CONFIG_TASKS_RCU */ diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7433a3..d13f2eec4bb8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -667,6 +667,7 @@ void do_exit(long code) { struct task_struct *tsk = current; int group_dead; + TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); @@ -775,6 +776,7 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA @@ -814,6 +816,7 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); /* * The setting of TASK_RUNNING by try_to_wake_up() may be delayed diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5fd1ddbfcc55..403fc4ae539e 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -367,6 +367,13 @@ static struct rcu_head *rcu_tasks_cbs_head; static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); +/* Track exiting tasks in order to allow them to be waited for. */ +DEFINE_SRCU(tasks_rcu_exit_srcu); + +/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ +static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 3; +module_param(rcu_task_stall_timeout, int, 0644); + /* Post an RCU-tasks callback. */ void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) { @@ -517,6 +524,15 @@ static int __noreturn rcu_tasks_kthread(void *arg) } rcu_read_unlock(); + /* + * Wait for tasks that are in the process of exiting. + * This does only part of the job, ensuring that all + * tasks that were previously exiting reach the point + * where they have disabled preemption, allowing the + * later synchronize_sched() to finish the job. + */ + synchronize_srcu(&tasks_rcu_exit_srcu); + /* * Each pass through the following loop scans the list * of holdout tasks, removing any that are no longer @@ -546,6 +562,11 @@ static int __noreturn rcu_tasks_kthread(void *arg) * ->rcu_tasks_holdout accesses to be within the grace * period, avoiding the need for memory barriers for * ->rcu_tasks_holdout accesses. + * + * In addition, this synchronize_sched() waits for exiting + * tasks to complete their final preempt_disable() region + * of execution, cleaning up after the synchronize_srcu() + * above. */ synchronize_sched(); -- cgit v1.2.3 From 06c2a9238fad48ec38f1be00455bf942d54377ee Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Jul 2014 18:17:19 -0700 Subject: rcu: Export RCU-tasks APIs to GPL modules This commit exports the RCU-tasks synchronous APIs, synchronize_rcu_tasks() and rcu_barrier_tasks(), to GPL-licensed kernel modules. Signed-off-by: Steven Rostedt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcu/update.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 403fc4ae539e..aef8109152ce 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -430,6 +430,7 @@ void synchronize_rcu_tasks(void) /* Wait for the grace period. */ wait_rcu_gp(call_rcu_tasks); } +EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); /** * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. @@ -442,6 +443,7 @@ void rcu_barrier_tasks(void) /* There is only one callback queue, so this is easy. ;-) */ synchronize_rcu_tasks(); } +EXPORT_SYMBOL_GPL(rcu_barrier_tasks); /* See if the current task has stopped holding out, remove from list if so. */ static void check_holdout_task(struct task_struct *t) -- cgit v1.2.3 From 69c604557ce34015629b325b85ff1a4996038a3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 11:59:36 -0700 Subject: rcutorture: Add torture tests for RCU-tasks This commit adds torture tests for RCU-tasks. It also fixes a bug that would segfault for an RCU flavor lacking a callback-barrier function. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 1 + kernel/rcu/rcutorture.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 50 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 54b2ebb20313..a3123f53a4ce 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -55,6 +55,7 @@ enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, RCU_SCHED_FLAVOR, + RCU_TASKS_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 178716713e11..75b1abf78c48 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -601,6 +601,52 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; +#ifdef CONFIG_TASKS_RCU + +/* + * Definitions for RCU-tasks torture testing. + */ + +static int tasks_torture_read_lock(void) +{ + return 0; +} + +static void tasks_torture_read_unlock(int idx) +{ +} + +static void rcu_tasks_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops tasks_ops = { + .ttype = RCU_TASKS_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = tasks_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = tasks_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_tasks_torture_deferred_free, + .sync = synchronize_rcu_tasks, + .exp_sync = synchronize_rcu_tasks, + .call = call_rcu_tasks, + .cb_barrier = rcu_barrier_tasks, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "tasks" +}; + +#define RCUTORTURE_TASKS_OPS &tasks_ops, + +#else /* #ifdef CONFIG_TASKS_RCU */ + +#define RCUTORTURE_TASKS_OPS + +#endif /* #else #ifdef CONFIG_TASKS_RCU */ + /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1295,7 +1341,8 @@ static int rcu_torture_barrier_cbs(void *arg) if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); - cur_ops->cb_barrier(); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; @@ -1534,6 +1581,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, + RCUTORTURE_TASKS_OPS }; if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) -- cgit v1.2.3 From 52db30ab23b6d00cf80b22a510c4ea4be4458031 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2014 18:16:30 -0700 Subject: rcu: Add stall-warning checks for RCU-tasks This commit adds a ten-minute RCU-tasks stall warning. The actual time is controlled by the boot/sysfs parameter rcu_task_stall_timeout, with values less than or equal to zero disabling the stall warnings. The default value is ten minutes, which means that the tasks that have not yet responded will get their stacks dumped every ten minutes, until they pass through a voluntary context switch. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 5 +++++ kernel/rcu/update.c | 29 +++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 5ae8608ca9f5..e98be953d96c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2982,6 +2982,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. + rcupdate.rcu_task_stall_timeout= [KNL] + Set timeout in jiffies for RCU task stall warning + messages. Disable with a value less than or equal + to zero. + rdinit= [KNL] Format: Run specified binary instead of /init from the ramdisk, diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index aef8109152ce..bad7dbd4c2e3 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -371,7 +371,7 @@ static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); DEFINE_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ -static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 3; +static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); /* Post an RCU-tasks callback. */ @@ -445,8 +445,9 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); -/* See if the current task has stopped holding out, remove from list if so. */ -static void check_holdout_task(struct task_struct *t) +/* See if tasks are still holding out, complain if so. */ +static void check_holdout_task(struct task_struct *t, + bool needreport, bool *firstreport) { if (!ACCESS_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || @@ -454,7 +455,15 @@ static void check_holdout_task(struct task_struct *t) ACCESS_ONCE(t->rcu_tasks_holdout) = false; list_del_rcu(&t->rcu_tasks_holdout_list); put_task_struct(t); + return; } + if (!needreport) + return; + if (*firstreport) { + pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); + *firstreport = false; + } + sched_show_task(t); } /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ @@ -462,6 +471,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) { unsigned long flags; struct task_struct *g, *t; + unsigned long lastreport; struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); @@ -540,13 +550,24 @@ static int __noreturn rcu_tasks_kthread(void *arg) * of holdout tasks, removing any that are no longer * holdouts. When the list is empty, we are done. */ + lastreport = jiffies; while (!list_empty(&rcu_tasks_holdouts)) { + bool firstreport; + bool needreport; + int rtst; + schedule_timeout_interruptible(HZ); + rtst = ACCESS_ONCE(rcu_task_stall_timeout); + needreport = rtst > 0 && + time_after(jiffies, lastreport + rtst); + if (needreport) + lastreport = jiffies; + firstreport = true; WARN_ON(signal_pending(current)); rcu_read_lock(); list_for_each_entry_rcu(t, &rcu_tasks_holdouts, rcu_tasks_holdout_list) - check_holdout_task(t); + check_holdout_task(t, needreport, &firstreport); rcu_read_unlock(); } -- cgit v1.2.3 From c7b24d2b9a0f2ce19fdf631d3148c80a8f6010b1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Jul 2014 14:39:25 -0700 Subject: rcu: Improve RCU-tasks energy efficiency The current RCU-tasks implementation uses strict polling to detect callback arrivals. This works quite well, but is not so good for energy efficiency. This commit therefore replaces the strict polling with a wait queue. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index bad7dbd4c2e3..444c8a303963 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -365,6 +365,7 @@ early_initcall(check_cpu_stall_init); /* Global list of callbacks and associated lock. */ static struct rcu_head *rcu_tasks_cbs_head; static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; +static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); /* Track exiting tasks in order to allow them to be waited for. */ @@ -378,13 +379,17 @@ module_param(rcu_task_stall_timeout, int, 0644); void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) { unsigned long flags; + bool needwake; rhp->next = NULL; rhp->func = func; raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); + needwake = !rcu_tasks_cbs_head; *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); + if (needwake) + wake_up(&rcu_tasks_cbs_wq); } EXPORT_SYMBOL_GPL(call_rcu_tasks); @@ -495,8 +500,12 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* If there were none, wait a bit and start over. */ if (!list) { - schedule_timeout_interruptible(HZ); - WARN_ON(signal_pending(current)); + wait_event_interruptible(rcu_tasks_cbs_wq, + rcu_tasks_cbs_head); + if (!rcu_tasks_cbs_head) { + WARN_ON(signal_pending(current)); + schedule_timeout_interruptible(HZ/10); + } continue; } @@ -602,6 +611,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) list = next; cond_resched(); } + schedule_timeout_uninterruptible(HZ/10); } } -- cgit v1.2.3 From 84a8f446ffd70c2799a96268aaa4d47c22a83ff0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Aug 2014 07:24:21 -0700 Subject: rcu: Defer rcu_tasks_kthread() creation till first call_rcu_tasks() It is expected that many sites will have CONFIG_TASKS_RCU=y, but will never actually invoke call_rcu_tasks(). For such sites, creating rcu_tasks_kthread() at boot is wasteful. This commit therefore defers creation of this kthread until the time of the first call_rcu_tasks(). This of course means that the first call_rcu_tasks() must be invoked from process context after the scheduler is fully operational. Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 444c8a303963..e1d71741958f 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -375,7 +375,12 @@ DEFINE_SRCU(tasks_rcu_exit_srcu); static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); -/* Post an RCU-tasks callback. */ +static void rcu_spawn_tasks_kthread(void); + +/* + * Post an RCU-tasks callback. First call must be from process context + * after the scheduler if fully operational. + */ void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) { unsigned long flags; @@ -388,8 +393,10 @@ void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - if (needwake) + if (needwake) { + rcu_spawn_tasks_kthread(); wake_up(&rcu_tasks_cbs_wq); + } } EXPORT_SYMBOL_GPL(call_rcu_tasks); @@ -615,15 +622,27 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn rcu_tasks_kthread() at boot time. */ -static int __init rcu_spawn_tasks_kthread(void) +/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ +static void rcu_spawn_tasks_kthread(void) { - struct task_struct __maybe_unused *t; + static DEFINE_MUTEX(rcu_tasks_kthread_mutex); + static struct task_struct *rcu_tasks_kthread_ptr; + struct task_struct *t; + if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) { + smp_mb(); /* Ensure caller sees full kthread. */ + return; + } + mutex_lock(&rcu_tasks_kthread_mutex); + if (rcu_tasks_kthread_ptr) { + mutex_unlock(&rcu_tasks_kthread_mutex); + return; + } t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); BUG_ON(IS_ERR(t)); - return 0; + smp_mb(); /* Ensure others see full kthread. */ + ACCESS_ONCE(rcu_tasks_kthread_ptr) = t; + mutex_unlock(&rcu_tasks_kthread_mutex); } -early_initcall(rcu_spawn_tasks_kthread); #endif /* #ifdef CONFIG_TASKS_RCU */ -- cgit v1.2.3 From 176f8f7a52cc6d09d686f0d900abda6942a52fbb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Aug 2014 17:43:50 -0700 Subject: rcu: Make TASKS_RCU handle nohz_full= CPUs Currently TASKS_RCU would ignore a CPU running a task in nohz_full= usermode execution. There would be neither a context switch nor a scheduling-clock interrupt to tell TASKS_RCU that the task in question had passed through a quiescent state. The grace period would therefore extend indefinitely. This commit therefore makes RCU's dyntick-idle subsystem record the task_struct structure of the task that is running in dyntick-idle mode on each CPU. The TASKS_RCU grace period can then access this information and record a quiescent state on behalf of any CPU running in dyntick-idle usermode. Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 3 ++- include/linux/sched.h | 2 ++ kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_plugin.h | 16 ++++++++++++++++ kernel/rcu/update.c | 4 +++- 6 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index dffd9258ee60..03b274873b06 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -121,7 +121,8 @@ extern struct group_info init_groups; #define INIT_TASK_RCU_TASKS(tsk) \ .rcu_tasks_holdout = false, \ .rcu_tasks_holdout_list = \ - LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), + LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list), \ + .rcu_tasks_idle_cpu = -1, #else #define INIT_TASK_RCU_TASKS(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index eaacac4ae77d..ec8b34722bcc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1274,6 +1274,7 @@ struct task_struct { unsigned long rcu_tasks_nvcsw; bool rcu_tasks_holdout; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_idle_cpu; #endif /* #ifdef CONFIG_TASKS_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2020,6 +2021,7 @@ static inline void rcu_copy_process(struct task_struct *p) #ifdef CONFIG_TASKS_RCU p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); + p->rcu_tasks_idle_cpu = -1; #endif /* #ifdef CONFIG_TASKS_RCU */ } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e23dad0661e2..c880f5387b1f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -526,6 +526,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, atomic_inc(&rdtp->dynticks); smp_mb__after_atomic(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + rcu_dynticks_task_enter(); /* * It is illegal to enter an extended quiescent state while @@ -642,6 +643,7 @@ void rcu_irq_exit(void) static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, int user) { + rcu_dynticks_task_exit(); smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a86eb7bac45..3a92000c354f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -605,6 +605,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); static bool rcu_nohz_full_cpu(struct rcu_state *rsp); +static void rcu_dynticks_task_enter(void); +static void rcu_dynticks_task_exit(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7672586d3920..e466b40052a7 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -3036,3 +3036,19 @@ static void rcu_bind_gp_kthread(void) housekeeping_affine(current); #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } + +/* Record the current task on dyntick-idle entry. */ +static void rcu_dynticks_task_enter(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id(); +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} + +/* Record no current task on dyntick-idle exit. */ +static void rcu_dynticks_task_exit(void) +{ +#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) + ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1; +#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ +} diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e1d71741958f..2658de4a5975 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -463,7 +463,9 @@ static void check_holdout_task(struct task_struct *t, { if (!ACCESS_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || - !ACCESS_ONCE(t->on_rq)) { + !ACCESS_ONCE(t->on_rq) || + (IS_ENABLED(CONFIG_NO_HZ_FULL) && + !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { ACCESS_ONCE(t->rcu_tasks_holdout) = false; list_del_rcu(&t->rcu_tasks_holdout_list); put_task_struct(t); -- cgit v1.2.3 From 8f20a5e83d2c5d0e126a2fc9bca67f7430dac907 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Aug 2014 05:10:24 -0700 Subject: rcu: Make rcu_tasks_kthread()'s GP-wait loop allow preemption The grace-period-wait loop in rcu_tasks_kthread() is under (unnecessary) RCU protection, and therefore has no preemption points in a PREEMPT=n kernel. This commit therefore removes the RCU protection and inserts cond_resched(). Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 2658de4a5975..f86d1ae50005 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -467,7 +467,7 @@ static void check_holdout_task(struct task_struct *t, (IS_ENABLED(CONFIG_NO_HZ_FULL) && !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { ACCESS_ONCE(t->rcu_tasks_holdout) = false; - list_del_rcu(&t->rcu_tasks_holdout_list); + list_del_init(&t->rcu_tasks_holdout_list); put_task_struct(t); return; } @@ -573,6 +573,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) bool firstreport; bool needreport; int rtst; + struct task_struct *t1; schedule_timeout_interruptible(HZ); rtst = ACCESS_ONCE(rcu_task_stall_timeout); @@ -582,11 +583,11 @@ static int __noreturn rcu_tasks_kthread(void *arg) lastreport = jiffies; firstreport = true; WARN_ON(signal_pending(current)); - rcu_read_lock(); - list_for_each_entry_rcu(t, &rcu_tasks_holdouts, - rcu_tasks_holdout_list) + list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, + rcu_tasks_holdout_list) { check_holdout_task(t, needreport, &firstreport); - rcu_read_unlock(); + cond_resched(); + } } /* -- cgit v1.2.3 From 4ff475ed4cf61a7f56bbfbc424147189d0022b38 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Aug 2014 19:47:12 -0700 Subject: rcu: Additional information on RCU-tasks stall-warning messages Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f86d1ae50005..9487b4898e51 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -48,6 +48,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -461,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static void check_holdout_task(struct task_struct *t, bool needreport, bool *firstreport) { + int cpu; + if (!ACCESS_ONCE(t->rcu_tasks_holdout) || t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) || !ACCESS_ONCE(t->on_rq) || @@ -477,6 +480,12 @@ static void check_holdout_task(struct task_struct *t, pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); *firstreport = false; } + cpu = task_cpu(t); + pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", + t, ".I"[is_idle_task(t)], + "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], + t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, + t->rcu_tasks_idle_cpu, cpu); sched_show_task(t); } -- cgit v1.2.3 From 1d082fd061884a587c490c4fc8a2056ce1e47624 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Aug 2014 16:01:53 -0700 Subject: rcu: Remove local_irq_disable() in rcu_preempt_note_context_switch() The rcu_preempt_note_context_switch() function is on a scheduling fast path, so it would be good to avoid disabling irqs. The reason that irqs are disabled is to synchronize process-level and irq-handler access to the task_struct ->rcu_read_unlock_special bitmask. This commit therefore makes ->rcu_read_unlock_special instead be a union of bools with a short allowing single-access checks in RCU's __rcu_read_unlock(). This results in the process-level and irq-handler accesses being simple loads and stores, so that irqs need no longer be disabled. This commit therefore removes the irq disabling from rcu_preempt_note_context_switch(). Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 2 +- include/linux/sched.h | 16 +++++++++------- kernel/rcu/tree_plugin.h | 32 +++++++++++++++----------------- kernel/rcu/update.c | 2 +- 4 files changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 03b274873b06..77fc43f8fb72 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -111,7 +111,7 @@ extern struct group_info init_groups; #ifdef CONFIG_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ - .rcu_read_unlock_special = 0, \ + .rcu_read_unlock_special.s = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ INIT_TASK_RCU_TREE_PREEMPT() #else diff --git a/include/linux/sched.h b/include/linux/sched.h index ec8b34722bcc..42888d715fb1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1212,6 +1212,13 @@ struct sched_dl_entity { struct hrtimer dl_timer; }; +union rcu_special { + struct { + bool blocked; + bool need_qs; + } b; + short s; +}; struct rcu_node; enum perf_event_task_context { @@ -1264,7 +1271,7 @@ struct task_struct { #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; - char rcu_read_unlock_special; + union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TREE_PREEMPT_RCU @@ -2005,16 +2012,11 @@ extern void task_clear_jobctl_trapping(struct task_struct *task); extern void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask); -#ifdef CONFIG_PREEMPT_RCU -#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - static inline void rcu_copy_process(struct task_struct *p) { #ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; - p->rcu_read_unlock_special = 0; + p->rcu_read_unlock_special.s = 0; p->rcu_blocked_node = NULL; INIT_LIST_HEAD(&p->rcu_node_entry); #endif /* #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e466b40052a7..0981c0cd70fe 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -155,9 +155,8 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * not in a quiescent state. There might be any number of tasks blocked * while in an RCU read-side critical section. * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. + * As with the other rcu_*_qs() functions, callers to this function + * must disable preemption. */ static void rcu_preempt_qs(int cpu) { @@ -166,7 +165,7 @@ static void rcu_preempt_qs(int cpu) if (rdp->passed_quiesce == 0) trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + current->rcu_read_unlock_special.b.need_qs = false; } /* @@ -190,14 +189,14 @@ static void rcu_preempt_note_context_switch(int cpu) struct rcu_node *rnp; if (t->rcu_read_lock_nesting > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + !t->rcu_read_unlock_special.b.blocked) { /* Possibly blocking in an RCU read-side critical section. */ rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; /* @@ -239,7 +238,7 @@ static void rcu_preempt_note_context_switch(int cpu) : rnp->gpnum + 1); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special) { + t->rcu_read_unlock_special.s) { /* * Complete exit from RCU read-side critical section on @@ -257,9 +256,7 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - local_irq_save(flags); rcu_preempt_qs(cpu); - local_irq_restore(flags); } /* @@ -340,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t) bool drop_boost_mutex = false; #endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; - int special; + union rcu_special special; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -350,12 +347,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. + * let it know that we have done so. Because irqs are disabled, + * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) { + if (special.b.need_qs) { rcu_preempt_qs(smp_processor_id()); - if (!t->rcu_read_unlock_special) { + if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; } @@ -368,8 +366,8 @@ void rcu_read_unlock_special(struct task_struct *t) } /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + if (special.b.blocked) { + t->rcu_read_unlock_special.b.blocked = false; /* * Remove this task from the list it blocked on. The @@ -658,7 +656,7 @@ static void rcu_preempt_check_callbacks(int cpu) } if (t->rcu_read_lock_nesting > 0 && per_cpu(rcu_preempt_data, cpu).qs_pending) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; + t->rcu_read_unlock_special.b.need_qs = true; } #ifdef CONFIG_RCU_BOOST @@ -941,7 +939,7 @@ void exit_rcu(void) return; t->rcu_read_lock_nesting = 1; barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + t->rcu_read_unlock_special.b.blocked = true; __rcu_read_unlock(); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 9487b4898e51..6fb911558562 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -93,7 +93,7 @@ void __rcu_read_unlock(void) barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); barrier(); /* ->rcu_read_unlock_special load before assign */ t->rcu_read_lock_nesting = 0; -- cgit v1.2.3 From 284a8c93af47306beed967a303d84730b32bab39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Aug 2014 16:38:46 -0700 Subject: rcu: Per-CPU operation cleanups to rcu_*_qs() functions The rcu_bh_qs(), rcu_preempt_qs(), and rcu_sched_qs() functions use old-style per-CPU variable access and write to ->passed_quiesce even if it is already set. This commit therefore updates to use the new-style per-CPU variable access functions and avoids the spurious writes. This commit also eliminates the "cpu" argument to these functions because they are always invoked on the indicated CPU. Reported-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- include/linux/rcutiny.h | 2 +- kernel/rcu/tiny.c | 10 +++++----- kernel/rcu/tree.c | 34 ++++++++++++++++++---------------- kernel/rcu/tree_plugin.h | 27 +++++++++++++++------------ kernel/softirq.c | 2 +- 6 files changed, 42 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 132e1e34cdca..2fab0e37afe0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -261,8 +261,8 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); -void rcu_sched_qs(int cpu); -void rcu_bh_qs(int cpu); +void rcu_sched_qs(void); +void rcu_bh_qs(void); void rcu_check_callbacks(int cpu, int user); struct notifier_block; void rcu_idle_enter(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d40a6a451330..38cc5b1e252d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -80,7 +80,7 @@ static inline void kfree_call_rcu(struct rcu_head *head, static inline void rcu_note_context_switch(int cpu) { - rcu_sched_qs(cpu); + rcu_sched_qs(); } /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 717f00854fc0..61b8d2ccc2cb 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval) current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + rcu_sched_qs(); /* implies rcu_bh_inc() */ barrier(); rcu_dynticks_nesting = newval; } @@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { unsigned long flags; @@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu) /* * Record an rcu_bh quiescent state. */ -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { unsigned long flags; @@ -251,9 +251,9 @@ void rcu_check_callbacks(int cpu, int user) { RCU_TRACE(check_cpu_stalls()); if (user || rcu_is_cpu_rrupt_from_idle()) - rcu_sched_qs(cpu); + rcu_sched_qs(); else if (!in_softirq()) - rcu_bh_qs(cpu); + rcu_bh_qs(); if (user) rcu_note_voluntary_context_switch(current); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c880f5387b1f..4c340625ffd4 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -188,22 +188,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * one since the start of the grace period, this just sets a flag. * The caller must have disabled preemption. */ -void rcu_sched_qs(int cpu) +void rcu_sched_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + } } -void rcu_bh_qs(int cpu) +void rcu_bh_qs(void) { - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; + if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_bh"), + __this_cpu_read(rcu_bh_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + } } static DEFINE_PER_CPU(int, rcu_sched_qs_mask); @@ -278,7 +280,7 @@ static void rcu_momentary_dyntick_idle(void) void rcu_note_context_switch(int cpu) { trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(cpu); + rcu_sched_qs(); rcu_preempt_note_context_switch(cpu); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); @@ -2395,8 +2397,8 @@ void rcu_check_callbacks(int cpu, int user) * at least not while the corresponding CPU is online. */ - rcu_sched_qs(cpu); - rcu_bh_qs(cpu); + rcu_sched_qs(); + rcu_bh_qs(); } else if (!in_softirq()) { @@ -2407,7 +2409,7 @@ void rcu_check_callbacks(int cpu, int user) * critical section, so note it. */ - rcu_bh_qs(cpu); + rcu_bh_qs(); } rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0981c0cd70fe..25e692a36280 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -158,14 +158,16 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); * As with the other rcu_*_qs() functions, callers to this function * must disable preemption. */ -static void rcu_preempt_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; - current->rcu_read_unlock_special.b.need_qs = false; +static void rcu_preempt_qs(void) +{ + if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) { + trace_rcu_grace_period(TPS("rcu_preempt"), + __this_cpu_read(rcu_preempt_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_preempt_data.passed_quiesce, 1); + barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ + current->rcu_read_unlock_special.b.need_qs = false; + } } /* @@ -256,7 +258,7 @@ static void rcu_preempt_note_context_switch(int cpu) * grace period, then the fact that the task has been enqueued * means that we continue to block the current grace period. */ - rcu_preempt_qs(cpu); + rcu_preempt_qs(); } /* @@ -352,7 +354,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ special = t->rcu_read_unlock_special; if (special.b.need_qs) { - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); return; @@ -651,11 +653,12 @@ static void rcu_preempt_check_callbacks(int cpu) struct task_struct *t = current; if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(cpu); + rcu_preempt_qs(); return; } if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending) + per_cpu(rcu_preempt_data, cpu).qs_pending && + !per_cpu(rcu_preempt_data, cpu).passed_quiesce) t->rcu_read_unlock_special.b.need_qs = true; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 5918d227730f..348ec763b104 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -278,7 +278,7 @@ restart: pending >>= softirq_bit; } - rcu_bh_qs(smp_processor_id()); + rcu_bh_qs(); local_irq_disable(); pending = local_softirq_pending(); -- cgit v1.2.3 From a34375ef9e65340a138fc0be287de5c940d260fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: percpu-refcount: add @gfp to percpu_ref_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_ref_init() so that !GFP_KERNEL allocation masks can be used with percpu_refs too. This patch doesn't make any functional difference. v2: blk-mq conversion was missing. Updated. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Benjamin LaHaise Cc: Li Zefan Cc: Nicholas A. Bellinger Cc: Jens Axboe --- block/blk-mq.c | 3 ++- drivers/target/target_core_tpg.c | 3 ++- fs/aio.c | 4 ++-- include/linux/percpu-refcount.h | 3 ++- kernel/cgroup.c | 6 +++--- lib/percpu-refcount.c | 6 ++++-- 6 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..702df07b980d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1776,7 +1776,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, + GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index fddfae61222f..4ab6da338585 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,8 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index bd7ec2cc2674..93fbcc0f5696 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -666,10 +666,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3dfbf237cd8f..ee8325122dbd 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -49,6 +49,7 @@ #include #include #include +#include struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); @@ -66,7 +67,7 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release); + percpu_ref_func_t *release, gfp_t gfp); void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7dc8788cfd52..589b4d89a0a5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1628,7 +1628,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out; @@ -4487,7 +4487,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release); + err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); if (err) goto err_free_css; @@ -4555,7 +4555,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index fe5a3342e960..ff9903264a91 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -40,6 +40,7 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @gfp: allocation mask to use * * Initializes the refcount in single atomic counter mode with a refcount of 1; * analagous to atomic_set(ref, 1). @@ -47,11 +48,12 @@ static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ -int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, + gfp_t gfp) { atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); - ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); + ref->pcpu_count_ptr = (unsigned long)alloc_percpu_gfp(unsigned, gfp); if (!ref->pcpu_count_ptr) return -ENOMEM; -- cgit v1.2.3 From 90ed9cbe765ad358b3151a12b8bf889a3cbcd573 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 15 Aug 2014 16:05:36 -0400 Subject: exit: Always reap resource stats in __exit_signal() Oleg pointed out that wait_task_zombie adds a task's usage statistics to the parent's signal struct, but the task's own signal struct should also propagate the statistics at exit time. This allows thread_group_cputime(reaped_zombie) to get the statistics after __unhash_process() has made the task invisible to for_each_thread, but before the thread has actually been rcu freed, making sure no non-monotonic results are returned inside that window. Suggested-by: Oleg Nesterov Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: David Rientjes Cc: Guillaume Morin Cc: Ionut Alexa Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/1408133138-22048-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/exit.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7433a3..b93d46dab6fc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -115,30 +115,29 @@ static void __exit_signal(struct task_struct *tsk) if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* + * Accumulate here the counters for all threads but the group leader + * as they die, so they can be added into the process-wide totals + * when those are taken. The group leader stays around as a zombie as + * long as there are other threads. When it gets reaped, the exit.c + * code will add its counts into these totals. We won't ever get here + * for the group leader, since it will have been the last reference on + * the signal_struct. + */ + task_cputime(tsk, &utime, &stime); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); -- cgit v1.2.3 From e78c3496790ee8a36522a838b59b388e8a709e65 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 16 Aug 2014 13:40:10 -0400 Subject: time, signal: Protect resource use statistics with seqlock Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability issues on large systems, due to both functions being serialized with a lock. The lock protects against reporting a wrong value, due to a thread in the task group exiting, its statistics reporting up to the signal struct, and that exited task's statistics being counted twice (or not at all). Protecting that with a lock results in times() and clock_gettime() being completely serialized on large systems. This can be fixed by using a seqlock around the events that gather and propagate statistics. As an additional benefit, the protection code can be moved into thread_group_cputime(), slightly simplifying the calling functions. In the case of posix_cpu_clock_get_task() things can be simplified a lot, because the calling function already ensures that the task sticks around, and the rest is now taken care of in thread_group_cputime(). This way the statistics reporting code can run lockless. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Alex Thorlton Cc: Andrew Morton Cc: Daeseok Youn Cc: David Rientjes Cc: Dongsheng Yang Cc: Geert Uytterhoeven Cc: Guillaume Morin Cc: Ionut Alexa Cc: Kees Cook Cc: Linus Torvalds Cc: Li Zefan Cc: Michal Hocko Cc: Michal Schmidt Cc: Oleg Nesterov Cc: Vladimir Davydov Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/exit.c | 4 ++++ kernel/fork.c | 1 + kernel/sched/cputime.c | 33 ++++++++++++++++++++------------- kernel/sys.c | 2 -- kernel/time/posix-cpu-timers.c | 14 -------------- 6 files changed, 26 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885ee52b..dd9eb4807389 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -645,6 +645,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ + seqlock_t stats_lock; cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; diff --git a/kernel/exit.c b/kernel/exit.c index b93d46dab6fc..fa09b86609db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -127,6 +127,7 @@ static void __exit_signal(struct task_struct *tsk) * the signal_struct. */ task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); @@ -140,6 +141,7 @@ static void __exit_signal(struct task_struct *tsk) sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -1042,6 +1044,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1064,6 +1067,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); + write_sequnlock(&psig->stats_lock); spin_unlock_irq(&p->real_parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb6e491..9387ae8ab048 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1068,6 +1068,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); + seqlock_init(&sig->stats_lock); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 3e52836359ba..49b7cfe98f7a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,18 +288,28 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; rcu_read_lock(); - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } + /* Attempt a lockless read on the first round. */ + nextseq = 0; + do { + seq = nextseq; + read_seqbegin_or_lock(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry(&sig->stats_lock, seq); rcu_read_unlock(); } @@ -611,9 +621,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) cputime_adjust(&cputime, &p->prev_cputime, ut, st); } -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..b6636643cbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -862,11 +862,9 @@ void do_sys_times(struct tms *tms) { cputime_t tgutime, tgstime, cutime, cstime; - spin_lock_irq(¤t->sighand->siglock); thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; - spin_unlock_irq(¤t->sighand->siglock); tms->tms_utime = cputime_to_clock_t(tgutime); tms->tms_stime = cputime_to_clock_t(tgstime); tms->tms_cutime = cputime_to_clock_t(cutime); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..492b986195d5 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, if (same_thread_group(tsk, current)) err = cpu_clock_sample(which_clock, tsk, &rtn); } else { - unsigned long flags; - struct sighand_struct *sighand; - - /* - * while_each_thread() is not yet entirely RCU safe, - * keep locking the group while sampling process - * clock for now. - */ - sighand = lock_task_sighand(tsk, &flags); - if (!sighand) - return err; - if (tsk == current || thread_group_leader(tsk)) err = cpu_clock_sample_group(which_clock, tsk, &rtn); - - unlock_task_sighand(tsk, &flags); } if (!err) -- cgit v1.2.3 From eb1b4af0a64ac7bb0ee36f579c1c7cefcbc3ac2c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 15 Aug 2014 16:05:38 -0400 Subject: sched, time: Atomically increment stime & utime The functions task_cputime_adjusted and thread_group_cputime_adjusted() can be called locklessly, as well as concurrently on many different CPUs. This can occasionally lead to the utime and stime reported by times(), and other syscalls like it, going backward. The cause for this appears to be multiple threads racing in cputime_adjust(), both with values for utime or stime that is larger than the original, but each with a different value. Sometimes the larger value gets saved first, only to be immediately overwritten with a smaller value by another thread. Using atomic exchange prevents that problem, and ensures time progresses monotonically. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: akpm@linux-foundation.org Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/1408133138-22048-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 49b7cfe98f7a..2b57031afc19 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -602,9 +602,12 @@ static void cputime_adjust(struct task_cputime *curr, * If the tick based count grows faster than the scheduler one, * the result of the scaling may go backward. * Let's enforce monotonicity. + * Atomic exchange protects against concurrent cputime_adjust(). */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + while (stime > (rtime = ACCESS_ONCE(prev->stime))) + cmpxchg(&prev->stime, rtime, stime); + while (utime > (rtime = ACCESS_ONCE(prev->utime))) + cmpxchg(&prev->utime, rtime, utime); out: *ut = prev->utime; -- cgit v1.2.3 From bc7115b1447fe88d065e7f85078ed776ebe7be74 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 2 Sep 2014 11:54:39 -0700 Subject: PM / sleep: Support freeze as test_suspend option Added freeze as one of the option for test_suspend boot param. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend_test.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index bd91bc177c93..379f36de348a 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -100,7 +100,14 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) if (state == PM_SUSPEND_STANDBY) { printk(info_test, pm_states[state]); status = pm_suspend(state); + if (status < 0) + state = PM_SUSPEND_FREEZE; } + if (state == PM_SUSPEND_FREEZE) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) printk(err_suspend, status); -- cgit v1.2.3 From 2ce986892faf843785f8cdab1c2ed6cd4a3c20aa Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 2 Sep 2014 11:54:40 -0700 Subject: PM / sleep: Enhance test_suspend option with repeat capability Enhanced test_suspend boot paramter to repeat tests multiple times, by adding optional repeat count. The new boot param syntax: test_suspend="mem|freeze|standby[,N]" Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend_test.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 379f36de348a..084452e34a12 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -22,6 +22,8 @@ #define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; +static u32 test_repeat_count_max = 1; +static u32 test_repeat_count_current; void suspend_test_start(void) { @@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) int status; /* this may fail if the RTC hasn't been initialized */ +repeat: status = rtc_read_time(rtc, &alm.time); if (status < 0) { printk(err_readtime, dev_name(&rtc->dev), status); @@ -111,6 +114,10 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) if (status < 0) printk(err_suspend, status); + test_repeat_count_current++; + if (test_repeat_count_current < test_repeat_count_max) + goto repeat; + /* Some platforms can't detect that the alarm triggered the * wakeup, or (accordingly) disable it after it afterwards. * It's supposed to give oneshot behavior; cope. @@ -144,16 +151,28 @@ static char warn_bad_state[] __initdata = static int __init setup_test_suspend(char *value) { int i; + char *repeat; + char *suspend_type; - /* "=mem" ==> "mem" */ + /* example : "=mem[,N]" ==> "mem[,N]" */ value++; + suspend_type = strsep(&value, ","); + if (!suspend_type) + return 0; + + repeat = strsep(&value, ","); + if (repeat) { + if (kstrtou32(repeat, 0, &test_repeat_count_max)) + return 0; + } + for (i = 0; pm_labels[i]; i++) - if (!strcmp(pm_labels[i], value)) { + if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; return 0; } - printk(warn_bad_state, value); + printk(warn_bad_state, suspend_type); return 0; } __setup("test_suspend", setup_test_suspend); -- cgit v1.2.3 From 5cd038f53ed9ec7a17ab7d536a727363080f4210 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 4 Jun 2014 16:25:15 +0800 Subject: sched: Migrate waking tasks Current code can fail to migrate a waking task (silently) when TTWU_QUEUE is enabled. When a task is waking, it is pending on the wake_list of the rq, but it is not queued (task->on_rq == 0). In this case, set_cpus_allowed_ptr() and __migrate_task() will not migrate it because its invisible to them. This behavior is incorrect, because the task has been already woken, it will be running on the wrong CPU without correct placement until the next wake-up or update for cpus_allowed. To fix this problem, we need to finish the wakeup (so they appear on the runqueue) before we migrate them. Reported-by: Sasha Levin Reported-by: Jason J. Herne Tested-by: Jason J. Herne Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/538ED7EB.5050303@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a814b3c88029..78e5c839df13 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4666,7 +4666,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_on_rq_queued(p)) { + if (task_on_rq_queued(p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); @@ -4799,6 +4799,12 @@ static int migration_cpu_stop(void *data) * be on another cpu but it doesn't matter. */ local_irq_disable(); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_allowed + * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. + */ + sched_ttwu_pending(); __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); local_irq_enable(); return 0; -- cgit v1.2.3 From 8236d907ab3411ad452280faa8b26c1347327380 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 2 Sep 2014 00:41:24 -0700 Subject: sched: Reduce contention in update_cfs_rq_blocked_load() When running workloads on 2+ socket systems, based on perf profiles, the update_cfs_rq_blocked_load() function often shows up as taking up a noticeable % of run time. Much of the contention is in __update_cfs_rq_tg_load_contrib() when we update the tg load contribution stats. However, it turns out that in many cases, they don't need to be updated and "tg_contrib" is 0. This patch adds a check in __update_cfs_rq_tg_load_contrib() to skip updating tg load contribution stats when nothing needs to be updated. This reduces the cacheline contention that would be unnecessary. Reviewed-by: Ben Segall Reviewed-by: Waiman Long Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Paul Turner Cc: jason.low2@hp.com Cc: Yuyang Du Cc: Aswin Chandramouleeswaran Cc: Chegu Vinod Cc: Scott J Norton Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409643684.19197.15.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 50d2025c1777..be9e97b0d76f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2382,6 +2382,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; tg_contrib -= cfs_rq->tg_load_contrib; + if (!tg_contrib) + return; + if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { atomic_long_add(tg_contrib, &tg->load_avg); cfs_rq->tg_load_contrib += tg_contrib; -- cgit v1.2.3 From e0455e194a5e0cf49bc7596a20d4f7e47995b9c6 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Fri, 22 Aug 2014 17:15:36 +0300 Subject: perf/callchain: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Peter Zijlstra (Intel) Cc: paulmck@linux.vnet.ibm.com Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20140822141536.GA32051@ada Signed-off-by: Ingo Molnar --- kernel/events/callchain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 97b67df8fbfe..f2a88de87a49 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -52,7 +52,7 @@ static void release_callchain_buffers(void) struct callchain_cpus_entries *entries; entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); + RCU_INIT_POINTER(callchain_cpus_entries, NULL); call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); } -- cgit v1.2.3 From 70691d4a0bf7c871559d4ef1b0056edefbca123b Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Fri, 22 Aug 2014 16:26:05 +0300 Subject: perf/core: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20140822132605.GA20130@ada Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 01bd42ed516c..f917dec6f897 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5908,7 +5908,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) if (!hlist) return; - rcu_assign_pointer(swhash->swevent_hlist, NULL); + RCU_INIT_POINTER(swhash->swevent_hlist, NULL); kfree_rcu(hlist, rcu_head); } -- cgit v1.2.3 From 3577af70a2ce4853d58e57d832e687d739281479 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 2 Sep 2014 15:27:20 -0700 Subject: perf: Fix a race condition in perf_remove_from_context() We saw a kernel soft lockup in perf_remove_from_context(), it looks like the `perf` process, when exiting, could not go out of the retry loop. Meanwhile, the target process was forking a child. So either the target process should execute the smp function call to deactive the event (if it was running) or it should do a context switch which deactives the event. It seems we optimize out a context switch in perf_event_context_sched_out(), and what's more important, we still test an obsolete task pointer when retrying, so no one actually would deactive that event in this situation. Fix it directly by reloading the task pointer in perf_remove_from_context(). This should cure the above soft lockup. Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Link: http://lkml.kernel.org/r/1409696840-843-1-git-send-email-xiyou.wangcong@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f9c1ed002dbc..d640a8b4dcbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1524,6 +1524,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -1967,6 +1972,11 @@ retry: */ if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } -- cgit v1.2.3 From f1ff6348b30b3658d138f05643149706f99078ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 22 Jul 2014 20:16:57 -0400 Subject: ftrace: Add separate function for non recursive callbacks Instead of using the generic list function for callbacks that are not recursive, call a new helper function from the mcount trampoline called ftrace_ops_recur_func() that will do the recursion checking for the callback. This eliminates an indirection as well as will help in future code that will use dynamically allocated trampolines. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5916a8e59e87..17b606362ab4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -113,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs); + #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); @@ -258,11 +261,18 @@ static void update_ftrace_function(void) if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && - (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ set_function_trace_op = ftrace_ops_list; - func = ftrace_ops_list->func; + /* + * If the func handles its own recursion, call it directly. + * Otherwise call the recursion protected function that + * will call the ftrace ops function. + */ + if (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) + func = ftrace_ops_list->func; + else + func = ftrace_ops_recurs_func; } else { /* Just use the default ftrace_ops */ set_function_trace_op = &ftrace_list_end; @@ -4827,6 +4837,25 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) } #endif +/* + * If there's only one function registered but it does not support + * recursion, this function will be called by the mcount trampoline. + * This function will handle recursion protection. + */ +static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *regs) +{ + int bit; + + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + if (bit < 0) + return; + + op->func(ip, parent_ip, op, regs); + + trace_clear_recursion(bit); +} + static void clear_ftrace_swapper(void) { struct task_struct *p; -- cgit v1.2.3 From 02ab695bb37ee9ad515df0d0790d5977505dd04a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:17 -0700 Subject: net: filter: add "load 64-bit immediate" eBPF instruction add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register. All previous instructions were 8-byte. This is first 16-byte instruction. Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction: insn[0].code = BPF_LD | BPF_DW | BPF_IMM insn[0].dst_reg = destination register insn[0].imm = lower 32-bit insn[1].code = 0 insn[1].imm = upper 32-bit All unused fields must be zero. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. x64 JITs it as single 'movabsq %rax, imm64' arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn Note that old eBPF programs are binary compatible with new interpreter. It helps eBPF programs load 64-bit constant into a register with one instruction instead of using two registers and 4 instructions: BPF_MOV32_IMM(R1, imm32) BPF_ALU64_IMM(BPF_LSH, R1, 32) BPF_MOV32_IMM(R2, imm32) BPF_ALU64_REG(BPF_OR, R1, R2) User space generated programs will use this instruction to load constants only. To tell kernel that user space needs a pointer the _pseudo_ variant of this instruction may be added later, which will use extra bits of encoding to indicate what type of pointer user space is asking kernel to provide. For example 'off' or 'src_reg' fields can be used for such purpose. src_reg = 1 could mean that user space is asking kernel to validate and load in-kernel map pointer. src_reg = 2 could mean that user space needs readonly data section pointer src_reg = 3 could mean that user space needs a pointer to per-cpu local data All such future pseudo instructions will not be carrying the actual pointer as part of the instruction, but rather will be treated as a request to kernel to provide one. The kernel will verify the request_for_a_pointer, then will drop _pseudo_ marking and will store actual internal pointer inside the instruction, so the end result is the interpreter and JITs never see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that loads 64-bit immediate into a register. User space never operates on direct pointers and verifier can easily recognize request_for_pointer vs other instructions. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 8 +++++++- arch/x86/net/bpf_jit_comp.c | 17 +++++++++++++++++ include/linux/filter.h | 18 ++++++++++++++++++ kernel/bpf/core.c | 5 +++++ lib/test_bpf.c | 21 +++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index c48a9704bda8..81916ab5d96f 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -951,7 +951,7 @@ Size modifier is one of ... Mode modifier is one of: - BPF_IMM 0x00 /* classic BPF only, reserved in eBPF */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ BPF_ABS 0x20 BPF_IND 0x40 BPF_MEM 0x60 @@ -995,6 +995,12 @@ BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and 2 byte atomic increments are not supported. +eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists +of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single +instruction that loads 64-bit immediate value into a dst_reg. +Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads +32-bit immediate value into a register. + Testing ------- diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 39ccfbb4a723..06f8c17f5484 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -393,6 +393,23 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, EMIT1_off32(add_1reg(0xB8, dst_reg), imm32); break; + case BPF_LD | BPF_IMM | BPF_DW: + if (insn[1].code != 0 || insn[1].src_reg != 0 || + insn[1].dst_reg != 0 || insn[1].off != 0) { + /* verifier must catch invalid insns */ + pr_err("invalid BPF_LD_IMM64 insn\n"); + return -EINVAL; + } + + /* movabsq %rax, imm64 */ + EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg)); + EMIT(insn[0].imm, 4); + EMIT(insn[1].imm, 4); + + insn++; + i++; + break; + /* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */ case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_DIV | BPF_X: diff --git a/include/linux/filter.h b/include/linux/filter.h index c78994593355..bf323da77950 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -166,6 +166,24 @@ enum { .off = 0, \ .imm = IMM }) +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b54bb2c2e494..2c2bfaacce66 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -242,6 +242,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; void *ptr; int off; @@ -301,6 +302,10 @@ select_insn: ALU64_MOV_K: DST = IMM; CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; CONT; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9a67456ba29a..413890815d3e 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -1735,6 +1735,27 @@ static struct bpf_test tests[] = { { }, { { 1, 0 } }, }, + { + "load 64-bit immediate", + .u.insns_int = { + BPF_LD_IMM64(R1, 0x567800001234L), + BPF_MOV64_REG(R2, R1), + BPF_MOV64_REG(R3, R2), + BPF_ALU64_IMM(BPF_RSH, R2, 32), + BPF_ALU64_IMM(BPF_LSH, R3, 32), + BPF_ALU64_IMM(BPF_RSH, R3, 32), + BPF_ALU64_IMM(BPF_MOV, R0, 0), + BPF_JMP_IMM(BPF_JEQ, R2, 0x5678, 1), + BPF_EXIT_INSN(), + BPF_JMP_IMM(BPF_JEQ, R3, 0x1234, 1), + BPF_EXIT_INSN(), + BPF_ALU64_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } } + }, }; static struct net_device dev; -- cgit v1.2.3 From 87354059881ce9315181604dc17076c535f4d744 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 22 Jul 2014 20:41:42 -0400 Subject: ftrace: Add helper function ftrace_ops_get_func() Add the helper function to what the mcount trampoline is to call for a ftrace_ops function. This helper will be used by arch code in the future to set up dynamic trampolines. But as this does the same tests that are performed in choosing what function to call for the default mcount trampoline, might as well use it to clean up the existing code. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 47 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 37 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f0b0edbf55a9..ef37286547fc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -56,6 +56,8 @@ struct ftrace_ops; typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs); +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); + /* * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are * set in the flags member. diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 17b606362ab4..dabf734f909c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -259,20 +259,12 @@ static void update_ftrace_function(void) * then have the mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && - !FTRACE_FORCE_LIST_FUNC)) { + (ftrace_ops_list->next == &ftrace_list_end)) { + /* Set the ftrace_ops that the arch callback uses */ set_function_trace_op = ftrace_ops_list; - /* - * If the func handles its own recursion, call it directly. - * Otherwise call the recursion protected function that - * will call the ftrace ops function. - */ - if (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) - func = ftrace_ops_list->func; - else - func = ftrace_ops_recurs_func; + + func = ftrace_ops_get_func(ftrace_ops_list); } else { /* Just use the default ftrace_ops */ set_function_trace_op = &ftrace_list_end; @@ -4856,6 +4848,37 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, trace_clear_recursion(bit); } +/** + * ftrace_ops_get_func - get the function a trampoline should call + * @ops: the ops to get the function for + * + * Normally the mcount trampoline will call the ops->func, but there + * are times that it should not. For example, if the ops does not + * have its own recursion protection, then it should call the + * ftrace_ops_recurs_func() instead. + * + * Returns the function that the trampoline should call for @ops. + */ +ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic ops or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + /* + * If the func handles its own recursion, call it directly. + * Otherwise call the recursion protected function that + * will call the ftrace ops function. + */ + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) + return ftrace_ops_recurs_func; + + return ops->func; +} + static void clear_ftrace_swapper(void) { struct task_struct *p; -- cgit v1.2.3 From 738cbe72adc5c8f2016c4c68aa5162631d4f27e1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 8 Sep 2014 08:04:47 +0200 Subject: net: bpf: consolidate JIT binary allocator Introduced in commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") and later on replicated in aa2d2c73c21f ("s390/bpf,jit: address randomize and write protect jit code") for s390 architecture, write protection for BPF JIT images got added and a random start address of the JIT code, so that it's not on a page boundary anymore. Since both use a very similar allocator for the BPF binary header, we can consolidate this code into the BPF core as it's mostly JIT independant anyway. This will also allow for future archs that support DEBUG_SET_MODULE_RONX to just reuse instead of reimplementing it. JIT tested on x86_64 and s390x with BPF test suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Eric Dumazet Cc: Heiko Carstens Cc: Martin Schwidefsky Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 45 ++++++++------------------------------- arch/x86/net/bpf_jit_comp.c | 50 ++++++++++---------------------------------- include/linux/filter.h | 13 ++++++++++++ kernel/bpf/core.c | 39 ++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+), 75 deletions(-) (limited to 'kernel') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index f2833c5b218a..b734f975c22e 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -5,11 +5,9 @@ * * Author(s): Martin Schwidefsky */ -#include #include #include #include -#include #include #include #include @@ -148,6 +146,12 @@ struct bpf_jit { ret; \ }) +static void bpf_jit_fill_hole(void *area, unsigned int size) +{ + /* Fill whole space with illegal instructions */ + memset(area, 0, size); +} + static void bpf_jit_prologue(struct bpf_jit *jit) { /* Save registers and create stack frame if necessary */ @@ -780,38 +784,6 @@ out: return -1; } -/* - * Note: for security reasons, bpf code will follow a randomly - * sized amount of illegal instructions. - */ -struct bpf_binary_header { - unsigned int pages; - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int bpfsize, - u8 **image_ptr) -{ - struct bpf_binary_header *header; - unsigned int sz, hole; - - /* Most BPF filters are really small, but if some of them fill a page, - * allow at least 128 extra bytes for illegal instructions. - */ - sz = round_up(bpfsize + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - memset(header, 0, sz); - header->pages = sz / PAGE_SIZE; - hole = min(sz - (bpfsize + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - /* Insert random number of illegal instructions before BPF code - * and make sure the first instruction starts at an even address. - */ - *image_ptr = &header->image[(prandom_u32() % hole) & -2]; - return header; -} - void bpf_jit_compile(struct bpf_prog *fp) { struct bpf_binary_header *header = NULL; @@ -850,7 +822,8 @@ void bpf_jit_compile(struct bpf_prog *fp) size = prg_len + lit_len; if (size >= BPF_SIZE_MAX) goto out; - header = bpf_alloc_binary(size, &jit.start); + header = bpf_jit_binary_alloc(size, &jit.start, + 2, bpf_jit_fill_hole); if (!header) goto out; jit.prg = jit.mid = jit.start + prg_len; @@ -884,7 +857,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 06f8c17f5484..9de0b5476b0c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,12 +8,10 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include -#include #include #include #include -#include +#include int bpf_jit_enable __read_mostly; @@ -109,39 +107,6 @@ static inline void bpf_flush_icache(void *start, void *end) #define CHOOSE_LOAD_FUNC(K, func) \ ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset) -struct bpf_binary_header { - unsigned int pages; - /* Note : for security reasons, bpf code will follow a randomly - * sized amount of int3 instructions - */ - u8 image[]; -}; - -static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen, - u8 **image_ptr) -{ - unsigned int sz, hole; - struct bpf_binary_header *header; - - /* Most of BPF filters are really small, - * but if some of them fill a page, allow at least - * 128 extra bytes to insert a random section of int3 - */ - sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE); - header = module_alloc(sz); - if (!header) - return NULL; - - memset(header, 0xcc, sz); /* fill whole space with int3 instructions */ - - header->pages = sz / PAGE_SIZE; - hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header)); - - /* insert a random number of int3 instructions before BPF code */ - *image_ptr = &header->image[prandom_u32() % hole]; - return header; -} - /* pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_REG + 1) @@ -206,6 +171,12 @@ static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg) return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3); } +static void jit_fill_hole(void *area, unsigned int size) +{ + /* fill whole space with int3 instructions */ + memset(area, 0xcc, size); +} + struct jit_context { unsigned int cleanup_addr; /* epilogue code offset */ bool seen_ld_abs; @@ -959,7 +930,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) if (proglen <= 0) { image = NULL; if (header) - module_free(NULL, header); + bpf_jit_binary_free(header); goto out; } if (image) { @@ -969,7 +940,8 @@ void bpf_int_jit_compile(struct bpf_prog *prog) break; } if (proglen == oldproglen) { - header = bpf_alloc_binary(proglen, &image); + header = bpf_jit_binary_alloc(proglen, &image, + 1, jit_fill_hole); if (!header) goto out; } @@ -998,7 +970,7 @@ void bpf_jit_free(struct bpf_prog *fp) goto free_filter; set_memory_rw(addr, header->pages); - module_free(NULL, header); + bpf_jit_binary_free(header); free_filter: bpf_prog_unlock_free(fp); diff --git a/include/linux/filter.h b/include/linux/filter.h index 8f82ef3f1cdd..868764fcffb8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -289,6 +289,11 @@ struct sock_fprog_kern { struct sock_filter *filter; }; +struct bpf_binary_header { + unsigned int pages; + u8 image[]; +}; + struct bpf_work_struct { struct bpf_prog *prog; struct work_struct work; @@ -358,6 +363,14 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2c2bfaacce66..8ee520f0ec70 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -20,9 +20,12 @@ * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ + #include #include #include +#include +#include #include /* Registers */ @@ -125,6 +128,42 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *hdr; + unsigned int size, hole, start; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + hdr = module_alloc(size); + if (hdr == NULL) + return NULL; + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + + hdr->pages = size / PAGE_SIZE; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (prandom_u32() % hole) & ~(alignment - 1); + + /* Leave a random number of instructions before BPF code. */ + *image_ptr = &hdr->image[start]; + + return hdr; +} + +void bpf_jit_binary_free(struct bpf_binary_header *hdr) +{ + module_free(NULL, hdr); +} + /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. -- cgit v1.2.3 From f7aad4e1a8221210db7eb434349cc6fe87aeee8c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 10 Sep 2014 10:42:46 -0400 Subject: ftrace: Set callback to ftrace_stub when no ops are registered The clean up that adds the helper function ftrace_ops_get_func() caused the default function to not change when DYNAMIC_FTRACE was not set and no ftrace_ops were registered. Although static tracing is not very useful (not having DYNAMIC_FTRACE set), it is still supported and we don't want to break it. Clean up the if statement even more to specifically have the default function call ftrace_stub when no ftrace_ops are registered. This fixes the small bug for static tracing as well as makes the code a bit more understandable. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dabf734f909c..708aea493d96 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -253,18 +253,25 @@ static void update_ftrace_function(void) { ftrace_func_t func; + /* + * Prepare the ftrace_ops that the arch callback will use. + * If there's only one ftrace_ops registered, the ftrace_ops_list + * will point to the ops we want. + */ + set_function_trace_op = ftrace_ops_list; + + /* If there's no ftrace_ops registered, just call the stub function */ + if (ftrace_ops_list == &ftrace_list_end) { + func = ftrace_stub; + /* * If we are at the end of the list and this ops is * recursion safe and not dynamic and the arch supports passing ops, * then have the mcount trampoline call the function directly. */ - if (ftrace_ops_list == &ftrace_list_end || - (ftrace_ops_list->next == &ftrace_list_end)) { - - /* Set the ftrace_ops that the arch callback uses */ - set_function_trace_op = ftrace_ops_list; - + } else if (ftrace_ops_list->next == &ftrace_list_end) { func = ftrace_ops_get_func(ftrace_ops_list); + } else { /* Just use the default ftrace_ops */ set_function_trace_op = &ftrace_list_end; -- cgit v1.2.3 From 3296fc4e2509fa8870923ed52e7990040b151847 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 24 Jul 2014 15:33:41 -0400 Subject: ftrace: Remove freeing of old_hash from ftrace_hash_move() ftrace_hash_move() currently frees the old hash that is passed to it after replacing the pointer with the new hash. Instead of having the function do that chore, have the caller perform the free. This lets the ftrace_hash_move() be used a bit more freely, which is needed for changing the way the trampoline logic is done. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 708aea493d96..2c4eef49b1af 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1316,7 +1316,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_func_entry *entry; struct hlist_node *tn; struct hlist_head *hhd; - struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; int size = src->count; int bits = 0; @@ -1361,9 +1360,7 @@ update: */ ftrace_hash_rec_disable_modify(ops, enable); - old_hash = *dst; rcu_assign_pointer(*dst, new_hash); - free_ftrace_hash_rcu(old_hash); ftrace_hash_rec_enable_modify(ops, enable); @@ -3408,6 +3405,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, { struct ftrace_func_probe *entry; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -3426,7 +3424,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&trace_probe_ops.func_hash->regex_lock); - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); if (!hash) { count = -ENOMEM; goto out; @@ -3485,7 +3483,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); - if (ret < 0) + if (!ret) + free_ftrace_hash_rcu(old_hash); + else count = ret; __enable_ftrace_function_probe(); @@ -3512,6 +3512,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, struct ftrace_func_probe *entry; struct ftrace_func_probe *p; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; + struct ftrace_hash *old_hash = *orig_hash; struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *tmp; @@ -3519,6 +3520,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, int type = MATCH_FULL; int i, len = 0; char *search; + int ret; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) glob = NULL; @@ -3577,8 +3579,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * Remove after the disable is called. Otherwise, if the last * probe is removed, a null hash means *all enabled*. */ - ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); synchronize_sched(); + if (!ret) + free_ftrace_hash_rcu(old_hash); + list_for_each_entry_safe(entry, p, &free_list, free_list) { list_del(&entry->free_list); ftrace_free_entry(entry); @@ -3776,6 +3781,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -3810,10 +3816,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } mutex_lock(&ftrace_lock); + old_hash = *orig_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) + if (!ret) { ftrace_ops_update_code(ops); - + free_ftrace_hash_rcu(old_hash); + } mutex_unlock(&ftrace_lock); out_regex_unlock: @@ -4022,6 +4030,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) struct seq_file *m = (struct seq_file *)file->private_data; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct trace_parser *parser; int filter_hash; int ret; @@ -4051,11 +4060,13 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); + old_hash = *orig_hash; ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); - if (!ret) + if (!ret) { ftrace_ops_update_code(iter->ops); - + free_ftrace_hash_rcu(old_hash); + } mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From 5fecaa044af3dc52e4bc138842bdf1c6676105b1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 24 Jul 2014 16:00:31 -0400 Subject: ftrace: Grab any ops for a rec for enabled_functions output When dumping the enabled_functions, use the first op that is found with a trampoline to the record, as there should only be one, as only one ops can be registered to a function that has a trampoline. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c4eef49b1af..858ac16f8492 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1900,6 +1900,25 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) return ftrace_check_record(rec, enable, 0); } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (!op->trampoline) + continue; + + if (ftrace_lookup_ip(op->func_hash->filter_hash, rec->ip) && + (ftrace_hash_empty(op->func_hash->notrace_hash) || + !ftrace_lookup_ip(op->func_hash->notrace_hash, rec->ip))) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { @@ -2966,7 +2985,7 @@ static int t_show(struct seq_file *m, void *v) if (rec->flags & FTRACE_FL_TRAMP_EN) { struct ftrace_ops *ops; - ops = ftrace_find_tramp_ops_curr(rec); + ops = ftrace_find_tramp_ops_any(rec); if (ops && ops->trampoline) seq_printf(m, "\ttramp: %pS", (void *)ops->trampoline); -- cgit v1.2.3 From e1effa0144a1ddf5b456c388ffaf784f3c5163fd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 5 Aug 2014 17:19:38 -0400 Subject: ftrace: Annotate the ops operation on update Add three new flags for ftrace_ops: FTRACE_OPS_FL_ADDING FTRACE_OPS_FL_REMOVING FTRACE_OPS_FL_MODIFYING These will be set for the ftrace_ops when they are first added to the function tracing, being removed from function tracing or just having their functions changed from function tracing, respectively. This will be needed to remove the tramp_hash, which can grow quite big. The tramp_hash is used to note what functions a ftrace_ops is using a trampoline for. Denoting which ftrace_ops is being modified, will allow us to use the ftrace_ops hashes themselves, which are much smaller as they have a global flag to denote if a ftrace_ops is tracing all functions, as well as a notrace hash if the ftrace_ops is tracing all but a few. The tramp_hash just creates a hash item for every function, which can go into the 10s of thousands if all functions are using the ftrace_ops trampoline. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 6 ++++++ kernel/trace/ftrace.c | 45 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ef37286547fc..d9216f6385d9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -91,6 +91,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * INITIALIZED - The ftrace_ops has already been initialized (first use time * register_ftrace_function() is called, it will initialized the ops) * DELETED - The ops are being deleted, do not let them be registered again. + * ADDING - The ops is in the process of being added. + * REMOVING - The ops is in the process of being removed. + * MODIFYING - The ops is in the process of changing its filter functions. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -102,6 +105,9 @@ enum { FTRACE_OPS_FL_STUB = 1 << 6, FTRACE_OPS_FL_INITIALIZED = 1 << 7, FTRACE_OPS_FL_DELETED = 1 << 8, + FTRACE_OPS_FL_ADDING = 1 << 9, + FTRACE_OPS_FL_REMOVING = 1 << 10, + FTRACE_OPS_FL_MODIFYING = 1 << 11, }; #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 858ac16f8492..e43c793093e5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1057,6 +1057,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid; static struct ftrace_ops *removed_ops; +/* + * Set when doing a global update, like enabling all recs or disabling them. + * It is not set when just updating a single ftrace_ops. + */ +static bool update_all_ops; + #ifndef CONFIG_FTRACE_MCOUNT_RECORD # error Dynamic ftrace depends on MCOUNT_RECORD #endif @@ -2366,6 +2372,13 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); } +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) +{ + ops->flags |= FTRACE_OPS_FL_MODIFYING; + ftrace_run_update_code(command); + ops->flags &= ~FTRACE_OPS_FL_MODIFYING; +} + static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; @@ -2387,6 +2400,13 @@ static void ftrace_startup_enable(int command) ftrace_run_update_code(command); } +static void ftrace_startup_all(int command) +{ + update_all_ops = true; + ftrace_startup_enable(command); + update_all_ops = false; +} + static int ftrace_startup(struct ftrace_ops *ops, int command) { int ret; @@ -2401,12 +2421,22 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; - ops->flags |= FTRACE_OPS_FL_ENABLED; + /* + * Note that ftrace probes uses this to start up + * and modify functions it will probe. But we still + * set the ADDING flag for modification, as probes + * do not have trampolines. If they add them in the + * future, then the probes will need to distinguish + * between adding and updating probes. + */ + ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); + ops->flags &= ~FTRACE_OPS_FL_ADDING; + return 0; } @@ -2456,11 +2486,12 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * If the ops uses a trampoline, then it needs to be * tested first on update. */ + ops->flags |= FTRACE_OPS_FL_REMOVING; removed_ops = ops; ftrace_run_update_code(command); - removed_ops = NULL; + ops->flags &= ~FTRACE_OPS_FL_REMOVING; /* * Dynamic ops may be freed, we must make sure that all @@ -3373,7 +3404,7 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); return; } @@ -3792,7 +3823,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) static void ftrace_ops_update_code(struct ftrace_ops *ops) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); } static int @@ -4717,6 +4748,7 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } +static inline void ftrace_startup_all(int command) { } /* Keep as macros so we do not need to define the commands */ # define ftrace_startup(ops, command) \ ({ \ @@ -5016,7 +5048,8 @@ static int ftrace_pid_add(int p) set_ftrace_pid_task(pid); ftrace_update_pid_func(); - ftrace_startup_enable(0); + + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); return 0; @@ -5045,7 +5078,7 @@ static void ftrace_pid_reset(void) } ftrace_update_pid_func(); - ftrace_startup_enable(0); + ftrace_startup_all(0); mutex_unlock(&ftrace_lock); } -- cgit v1.2.3 From fef5aeeee9e3717e7aea991a7ae9ff6a7a2d4c85 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 24 Jul 2014 12:25:47 -0400 Subject: ftrace: Replace tramp_hash with old_*_hash to save space Allowing function callbacks to declare their own trampolines requires that each ftrace_ops that has a trampoline must have some sort of accounting that keeps track of which ops has a trampoline attached to a record. The easy way to solve this was to add a "tramp_hash" that created a hash entry for every function that a ops uses with a trampoline. But since we can have literally tens of thousands of functions being traced, that means we need tens of thousands of descriptors to map the ops to the function in the hash. This is quite expensive and can cause enabling and disabling the function graph tracer to take some time to start and stop. It can take up to several seconds to disable or enable all functions in the function graph tracer for this reason. The better approach albeit more complex, is to keep track of how ops are being enabled and disabled, and use that along with the counting of the number of ops attached to records, to determive what ops has a trampoline attached to a record at enabling and disabling of tracing. To do this, the tramp_hash has been replaced with an old_filter_hash and old_notrace_hash, which get the copy of the ops filter_hash and notrace_hash respectively. The old hashes is kept until the ops has been modified or removed and the old hashes are used with the logic of the accounting to determine the ops that have the trampoline of a record. The reason this has less of a footprint is due to the trick that an "empty" hash in the filter_hash means "all functions" and an empty hash in the notrace hash means "no functions" in the hash. This is much more efficienct, doesn't have the delay, and takes up much less memory, as we do not need to map all the functions but just figure out which functions are mapped at the time it is enabled or disabled. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 239 +++++++++++++++++-------------------------------- 2 files changed, 85 insertions(+), 156 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index d9216f6385d9..662697babd48 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -140,7 +140,7 @@ struct ftrace_ops { int nr_trampolines; struct ftrace_ops_hash local_hash; struct ftrace_ops_hash *func_hash; - struct ftrace_hash *tramp_hash; + struct ftrace_ops_hash old_hash; unsigned long trampoline; #endif }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e43c793093e5..d325a1e76554 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1373,6 +1373,21 @@ update: return 0; } +static bool hash_contains_ip(unsigned long ip, + struct ftrace_ops_hash *hash) +{ + /* + * The function record is a match if it exists in the filter + * hash and not in the notrace hash. Note, an emty hash is + * considered a match for the filter hash, but an empty + * notrace hash is considered not in the notrace hash. + */ + return (ftrace_hash_empty(hash->filter_hash) || + ftrace_lookup_ip(hash->filter_hash, ip)) && + (ftrace_hash_empty(hash->notrace_hash) || + !ftrace_lookup_ip(hash->notrace_hash, ip)); +} + /* * Test the hashes for this ops to see if we want to call * the ops->func or not. @@ -1388,8 +1403,7 @@ update: static int ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_ops_hash hash; int ret; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS @@ -1402,13 +1416,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 0; #endif - filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); - notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); + hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash); + hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash); - if ((ftrace_hash_empty(filter_hash) || - ftrace_lookup_ip(filter_hash, ip)) && - (ftrace_hash_empty(notrace_hash) || - !ftrace_lookup_ip(notrace_hash, ip))) + if (hash_contains_ip(ip, &hash)) ret = 1; else ret = 0; @@ -1520,46 +1531,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void ftrace_remove_tramp(struct ftrace_ops *ops, - struct dyn_ftrace *rec) -{ - /* If TRAMP is not set, no ops should have a trampoline for this */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - return; - - rec->flags &= ~FTRACE_FL_TRAMP; - - if ((!ftrace_hash_empty(ops->func_hash->filter_hash) && - !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) || - ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) - return; - /* - * The tramp_hash entry will be removed at time - * of update. - */ - ops->nr_trampolines--; -} - -static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops) -{ - struct ftrace_ops *op; - - /* If TRAMP is not set, no ops should have a trampoline for this */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - return; - - do_for_each_ftrace_op(op, ftrace_ops_list) { - /* - * This function is called to clear other tramps - * not the one that is being updated. - */ - if (op == ops) - continue; - if (op->nr_trampolines) - ftrace_remove_tramp(op, rec); - } while_for_each_ftrace_op(op); -} - static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1648,18 +1619,16 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * function, and the ops has a trampoline registered * for it, then we can call it directly. */ - if (ftrace_rec_count(rec) == 1 && ops->trampoline) { + if (ftrace_rec_count(rec) == 1 && ops->trampoline) rec->flags |= FTRACE_FL_TRAMP; - ops->nr_trampolines++; - } else { + else /* * If we are adding another function callback * to this function, and the previous had a * custom trampoline in use, then we need to go * back to the default trampoline. */ - ftrace_clear_tramps(rec, ops); - } + rec->flags &= ~FTRACE_FL_TRAMP; /* * If any ops wants regs saved for this function @@ -1672,9 +1641,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, return; rec->flags--; - if (ops->trampoline && !ftrace_rec_count(rec)) - ftrace_remove_tramp(ops, rec); - /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -1688,6 +1654,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags &= ~FTRACE_FL_REGS; } + /* + * If the rec had TRAMP enabled, then it needs to + * be cleared. As TRAMP can only be enabled iff + * there is only a single ops attached to it. + * In otherwords, always disable it on decrementing. + * In the future, we may set it if rec count is + * decremented to one, and the ops that is left + * has a trampoline. + */ + rec->flags &= ~FTRACE_FL_TRAMP; + /* * flags will be cleared in ftrace_check_record() * if rec count is zero. @@ -1910,15 +1887,14 @@ static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; do_for_each_ftrace_op(op, ftrace_ops_list) { if (!op->trampoline) continue; - if (ftrace_lookup_ip(op->func_hash->filter_hash, rec->ip) && - (ftrace_hash_empty(op->func_hash->notrace_hash) || - !ftrace_lookup_ip(op->func_hash->notrace_hash, rec->ip))) + if (hash_contains_ip(ip, op->func_hash)) return op; } while_for_each_ftrace_op(op); @@ -1929,18 +1905,51 @@ static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; - /* Removed ops need to be tested first */ - if (removed_ops && removed_ops->tramp_hash) { - if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) + /* + * Need to check removed ops first. + * If they are being removed, and this rec has a tramp, + * and this rec is in the ops list, then it would be the + * one with the tramp. + */ + if (removed_ops) { + if (hash_contains_ip(ip, &removed_ops->old_hash)) return removed_ops; } + /* + * Need to find the current trampoline for a rec. + * Now, a trampoline is only attached to a rec if there + * was a single 'ops' attached to it. But this can be called + * when we are adding another op to the rec or removing the + * current one. Thus, if the op is being added, we can + * ignore it because it hasn't attached itself to the rec + * yet. That means we just need to find the op that has a + * trampoline and is not beeing added. + */ do_for_each_ftrace_op(op, ftrace_ops_list) { - if (!op->tramp_hash) + + if (!op->trampoline) + continue; + + /* + * If the ops is being added, it hasn't gotten to + * the point to be removed from this tree yet. + */ + if (op->flags & FTRACE_OPS_FL_ADDING) continue; - if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) + /* + * If the ops is not being added and has a trampoline, + * then it must be the one that we want! + */ + if (hash_contains_ip(ip, op->func_hash)) + return op; + + /* If the ops is being modified, it may be in the old hash. */ + if ((op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, &op->old_hash)) return op; } while_for_each_ftrace_op(op); @@ -1952,10 +1961,11 @@ static struct ftrace_ops * ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) { struct ftrace_ops *op; + unsigned long ip = rec->ip; do_for_each_ftrace_op(op, ftrace_ops_list) { /* pass rec in as regs to have non-NULL val */ - if (ftrace_ops_test(op, rec->ip, rec)) + if (hash_contains_ip(ip, op->func_hash)) return op; } while_for_each_ftrace_op(op); @@ -2262,92 +2272,6 @@ void __weak arch_ftrace_update_code(int command) ftrace_run_stop_machine(command); } -static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) -{ - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int size, bits; - int ret; - - size = ops->nr_trampolines; - bits = 0; - /* - * Make the hash size about 1/2 the # found - */ - for (size /= 2; size; size >>= 1) - bits++; - - ops->tramp_hash = alloc_ftrace_hash(bits); - /* - * TODO: a failed allocation is going to screw up - * the accounting of what needs to be modified - * and not. For now, we kill ftrace if we fail - * to allocate here. But there are ways around this, - * but that will take a little more work. - */ - if (!ops->tramp_hash) - return -ENOMEM; - - do_for_each_ftrace_rec(pg, rec) { - if (ftrace_rec_count(rec) == 1 && - ftrace_ops_test(ops, rec->ip, rec)) { - - /* - * If another ops adds to a rec, the rec will - * lose its trampoline and never get it back - * until all ops are off of it. - */ - if (!(rec->flags & FTRACE_FL_TRAMP)) - continue; - - /* This record had better have a trampoline */ - if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) - return -1; - - ret = add_hash_entry(ops->tramp_hash, rec->ip); - if (ret < 0) - return ret; - } - } while_for_each_ftrace_rec(); - - /* The number of recs in the hash must match nr_trampolines */ - if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines)) - pr_warn("count=%ld trampolines=%d\n", - ops->tramp_hash->count, - ops->nr_trampolines); - - return 0; -} - -static int ftrace_save_tramp_hashes(void) -{ - struct ftrace_ops *op; - int ret; - - /* - * Now that any trampoline is being used, we need to save the - * hashes for the ops that have them. This allows the mapping - * back from the record to the ops that has the trampoline to - * know what code is being replaced. Modifying code must always - * verify what it is changing. - */ - do_for_each_ftrace_op(op, ftrace_ops_list) { - - /* The tramp_hash is recreated each time. */ - free_ftrace_hash(op->tramp_hash); - op->tramp_hash = NULL; - - if (op->nr_trampolines) { - ret = ftrace_save_ops_tramp_hash(op); - if (ret) - return ret; - } - - } while_for_each_ftrace_op(op); - - return 0; -} - static void ftrace_run_update_code(int command) { int ret; @@ -2367,9 +2291,6 @@ static void ftrace_run_update_code(int command) ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); - - ret = ftrace_save_tramp_hashes(); - FTRACE_WARN_ON(ret); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) @@ -2489,8 +2410,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) ops->flags |= FTRACE_OPS_FL_REMOVING; removed_ops = ops; + /* The trampoline logic checks the old hashes */ + ops->old_hash.filter_hash = ops->func_hash->filter_hash; + ops->old_hash.notrace_hash = ops->func_hash->notrace_hash; + ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; + + removed_ops = NULL; ops->flags &= ~FTRACE_OPS_FL_REMOVING; /* @@ -3017,7 +2946,7 @@ static int t_show(struct seq_file *m, void *v) struct ftrace_ops *ops; ops = ftrace_find_tramp_ops_any(rec); - if (ops && ops->trampoline) + if (ops) seq_printf(m, "\ttramp: %pS", (void *)ops->trampoline); else -- cgit v1.2.3 From fb5a613b4f310d6d520daf295547ab35b0ac58a3 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Fri, 22 Aug 2014 17:28:22 +0300 Subject: kernel: trace_syscalls: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The uses of "rcu_assign_pointer()" are NULLing out the pointers. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Link: http://lkml.kernel.org/p/20140822142822.GA32391@ada Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 759d5e004517..4dc8b79c5f75 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -425,7 +425,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - rcu_assign_pointer(tr->enter_syscall_files[num], NULL); + RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); @@ -463,7 +463,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - rcu_assign_pointer(tr->exit_syscall_files[num], NULL); + RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); -- cgit v1.2.3 From b954d83421d51d822c42e5ab7b65069b25ad3005 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Sep 2014 15:01:02 +0200 Subject: net: bpf: only build bpf_jit_binary_{alloc, free}() when jit selected Since BPF JIT depends on the availability of module_alloc() and module_free() helpers (HAVE_BPF_JIT and MODULES), we better build that code only in case we have BPF_JIT in our config enabled, just like with other JIT code. Fixes builds for arm/marzen_defconfig and sh/rsk7269_defconfig. ==================== kernel/built-in.o: In function `bpf_jit_binary_alloc': /home/cwang/linux/kernel/bpf/core.c:144: undefined reference to `module_alloc' kernel/built-in.o: In function `bpf_jit_binary_free': /home/cwang/linux/kernel/bpf/core.c:164: undefined reference to `module_free' make: *** [vmlinux] Error 1 ==================== Reported-by: Fengguang Wu Fixes: 738cbe72adc5 ("net: bpf: consolidate JIT binary allocator") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 78 +++++++++++++++++++++++++------------------------- kernel/bpf/core.c | 2 ++ 2 files changed, 41 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4b59edead908..1a0bc6d134d7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -4,12 +4,18 @@ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ +#include + #include #include #include +#include +#include #include -#include + #include + +#include #include struct sk_buff; @@ -363,14 +369,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); -typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); - -struct bpf_binary_header * -bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, - unsigned int alignment, - bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_jit_binary_free(struct bpf_binary_header *hdr); - static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { bpf_prog_unlock_ro(fp); @@ -393,6 +391,38 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct bpf_prog *fp); +#ifdef CONFIG_BPF_JIT +typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); + +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_jit_binary_free(struct bpf_binary_header *hdr); + +void bpf_jit_compile(struct bpf_prog *fp); +void bpf_jit_free(struct bpf_prog *fp); + +static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, + u32 pass, void *image) +{ + pr_err("flen=%u proglen=%u pass=%u image=%pK\n", + flen, proglen, pass, image); + if (image) + print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, + 16, 1, image, proglen, false); +} +#else +static inline void bpf_jit_compile(struct bpf_prog *fp) +{ +} + +static inline void bpf_jit_free(struct bpf_prog *fp) +{ + bpf_prog_unlock_free(fp); +} +#endif /* CONFIG_BPF_JIT */ + #define BPF_ANC BIT(15) static inline u16 bpf_anc_helper(const struct sock_filter *ftest) @@ -440,36 +470,6 @@ static inline void *bpf_load_pointer(const struct sk_buff *skb, int k, return bpf_internal_load_pointer_neg_helper(skb, k, size); } -#ifdef CONFIG_BPF_JIT -#include -#include -#include - -void bpf_jit_compile(struct bpf_prog *fp); -void bpf_jit_free(struct bpf_prog *fp); - -static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, - u32 pass, void *image) -{ - pr_err("flen=%u proglen=%u pass=%u image=%pK\n", - flen, proglen, pass, image); - if (image) - print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, - 16, 1, image, proglen, false); -} -#else -#include - -static inline void bpf_jit_compile(struct bpf_prog *fp) -{ -} - -static inline void bpf_jit_free(struct bpf_prog *fp) -{ - bpf_prog_unlock_free(fp); -} -#endif /* CONFIG_BPF_JIT */ - static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8ee520f0ec70..8b7002488251 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -128,6 +128,7 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +#ifdef CONFIG_BPF_JIT struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -163,6 +164,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) { module_free(NULL, hdr); } +#endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs -- cgit v1.2.3 From 000a7d66ec30898f46869be01ab8205b056385d0 Mon Sep 17 00:00:00 2001 From: Patrick Palka Date: Tue, 9 Sep 2014 14:50:48 -0700 Subject: kernel/printk/printk.c: fix faulty logic in the case of recursive printk We shouldn't set text_len in the code path that detects printk recursion because text_len corresponds to the length of the string inside textbuf. A few lines down from the line text_len = strlen(recursion_msg); is the line text_len += vscnprintf(text + text_len, ...); So if printk detects recursion, it sets text_len to 29 (the length of recursion_msg) and logs an error. Then the message supplied by the caller of printk is stored inside textbuf but offset by 29 bytes. This means that the output of the recursive call to printk will contain 29 bytes of garbage in front of it. This defect is caused by commit 458df9fd4815 ("printk: remove separate printk_sched buffers and use printk buf instead") which turned the line text_len = vscnprintf(text, ...); into text_len += vscnprintf(text + text_len, ...); To fix this, this patch avoids setting text_len when logging the printk recursion error. This patch also marks unlikely() the branch leading up to this code. Fixes: 458df9fd4815b478 ("printk: remove separate printk_sched buffers and use printk buf instead") Signed-off-by: Patrick Palka Reviewed-by: Petr Mladek Reviewed-by: Jan Kara Acked-by: Steven Rostedt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e04c455a0e38..1ce770687ea8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1665,15 +1665,15 @@ asmlinkage int vprintk_emit(int facility, int level, raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; - if (recursion_bug) { + if (unlikely(recursion_bug)) { static const char recursion_msg[] = "BUG: recent printk recursion!"; recursion_bug = 0; - text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, text_len); + NULL, 0, recursion_msg, + strlen(recursion_msg)); } /* -- cgit v1.2.3 From acbbe6fbb240a927ee1f5994f04d31267d422215 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 9 Sep 2014 14:51:01 -0700 Subject: kcmp: fix standard comparison bug The C operator <= defines a perfectly fine total ordering on the set of values representable in a long. However, unlike its namesake in the integers, it is not translation invariant, meaning that we do not have "b <= c" iff "a+b <= a+c" for all a,b,c. This means that it is always wrong to try to boil down the relationship between two longs to a question about the sign of their difference, because the resulting relation [a LEQ b iff a-b <= 0] is neither anti-symmetric or transitive. The former is due to -LONG_MIN==LONG_MIN (take any two a,b with a-b = LONG_MIN; then a LEQ b and b LEQ a, but a != b). The latter can either be seen observing that x LEQ x+1 for all x, implying x LEQ x+1 LEQ x+2 ... LEQ x-1 LEQ x; or more directly with the simple example a=LONG_MIN, b=0, c=1, for which a-b < 0, b-c < 0, but a-c > 0. Note that it makes absolutely no difference that a transmogrying bijection has been applied before the comparison is done. In fact, had the obfuscation not been done, one could probably not observe the bug (assuming all values being compared always lie in one half of the address space, the mathematical value of a-b is always representable in a long). As it stands, one can easily obtain three file descriptors exhibiting the non-transitivity of kcmp(). Side note 1: I can't see that ensuring the MSB of the multiplier is set serves any purpose other than obfuscating the obfuscating code. Side note 2: #include #include #include #include #include #include #include enum kcmp_type { KCMP_FILE, KCMP_VM, KCMP_FILES, KCMP_FS, KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, KCMP_TYPES, }; pid_t pid; int kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) { return syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2); } int cmp_fd(int fd1, int fd2) { int c = kcmp(pid, pid, KCMP_FILE, fd1, fd2); if (c < 0) { perror("kcmp"); exit(1); } assert(0 <= c && c < 3); return c; } int cmp_fdp(const void *a, const void *b) { static const int normalize[] = {0, -1, 1}; return normalize[cmp_fd(*(int*)a, *(int*)b)]; } #define MAX 100 /* This is plenty; I've seen it trigger for MAX==3 */ int main(int argc, char *argv[]) { int r, s, count = 0; int REL[3] = {0,0,0}; int fd[MAX]; pid = getpid(); while (count < MAX) { r = open("/dev/null", O_RDONLY); if (r < 0) break; fd[count++] = r; } printf("opened %d file descriptors\n", count); for (r = 0; r < count; ++r) { for (s = r+1; s < count; ++s) { REL[cmp_fd(fd[r], fd[s])]++; } } printf("== %d\t< %d\t> %d\n", REL[0], REL[1], REL[2]); qsort(fd, count, sizeof(fd[0]), cmp_fdp); memset(REL, 0, sizeof(REL)); for (r = 0; r < count; ++r) { for (s = r+1; s < count; ++s) { REL[cmp_fd(fd[r], fd[s])]++; } } printf("== %d\t< %d\t> %d\n", REL[0], REL[1], REL[2]); return (REL[0] + REL[2] != 0); } Signed-off-by: Rasmus Villemoes Reviewed-by: Cyrill Gorcunov "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index e30ac0fe61c3..0aa69ea1d8fd 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { - long ret; + long t1, t2; - ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + t1 = kptr_obfuscate((long)v1, type); + t2 = kptr_obfuscate((long)v2, type); - return (ret < 0) | ((ret > 0) << 1); + return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ -- cgit v1.2.3 From 13c42c2f43b19aab3195f2d357db00d1e885eaa8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Sep 2014 23:44:35 +0200 Subject: futex: Unlock hb->lock in futex_wait_requeue_pi() error path futex_wait_requeue_pi() calls futex_wait_setup(). If futex_wait_setup() succeeds it returns with hb->lock held and preemption disabled. Now the sanity check after this does: if (match_futex(&q.key, &key2)) { ret = -EINVAL; goto out_put_keys; } which releases the keys but does not release hb->lock. So we happily return to user space with hb->lock held and therefor preemption disabled. Unlock hb->lock before taking the exit route. Reported-by: Dave "Trinity" Jones Signed-off-by: Thomas Gleixner Reviewed-by: Darren Hart Reviewed-by: Davidlohr Bueso Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1409112318500.4178@nanos Signed-off-by: Thomas Gleixner --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d3a9d946d0b7..815d7af2ffe8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2592,6 +2592,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * shared futexes. We need to compare the keys: */ if (match_futex(&q.key, &key2)) { + queue_unlock(hb); ret = -EINVAL; goto out_put_keys; } -- cgit v1.2.3 From d78c9300c51d6ceed9f6d078d4e9366f259de28c Mon Sep 17 00:00:00 2001 From: Andrew Hunter Date: Thu, 4 Sep 2014 14:17:16 -0700 Subject: jiffies: Fix timeval conversion to jiffies timeval_to_jiffies tried to round a timeval up to an integral number of jiffies, but the logic for doing so was incorrect: intervals corresponding to exactly N jiffies would become N+1. This manifested itself particularly repeatedly stopping/starting an itimer: setitimer(ITIMER_PROF, &val, NULL); setitimer(ITIMER_PROF, NULL, &val); would add a full tick to val, _even if it was exactly representable in terms of jiffies_ (say, the result of a previous rounding.) Doing this repeatedly would cause unbounded growth in val. So fix the math. Here's what was wrong with the conversion: we essentially computed (eliding seconds) jiffies = usec * (NSEC_PER_USEC/TICK_NSEC) by using scaling arithmetic, which took the best approximation of NSEC_PER_USEC/TICK_NSEC with denominator of 2^USEC_JIFFIE_SC = x/(2^USEC_JIFFIE_SC), and computed: jiffies = (usec * x) >> USEC_JIFFIE_SC and rounded this calculation up in the intermediate form (since we can't necessarily exactly represent TICK_NSEC in usec.) But the scaling arithmetic is a (very slight) *over*approximation of the true value; that is, instead of dividing by (1 usec/ 1 jiffie), we effectively divided by (1 usec/1 jiffie)-epsilon (rounding down). This would normally be fine, but we want to round timeouts up, and we did so by adding 2^USEC_JIFFIE_SC - 1 before the shift; this would be fine if our division was exact, but dividing this by the slightly smaller factor was equivalent to adding just _over_ 1 to the final result (instead of just _under_ 1, as desired.) In particular, with HZ=1000, we consistently computed that 10000 usec was 11 jiffies; the same was true for any exact multiple of TICK_NSEC. We could possibly still round in the intermediate form, adding something less than 2^USEC_JIFFIE_SC - 1, but easier still is to convert usec->nsec, round in nanoseconds, and then convert using time*spec*_to_jiffies. This adds one constant multiplication, and is not observably slower in microbenchmarks on recent x86 hardware. Tested: the following program: int main() { struct itimerval zero = {{0, 0}, {0, 0}}; /* Initially set to 10 ms. */ struct itimerval initial = zero; initial.it_interval.tv_usec = 10000; setitimer(ITIMER_PROF, &initial, NULL); /* Save and restore several times. */ for (size_t i = 0; i < 10; ++i) { struct itimerval prev; setitimer(ITIMER_PROF, &zero, &prev); /* on old kernels, this goes up by TICK_USEC every iteration */ printf("previous value: %ld %ld %ld %ld\n", prev.it_interval.tv_sec, prev.it_interval.tv_usec, prev.it_value.tv_sec, prev.it_value.tv_usec); setitimer(ITIMER_PROF, &prev, NULL); } return 0; } Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Paul Turner Cc: Richard Cochran Cc: Prarit Bhargava Reviewed-by: Paul Turner Reported-by: Aaron Jacobs Signed-off-by: Andrew Hunter [jstultz: Tweaked to apply to 3.17-rc] Signed-off-by: John Stultz --- include/linux/jiffies.h | 12 ----------- kernel/time/time.c | 56 +++++++++++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 1f44466c1e9d..c367cbdf73ab 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -258,23 +258,11 @@ extern unsigned long preset_lpj; #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) -#define USEC_JIFFIE_SC (SEC_JIFFIE_SC + 19) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) -#define USEC_CONVERSION \ - ((unsigned long)((((u64)NSEC_PER_USEC << USEC_JIFFIE_SC) +\ - TICK_NSEC -1) / (u64)TICK_NSEC)) -/* - * USEC_ROUND is used in the timeval to jiffie conversion. See there - * for more details. It is the scaled resolution rounding value. Note - * that it is a 64-bit value. Since, when it is applied, we are already - * in jiffies (albit scaled), it is nothing but the bits we will shift - * off. - */ -#define USEC_ROUND (u64)(((u64)1 << USEC_JIFFIE_SC) - 1) /* * The maximum jiffie value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, diff --git a/kernel/time/time.c b/kernel/time/time.c index f0294ba14634..a9ae20fb0b11 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -559,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies); * that a remainder subtract here would not do the right thing as the * resolution values don't fall on second boundries. I.e. the line: * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * Note that due to the small error in the multiplier here, this + * rounding is incorrect for sufficiently large values of tv_nsec, but + * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're + * OK. * * Rather, we just shift the bits off the right. * * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec * value to a scaled second value. */ -unsigned long -timespec_to_jiffies(const struct timespec *value) +static unsigned long +__timespec_to_jiffies(unsigned long sec, long nsec) { - unsigned long sec = value->tv_sec; - long nsec = value->tv_nsec + TICK_NSEC - 1; + nsec = nsec + TICK_NSEC - 1; if (sec >= MAX_SEC_IN_JIFFIES){ sec = MAX_SEC_IN_JIFFIES; @@ -580,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value) (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; } + +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); +} + EXPORT_SYMBOL(timespec_to_jiffies); void @@ -596,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) } EXPORT_SYMBOL(jiffies_to_timespec); -/* Same for "timeval" - * - * Well, almost. The problem here is that the real system resolution is - * in nanoseconds and the value being converted is in micro seconds. - * Also for some machines (those that use HZ = 1024, in-particular), - * there is a LARGE error in the tick size in microseconds. - - * The solution we use is to do the rounding AFTER we convert the - * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. - * Instruction wise, this should cost only an additional add with carry - * instruction above the way it was done above. +/* + * We could use a similar algorithm to timespec_to_jiffies (with a + * different multiplier for usec instead of nsec). But this has a + * problem with rounding: we can't exactly add TICK_NSEC - 1 to the + * usec value, since it's not necessarily integral. + * + * We could instead round in the intermediate scaled representation + * (i.e. in units of 1/2^(large scale) jiffies) but that's also + * perilous: the scaling introduces a small positive error, which + * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1 + * units to the intermediate before shifting) leads to accidental + * overflow and overestimates. + * + * At the cost of one additional multiplication by a constant, just + * use the timespec implementation. */ unsigned long timeval_to_jiffies(const struct timeval *value) { - unsigned long sec = value->tv_sec; - long usec = value->tv_usec; - - if (sec >= MAX_SEC_IN_JIFFIES){ - sec = MAX_SEC_IN_JIFFIES; - usec = 0; - } - return (((u64)sec * SEC_CONVERSION) + - (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> - (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + return __timespec_to_jiffies(value->tv_sec, + value->tv_usec * NSEC_PER_USEC); } EXPORT_SYMBOL(timeval_to_jiffies); -- cgit v1.2.3 From e86fea764991e00a03ff1e56409ec9cacdbda4c9 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:03 -0700 Subject: alarmtimer: Return relative times in timer_gettime Returns the time remaining for an alarm timer, rather than the time at which it is scheduled to expire. If the timer has already expired or it is not currently scheduled, the it_value's members are set to zero. This new behavior matches that of the other posix-timers and the POSIX specifications. This is a change in user-visible behavior, and may break existing applications. Hopefully, few users rely on the old incorrect behavior. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque [jstultz: minor style tweak] Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4aec4a457431..b4bce62e47b2 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -541,18 +541,22 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @new_timer: k_itimer pointer * @cur_setting: itimerspec data to fill * - * Copies the itimerspec data out from the k_itimer + * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) { - memset(cur_setting, 0, sizeof(struct itimerspec)); + ktime_t relative_expiry_time = + alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); - cur_setting->it_interval = - ktime_to_timespec(timr->it.alarm.interval); - cur_setting->it_value = - ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); - return; + if (ktime_to_ns(relative_expiry_time) > 0) { + cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + } else { + cur_setting->it_value.tv_sec = 0; + cur_setting->it_value.tv_nsec = 0; + } + + cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); } /** -- cgit v1.2.3 From 265b81d23a46c39df0a735a3af4238954b41a4c2 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:04 -0700 Subject: alarmtimer: Do not signal SIGEV_NONE timers Avoids sending a signal to alarm timers created with sigev_notify set to SIGEV_NONE by checking for that special case in the timeout callback. The regular posix timers avoid sending signals to SIGEV_NONE timers by not scheduling any callbacks for them in the first place. Although it would be possible to do something similar for alarm timers, it's simpler to handle this as a special case in the timeout. Prior to this patch, the alarm timer would ignore the sigev_notify value and try to deliver signals to the process anyway. Even worse, the sanity check for the value of sigev_signo is skipped when SIGEV_NONE was specified, so the signal number could be bogus. If sigev_signo was an unitialized value (as it often would be if SIGEV_NONE is used), then it's hard to predict which signal will be sent. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index b4bce62e47b2..41a925396830 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -466,8 +466,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, { struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); - if (posix_timer_event(ptr, 0) != 0) - ptr->it_overrun++; + if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; + } /* Re-add periodic timers */ if (ptr->it.alarm.interval.tv64) { -- cgit v1.2.3 From 474e941bed9262f5fa2394f9a4a67e24499e5926 Mon Sep 17 00:00:00 2001 From: Richard Larocque Date: Tue, 9 Sep 2014 18:31:05 -0700 Subject: alarmtimer: Lock k_itimer during timer callback Locks the k_itimer's it_lock member when handling the alarm timer's expiry callback. The regular posix timers defined in posix-timers.c have this lock held during timout processing because their callbacks are routed through posix_timer_fn(). The alarm timers follow a different path, so they ought to grab the lock somewhere else. Cc: stable@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Sharvil Nanavati Signed-off-by: Richard Larocque Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 41a925396830..a7077d3ae52f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -464,8 +464,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) { + unsigned long flags; struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); + enum alarmtimer_restart result = ALARMTIMER_NORESTART; + + spin_lock_irqsave(&ptr->it_lock, flags); if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { if (posix_timer_event(ptr, 0) != 0) ptr->it_overrun++; @@ -475,9 +479,11 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, if (ptr->it.alarm.interval.tv64) { ptr->it_overrun += alarm_forward(alarm, now, ptr->it.alarm.interval); - return ALARMTIMER_RESTART; + result = ALARMTIMER_RESTART; } - return ALARMTIMER_NORESTART; + spin_unlock_irqrestore(&ptr->it_lock, flags); + + return result; } /** -- cgit v1.2.3 From 84bde62ca4b49701190dbd953c1e04024860c1f5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Sep 2014 14:21:13 -0400 Subject: ftrace: Add sanity check when unregistering last ftrace_ops When the last ftrace_ops is unregistered, all the function records should have a zeroed flags value. Make sure that is the case when the last ftrace_ops is unregistered. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d325a1e76554..fb186b9ddf51 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2416,6 +2416,21 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) ftrace_run_update_code(command); + /* + * If there's no more ops registered with ftrace, run a + * sanity check to make sure all rec flags are cleared. + */ + if (ftrace_ops_list == &ftrace_list_end) { + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + do_for_each_ftrace_rec(pg, rec) { + if (FTRACE_WARN_ON_ONCE(rec->flags)) + pr_warn(" %pS flags:%lx\n", + (void *)rec->ip, rec->flags); + } while_for_each_ftrace_rec(); + } + ops->old_hash.filter_hash = NULL; ops->old_hash.notrace_hash = NULL; -- cgit v1.2.3 From 3ddee63a099ebbdc8f84697fe46730b58240c09d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Sep 2014 14:26:51 -0400 Subject: ftrace: Only disable ftrace_enabled to test buffer in selftest The ftrace_enabled variable is set to zero in the self tests to keep delayed functions from being traced and messing with the checks. This only needs to be done when the checks are being performed, otherwise, if ftrace_enabled is off when calls back to the utility that is being tested, it can cause errors to happen and the tests can fail with false positives. Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ef60499dc8e..61a6acd6025d 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; tracing_start(); /* we should only have one item */ @@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); + + ftrace_enabled = 1; trace->reset(tr); tracing_start(); -- cgit v1.2.3 From a80e49e2cc3145af014a8ae44f575829cc236192 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Aug 2014 17:47:18 +0200 Subject: nohz: Move nohz full init call to tick init This way we unbloat a bit main.c and more importantly we initialize nohz full after init_IRQ(). This dependency will be needed in further patches because nohz full needs irq work to raise its own IRQ. Information about the support for this ability on ARM64 is obtained on init_IRQ() which initialize the pointer to __smp_call_function. Since tick_init() is called right after init_IRQ(), this is a good place to call tick_nohz_init() and prepare for that dependency. Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/linux/tick.h | 2 -- init/main.c | 1 - kernel/time/tick-common.c | 1 + kernel/time/tick-internal.h | 7 +++++++ 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 9a82c7dc3fdd..595ee86f5e0d 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -181,14 +181,12 @@ static inline bool tick_nohz_full_cpu(int cpu) return cpumask_test_cpu(cpu, tick_nohz_full_mask); } -extern void tick_nohz_init(void); extern void __tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(struct task_struct *tsk); #else -static inline void tick_nohz_init(void) { } static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void __tick_nohz_full_check(void) { } diff --git a/init/main.c b/init/main.c index bb1aed928f21..8af2f1abfe38 100644 --- a/init/main.c +++ b/init/main.c @@ -577,7 +577,6 @@ asmlinkage __visible void __init start_kernel(void) local_irq_disable(); idr_init_cache(); rcu_init(); - tick_nohz_init(); context_tracking_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 0a0608edeb26..052b4b53c3d6 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -400,4 +400,5 @@ void tick_resume(void) void __init tick_init(void) { tick_broadcast_init(); + tick_nohz_init(); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c19c1d84b6f3..366aeb4f2c66 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline bool tick_broadcast_oneshot_available(void) { return false; } #endif /* !TICK_ONESHOT */ +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif + /* * Broadcasting support */ -- cgit v1.2.3 From 76a33061b9323b7fdb220ae5fa116c10833ec22e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 Aug 2014 18:37:19 +0200 Subject: irq_work: Force raised irq work to run on irq work interrupt The nohz full kick, which restarts the tick when any resource depend on it, can't be executed anywhere given the operation it does on timers. If it is called from the scheduler or timers code, chances are that we run into a deadlock. This is why we run the nohz full kick from an irq work. That way we make sure that the kick runs on a virgin context. However if that's the case when irq work runs in its own dedicated self-ipi, things are different for the big bunch of archs that don't support the self triggered way. In order to support them, irq works are also handled by the timer interrupt as fallback. Now when irq works run on the timer interrupt, the context isn't blank. More precisely, they can run in the context of the hrtimer that runs the tick. But the nohz kick cancels and restarts this hrtimer and cancelling an hrtimer from itself isn't allowed. This is why we run in an endless loop: Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2 CPU: 2 PID: 7538 Comm: kworker/u8:8 Not tainted 3.16.0+ #34 Workqueue: btrfs-endio-write normal_work_helper [btrfs] ffff880244c06c88 000000001b486fe1 ffff880244c06bf0 ffffffff8a7f1e37 ffffffff8ac52a18 ffff880244c06c78 ffffffff8a7ef928 0000000000000010 ffff880244c06c88 ffff880244c06c20 000000001b486fe1 0000000000000000 Call Trace: ] dump_stack+0x4e/0x7a [] panic+0xd4/0x207 [] watchdog_overflow_callback+0x118/0x120 [] __perf_event_overflow+0xae/0x350 [] ? perf_event_task_disable+0xa0/0xa0 [] ? x86_perf_event_set_period+0xbf/0x150 [] perf_event_overflow+0x14/0x20 [] intel_pmu_handle_irq+0x206/0x410 [] perf_event_nmi_handler+0x2b/0x50 [] nmi_handle+0xd2/0x390 [] ? nmi_handle+0x5/0x390 [] ? match_held_lock+0x8/0x1b0 [] default_do_nmi+0x72/0x1c0 [] do_nmi+0xb8/0x100 [] end_repeat_nmi+0x1e/0x2e [] ? match_held_lock+0x8/0x1b0 [] ? match_held_lock+0x8/0x1b0 [] ? match_held_lock+0x8/0x1b0 <] lock_acquired+0xaf/0x450 [] ? lock_hrtimer_base.isra.20+0x25/0x50 [] _raw_spin_lock_irqsave+0x78/0x90 [] ? lock_hrtimer_base.isra.20+0x25/0x50 [] lock_hrtimer_base.isra.20+0x25/0x50 [] hrtimer_try_to_cancel+0x33/0x1e0 [] hrtimer_cancel+0x1a/0x30 [] tick_nohz_restart+0x17/0x90 [] __tick_nohz_full_check+0xc3/0x100 [] nohz_full_kick_work_func+0xe/0x10 [] irq_work_run_list+0x44/0x70 [] irq_work_run+0x2a/0x50 [] update_process_times+0x5b/0x70 [] tick_sched_handle.isra.21+0x25/0x60 [] tick_sched_timer+0x41/0x60 [] __run_hrtimer+0x72/0x470 [] ? tick_sched_do_timer+0xb0/0xb0 [] hrtimer_interrupt+0x117/0x270 [] local_apic_timer_interrupt+0x37/0x60 [] smp_apic_timer_interrupt+0x3f/0x50 [] apic_timer_interrupt+0x6f/0x80 To fix this we force non-lazy irq works to run on irq work self-IPIs when available. That ability of the arch to trigger irq work self IPIs is available with arch_irq_work_has_interrupt(). Reported-by: Catalin Iacob Reported-by: Dave Jones Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- include/linux/irq_work.h | 1 + kernel/irq_work.c | 15 +++++++++++++-- kernel/time/timer.c | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 6b47b2ede405..bf3fe719c7ce 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -39,6 +39,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu); #endif void irq_work_run(void); +void irq_work_tick(void); void irq_work_sync(struct irq_work *work); #ifdef CONFIG_IRQ_WORK diff --git a/kernel/irq_work.c b/kernel/irq_work.c index e6bcbe756663..385b85aded19 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -115,8 +115,10 @@ bool irq_work_needs_cpu(void) raised = &__get_cpu_var(raised_list); lazy = &__get_cpu_var(lazy_list); - if (llist_empty(raised) && llist_empty(lazy)) - return false; + + if (llist_empty(raised) || arch_irq_work_has_interrupt()) + if (llist_empty(lazy)) + return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); @@ -171,6 +173,15 @@ void irq_work_run(void) } EXPORT_SYMBOL_GPL(irq_work_run); +void irq_work_tick(void) +{ + struct llist_head *raised = &__get_cpu_var(raised_list); + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); + irq_work_run_list(&__get_cpu_var(lazy_list)); +} + /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. diff --git a/kernel/time/timer.c b/kernel/time/timer.c index aca5dfe2fa3d..9bbb8344ed3b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1385,7 +1385,7 @@ void update_process_times(int user_tick) rcu_check_callbacks(cpu, user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) - irq_work_run(); + irq_work_tick(); #endif scheduler_tick(); run_posix_cpu_timers(p); -- cgit v1.2.3 From 4327b15f64b2580dad40d2674d50fc44f1b699c1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Aug 2014 22:02:55 +0200 Subject: nohz: Consolidate nohz full init code The supports for CONFIG_NO_HZ_FULL_ALL=y and the nohz_full= kernel parameter both have their own way to do the same thing: allocate full dynticks cpumasks, fill them and initialize some state variables. Lets consolidate that all in the same place. While at it, convert some regular printk message to warnings when fundamental allocations fail. Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f654a8a298fa..eb4af016ac65 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -295,22 +295,12 @@ out: /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { - int cpu; - alloc_bootmem_cpumask_var(&tick_nohz_full_mask); - alloc_bootmem_cpumask_var(&housekeeping_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } - - cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); - cpumask_clear_cpu(cpu, tick_nohz_full_mask); - } - cpumask_andnot(housekeeping_mask, - cpu_possible_mask, tick_nohz_full_mask); tick_nohz_full_running = true; return 1; @@ -349,18 +339,11 @@ static int tick_nohz_init_all(void) #ifdef CONFIG_NO_HZ_FULL_ALL if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { - pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); - return err; - } - if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { - pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); + WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } err = 0; cpumask_setall(tick_nohz_full_mask); - cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); - cpumask_clear(housekeeping_mask); - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); tick_nohz_full_running = true; #endif return err; @@ -375,6 +358,23 @@ void __init tick_nohz_init(void) return; } + if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { + WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n"); + cpumask_clear(tick_nohz_full_mask); + tick_nohz_full_running = false; + return; + } + + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } + + cpumask_andnot(housekeeping_mask, + cpu_possible_mask, tick_nohz_full_mask); + for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); -- cgit v1.2.3 From 9b01f5bf3999a3db5b1bbd9fdfd80d8d304e94ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 18 Aug 2014 01:36:07 +0200 Subject: nohz: nohz full depends on irq work self IPI support The nohz full functionality depends on IRQ work to trigger its own interrupts. As it's used to restart the tick, we can't rely on the tick fallback for irq work callbacks, ie: we can't use the tick to restart the tick itself. Lets reject the full dynticks initialization if that arch support isn't available. As a side effect, this makes sure that nohz kick is never called from the tick. That otherwise would result in illegal hrtimer self-cancellation and lockup. Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker --- kernel/time/tick-sched.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index eb4af016ac65..5a9ff243588c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -365,6 +365,20 @@ void __init tick_nohz_init(void) return; } + /* + * Full dynticks uses irq work to drive the tick rescheduling on safe + * locking contexts. But then we need irq work to raise its own + * interrupts to avoid circular dependency on the tick + */ + if (!arch_irq_work_has_interrupt()) { + pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " + "support irq work self-IPIs\n"); + cpumask_clear(tick_nohz_full_mask); + cpumask_copy(housekeeping_mask, cpu_possible_mask); + tick_nohz_full_running = false; + return; + } + cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { -- cgit v1.2.3 From db0e716a1512179e8374a74c1f3184e9ce15d138 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 22:34:25 -0700 Subject: locking/rwsem: Move EXPORT_SYMBOL() lines to follow function definition rw-semaphore is the only type of lock doing this ugliness of exporting at the end of the file. Signed-off-by: Davidlohr Bueso Cc: dave@stgolabs.net Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1410500066-5909-1-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index d6203faf2eb1..12166ec9b7e7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -246,6 +246,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_down_read_failed); static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { @@ -465,6 +466,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_down_write_failed); /* * handle waking up a waiter on the semaphore @@ -485,6 +487,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) return sem; } +EXPORT_SYMBOL(rwsem_wake); /* * downgrade a write lock into a read lock @@ -506,8 +509,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) return sem; } - -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); EXPORT_SYMBOL(rwsem_downgrade_wake); -- cgit v1.2.3 From c88f2096136416b261bd3647cc260935f6e95805 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 8 Sep 2014 16:31:07 +0200 Subject: perf: Do not check PERF_EVENT_STATE_EXIT on syscall read path Revert PERF_EVENT_STATE_EXIT check on read syscall path. It breaks standard way to read counter, which is to open the counter, wait for the monitored process to die and read the counter. Reported-by: Stephane Eranian Signed-off-by: Jiri Olsa Acked-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Paul Mackerras Cc: Stephane Eranian Cc: David Ahern Link: http://lkml.kernel.org/r/20140908143107.GG17728@krava.brq.redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f917dec6f897..733c61636f0d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3601,8 +3601,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if ((event->state == PERF_EVENT_STATE_ERROR) || - (event->state == PERF_EVENT_STATE_EXIT)) + if (event->state == PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) -- cgit v1.2.3 From f4579fc57cf4244057b713b1f73f4dc9f0b11e97 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 25 Jul 2014 11:21:47 -0700 Subject: rcu: Fix attempt to avoid unsolicited offloading of callbacks Commit b58cc46c5f6b (rcu: Don't offload callbacks unless specifically requested) failed to adjust the callback lists of the CPUs that are known to be no-CBs CPUs only because they are also nohz_full= CPUs. This failure can result in callbacks that are posted during early boot getting stranded on nxtlist for CPUs whose no-CBs property becomes apparent late, and there can also be spurious warnings about offline CPUs posting callbacks. This commit fixes these problems by adding an early-boot rcu_init_nohz() that properly initializes the no-CBs CPUs. Note that kernels built with CONFIG_RCU_NOCB_CPU_ALL=y or with CONFIG_RCU_NOCB_CPU=n do not exhibit this bug. Neither do kernels booted without the nohz_full= boot parameter. Signed-off-by: Paul E. McKenney Reviewed-by: Pranith Kumar Tested-by: Paul Gortmaker --- include/linux/rcupdate.h | 8 +++++ init/Kconfig | 4 +-- init/main.c | 1 + kernel/rcu/tree_plugin.h | 92 ++++++++++++++++++++++++++++++++---------------- 4 files changed, 72 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d231aa17b1d7..cc7bed1c90dc 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -269,6 +269,14 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, struct task_struct *next) { } #endif /* CONFIG_RCU_USER_QS */ +#ifdef CONFIG_RCU_NOCB_CPU +void rcu_init_nohz(void); +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static inline void rcu_init_nohz(void) +{ +} +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + /** * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers * @a: Code that RCU needs to pay attention to. diff --git a/init/Kconfig b/init/Kconfig index e84c6423a2e5..64ee4d967786 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -737,7 +737,7 @@ choice config RCU_NOCB_CPU_NONE bool "No build_forced no-CBs CPUs" - depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL + depends on RCU_NOCB_CPU help This option does not force any of the CPUs to be no-CBs CPUs. Only CPUs designated by the rcu_nocbs= boot parameter will be @@ -751,7 +751,7 @@ config RCU_NOCB_CPU_NONE config RCU_NOCB_CPU_ZERO bool "CPU 0 is a build_forced no-CBs CPU" - depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL + depends on RCU_NOCB_CPU help This option forces CPU 0 to be a no-CBs CPU, so that its RCU callbacks are invoked by a per-CPU kthread whose name begins diff --git a/init/main.c b/init/main.c index bb1aed928f21..e3c4cdd94d5b 100644 --- a/init/main.c +++ b/init/main.c @@ -578,6 +578,7 @@ asmlinkage __visible void __init start_kernel(void) idr_init_cache(); rcu_init(); tick_nohz_init(); + rcu_init_nohz(); context_tracking_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a7997e272564..06d077ccf8d5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -85,33 +85,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); -#ifdef CONFIG_RCU_NOCB_CPU -#ifndef CONFIG_RCU_NOCB_CPU_NONE - if (!have_rcu_nocb_mask) { - zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); - have_rcu_nocb_mask = true; - } -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - if (have_rcu_nocb_mask) { - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - } -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ } #ifdef CONFIG_TREE_PREEMPT_RCU @@ -2451,6 +2424,67 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); } +void __init rcu_init_nohz(void) +{ + int cpu; + bool need_rcu_nocb_mask = true; + struct rcu_state *rsp; + +#ifdef CONFIG_RCU_NOCB_CPU_NONE + need_rcu_nocb_mask = false; +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ + +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) + need_rcu_nocb_mask = true; +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { + zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); + have_rcu_nocb_mask = true; + } + if (!have_rcu_nocb_mask) + return; + +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tOffload RCU callbacks from CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tOffload RCU callbacks from all CPUs\n"); + cpumask_copy(rcu_nocb_mask, cpu_possible_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#if defined(CONFIG_NO_HZ_FULL) + if (tick_nohz_full_running) + cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) */ + + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); + pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + + for_each_rcu_flavor(rsp) { + for_each_cpu(cpu, rcu_nocb_mask) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + + /* + * If there are early callbacks, they will need + * to be moved to the nocb lists. + */ + WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != + &rdp->nxtlist && + rdp->nxttail[RCU_NEXT_TAIL] != NULL); + init_nocb_callback_list(rdp); + } + } +} + /* Initialize per-rcu_data variables for no-CBs CPUs. */ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { @@ -2479,10 +2513,6 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) if (rcu_nocb_mask == NULL) return; -#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) - if (tick_nohz_full_running) - cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); -#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ if (ls == -1) { ls = int_sqrt(nr_cpu_ids); rcu_nocb_leader_stride = ls; -- cgit v1.2.3 From 949cccdbe6d286544ce3fe170298183eb7ada81c Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Fri, 25 Jul 2014 16:02:07 -0700 Subject: rcu: Check the return value of zalloc_cpumask_var() This commit checks the return value of the zalloc_cpumask_var() used for allocating cpumask for rcu_nocb_mask. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 06d077ccf8d5..105b0ce3d78f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2440,7 +2440,10 @@ void __init rcu_init_nohz(void) #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { - zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } have_rcu_nocb_mask = true; } if (!have_rcu_nocb_mask) -- cgit v1.2.3 From c271d3a957384a162f7a6aae53455d8e8afd1f3e Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:14 -0400 Subject: rcu: Use true/false for return in __call_rcu_nocb() Return true/false instead of 0/1 in __call_rcu_nocb() as this returns a bool type. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 105b0ce3d78f..36c678b898fa 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2123,7 +2123,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, { if (!rcu_is_nocb_cpu(rdp->cpu)) - return 0; + return false; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, @@ -2134,7 +2134,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_callback(rdp->rsp->name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); - return 1; + return true; } /* -- cgit v1.2.3 From 0a9e1e111b3a9e1c21d2dd27ca361cd9601d99af Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:15 -0400 Subject: rcu: Use true/false for return in rcu_nocb_adopt_orphan_cbs() Return true/false in rcu_nocb_adopt_orphan_cbs() instead of 0/1 as this function has return type of bool. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 36c678b898fa..662584142e0c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2150,7 +2150,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ if (!rcu_is_nocb_cpu(smp_processor_id())) - return 0; + return false; rsp->qlen = 0; rsp->qlen_lazy = 0; @@ -2169,7 +2169,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, rsp->orphan_nxtlist = NULL; rsp->orphan_nxttail = &rsp->orphan_nxtlist; } - return 1; + return true; } /* -- cgit v1.2.3 From 4afc7e269befc7b6e09a994e48c67e36f4a378e1 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:16 -0400 Subject: rcu: Use false for return in __call_rcu_nocb() Return false instead of 0 in __call_rcu_nocb() as this has bool as return type. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 662584142e0c..427110475e33 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2574,7 +2574,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags) { - return 0; + return false; } static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, -- cgit v1.2.3 From f4aa84ba24872e3a8e59b58bc8533cae95597f2e Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 8 Jul 2014 18:26:17 -0400 Subject: rcu: Return false instead of 0 in rcu_nocb_adopt_orphan_cbs() Return false instead of 0 in rcu_nocb_adopt_orphan_cbs() as this has bool as return type. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 427110475e33..4c1af96836f6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2581,7 +2581,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags) { - return 0; + return false; } static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -- cgit v1.2.3 From 9386c0b75dda05f535a10ea1abf1817fe292c81c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 13 Jul 2014 12:00:53 -0700 Subject: rcu: Rationalize kthread spawning Currently, RCU spawns kthreads from several different early_initcall() functions. Although this has served RCU well for quite some time, as more kthreads are added a more deterministic approach is required. This commit therefore causes all of RCU's early-boot kthreads to be spawned from a single early_initcall() function. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Tested-by: Paul Gortmaker --- kernel/rcu/tree.c | 4 +++- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 12 +++--------- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b70cb6fbe3c..9be47f43903b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3489,7 +3489,7 @@ static int rcu_pm_notify(struct notifier_block *self, } /* - * Spawn the kthread that handles this RCU flavor's grace periods. + * Spawn the kthreads that handle each RCU flavor's grace periods. */ static int __init rcu_spawn_gp_kthread(void) { @@ -3498,6 +3498,7 @@ static int __init rcu_spawn_gp_kthread(void) struct rcu_state *rsp; struct task_struct *t; + rcu_scheduler_fully_active = 1; for_each_rcu_flavor(rsp) { t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); @@ -3507,6 +3508,7 @@ static int __init rcu_spawn_gp_kthread(void) raw_spin_unlock_irqrestore(&rnp->lock, flags); rcu_spawn_nocb_kthreads(rsp); } + rcu_spawn_boost_kthreads(); return 0; } early_initcall(rcu_spawn_gp_kthread); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 6a86eb7bac45..a966092fdfd7 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -572,6 +572,7 @@ static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ +static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4c1af96836f6..410c74424d96 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1435,14 +1435,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = { }; /* - * Spawn all kthreads -- called as soon as the scheduler is running. + * Spawn boost kthreads -- called as soon as the scheduler is running. */ -static int __init rcu_spawn_kthreads(void) +static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; int cpu; - rcu_scheduler_fully_active = 1; for_each_possible_cpu(cpu) per_cpu(rcu_cpu_has_work, cpu) = 0; BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); @@ -1452,9 +1451,7 @@ static int __init rcu_spawn_kthreads(void) rcu_for_each_leaf_node(rcu_state_p, rnp) (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); } - return 0; } -early_initcall(rcu_spawn_kthreads); static void rcu_prepare_kthreads(int cpu) { @@ -1492,12 +1489,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { } -static int __init rcu_scheduler_really_started(void) +static void __init rcu_spawn_boost_kthreads(void) { - rcu_scheduler_fully_active = 1; - return 0; } -early_initcall(rcu_scheduler_really_started); static void rcu_prepare_kthreads(int cpu) { -- cgit v1.2.3 From 35ce7f29a44a888c45c0a9f202f69e10613c5306 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Jul 2014 11:30:24 -0700 Subject: rcu: Create rcuo kthreads only for onlined CPUs RCU currently uses for_each_possible_cpu() to spawn rcuo kthreads, which can result in more rcuo kthreads than one would expect, for example, derRichard reported 64 CPUs worth of rcuo kthreads on an 8-CPU image. This commit therefore creates rcuo kthreads only for those CPUs that actually come online. This was reported by derRichard on the OFTC IRC network. Reported-by: Richard Weinberger Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Tested-by: Paul Gortmaker --- kernel/rcu/tree.c | 3 +- kernel/rcu/tree.h | 6 +++- kernel/rcu/tree_plugin.h | 90 ++++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 86 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9be47f43903b..b49c8433f834 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3442,6 +3442,7 @@ static int rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_prepare_cpu(cpu); rcu_prepare_kthreads(cpu); + rcu_spawn_all_nocb_kthreads(cpu); break; case CPU_ONLINE: case CPU_DOWN_FAILED: @@ -3506,8 +3507,8 @@ static int __init rcu_spawn_gp_kthread(void) raw_spin_lock_irqsave(&rnp->lock, flags); rsp->gp_kthread = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_spawn_nocb_kthreads(rsp); } + rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); return 0; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index a966092fdfd7..a9a226d2e80a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -593,7 +593,11 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void rcu_spawn_all_nocb_kthreads(int cpu); +static void __init rcu_spawn_nocb_kthreads(void); +#ifdef CONFIG_RCU_NOCB_CPU +static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 410c74424d96..31c7afb611fd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2479,6 +2479,7 @@ void __init rcu_init_nohz(void) rdp->nxttail[RCU_NEXT_TAIL] != NULL); init_nocb_callback_list(rdp); } + rcu_organize_nocb_kthreads(rsp); } } @@ -2490,15 +2491,85 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_follower_tail = &rdp->nocb_follower_head; } +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are + * brought online out of order, this can require re-organizing the + * leader-follower relationships. + */ +static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp; + struct rcu_data *rdp_last; + struct rcu_data *rdp_old_leader; + struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu); + struct task_struct *t; + + /* + * If this isn't a no-CBs CPU or if it already has an rcuo kthread, + * then nothing to do. + */ + if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) + return; + + /* If we didn't spawn the leader first, reorganize! */ + rdp_old_leader = rdp_spawn->nocb_leader; + if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { + rdp_last = NULL; + rdp = rdp_old_leader; + do { + rdp->nocb_leader = rdp_spawn; + if (rdp_last && rdp != rdp_spawn) + rdp_last->nocb_next_follower = rdp; + rdp_last = rdp; + rdp = rdp->nocb_next_follower; + rdp_last->nocb_next_follower = NULL; + } while (rdp); + rdp_spawn->nocb_next_follower = rdp_old_leader; + } + + /* Spawn the kthread for this CPU and RCU flavor. */ + t = kthread_run(rcu_nocb_kthread, rdp_spawn, + "rcuo%c/%d", rsp->abbr, cpu); + BUG_ON(IS_ERR(t)); + ACCESS_ONCE(rdp_spawn->nocb_kthread) = t; +} + +/* + * If the specified CPU is a no-CBs CPU that does not already have its + * rcuo kthreads, spawn them. + */ +static void rcu_spawn_all_nocb_kthreads(int cpu) +{ + struct rcu_state *rsp; + + if (rcu_scheduler_fully_active) + for_each_rcu_flavor(rsp) + rcu_spawn_one_nocb_kthread(rsp, cpu); +} + +/* + * Once the scheduler is running, spawn rcuo kthreads for all online + * no-CBs CPUs. This assumes that the early_initcall()s happen before + * non-boot CPUs come online -- if this changes, we will need to add + * some mutual exclusion. + */ +static void __init rcu_spawn_nocb_kthreads(void) +{ + int cpu; + + for_each_online_cpu(cpu) + rcu_spawn_all_nocb_kthreads(cpu); +} + /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ static int rcu_nocb_leader_stride = -1; module_param(rcu_nocb_leader_stride, int, 0444); /* - * Create a kthread for each RCU flavor for each no-CBs CPU. - * Also initialize leader-follower relationships. + * Initialize leader-follower relationships for all no-CBs CPU. */ -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) { int cpu; int ls = rcu_nocb_leader_stride; @@ -2506,7 +2577,6 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; - struct task_struct *t; if (rcu_nocb_mask == NULL) return; @@ -2532,12 +2602,6 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) rdp_prev->nocb_next_follower = rdp; } rdp_prev = rdp; - - /* Spawn the kthread for this CPU. */ - t = kthread_run(rcu_nocb_kthread, rdp, - "rcuo%c/%d", rsp->abbr, cpu); - BUG_ON(IS_ERR(t)); - ACCESS_ONCE(rdp->nocb_kthread) = t; } } @@ -2591,7 +2655,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { } -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +static void rcu_spawn_all_nocb_kthreads(int cpu) +{ +} + +static void __init rcu_spawn_nocb_kthreads(void) { } -- cgit v1.2.3 From 22c2f669611590b428647ac9a73bc63ef3989d4b Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Thu, 17 Jul 2014 20:11:01 -0400 Subject: rcu: Check for have_rcu_nocb_mask instead of rcu_nocb_mask If we configure a kernel with CONFIG_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_NONE=y and CONFIG_CPUMASK_OFFSTACK=n and do not pass in a rcu_nocb= boot parameter, the cpumask rcu_nocb_mask can be garbage instead of NULL. Hence this commit replaces checks for rcu_nocb_mask == NULL with a check for have_rcu_nocb_mask. Signed-off-by: Pranith Kumar Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 31c7afb611fd..39e68bcf6d83 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2578,7 +2578,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; - if (rcu_nocb_mask == NULL) + if (!have_rcu_nocb_mask) return; if (ls == -1) { ls = int_sqrt(nr_cpu_ids); @@ -2608,9 +2608,9 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ static bool init_nocb_callback_list(struct rcu_data *rdp) { - if (rcu_nocb_mask == NULL || - !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) + if (!rcu_is_nocb_cpu(rdp->cpu)) return false; + rdp->nxttail[RCU_NEXT_TAIL] = NULL; return true; } -- cgit v1.2.3 From 417e8d26557c4264a484d78a7491316751afa46f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Jul 2014 11:26:54 -0700 Subject: rcu: Eliminate redundant rcu_sysidle_state variable Now that we have rcu_state_p, which references rcu_preempt_state for TREE_PREEMPT_RCU and rcu_sched_state for TREE_RCU, we don't need a separate rcu_sysidle_state variable. This commit therefore eliminates rcu_preempt_state in favor of rcu_state_p. Signed-off-by: Paul E. McKenney Reviewed-by: Pranith Kumar Acked-by: Frederic Weisbecker Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 39e68bcf6d83..3ddad4fb11a8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2690,16 +2690,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu) #ifdef CONFIG_NO_HZ_FULL_SYSIDLE -/* - * Define RCU flavor that holds sysidle state. This needs to be the - * most active flavor of RCU. - */ -#ifdef CONFIG_PREEMPT_RCU -static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; -#else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - static int full_sysidle_state; /* Current system-idle state. */ #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ @@ -2841,7 +2831,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, * not the flavor of RCU that tracks sysidle state, or if this * is an offline or the timekeeping CPU, nothing to do. */ - if (!*isidle || rdp->rsp != rcu_sysidle_state || + if (!*isidle || rdp->rsp != rcu_state_p || cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) return; if (rcu_gp_in_progress(rdp->rsp)) @@ -2867,7 +2857,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, */ static bool is_sysidle_rcu_state(struct rcu_state *rsp) { - return rsp == rcu_sysidle_state; + return rsp == rcu_state_p; } /* @@ -2945,7 +2935,7 @@ static void rcu_sysidle_cancel(void) static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, unsigned long maxj, bool gpkt) { - if (rsp != rcu_sysidle_state) + if (rsp != rcu_state_p) return; /* Wrong flavor, ignore. */ if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) return; /* Running state machine from timekeeping CPU. */ @@ -3014,13 +3004,12 @@ bool rcu_sys_is_idle(void) /* Scan all the CPUs looking for nonidle CPUs. */ for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rdp = per_cpu_ptr(rcu_state_p->rda, cpu); rcu_sysidle_check_cpu(rdp, &isidle, &maxj); if (!isidle) break; } - rcu_sysidle_report(rcu_sysidle_state, - isidle, maxj, false); + rcu_sysidle_report(rcu_state_p, isidle, maxj, false); oldrss = rss; rss = ACCESS_ONCE(full_sysidle_state); } @@ -3047,7 +3036,7 @@ bool rcu_sys_is_idle(void) * provided by the memory allocator. */ if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_sysidle_state) && + !rcu_gp_in_progress(rcu_state_p) && !rsh.inuse && xchg(&rsh.inuse, 1) == 0) call_rcu(&rsh.rh, rcu_sysidle_cb); return false; -- cgit v1.2.3 From 663e131090dd10bac9dc0b4f5b624dd3211b20f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Jul 2014 11:34:33 -0700 Subject: rcu: Don't track sysidle state if no nohz_full= CPUs If there are no nohz_full= CPUs, then there is currently no reason to track sysidle state. This commit therefore short-circuits this state tracking if !tick_nohz_full_enabled(). Note that these checks will need to be revisited if nohz_full= state can ever be changed at runtime. Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3ddad4fb11a8..d5aec549558d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2707,6 +2707,10 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) { unsigned long j; + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; + /* Adjust nesting, check for fully idle. */ if (irq) { rdtp->dynticks_idle_nesting--; @@ -2772,6 +2776,10 @@ void rcu_sysidle_force_exit(void) */ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) { + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; + /* Adjust nesting, check for already non-idle. */ if (irq) { rdtp->dynticks_idle_nesting++; @@ -2826,6 +2834,10 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long j; struct rcu_dynticks *rdtp = rdp->dynticks; + /* If there are no nohz_full= CPUs, don't check system-wide idleness. */ + if (!tick_nohz_full_enabled()) + return; + /* * If some other CPU has already reported non-idle, if this is * not the flavor of RCU that tracks sysidle state, or if this @@ -2952,6 +2964,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { + /* If there are no nohz_full= CPUs, no need to track this. */ + if (!tick_nohz_full_enabled()) + return; + rcu_sysidle_report(rsp, isidle, maxj, true); } @@ -2978,7 +2994,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp) /* * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. + * The caller must have disabled interrupts. This is not intended to be + * called unless tick_nohz_full_enabled(). */ bool rcu_sys_is_idle(void) { -- cgit v1.2.3 From 39953dfd40077c7480b1d5deb4d617e086b1c865 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Aug 2014 10:47:48 -0700 Subject: rcu: Avoid misordering in __call_rcu_nocb_enqueue() The NOCB leader wakeup ordering depends on the store to the header happening before the check for the leader already being awake. However, because atomic_long_add() does not return a value, it does not provide ordering guarantees, the incorrect comment in wake_nocb_leader() notwithstanding. This commit therefore adds a smp_mb__after_atomic() after the final atomic_long_add() to provide the needed ordering guarantee. Reported-by: Amit Shah Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d5aec549558d..4ad63d861599 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2042,7 +2042,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) return; if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) { - /* Prior xchg orders against prior callback enqueue. */ + /* Prior smp_mb__after_atomic() orders against prior enqueue. */ ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false; wake_up(&rdp_leader->nocb_wq); } @@ -2071,6 +2071,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, ACCESS_ONCE(*old_rhpp) = rhp; atomic_long_add(rhcount, &rdp->nocb_q_count); atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); + smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ /* If we are not being polled and there is a kthread, awaken it ... */ t = ACCESS_ONCE(rdp->nocb_kthread); -- cgit v1.2.3 From 1772947bd0126661866069157e95197e9c0020e9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Aug 2014 11:27:31 -0700 Subject: rcu: Handle NOCB callbacks from irq-disabled idle code If an RCU callback is queued on a no-CBs CPU from idle code with irqs disabled, and if that CPU stays idle forever after, the callback will never be invoked. This commit therefore adds a check for this situation in ____call_rcu_nocb(), invoking the RCU core solely for the purpose of the ensuing return-to-idle transition. (If the CPU doesn't return to idle, the next scheduling-clock interrupt will fix things up.) Reported-by: Amit Shah Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4ad63d861599..8b7351836228 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2129,6 +2129,17 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_callback(rdp->rsp->name, rhp, -atomic_long_read(&rdp->nocb_q_count_lazy), -atomic_long_read(&rdp->nocb_q_count)); + + /* + * If called from an extended quiescent state with interrupts + * disabled, invoke the RCU core in order to allow the idle-entry + * deferred-wakeup check to function. + */ + if (irqs_disabled_flags(flags) && + !rcu_is_watching() && + cpu_online(smp_processor_id())) + invoke_rcu_core(); + return true; } -- cgit v1.2.3 From c847f14217d5aec5336272a54a32ffcf6e06ddcb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Aug 2014 13:54:21 -0700 Subject: rcu: Avoid misordering in nocb_leader_wait() The NOCB follower wakeup ordering depends on the store to the tail pointer happening before the wakeup. However, because atomic_long_add() does not return a value, it does not provide ordering guarantees, and the locking in wake_up() only guarantees that the store will happen before the unlock, which might be too late. Even though this is only a theoretical issue, this commit adds a smp_mb__after_atomic() after the final atomic_long_add() to provide the needed ordering guarantee. Reported-by: Amit Shah Signed-off-by: Paul E. McKenney Tested-by: Paul Gortmaker --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8b7351836228..c554accfc5f5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2306,6 +2306,7 @@ wait_again: atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); atomic_long_add(rdp->nocb_gp_count_lazy, &rdp->nocb_follower_count_lazy); + smp_mb__after_atomic(); /* Store *tail before wakeup. */ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { /* * List was empty, wake up the follower. -- cgit v1.2.3 From 23a8e5c2d2a481fcf382490369c27b405a650212 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 20:40:16 -0700 Subject: locktorture: Rename locktorture_runnable parameter ... to just 'torture_runnable'. It follows other variable naming and is shorter. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 0955b885d0dc..8c770b2c6e2a 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -87,9 +87,9 @@ static struct lock_writer_stress_stats *lwsa; #else #define LOCKTORTURE_RUNNABLE_INIT 0 #endif -int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; -module_param(locktorture_runnable, int, 0444); -MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); +int torture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(torture_runnable, int, 0444); +MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); /* Forward reference. */ static void lock_torture_cleanup(void); @@ -355,7 +355,7 @@ static int __init lock_torture_init(void) &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, }; - if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) + if (!torture_init_begin(torture_type, verbose, &torture_runnable)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ -- cgit v1.2.3 From 42ddc75ddd478edac6ad9dc8c63abb4441541af2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 20:40:18 -0700 Subject: locktorture: Support mutexes Add a "mutex_lock" torture test. The main difference with the already existing spinlock tests is that the latency of the critical region is much larger. We randomly delay for (arbitrarily) either 500 ms or, otherwise, 25 ms. While this can considerably reduce the amount of writes compared to non blocking locks, if run long enough it can have the same torturous effect. Furthermore it is more representative of mutex hold times and can stress better things like thrashing. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- Documentation/locking/locktorture.txt | 2 ++ kernel/locking/locktorture.c | 41 +++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt index 3eb9b81454d0..f2a905b27862 100644 --- a/Documentation/locking/locktorture.txt +++ b/Documentation/locking/locktorture.txt @@ -40,6 +40,8 @@ torture_type Type of lock to torture. By default, only spinlocks will o "spin_lock_irq": spin_lock_irq() and spin_unlock_irq() pairs. + o "mutex_lock": mutex_lock() and mutex_unlock() pairs. + torture_runnable Start locktorture at boot time in the case where the module is built into the kernel, otherwise wait for torture_runnable to be set via sysfs before starting. diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8c770b2c6e2a..414ba45d580f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -66,7 +67,7 @@ torture_param(bool, verbose, true, static char *torture_type = "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, - "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); + "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); static atomic_t n_lock_torture_errors; @@ -206,6 +207,42 @@ static struct lock_torture_ops spin_lock_irq_ops = { .name = "spin_lock_irq" }; +static DEFINE_MUTEX(torture_mutex); + +static int torture_mutex_lock(void) __acquires(torture_mutex) +{ + mutex_lock(&torture_mutex); + return 0; +} + +static void torture_mutex_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 5); + else + mdelay(longdelay_ms / 5); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_mutex_unlock(void) __releases(torture_mutex) +{ + mutex_unlock(&torture_mutex); +} + +static struct lock_torture_ops mutex_lock_ops = { + .writelock = torture_mutex_lock, + .write_delay = torture_mutex_delay, + .writeunlock = torture_mutex_unlock, + .name = "mutex_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. @@ -352,7 +389,7 @@ static int __init lock_torture_init(void) int i; int firsterr = 0; static struct lock_torture_ops *torture_ops[] = { - &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &mutex_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) -- cgit v1.2.3 From f095bfc0ea04829d6962edaf06a5c56e0c251f5b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 20:40:19 -0700 Subject: locktorture: Teach about lock debugging Regular locks are very different than locks with debugging. For instance for mutexes, debugging forces to only take the slowpaths. As such, the locktorture module should take this into account when printing related information -- specifically when printing user passed parameters, it seems the right place for such info. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 414ba45d580f..a6049fa2287e 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -64,6 +64,7 @@ torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); +static bool debug_lock = false; static char *torture_type = "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, @@ -349,8 +350,9 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG - "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealwriters_stress, stat_interval, verbose, + "--- %s%s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, debug_lock ? " [debug]": "", + nrealwriters_stress, stat_interval, verbose, shuffle_interval, stutter, shutdown_secs, onoff_interval, onoff_holdoff); } @@ -418,6 +420,15 @@ static int __init lock_torture_init(void) nrealwriters_stress = nwriters_stress; else nrealwriters_stress = 2 * num_online_cpus(); + +#ifdef CONFIG_DEBUG_MUTEXES + if (strncmp(torture_type, "mutex", 5) == 0) + debug_lock = true; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + if (strncmp(torture_type, "spin", 4) == 0) + debug_lock = true; +#endif lock_torture_print_module_parms(cur_ops, "Start of test"); /* Initialize the statistics so that each run gets its own numbers. */ -- cgit v1.2.3 From 1e6757a92189278c484799ea98fc69bdc528940e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 20:40:20 -0700 Subject: locktorture: Make statistics generic The statistics structure can serve well for both reader and writer locks, thus simply rename some fields that mention 'write' and leave the declaration of lwsa. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index a6049fa2287e..de703a769c1d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -78,11 +78,11 @@ static struct task_struct **writer_tasks; static int nrealwriters_stress; static bool lock_is_write_held; -struct lock_writer_stress_stats { - long n_write_lock_fail; - long n_write_lock_acquired; +struct lock_stress_stats { + long n_lock_fail; + long n_lock_acquired; }; -static struct lock_writer_stress_stats *lwsa; +static struct lock_stress_stats *lwsa; /* writer statistics */ #if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 @@ -250,7 +250,7 @@ static struct lock_torture_ops mutex_lock_ops = { */ static int lock_torture_writer(void *arg) { - struct lock_writer_stress_stats *lwsp = arg; + struct lock_stress_stats *lwsp = arg; static DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("lock_torture_writer task started"); @@ -261,9 +261,9 @@ static int lock_torture_writer(void *arg) schedule_timeout_uninterruptible(1); cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) - lwsp->n_write_lock_fail++; + lwsp->n_lock_fail++; lock_is_write_held = 1; - lwsp->n_write_lock_acquired++; + lwsp->n_lock_acquired++; cur_ops->write_delay(&rand); lock_is_write_held = 0; cur_ops->writeunlock(); @@ -281,17 +281,17 @@ static void lock_torture_printk(char *page) bool fail = 0; int i; long max = 0; - long min = lwsa[0].n_write_lock_acquired; + long min = lwsa[0].n_lock_acquired; long long sum = 0; for (i = 0; i < nrealwriters_stress; i++) { - if (lwsa[i].n_write_lock_fail) + if (lwsa[i].n_lock_fail) fail = true; - sum += lwsa[i].n_write_lock_acquired; - if (max < lwsa[i].n_write_lock_fail) - max = lwsa[i].n_write_lock_fail; - if (min > lwsa[i].n_write_lock_fail) - min = lwsa[i].n_write_lock_fail; + sum += lwsa[i].n_lock_acquired; + if (max < lwsa[i].n_lock_fail) + max = lwsa[i].n_lock_fail; + if (min > lwsa[i].n_lock_fail) + min = lwsa[i].n_lock_fail; } page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); page += sprintf(page, @@ -441,8 +441,8 @@ static int __init lock_torture_init(void) goto unwind; } for (i = 0; i < nrealwriters_stress; i++) { - lwsa[i].n_write_lock_fail = 0; - lwsa[i].n_write_lock_acquired = 0; + lwsa[i].n_lock_fail = 0; + lwsa[i].n_lock_acquired = 0; } /* Start up the kthreads. */ -- cgit v1.2.3 From d36a7a0d5e8b5bff1671723d733eb61621b0cee4 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 20:40:21 -0700 Subject: torture: Address race in module cleanup When performing module cleanups by calling torture_cleanup() the 'torture_type' string in nullified However, callers are not necessarily done, and might still need to reference the variable. This impacts both rcutorture and locktorture, causing printing things like: [ 94.226618] (null)-torture: Stopping lock_torture_writer task [ 94.226624] (null)-torture: Stopping lock_torture_stats task Thus delay this operation until the very end of the cleanup process. The consequence (which shouldn't matter for this kid of program) is, of course, that we delay the window between rmmod and modprobing, for instance in module_torture_begin(). Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 3 ++- kernel/locking/locktorture.c | 3 ++- kernel/rcu/rcutorture.c | 3 ++- kernel/torture.c | 16 +++++++++++++--- 4 files changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index fec46f8c08eb..7759fc3c622d 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -77,7 +77,8 @@ int torture_stutter_init(int s); /* Initialization and cleanup. */ bool torture_init_begin(char *ttype, bool v, int *runnable); void torture_init_end(void); -bool torture_cleanup(void); +bool torture_cleanup_begin(void); +void torture_cleanup_end(void); bool torture_must_stop(void); bool torture_must_stop_irq(void); void torture_kthread_stopping(char *title); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index de703a769c1d..988267cc92c1 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -361,7 +361,7 @@ static void lock_torture_cleanup(void) { int i; - if (torture_cleanup()) + if (torture_cleanup_begin()) return; if (writer_tasks) { @@ -384,6 +384,7 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); + torture_cleanup_end(); } static int __init lock_torture_init(void) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6d1509500d2b..04c4b5afb759 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1514,7 +1514,7 @@ rcu_torture_cleanup(void) int i; rcutorture_record_test_transition(); - if (torture_cleanup()) { + if (torture_cleanup_begin()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; @@ -1566,6 +1566,7 @@ rcu_torture_cleanup(void) "End of test: RCU_HOTPLUG"); else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); + torture_cleanup_end(); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD diff --git a/kernel/torture.c b/kernel/torture.c index ede8b25ec1ae..dd70993c266c 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -633,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end); * * This must be called before the caller starts shutting down its own * kthreads. + * + * Both torture_cleanup_begin() and torture_cleanup_end() must be paired, + * in order to correctly perform the cleanup. They are separated because + * threads can still need to reference the torture_type type, thus nullify + * only after completing all other relevant calls. */ -bool torture_cleanup(void) +bool torture_cleanup_begin(void) { mutex_lock(&fullstop_mutex); if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { @@ -649,12 +654,17 @@ bool torture_cleanup(void) torture_shuffle_cleanup(); torture_stutter_cleanup(); torture_onoff_cleanup(); + return false; +} +EXPORT_SYMBOL_GPL(torture_cleanup_begin); + +void torture_cleanup_end(void) +{ mutex_lock(&fullstop_mutex); torture_type = NULL; mutex_unlock(&fullstop_mutex); - return false; } -EXPORT_SYMBOL_GPL(torture_cleanup); +EXPORT_SYMBOL_GPL(torture_cleanup_end); /* * Is it time for the current torture test to stop? -- cgit v1.2.3 From 4f6332c1dce9c64ef6bf93842067250dd850e482 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 21:40:41 -0700 Subject: locktorture: Add infrastructure for torturing read locks Most of it is based on what we already have for writers. This allows readers to be very independent (and thus configurable), enabling future module parameters to control things such as rw distribution. Furthermore, readers have their own delaying function, allowing us to test different rw critical region latencies, and stress locking internals. Similarly, statistics, for now will only serve for the number of lock acquisitions -- as opposed to writers, readers have no failure detection. In addition, introduce a new nreaders_stress module parameter. The default number of readers will be the same number of writers threads. Writer threads are interleaved with readers. Documentation is updated, respectively. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- Documentation/locking/locktorture.txt | 16 +++- kernel/locking/locktorture.c | 176 ++++++++++++++++++++++++++++++---- 2 files changed, 168 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt index f2a905b27862..7a72621b924f 100644 --- a/Documentation/locking/locktorture.txt +++ b/Documentation/locking/locktorture.txt @@ -29,6 +29,11 @@ nwriters_stress Number of kernel threads that will stress exclusive lock ownership (writers). The default value is twice the number of online CPUs. +nreaders_stress Number of kernel threads that will stress shared lock + ownership (readers). The default is the same amount of writer + locks. If the user did not specify nwriters_stress, then + both readers and writers be the amount of online CPUs. + torture_type Type of lock to torture. By default, only spinlocks will be tortured. This module can torture the following locks, with string values as follows: @@ -97,15 +102,18 @@ STATISTICS Statistics are printed in the following format: spin_lock-torture: Writes: Total: 93746064 Max/Min: 0/0 Fail: 0 - (A) (B) (C) (D) + (A) (B) (C) (D) (E) (A): Lock type that is being tortured -- torture_type parameter. -(B): Number of times the lock was acquired. +(B): Number of writer lock acquisitions. If dealing with a read/write primitive + a second "Reads" statistics line is printed. + +(C): Number of times the lock was acquired. -(C): Min and max number of times threads failed to acquire the lock. +(D): Min and max number of times threads failed to acquire the lock. -(D): true/false values if there were errors acquiring the lock. This should +(E): true/false values if there were errors acquiring the lock. This should -only- be positive if there is a bug in the locking primitive's implementation. Otherwise a lock should never fail (i.e., spin_lock()). Of course, the same applies for (C), above. A dummy example of this is diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 988267cc92c1..c1073d79e440 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -52,6 +52,8 @@ MODULE_AUTHOR("Paul E. McKenney "); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, + "Number of read-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); @@ -74,15 +76,19 @@ static atomic_t n_lock_torture_errors; static struct task_struct *stats_task; static struct task_struct **writer_tasks; +static struct task_struct **reader_tasks; static int nrealwriters_stress; static bool lock_is_write_held; +static int nrealreaders_stress; +static bool lock_is_read_held; struct lock_stress_stats { long n_lock_fail; long n_lock_acquired; }; static struct lock_stress_stats *lwsa; /* writer statistics */ +static struct lock_stress_stats *lrsa; /* reader statistics */ #if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 @@ -104,6 +110,9 @@ struct lock_torture_ops { int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); void (*writeunlock)(void); + int (*readlock)(void); + void (*read_delay)(struct torture_random_state *trsp); + void (*readunlock)(void); unsigned long flags; const char *name; }; @@ -142,6 +151,9 @@ static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, .writeunlock = torture_lock_busted_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "lock_busted" }; @@ -182,6 +194,9 @@ static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, .writeunlock = torture_spin_lock_write_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "spin_lock" }; @@ -205,6 +220,9 @@ static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, .writeunlock = torture_lock_spin_write_unlock_irq, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "spin_lock_irq" }; @@ -241,6 +259,9 @@ static struct lock_torture_ops mutex_lock_ops = { .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, .writeunlock = torture_mutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, .name = "mutex_lock" }; @@ -273,29 +294,58 @@ static int lock_torture_writer(void *arg) return 0; } +/* + * Lock torture reader kthread. Repeatedly acquires and releases + * the reader lock. + */ +static int lock_torture_reader(void *arg) +{ + struct lock_stress_stats *lrsp = arg; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_TOROUT_STRING("lock_torture_reader task started"); + set_user_nice(current, MAX_NICE); + + do { + if ((torture_random(&rand) & 0xfffff) == 0) + schedule_timeout_uninterruptible(1); + cur_ops->readlock(); + lock_is_read_held = 1; + lrsp->n_lock_acquired++; + cur_ops->read_delay(&rand); + lock_is_read_held = 0; + cur_ops->readunlock(); + stutter_wait("lock_torture_reader"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_reader"); + return 0; +} + /* * Create an lock-torture-statistics message in the specified buffer. */ -static void lock_torture_printk(char *page) +static void __torture_print_stats(char *page, + struct lock_stress_stats *statp, bool write) { bool fail = 0; - int i; + int i, n_stress; long max = 0; - long min = lwsa[0].n_lock_acquired; + long min = statp[0].n_lock_acquired; long long sum = 0; - for (i = 0; i < nrealwriters_stress; i++) { - if (lwsa[i].n_lock_fail) + n_stress = write ? nrealwriters_stress : nrealreaders_stress; + for (i = 0; i < n_stress; i++) { + if (statp[i].n_lock_fail) fail = true; - sum += lwsa[i].n_lock_acquired; - if (max < lwsa[i].n_lock_fail) - max = lwsa[i].n_lock_fail; - if (min > lwsa[i].n_lock_fail) - min = lwsa[i].n_lock_fail; + sum += statp[i].n_lock_acquired; + if (max < statp[i].n_lock_fail) + max = statp[i].n_lock_fail; + if (min > statp[i].n_lock_fail) + min = statp[i].n_lock_fail; } - page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); page += sprintf(page, - "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + write ? "Writes" : "Reads ", sum, max, min, max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) @@ -315,15 +365,32 @@ static void lock_torture_stats_print(void) int size = nrealwriters_stress * 200 + 8192; char *buf; + if (cur_ops->readlock) + size += nrealreaders_stress * 200 + 8192; + buf = kmalloc(size, GFP_KERNEL); if (!buf) { pr_err("lock_torture_stats_print: Out of memory, need: %d", size); return; } - lock_torture_printk(buf); + + __torture_print_stats(buf, lwsa, true); pr_alert("%s", buf); kfree(buf); + + if (cur_ops->readlock) { + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("lock_torture_stats_print: Out of memory, need: %d", + size); + return; + } + + __torture_print_stats(buf, lrsa, false); + pr_alert("%s", buf); + kfree(buf); + } } /* @@ -350,10 +417,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, debug_lock ? " [debug]": "", - nrealwriters_stress, stat_interval, verbose, - shuffle_interval, stutter, shutdown_secs, + nrealwriters_stress, nrealreaders_stress, stat_interval, + verbose, shuffle_interval, stutter, shutdown_secs, onoff_interval, onoff_holdoff); } @@ -372,6 +439,14 @@ static void lock_torture_cleanup(void) writer_tasks = NULL; } + if (reader_tasks) { + for (i = 0; i < nrealreaders_stress; i++) + torture_stop_kthread(lock_torture_reader, + reader_tasks[i]); + kfree(reader_tasks); + reader_tasks = NULL; + } + torture_stop_kthread(lock_torture_stats, stats_task); lock_torture_stats_print(); /* -After- the stats thread is stopped! */ @@ -389,7 +464,7 @@ static void lock_torture_cleanup(void) static int __init lock_torture_init(void) { - int i; + int i, j; int firsterr = 0; static struct lock_torture_ops *torture_ops[] = { &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &mutex_lock_ops, @@ -430,7 +505,6 @@ static int __init lock_torture_init(void) if (strncmp(torture_type, "spin", 4) == 0) debug_lock = true; #endif - lock_torture_print_module_parms(cur_ops, "Start of test"); /* Initialize the statistics so that each run gets its own numbers. */ @@ -446,8 +520,37 @@ static int __init lock_torture_init(void) lwsa[i].n_lock_acquired = 0; } - /* Start up the kthreads. */ + if (cur_ops->readlock) { + if (nreaders_stress >= 0) + nrealreaders_stress = nreaders_stress; + else { + /* + * By default distribute evenly the number of + * readers and writers. We still run the same number + * of threads as the writer-only locks default. + */ + if (nwriters_stress < 0) /* user doesn't care */ + nrealwriters_stress = num_online_cpus(); + nrealreaders_stress = nrealwriters_stress; + } + + lock_is_read_held = 0; + lrsa = kmalloc(sizeof(*lrsa) * nrealreaders_stress, GFP_KERNEL); + if (lrsa == NULL) { + VERBOSE_TOROUT_STRING("lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(lwsa); + goto unwind; + } + for (i = 0; i < nrealreaders_stress; i++) { + lrsa[i].n_lock_fail = 0; + lrsa[i].n_lock_acquired = 0; + } + } + lock_torture_print_module_parms(cur_ops, "Start of test"); + + /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); @@ -478,11 +581,44 @@ static int __init lock_torture_init(void) firsterr = -ENOMEM; goto unwind; } - for (i = 0; i < nrealwriters_stress; i++) { + + if (cur_ops->readlock) { + reader_tasks = kzalloc(nrealreaders_stress * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + } + + /* + * Create the kthreads and start torturing (oh, those poor little locks). + * + * TODO: Note that we interleave writers with readers, giving writers a + * slight advantage, by creating its kthread first. This can be modified + * for very specific needs, or even let the user choose the policy, if + * ever wanted. + */ + for (i = 0, j = 0; i < nrealwriters_stress || + j < nrealreaders_stress; i++, j++) { + if (i >= nrealwriters_stress) + goto create_reader; + + /* Create writer. */ firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], writer_tasks[i]); if (firsterr) goto unwind; + + create_reader: + if (cur_ops->readlock == NULL || (j >= nrealreaders_stress)) + continue; + /* Create reader. */ + firsterr = torture_create_kthread(lock_torture_reader, &lrsa[j], + reader_tasks[j]); + if (firsterr) + goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, -- cgit v1.2.3 From 4a3b427f0b27c7e15edfa607524ff012a155337a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 21:41:30 -0700 Subject: locktorture: Support rwsems We can easily do so with our new reader lock support. Just an arbitrary design default: readers have higher (5x) critical region latencies than writers: 50 ms and 10 ms, respectively. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- Documentation/locking/locktorture.txt | 2 ++ kernel/locking/locktorture.c | 68 ++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt index 7a72621b924f..be715015e0f7 100644 --- a/Documentation/locking/locktorture.txt +++ b/Documentation/locking/locktorture.txt @@ -47,6 +47,8 @@ torture_type Type of lock to torture. By default, only spinlocks will o "mutex_lock": mutex_lock() and mutex_unlock() pairs. + o "rwsem_lock": read/write down() and up() semaphore pairs. + torture_runnable Start locktorture at boot time in the case where the module is built into the kernel, otherwise wait for torture_runnable to be set via sysfs before starting. diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index c1073d79e440..8480118c0ca8 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -265,6 +265,71 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +static DECLARE_RWSEM(torture_rwsem); +static int torture_rwsem_down_write(void) __acquires(torture_rwsem) +{ + down_write(&torture_rwsem); + return 0; +} + +static void torture_rwsem_write_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 10); + else + mdelay(longdelay_ms / 10); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rwsem_up_write(void) __releases(torture_rwsem) +{ + up_write(&torture_rwsem); +} + +static int torture_rwsem_down_read(void) __acquires(torture_rwsem) +{ + down_read(&torture_rwsem); + return 0; +} + +static void torture_rwsem_read_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_ms = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms * 2); + else + mdelay(longdelay_ms / 2); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealreaders_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rwsem_up_read(void) __releases(torture_rwsem) +{ + up_read(&torture_rwsem); +} + +static struct lock_torture_ops rwsem_lock_ops = { + .writelock = torture_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .writeunlock = torture_rwsem_up_write, + .readlock = torture_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_rwsem_up_read, + .name = "rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. @@ -467,7 +532,8 @@ static int __init lock_torture_init(void) int i, j; int firsterr = 0; static struct lock_torture_ops *torture_ops[] = { - &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, &mutex_lock_ops, + &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + &mutex_lock_ops, &rwsem_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) -- cgit v1.2.3 From 630952c22b04ada7e88ad93b87ad893cd818cc6b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 11 Sep 2014 21:42:25 -0700 Subject: locktorture: Introduce torture context The amount of global variables is getting pretty ugly. Group variables related to the execution (ie: not parameters) in a new context structure. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 161 ++++++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 79 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8480118c0ca8..540d5dfe1112 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -66,29 +66,22 @@ torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(bool, verbose, true, "Enable verbose debugging printk()s"); -static bool debug_lock = false; static char *torture_type = "spin_lock"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); -static atomic_t n_lock_torture_errors; - static struct task_struct *stats_task; static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; -static int nrealwriters_stress; static bool lock_is_write_held; -static int nrealreaders_stress; static bool lock_is_read_held; struct lock_stress_stats { long n_lock_fail; long n_lock_acquired; }; -static struct lock_stress_stats *lwsa; /* writer statistics */ -static struct lock_stress_stats *lrsa; /* reader statistics */ #if defined(MODULE) #define LOCKTORTURE_RUNNABLE_INIT 1 @@ -117,8 +110,18 @@ struct lock_torture_ops { const char *name; }; -static struct lock_torture_ops *cur_ops; - +struct lock_torture_cxt { + int nrealwriters_stress; + int nrealreaders_stress; + bool debug_lock; + atomic_t n_lock_torture_errors; + struct lock_torture_ops *cur_ops; + struct lock_stress_stats *lwsa; /* writer statistics */ + struct lock_stress_stats *lrsa; /* reader statistics */ +}; +static struct lock_torture_cxt cxt = { 0, 0, false, + ATOMIC_INIT(0), + NULL, NULL}; /* * Definitions for lock torture testing. */ @@ -134,10 +137,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_us))) + (cxt.nrealwriters_stress * 2000 * longdelay_us))) mdelay(longdelay_us); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -174,13 +177,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) * we want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_us))) + (cxt.nrealwriters_stress * 2000 * longdelay_us))) mdelay(longdelay_us); if (!(torture_random(trsp) % - (nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -206,14 +209,14 @@ __acquires(torture_spinlock_irq) unsigned long flags; spin_lock_irqsave(&torture_spinlock, flags); - cur_ops->flags = flags; + cxt.cur_ops->flags = flags; return 0; } static void torture_lock_spin_write_unlock_irq(void) __releases(torture_spinlock) { - spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); + spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags); } static struct lock_torture_ops spin_lock_irq_ops = { @@ -240,12 +243,12 @@ static void torture_mutex_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_ms))) + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 5); else mdelay(longdelay_ms / 5); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -278,12 +281,12 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_ms))) + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 10); else mdelay(longdelay_ms / 10); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -305,12 +308,12 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (nrealwriters_stress * 2000 * longdelay_ms))) + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 2); else mdelay(longdelay_ms / 2); #ifdef CONFIG_PREEMPT - if (!(torture_random(trsp) % (nrealreaders_stress * 20000))) + if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) preempt_schedule(); /* Allow test to be preempted. */ #endif } @@ -345,14 +348,14 @@ static int lock_torture_writer(void *arg) do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cur_ops->writelock(); + cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = 1; lwsp->n_lock_acquired++; - cur_ops->write_delay(&rand); + cxt.cur_ops->write_delay(&rand); lock_is_write_held = 0; - cur_ops->writeunlock(); + cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); torture_kthread_stopping("lock_torture_writer"); @@ -374,12 +377,12 @@ static int lock_torture_reader(void *arg) do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); - cur_ops->readlock(); + cxt.cur_ops->readlock(); lock_is_read_held = 1; lrsp->n_lock_acquired++; - cur_ops->read_delay(&rand); + cxt.cur_ops->read_delay(&rand); lock_is_read_held = 0; - cur_ops->readunlock(); + cxt.cur_ops->readunlock(); stutter_wait("lock_torture_reader"); } while (!torture_must_stop()); torture_kthread_stopping("lock_torture_reader"); @@ -398,7 +401,7 @@ static void __torture_print_stats(char *page, long min = statp[0].n_lock_acquired; long long sum = 0; - n_stress = write ? nrealwriters_stress : nrealreaders_stress; + n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; for (i = 0; i < n_stress; i++) { if (statp[i].n_lock_fail) fail = true; @@ -414,7 +417,7 @@ static void __torture_print_stats(char *page, sum, max, min, max / 2 > min ? "???" : "", fail, fail ? "!!!" : ""); if (fail) - atomic_inc(&n_lock_torture_errors); + atomic_inc(&cxt.n_lock_torture_errors); } /* @@ -427,11 +430,11 @@ static void __torture_print_stats(char *page, */ static void lock_torture_stats_print(void) { - int size = nrealwriters_stress * 200 + 8192; + int size = cxt.nrealwriters_stress * 200 + 8192; char *buf; - if (cur_ops->readlock) - size += nrealreaders_stress * 200 + 8192; + if (cxt.cur_ops->readlock) + size += cxt.nrealreaders_stress * 200 + 8192; buf = kmalloc(size, GFP_KERNEL); if (!buf) { @@ -440,11 +443,11 @@ static void lock_torture_stats_print(void) return; } - __torture_print_stats(buf, lwsa, true); + __torture_print_stats(buf, cxt.lwsa, true); pr_alert("%s", buf); kfree(buf); - if (cur_ops->readlock) { + if (cxt.cur_ops->readlock) { buf = kmalloc(size, GFP_KERNEL); if (!buf) { pr_err("lock_torture_stats_print: Out of memory, need: %d", @@ -452,7 +455,7 @@ static void lock_torture_stats_print(void) return; } - __torture_print_stats(buf, lrsa, false); + __torture_print_stats(buf, cxt.lrsa, false); pr_alert("%s", buf); kfree(buf); } @@ -483,8 +486,8 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, { pr_alert("%s" TORTURE_FLAG "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, debug_lock ? " [debug]": "", - nrealwriters_stress, nrealreaders_stress, stat_interval, + torture_type, tag, cxt.debug_lock ? " [debug]": "", + cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval, verbose, shuffle_interval, stutter, shutdown_secs, onoff_interval, onoff_holdoff); } @@ -497,7 +500,7 @@ static void lock_torture_cleanup(void) return; if (writer_tasks) { - for (i = 0; i < nrealwriters_stress; i++) + for (i = 0; i < cxt.nrealwriters_stress; i++) torture_stop_kthread(lock_torture_writer, writer_tasks[i]); kfree(writer_tasks); @@ -505,7 +508,7 @@ static void lock_torture_cleanup(void) } if (reader_tasks) { - for (i = 0; i < nrealreaders_stress; i++) + for (i = 0; i < cxt.nrealreaders_stress; i++) torture_stop_kthread(lock_torture_reader, reader_tasks[i]); kfree(reader_tasks); @@ -515,14 +518,14 @@ static void lock_torture_cleanup(void) torture_stop_kthread(lock_torture_stats, stats_task); lock_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (atomic_read(&n_lock_torture_errors)) - lock_torture_print_module_parms(cur_ops, + if (atomic_read(&cxt.n_lock_torture_errors)) + lock_torture_print_module_parms(cxt.cur_ops, "End of test: FAILURE"); else if (torture_onoff_failures()) - lock_torture_print_module_parms(cur_ops, + lock_torture_print_module_parms(cxt.cur_ops, "End of test: LOCK_HOTPLUG"); else - lock_torture_print_module_parms(cur_ops, + lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); } @@ -541,8 +544,8 @@ static int __init lock_torture_init(void) /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) + cxt.cur_ops = torture_ops[i]; + if (strcmp(torture_type, cxt.cur_ops->name) == 0) break; } if (i == ARRAY_SIZE(torture_ops)) { @@ -555,40 +558,40 @@ static int __init lock_torture_init(void) torture_init_end(); return -EINVAL; } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + if (cxt.cur_ops->init) + cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ if (nwriters_stress >= 0) - nrealwriters_stress = nwriters_stress; + cxt.nrealwriters_stress = nwriters_stress; else - nrealwriters_stress = 2 * num_online_cpus(); + cxt.nrealwriters_stress = 2 * num_online_cpus(); #ifdef CONFIG_DEBUG_MUTEXES if (strncmp(torture_type, "mutex", 5) == 0) - debug_lock = true; + cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_SPINLOCK if (strncmp(torture_type, "spin", 4) == 0) - debug_lock = true; + cxt.debug_lock = true; #endif /* Initialize the statistics so that each run gets its own numbers. */ lock_is_write_held = 0; - lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); - if (lwsa == NULL) { - VERBOSE_TOROUT_STRING("lwsa: Out of memory"); + cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); firsterr = -ENOMEM; goto unwind; } - for (i = 0; i < nrealwriters_stress; i++) { - lwsa[i].n_lock_fail = 0; - lwsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; } - if (cur_ops->readlock) { + if (cxt.cur_ops->readlock) { if (nreaders_stress >= 0) - nrealreaders_stress = nreaders_stress; + cxt.nrealreaders_stress = nreaders_stress; else { /* * By default distribute evenly the number of @@ -596,25 +599,25 @@ static int __init lock_torture_init(void) * of threads as the writer-only locks default. */ if (nwriters_stress < 0) /* user doesn't care */ - nrealwriters_stress = num_online_cpus(); - nrealreaders_stress = nrealwriters_stress; + cxt.nrealwriters_stress = num_online_cpus(); + cxt.nrealreaders_stress = cxt.nrealwriters_stress; } lock_is_read_held = 0; - lrsa = kmalloc(sizeof(*lrsa) * nrealreaders_stress, GFP_KERNEL); - if (lrsa == NULL) { - VERBOSE_TOROUT_STRING("lrsa: Out of memory"); + cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; - kfree(lwsa); + kfree(cxt.lwsa); goto unwind; } - for (i = 0; i < nrealreaders_stress; i++) { - lrsa[i].n_lock_fail = 0; - lrsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; } } - lock_torture_print_module_parms(cur_ops, "Start of test"); + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ if (onoff_interval > 0) { @@ -640,7 +643,7 @@ static int __init lock_torture_init(void) goto unwind; } - writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), + writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); @@ -648,8 +651,8 @@ static int __init lock_torture_init(void) goto unwind; } - if (cur_ops->readlock) { - reader_tasks = kzalloc(nrealreaders_stress * sizeof(reader_tasks[0]), + if (cxt.cur_ops->readlock) { + reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); @@ -666,22 +669,22 @@ static int __init lock_torture_init(void) * for very specific needs, or even let the user choose the policy, if * ever wanted. */ - for (i = 0, j = 0; i < nrealwriters_stress || - j < nrealreaders_stress; i++, j++) { - if (i >= nrealwriters_stress) + for (i = 0, j = 0; i < cxt.nrealwriters_stress || + j < cxt.nrealreaders_stress; i++, j++) { + if (i >= cxt.nrealwriters_stress) goto create_reader; /* Create writer. */ - firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], + firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], writer_tasks[i]); if (firsterr) goto unwind; create_reader: - if (cur_ops->readlock == NULL || (j >= nrealreaders_stress)) + if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) continue; /* Create reader. */ - firsterr = torture_create_kthread(lock_torture_reader, &lrsa[j], + firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], reader_tasks[j]); if (firsterr) goto unwind; -- cgit v1.2.3 From 59da22a02032cf1a069ec431f93d403b321ff6b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Sep 2014 10:36:15 -0700 Subject: rcutorture: Rename rcutorture_runnable parameter This commit changes rcutorture_runnable to torture_runnable, which is consistent with the names of the other parameters and is a bit shorter as well. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 2 +- include/linux/rcupdate.h | 3 --- kernel/rcu/rcutorture.c | 8 ++++---- kernel/sysctl.c | 9 --------- tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh | 2 +- 5 files changed, 6 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e1147bc62633..7aba744afcde 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2938,7 +2938,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Set time (s) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. - rcutorture.rcutorture_runnable= [BOOT] + rcutorture.torture_runnable= [BOOT] Start rcutorture running at boot time. rcutorture.shuffle_interval= [KNL] diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5cafd60c1ee4..a4a819ffb2d1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -47,9 +47,6 @@ #include extern int rcu_expedited; /* for sysctl */ -#ifdef CONFIG_RCU_TORTURE_TEST -extern int rcutorture_runnable; /* for sysctl */ -#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ enum rcutorture_type { RCU_FLAVOR, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 04c4b5afb759..240fa9094f83 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -168,9 +168,9 @@ static int rcu_torture_writer_state; #else #define RCUTORTURE_RUNNABLE_INIT 0 #endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); +static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(torture_runnable, int, 0444); +MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 @@ -1636,7 +1636,7 @@ rcu_torture_init(void) RCUTORTURE_TASKS_OPS }; - if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) + if (!torture_init_begin(torture_type, verbose, &torture_runnable)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75875a741b5e..ab456664609d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1055,15 +1055,6 @@ static struct ctl_table kern_table[] = { .child = key_sysctls, }, #endif -#ifdef CONFIG_RCU_TORTURE_TEST - { - .procname = "rcutorture_runnable", - .data = &rcutorture_runnable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PERF_EVENTS /* * User-space scripts rely on the existence of this file diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh index 8977d8d31b19..ffb85ed786fa 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh @@ -51,7 +51,7 @@ per_version_boot_params () { `rcutorture_param_n_barrier_cbs "$1"` \ rcutorture.stat_interval=15 \ rcutorture.shutdown_secs=$3 \ - rcutorture.rcutorture_runnable=1 \ + rcutorture.torture_runnable=1 \ rcutorture.test_no_idle_hz=1 \ rcutorture.verbose=1 } -- cgit v1.2.3 From 6213daab2547fdc0d02a86abf3ac209ac6881ae3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 17 Sep 2014 18:18:09 +0800 Subject: cgroup: remove some useless forward declarations Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b5223c570eba..f7898e0bce1e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,7 +27,6 @@ struct cgroup_root; struct cgroup_subsys; -struct inode; struct cgroup; extern int cgroup_init_early(void); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ebd4476c57de..619aae399a3a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static void cgroup_put(struct cgroup *cgrp); static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask); static int cgroup_destroy_locked(struct cgroup *cgrp); @@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /* IDR wrappers which synchronize using cgroup_idr_lock */ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, -- cgit v1.2.3 From 244bb9a6336d2aa53526261ec35c593ebd5c1a33 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 17 Sep 2014 18:18:34 +0800 Subject: cgroup: remove redundant code in cgroup_rmdir() We no longer clear kn->priv in cgroup_rmdir(), so we don't need to get an extra refcnt. Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 619aae399a3a..d739a732edb9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4841,13 +4841,10 @@ static int cgroup_rmdir(struct kernfs_node *kn) cgrp = cgroup_kn_lock_live(kn); if (!cgrp) return 0; - cgroup_get(cgrp); /* for @kn->priv clearing */ ret = cgroup_destroy_locked(cgrp); cgroup_kn_unlock(kn); - - cgroup_put(cgrp); return ret; } -- cgit v1.2.3 From 0c8fc2c1210556434835adfb2274f41704853e8a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 17 Sep 2014 18:19:24 +0800 Subject: cgroup: remove bogus comments We never grab cgroup mutex in fork and exit paths no matter whether notify_on_release is set or not. Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d739a732edb9..4ddc75588983 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -967,14 +967,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * knows that the cgroup won't be removed, as cgroup_rmdir() * needs that mutex. * - * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't - * (usually) take cgroup_mutex. These are the two most performance - * critical pieces of code here. The exception occurs on cgroup_exit(), - * when a task in a notify_on_release cgroup exits. Then cgroup_mutex - * is taken, and if the cgroup count is zero, a usermode call made - * to the release agent with the name of the cgroup (path relative to - * the root of cgroup file system) as the argument. - * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at -- cgit v1.2.3 From eb4aec84d6bdf98d00cedb41c18000f7a31e648a Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Thu, 18 Sep 2014 17:28:46 +0800 Subject: cgroup: fix unbalanced locking cgroup_pidlist_start() holds cgrp->pidlist_mutex and then calls pidlist_array_load(), and cgroup_pidlist_stop() releases the mutex. It is wrong that we release the mutex in the failure path in pidlist_array_load(), because cgroup_pidlist_stop() will be called no matter if cgroup_pidlist_start() returns errno or not. Fixes: 4bac00d16a8760eae7205e41d2c246477d42a210 Cc: # 3.14+ Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Acked-by: Cong Wang --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 940aced4ed00..3a73f995a81e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3985,7 +3985,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l = cgroup_pidlist_find_create(cgrp, type); if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } -- cgit v1.2.3 From 971ff49355387fef41d1327434d8939721a4eb35 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Thu, 18 Sep 2014 16:06:19 +0800 Subject: cgroup: use a per-cgroup work for release agent Instead of using a global work to schedule release agent on removable cgroups, we change to use a per-cgroup work to do this, which makes the code much simpler. v2: use a dedicated work instead of reusing css->destroy_work. (Tejun) Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 10 ++--- kernel/cgroup.c | 108 +++++++++++++++---------------------------------- 2 files changed, 36 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f7898e0bce1e..51958d0fb88f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -233,13 +233,6 @@ struct cgroup { */ struct list_head e_csets[CGROUP_SUBSYS_COUNT]; - /* - * Linked list running through all cgroups that can - * potentially be reaped by the release agent. Protected by - * release_list_lock - */ - struct list_head release_list; - /* * list of pidlists, up to two for each namespace (one for procs, one * for tasks); created on demand. @@ -249,6 +242,9 @@ struct cgroup { /* used to wait for offlining of csses */ wait_queue_head_t offline_waitq; + + /* used to schedule release agent */ + struct work_struct release_agent_work; }; #define MAX_CGROUP_ROOT_NAMELEN 64 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4ddc75588983..db19a4884a7f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -392,12 +392,7 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else -/* the list of cgroups eligible for automatic release. Protected by - * release_list_lock */ -static LIST_HEAD(release_list); -static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); -static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); /* @@ -1577,7 +1572,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); INIT_LIST_HEAD(&cgrp->cset_links); - INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->self.cgroup = cgrp; @@ -1587,6 +1581,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); init_waitqueue_head(&cgrp->offline_waitq); + INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent); } static void init_cgroup_root(struct cgroup_root *root, @@ -4342,6 +4337,7 @@ static void css_free_work_fn(struct work_struct *work) /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); + cancel_work_sync(&cgrp->release_agent_work); if (cgroup_parent(cgrp)) { /* @@ -4804,12 +4800,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* CSS_ONLINE is clear, remove from ->release_list for the last time */ - raw_spin_lock(&release_list_lock); - if (!list_empty(&cgrp->release_list)) - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - /* * Remove @cgrp directory along with the base files. @cgrp has an * extra ref on its kn. @@ -5271,25 +5261,9 @@ void cgroup_exit(struct task_struct *tsk) static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && - !css_has_online_children(&cgrp->self)) { - /* - * Control Group is currently removeable. If it's not - * already queued for a userspace notification, queue - * it now - */ - int need_schedule_work = 0; - - raw_spin_lock(&release_list_lock); - if (!cgroup_is_dead(cgrp) && - list_empty(&cgrp->release_list)) { - list_add(&cgrp->release_list, &release_list); - need_schedule_work = 1; - } - raw_spin_unlock(&release_list_lock); - if (need_schedule_work) - schedule_work(&release_agent_work); - } + if (cgroup_is_releasable(cgrp) && !cgroup_has_tasks(cgrp) && + !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) + schedule_work(&cgrp->release_agent_work); } /* @@ -5317,52 +5291,36 @@ static void check_for_release(struct cgroup *cgrp) */ static void cgroup_release_agent(struct work_struct *work) { - BUG_ON(work != &release_agent_work); + struct cgroup *cgrp = + container_of(work, struct cgroup, release_agent_work); + char *pathbuf = NULL, *agentbuf = NULL, *path; + char *argv[3], *envp[3]; + mutex_lock(&cgroup_mutex); - raw_spin_lock(&release_list_lock); - while (!list_empty(&release_list)) { - char *argv[3], *envp[3]; - int i; - char *pathbuf = NULL, *agentbuf = NULL, *path; - struct cgroup *cgrp = list_entry(release_list.next, - struct cgroup, - release_list); - list_del_init(&cgrp->release_list); - raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!pathbuf) - goto continue_free; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); - if (!path) - goto continue_free; - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!agentbuf) - goto continue_free; - - i = 0; - argv[i++] = agentbuf; - argv[i++] = path; - argv[i] = NULL; - - i = 0; - /* minimal command environment */ - envp[i++] = "HOME=/"; - envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[i] = NULL; - - /* Drop the lock while we invoke the usermode helper, - * since the exec could involve hitting disk and hence - * be a slow process */ - mutex_unlock(&cgroup_mutex); - call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - mutex_lock(&cgroup_mutex); - continue_free: - kfree(pathbuf); - kfree(agentbuf); - raw_spin_lock(&release_list_lock); - } - raw_spin_unlock(&release_list_lock); + + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out; + + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) + goto out; + + argv[0] = agentbuf; + argv[1] = path; + argv[2] = NULL; + + /* minimal command environment */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); +out: + kfree(agentbuf); + kfree(pathbuf); } static int __init cgroup_disable(char *str) -- cgit v1.2.3 From 006f4ac49742b5f70ef7e39176fd42a500144ccc Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Thu, 18 Sep 2014 16:03:15 +0800 Subject: cgroup: simplify proc_cgroup_show() Use the ONE macro instead of REG, and we can simplify proc_cgroup_show(). Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- fs/proc/base.c | 19 ++----------------- include/linux/cgroup.h | 3 ++- kernel/cgroup.c | 18 +++--------------- 3 files changed, 7 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index baf852b648ad..6b96892015ec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -376,21 +376,6 @@ static const struct file_operations proc_lstats_operations = { #endif -#ifdef CONFIG_CGROUPS -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -static const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - #ifdef CONFIG_PROC_PID_CPUSET static int cpuset_open(struct inode *inode, struct file *file) @@ -2576,7 +2561,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("cpuset", S_IRUGO, proc_cpuset_operations), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, proc_cgroup_operations), + ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), @@ -2922,7 +2907,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("cpuset", S_IRUGO, proc_cpuset_operations), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, proc_cgroup_operations), + ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 51958d0fb88f..77a1d37b742b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -37,7 +37,8 @@ extern void cgroup_exit(struct task_struct *p); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); -extern int proc_cgroup_show(struct seq_file *, void *); +extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk); /* define the enumeration of all cgroup subsystems */ #define SUBSYS(_x) _x ## _cgrp_id, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index db19a4884a7f..df7733b48d2e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5030,12 +5030,9 @@ core_initcall(cgroup_wq_init); * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc//cgroup. */ - -/* TODO: Use a proper seq_file iterator */ -int proc_cgroup_show(struct seq_file *m, void *v) +int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) { - struct pid *pid; - struct task_struct *tsk; char *buf, *path; int retval; struct cgroup_root *root; @@ -5045,14 +5042,6 @@ int proc_cgroup_show(struct seq_file *m, void *v) if (!buf) goto out; - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - - retval = 0; - mutex_lock(&cgroup_mutex); down_read(&css_set_rwsem); @@ -5082,11 +5071,10 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_putc(m, '\n'); } + retval = 0; out_unlock: up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); - put_task_struct(tsk); -out_free: kfree(buf); out: return retval; -- cgit v1.2.3 From 52de4779f201758ddcf37360f09a16895756e708 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Thu, 18 Sep 2014 16:03:36 +0800 Subject: cpuset: simplify proc_cpuset_show() Use the ONE macro instead of REG, and we can simplify proc_cpuset_show(). Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- fs/proc/base.c | 20 ++------------------ include/linux/cpuset.h | 3 ++- kernel/cpuset.c | 15 +++------------ 3 files changed, 7 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6b96892015ec..4e8aa35fc3eb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -376,22 +376,6 @@ static const struct file_operations proc_lstats_operations = { #endif -#ifdef CONFIG_PROC_PID_CPUSET - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -static const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -2558,7 +2542,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, proc_cpuset_operations), + ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), @@ -2904,7 +2888,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, proc_cpuset_operations), + ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS ONE("cgroup", S_IRUGO, proc_cgroup_show), diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index ade2390ffe92..0d4e0675b318 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -86,7 +86,8 @@ extern void __cpuset_memory_pressure_bump(void); extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); -extern int proc_cpuset_show(struct seq_file *, void *); +extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); extern int cpuset_slab_spread_node(void); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 22874d7cf2c0..a37f4ed24867 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2729,10 +2729,9 @@ void __cpuset_memory_pressure_bump(void) * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ -int proc_cpuset_show(struct seq_file *m, void *unused_v) +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) { - struct pid *pid; - struct task_struct *tsk; char *buf, *p; struct cgroup_subsys_state *css; int retval; @@ -2742,24 +2741,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!buf) goto out; - retval = -ESRCH; - pid = m->private; - tsk = get_pid_task(pid, PIDTYPE_PID); - if (!tsk) - goto out_free; - retval = -ENAMETOOLONG; rcu_read_lock(); css = task_css(tsk, cpuset_cgrp_id); p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); if (!p) - goto out_put_task; + goto out_free; seq_puts(m, p); seq_putc(m, '\n'); retval = 0; -out_put_task: - put_task_struct(tsk); out_free: kfree(buf); out: -- cgit v1.2.3 From dd56af42bd829c6e770ed69812bd65a04eaeb1e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Aug 2014 20:25:06 -0700 Subject: rcu: Eliminate deadlock between CPU hotplug and expedited grace periods Currently, the expedited grace-period primitives do get_online_cpus(). This greatly simplifies their implementation, but means that calls to them holding locks that are acquired by CPU-hotplug notifiers (to say nothing of calls to these primitives from CPU-hotplug notifiers) can deadlock. But this is starting to become inconvenient, as can be seen here: https://lkml.org/lkml/2014/8/5/754. The problem in this case is that some developers need to acquire a mutex from a CPU-hotplug notifier, but also need to hold it across a synchronize_rcu_expedited(). As noted above, this currently results in deadlock. This commit avoids the deadlock and retains the simplicity by creating a try_get_online_cpus(), which returns false if the get_online_cpus() reference count could not immediately be incremented. If a call to try_get_online_cpus() returns true, the expedited primitives operate as before. If a call returns false, the expedited primitives fall back to normal grace-period operations. This falling back of course results in increased grace-period latency, but only during times when CPU hotplug operations are actually in flight. The effect should therefore be negligible during normal operation. Signed-off-by: Paul E. McKenney Cc: Josh Triplett Cc: "Rafael J. Wysocki" Tested-by: Lan Tianyu --- include/linux/cpu.h | 2 ++ include/linux/lockdep.h | 1 + kernel/cpu.c | 16 +++++++++++++++- kernel/rcu/tree.c | 19 ++++++++++++------- kernel/rcu/tree_plugin.h | 11 +++++------ 5 files changed, 35 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 95978ad7fcdd..b2d9a43012b2 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -213,6 +213,7 @@ extern struct bus_type cpu_subsys; extern void cpu_hotplug_begin(void); extern void cpu_hotplug_done(void); extern void get_online_cpus(void); +extern bool try_get_online_cpus(void); extern void put_online_cpus(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); @@ -230,6 +231,7 @@ int cpu_down(unsigned int cpu); static inline void cpu_hotplug_begin(void) {} static inline void cpu_hotplug_done(void) {} #define get_online_cpus() do { } while (0) +#define try_get_online_cpus() true #define put_online_cpus() do { } while (0) #define cpu_hotplug_disable() do { } while (0) #define cpu_hotplug_enable() do { } while (0) diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 008388f920d7..4f86465cc317 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -505,6 +505,7 @@ static inline void print_irqtrace_events(struct task_struct *curr) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, 1, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING diff --git a/kernel/cpu.c b/kernel/cpu.c index 81e2a388a0f6..356450f09c1f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -79,6 +79,8 @@ static struct { /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire_tryread() \ + lock_map_acquire_tryread(&cpu_hotplug.dep_map) #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) @@ -91,10 +93,22 @@ void get_online_cpus(void) mutex_lock(&cpu_hotplug.lock); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); - } EXPORT_SYMBOL_GPL(get_online_cpus); +bool try_get_online_cpus(void) +{ + if (cpu_hotplug.active_writer == current) + return true; + if (!mutex_trylock(&cpu_hotplug.lock)) + return false; + cpuhp_lock_acquire_tryread(); + cpu_hotplug.refcount++; + mutex_unlock(&cpu_hotplug.lock); + return true; +} +EXPORT_SYMBOL_GPL(try_get_online_cpus); + void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d7a3b13bc94c..133e47223095 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2940,11 +2940,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * restructure your code to batch your updates, and then use a single * synchronize_sched() instead. * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. - * * This implementation can be thought of as an application of ticket * locking to RCU, with sync_sched_expedited_started and * sync_sched_expedited_done taking on the roles of the halves @@ -2994,7 +2989,12 @@ void synchronize_sched_expedited(void) */ snap = atomic_long_inc_return(&rsp->expedited_start); firstsnap = snap; - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU hotplug operation in flight, fall back to normal GP. */ + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); + return; + } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); /* @@ -3041,7 +3041,12 @@ void synchronize_sched_expedited(void) * and they started after our first try, so their grace * period works for us. */ - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU hotplug operation in flight, use normal GP. */ + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); + return; + } snap = atomic_long_read(&rsp->expedited_start); smp_mb(); /* ensure read is before try_stop_cpus(). */ } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e2c5910546f6..387dd4599344 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -793,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) * In fact, if you are using synchronize_rcu_expedited() in a loop, * please restructure your code to batch your updates, and then Use a * single synchronize_rcu() instead. - * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. */ void synchronize_rcu_expedited(void) { @@ -819,7 +814,11 @@ void synchronize_rcu_expedited(void) * being boosted. This simplifies the process of moving tasks * from leaf to root rcu_node structures. */ - get_online_cpus(); + if (!try_get_online_cpus()) { + /* CPU-hotplug operation in flight, fall back to normal GP. */ + wait_rcu_gp(call_rcu); + return; + } /* * Acquire lock, falling back to synchronize_rcu() if too many -- cgit v1.2.3 From ba7e5a279e72f4b246dc7a419ac707e1936ede3e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 4 Sep 2014 16:35:30 -0400 Subject: sched/numa: Use select_idle_sibling() to select a destination for task_numa_move() The code in task_numa_compare() will only examine at most one idle CPU per node, because they all have the same score. However, some idle CPUs are better candidates than others, due to busy or idle SMT siblings, etc... The scheduler has logic to find the best CPU within an LLC to place a task. The NUMA code should probably use it. This seems to reduce the standard deviation for single instance SPECjbb2005 with a low warehouse count on my 4 node test system. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: mgorman@suse.de Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140904163530.189d410a@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be9e97b0d76f..96e7147044bb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -665,6 +665,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP +static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); @@ -1257,6 +1258,13 @@ balance: if (load_too_imbalanced(src_load, dst_load, env)) goto unlock; + /* + * One idle CPU per node is evaluated for a task numa move. + * Call select_idle_sibling to maybe find a better one. + */ + if (!cur) + env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + assign: task_numa_assign(env, cur, imp); unlock: -- cgit v1.2.3 From f6be8af1c95de4a46e325e728900a70ceadb52cf Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 4 Sep 2014 15:17:53 +0800 Subject: sched: Add new API wake_up_if_idle() to wake up the idle cpu Implementing one new API wake_up_if_idle(), which is used to wake up the idle CPU. Suggested-by: Andy Lutomirski Signed-off-by: Chuansheng Liu Signed-off-by: Peter Zijlstra (Intel) Cc: daniel.lezcano@linaro.org Cc: rjw@rjwysocki.net Cc: linux-pm@vger.kernel.org Cc: changcheng.liu@intel.com Cc: xiaoming.wang@intel.com Cc: souvik.k.chakravarty@intel.com Cc: chuansheng.liu@intel.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409815075-4180-1-git-send-email-chuansheng.liu@intel.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index dd9eb4807389..82ff3d6efb19 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1024,6 +1024,7 @@ struct sched_domain_topology_level { extern struct sched_domain_topology_level *sched_domain_topology; extern void set_sched_topology(struct sched_domain_topology_level *tl); +extern void wake_up_if_idle(int cpu); #ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 78e5c839df13..f7c6ed2fd69d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1634,6 +1634,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) } } +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!is_idle_task(rq->curr)) + return; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + raw_spin_lock_irqsave(&rq->lock, flags); + if (is_idle_task(rq->curr)) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -- cgit v1.2.3 From c6f4459fc3ba532e896cb678e29b45cb985f82bf Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Thu, 4 Sep 2014 15:17:54 +0800 Subject: smp: Add new wake_up_all_idle_cpus() function Currently kick_all_cpus_sync() can break non-polling idle cpus thru IPI interrupts. But sometimes we need to break the polling idle cpus immediately to reselect the suitable c-state, also for non-idle cpus, we need to do nothing if we try to wake up them. Here adding one new function wake_up_all_idle_cpus() to let all cpus out of idle based on function wake_up_if_idle(). Signed-off-by: Chuansheng Liu Signed-off-by: Peter Zijlstra (Intel) Cc: daniel.lezcano@linaro.org Cc: rjw@rjwysocki.net Cc: linux-pm@vger.kernel.org Cc: changcheng.liu@intel.com Cc: xiaoming.wang@intel.com Cc: souvik.k.chakravarty@intel.com Cc: luto@amacapital.net Cc: Andrew Morton Cc: Christoph Hellwig Cc: Frederic Weisbecker Cc: Geert Uytterhoeven Cc: Jan Kara Cc: Jens Axboe Cc: Jens Axboe Cc: Linus Torvalds Cc: Michal Hocko Cc: Paul Gortmaker Cc: Roman Gushchin Cc: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/1409815075-4180-2-git-send-email-chuansheng.liu@intel.com Signed-off-by: Ingo Molnar --- include/linux/smp.h | 2 ++ kernel/smp.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index 34347f26be9b..93dff5fff524 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -100,6 +100,7 @@ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); +void wake_up_all_idle_cpus(void); /* * Generic and arch helpers @@ -148,6 +149,7 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, } static inline void kick_all_cpus_sync(void) { } +static inline void wake_up_all_idle_cpus(void) { } #endif /* !SMP */ diff --git a/kernel/smp.c b/kernel/smp.c index aff8aa14f547..9e0d0b289118 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "smpboot.h" @@ -699,3 +700,24 @@ void kick_all_cpus_sync(void) smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); + +/** + * wake_up_all_idle_cpus - break all cpus out of idle + * wake_up_all_idle_cpus try to break all cpus which is in idle state even + * including idle polling cpus, for non-idle cpus, we will do nothing + * for them. + */ +void wake_up_all_idle_cpus(void) +{ + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu == smp_processor_id()) + continue; + + wake_up_if_idle(cpu); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); -- cgit v1.2.3 From 9c368b5b6eccce1cbd7f68142106b3b4ddb1c5b5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 12 Sep 2014 09:12:15 -0400 Subject: sched, time: Fix lock inversion in thread_group_cputime() The sig->stats_lock nests inside the tasklist_lock and the sighand->siglock in __exit_signal and wait_task_zombie. However, both of those locks can be taken from irq context, which means we need to use the interrupt safe variant of read_seqbegin_or_lock. This blocks interrupts when the "lock" branch is taken (seq is odd), preventing the lock inversion. On the first (lockless) pass through the loop, irqs are not blocked. Reported-by: Stanislaw Gruszka Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: prarit@redhat.com Cc: oleg@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410527535-9814-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2b57031afc19..64492dff8a81 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -289,13 +289,14 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) cputime_t utime, stime; struct task_struct *t; unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); /* Attempt a lockless read on the first round. */ nextseq = 0; do { seq = nextseq; - read_seqbegin_or_lock(&sig->stats_lock, &seq); + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; @@ -309,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) /* If lockless access failed, take the lock. */ nextseq = 1; } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry(&sig->stats_lock, seq); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } -- cgit v1.2.3 From f139caf2e89713687514d9db847a4fa2e29c87a2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:40:54 +0400 Subject: sched, cleanup, treewide: Remove set_current_state(TASK_RUNNING) after schedule() schedule(), io_schedule() and schedule_timeout() always return with TASK_RUNNING state set, so one more setting is unnecessary. (All places in patch are visible good, only exception is kiblnd_scheduler() from: drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c Its schedule() is one line above standard 3 lines of unified diff) No places where set_current_state() is used for mb(). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529254.3569.23.camel@tkhai Cc: Alasdair Kergon Cc: Anil Belur Cc: Arnd Bergmann Cc: Dave Kleikamp Cc: David Airlie Cc: David Howells Cc: Dmitry Eremin Cc: Frank Blaschka Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Helge Deller Cc: Isaac Huang Cc: James E.J. Bottomley Cc: James E.J. Bottomley Cc: J. Bruce Fields Cc: Jeff Dike Cc: Jesper Nilsson Cc: Jiri Slaby Cc: Laura Abbott Cc: Liang Zhen Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Masaru Nomura Cc: Michael Opdenacker Cc: Mikael Starvik Cc: Mike Snitzer Cc: Neil Brown Cc: Oleg Drokin Cc: Peng Tao Cc: Richard Weinberger Cc: Robert Love Cc: Steven Rostedt Cc: Trond Myklebust Cc: Ursula Braun Cc: Zi Shen Lim Cc: devel@driverdev.osuosl.org Cc: dm-devel@redhat.com Cc: dri-devel@lists.freedesktop.org Cc: fcoe-devel@open-fcoe.org Cc: jfs-discussion@lists.sourceforge.net Cc: linux390@de.ibm.com Cc: linux-afs@lists.infradead.org Cc: linux-cris-kernel@axis.com Cc: linux-kernel@vger.kernel.org Cc: linux-nfs@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linux-raid@vger.kernel.org Cc: linux-s390@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: qla2xxx-upstream@qlogic.com Cc: user-mode-linux-devel@lists.sourceforge.net Cc: user-mode-linux-user@lists.sourceforge.net Signed-off-by: Ingo Molnar --- arch/cris/arch-v10/drivers/sync_serial.c | 1 - arch/cris/arch-v32/drivers/sync_serial.c | 1 - arch/um/drivers/random.c | 1 - drivers/gpu/vga/vgaarb.c | 1 - drivers/md/dm-bufio.c | 1 - drivers/parisc/power.c | 1 - drivers/s390/net/claw.c | 2 -- drivers/scsi/fcoe/fcoe.c | 1 - drivers/scsi/qla2xxx/qla_os.c | 1 - drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | 3 --- drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | 1 - drivers/staging/lustre/lustre/libcfs/fail.c | 1 - drivers/tty/bfin_jtag_comm.c | 1 - fs/afs/vlocation.c | 1 - fs/jfs/jfs_logmgr.c | 2 -- fs/jfs/jfs_txnmgr.c | 3 --- fs/nfs/blocklayout/blocklayoutdev.c | 1 - fs/nfs/blocklayout/blocklayoutdm.c | 1 - fs/nfsd/nfs4recover.c | 1 - kernel/time/hrtimer.c | 1 - kernel/trace/ring_buffer_benchmark.c | 3 --- 21 files changed, 29 deletions(-) (limited to 'kernel') diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c index 29eb02ab3f25..0f3983241e60 100644 --- a/arch/cris/arch-v10/drivers/sync_serial.c +++ b/arch/cris/arch-v10/drivers/sync_serial.c @@ -1086,7 +1086,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, } local_irq_restore(flags); schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&port->out_wait_q, &wait); if (signal_pending(current)) return -EINTR; diff --git a/arch/cris/arch-v32/drivers/sync_serial.c b/arch/cris/arch-v32/drivers/sync_serial.c index bbb806b68838..5a149134cfb5 100644 --- a/arch/cris/arch-v32/drivers/sync_serial.c +++ b/arch/cris/arch-v32/drivers/sync_serial.c @@ -1089,7 +1089,6 @@ static ssize_t sync_serial_write(struct file *file, const char *buf, } schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&port->out_wait_q, &wait); if (signal_pending(current)) diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index 9e3a72205827..dd16c902ff70 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -79,7 +79,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, set_task_state(current, TASK_INTERRUPTIBLE); schedule(); - set_task_state(current, TASK_RUNNING); remove_wait_queue(&host_read_wait, &wait); if (atomic_dec_and_test(&host_sleep_count)) { diff --git a/drivers/gpu/vga/vgaarb.c b/drivers/gpu/vga/vgaarb.c index d2077f040f3e..d07f810c7087 100644 --- a/drivers/gpu/vga/vgaarb.c +++ b/drivers/gpu/vga/vgaarb.c @@ -403,7 +403,6 @@ int vga_get(struct pci_dev *pdev, unsigned int rsrc, int interruptible) } schedule(); remove_wait_queue(&vga_wait_queue, &wait); - set_current_state(TASK_RUNNING); } return rc; } diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ab472c557d18..0505559f0965 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -720,7 +720,6 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) io_schedule(); - set_task_state(current, TASK_RUNNING); remove_wait_queue(&c->free_buffer_wait, &wait); dm_bufio_lock(c); diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c index 90cca5e3805f..ef31b77404ef 100644 --- a/drivers/parisc/power.c +++ b/drivers/parisc/power.c @@ -121,7 +121,6 @@ static int kpowerswd(void *param) unsigned long soft_power_reg = (unsigned long) param; schedule_timeout_interruptible(pwrsw_enabled ? HZ : HZ/POWERSWITCH_POLL_PER_SEC); - __set_current_state(TASK_RUNNING); if (unlikely(!pwrsw_enabled)) continue; diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c index fbc6701bef30..213e54ee8a66 100644 --- a/drivers/s390/net/claw.c +++ b/drivers/s390/net/claw.c @@ -481,7 +481,6 @@ claw_open(struct net_device *dev) spin_unlock_irqrestore( get_ccwdev_lock(privptr->channel[i].cdev), saveflags); schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&privptr->channel[i].wait, &wait); if(rc != 0) ccw_check_return_code(privptr->channel[i].cdev, rc); @@ -828,7 +827,6 @@ claw_release(struct net_device *dev) spin_unlock_irqrestore( get_ccwdev_lock(privptr->channel[i].cdev), saveflags); schedule(); - set_current_state(TASK_RUNNING); remove_wait_queue(&privptr->channel[i].wait, &wait); if (rc != 0) { ccw_check_return_code(privptr->channel[i].cdev, rc); diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 00ee0ed642aa..4a8ac7d8c76b 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1884,7 +1884,6 @@ retry: set_current_state(TASK_INTERRUPTIBLE); spin_unlock_bh(&p->fcoe_rx_list.lock); schedule(); - set_current_state(TASK_RUNNING); goto retry; } diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index be9698d920c2..8b5a5dc129b4 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -4853,7 +4853,6 @@ qla2x00_do_dpc(void *data) "DPC handler sleeping.\n"); schedule(); - __set_current_state(TASK_RUNNING); if (!base_vha->flags.init_done || ha->flags.mbox_busy) goto end_loop; diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 306d72876432..b94f7436ec19 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3215,7 +3215,6 @@ kiblnd_connd (void *arg) schedule_timeout(timeout); - set_current_state(TASK_RUNNING); remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); } @@ -3432,7 +3431,6 @@ kiblnd_scheduler(void *arg) busy_loops = 0; remove_wait_queue(&sched->ibs_waitq, &wait); - set_current_state(TASK_RUNNING); spin_lock_irqsave(&sched->ibs_lock, flags); } @@ -3507,7 +3505,6 @@ kiblnd_failover_thread(void *arg) rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) : cfs_time_seconds(1)); - set_current_state(TASK_RUNNING); remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); write_lock_irqsave(glock, flags); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c index 521439954fcb..9994fc66111b 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -2233,7 +2233,6 @@ ksocknal_connd (void *arg) nloops = 0; schedule_timeout(timeout); - set_current_state(TASK_RUNNING); remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); spin_lock_bh(connd_lock); } diff --git a/drivers/staging/lustre/lustre/libcfs/fail.c b/drivers/staging/lustre/lustre/libcfs/fail.c index 1bf9c90b4789..e73ca3df9734 100644 --- a/drivers/staging/lustre/lustre/libcfs/fail.c +++ b/drivers/staging/lustre/lustre/libcfs/fail.c @@ -131,7 +131,6 @@ int __cfs_fail_timeout_set(__u32 id, __u32 value, int ms, int set) id, ms); set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(cfs_time_seconds(ms) / 1000); - set_current_state(TASK_RUNNING); CERROR("cfs_fail_timeout id %x awake\n", id); } return ret; diff --git a/drivers/tty/bfin_jtag_comm.c b/drivers/tty/bfin_jtag_comm.c index 8096fcbe2dc1..d7b198c400c7 100644 --- a/drivers/tty/bfin_jtag_comm.c +++ b/drivers/tty/bfin_jtag_comm.c @@ -77,7 +77,6 @@ bfin_jc_emudat_manager(void *arg) pr_debug("waiting for readers\n"); __set_current_state(TASK_UNINTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); continue; } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index b6df2e83809f..52976785a32c 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, /* second+ BUSY - sleep a little bit */ set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); - __set_current_state(TASK_RUNNING); } continue; } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 0acddf60af55..bc462dcd7a40 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1585,7 +1585,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait) set_current_state(TASK_UNINTERRUPTIBLE); LOGGC_UNLOCK(log); schedule(); - __set_current_state(TASK_RUNNING); LOGGC_LOCK(log); remove_wait_queue(&target->gcwait, &__wait); } @@ -2359,7 +2358,6 @@ int jfsIOWait(void *arg) set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); schedule(); - __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 564c4f279ac6..d595856453b2 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -136,7 +136,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); io_schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(event, &wait); } @@ -2808,7 +2807,6 @@ int jfs_lazycommit(void *arg) set_current_state(TASK_INTERRUPTIBLE); LAZY_UNLOCK(flags); schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(&jfs_commit_thread_wait, &wq); } } while (!kthread_should_stop()); @@ -2996,7 +2994,6 @@ int jfs_sync(void *arg) set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); schedule(); - __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c index 04303b5c9361..9fde840c42d3 100644 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ b/fs/nfs/blocklayout/blocklayoutdev.c @@ -146,7 +146,6 @@ nfs4_blk_decode_device(struct nfs_server *server, set_current_state(TASK_UNINTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(&nn->bl_wq, &wq); if (reply->status != BL_DEVICE_REQUEST_PROC) { diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 8999cfddd866..b18680eb6b39 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -76,7 +76,6 @@ static void dev_remove(struct net *net, dev_t dev) set_current_state(TASK_UNINTERRUPTIBLE); schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(&nn->bl_wq, &wq); out: diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9c271f42604a..8f1af78ebb67 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -670,7 +670,6 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) } schedule(); - set_current_state(TASK_RUNNING); if (msg.errno < 0) ret = msg.errno; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 1c2fe7de2842..ab370ffffd53 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, */ if (!expires) { schedule(); - __set_current_state(TASK_RUNNING); return -EINTR; } diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 0434ff1b808e..3f9e328c30b5 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -205,7 +205,6 @@ static void ring_buffer_consumer(void) break; schedule(); - __set_current_state(TASK_RUNNING); } reader_finish = 0; complete(&read_done); @@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg) break; schedule(); - __set_current_state(TASK_RUNNING); } __set_current_state(TASK_RUNNING); @@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg) trace_printk("Sleeping for 10 secs\n"); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ * SLEEP_TIME); - __set_current_state(TASK_RUNNING); } if (kill_test) -- cgit v1.2.3 From a8edd075323cec607797fdd1d7b1222c987f4a47 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:41:16 +0400 Subject: sched/fair: cleanup: Remove useless assignment in select_task_rq_fair() new_cpu is reassigned below, so we do not need this here. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529276.3569.24.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e7147044bb..9807a991dc0e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4521,11 +4521,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (p->nr_cpus_allowed == 1) return prev_cpu; - if (sd_flag & SD_BALANCE_WAKE) { - if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) - want_affine = 1; - new_cpu = prev_cpu; - } + if (sd_flag & SD_BALANCE_WAKE) + want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { -- cgit v1.2.3 From f3cd1c4ec059c956d3346705e453aff3ace3b494 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:41:40 +0400 Subject: sched/core: Use put_prev_task() accessor where possible Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529300.3569.25.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7c6ed2fd69d..5536397a0309 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3033,7 +3033,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); /* * Boosting condition are: @@ -3586,7 +3586,7 @@ change: if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); prev_class = p->sched_class; __setscheduler(rq, p, attr); @@ -4792,7 +4792,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) dequeue_task(rq, p, 0); if (running) - p->sched_class->put_prev_task(rq, p); + put_prev_task(rq, p); p->numa_preferred_nid = nid; @@ -7374,7 +7374,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) dequeue_task(rq, tsk, 0); if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); + put_prev_task(rq, tsk); tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), -- cgit v1.2.3 From f3f1768f89d601ad29f4701deef91caaa82b9f57 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:42:01 +0400 Subject: sched/rt: Remove useless if from cleanup pick_next_task_rt() _pick_next_task_rt() never returns NULL. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529321.3569.26.camel@tkhai Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4feac8fcb47f..2e6a7743703e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ - if (p) - dequeue_pushable_task(rq, p); + dequeue_pushable_task(rq, p); set_post_schedule(rq); -- cgit v1.2.3 From 1ba93d42727c44001aa8ccffd39c8ab5705379e2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 17:42:20 +0400 Subject: sched/dl: Simplify pick_dl_task() 1) Nobody calls pick_dl_task() with negative cpu, it's old RT leftover. 2) If p->nr_cpus_allowed is 1, than the affinity has just been changed in set_cpus_allowed_ptr(); we'll pick it just earlier than migration thread. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410529340.3569.27.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cc4eb89019c1..aaa5abbff2f1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1121,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && - (p->nr_cpus_allowed > 1)) + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) return 1; - return 0; } -- cgit v1.2.3 From a15b12ac36ad4e7b856a4ae54937ae26a51aebad Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Sep 2014 15:03:34 +0400 Subject: sched: Do not stop cpu in set_cpus_allowed_ptr() if task is not running If a task is queued but not running on it rq, we can simply migrate it without migration thread and switching of context. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410519814.3569.7.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5536397a0309..4b1ddebed54a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4629,6 +4629,33 @@ void init_idle(struct task_struct *idle, int cpu) } #ifdef CONFIG_SMP +/* + * move_queued_task - move a queued task to new rq. + * + * Returns (locked) new rq. Old rq's lock is released. + */ +static struct rq *move_queued_task(struct task_struct *p, int new_cpu) +{ + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); + + dequeue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; + set_task_cpu(p, new_cpu); + raw_spin_unlock(&rq->lock); + + rq = cpu_rq(new_cpu); + + raw_spin_lock(&rq->lock); + BUG_ON(task_cpu(p) != new_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; + enqueue_task(rq, p, 0); + check_preempt_curr(rq, p, 0); + + return rq; +} + void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { if (p->sched_class && p->sched_class->set_cpus_allowed) @@ -4685,14 +4712,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (task_on_rq_queued(p) || p->state == TASK_WAKING) { + if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; - } + } else if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); out: task_rq_unlock(rq, p, &flags); @@ -4735,19 +4763,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (task_on_rq_queued(p)) { - dequeue_task(rq, p, 0); - p->on_rq = TASK_ON_RQ_MIGRATING; - set_task_cpu(p, dest_cpu); - raw_spin_unlock(&rq->lock); - - rq = cpu_rq(dest_cpu); - raw_spin_lock(&rq->lock); - BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; - enqueue_task(rq, p, 0); - check_preempt_curr(rq, p, 0); - } + if (task_on_rq_queued(p)) + rq = move_queued_task(p, dest_cpu); done: ret = 1; fail: -- cgit v1.2.3 From d4311ff1a8da48d609db9500f121c15580dfeeb7 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:17 +0100 Subject: init/main.c: Give init_task a canary Tasks get their end of stack set to STACK_END_MAGIC with the aim to catch stack overruns. Currently this feature does not apply to init_task. This patch removes this restriction. Note that a similar patch was posted by Prarit Bhargava some time ago but was never merged: http://marc.info/?l=linux-kernel&m=127144305403241&w=2 Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Michael Ellerman Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Alex Thorlton Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Daeseok Youn Cc: David Rientjes Cc: Fabian Frederick Cc: Geert Uytterhoeven Cc: Jiri Olsa Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Opdenacker Cc: Paul Mackerras Cc: Prarit Bhargava Cc: Rik van Riel Cc: Rusty Russell Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Vladimir Davydov Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-2-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 3 +-- arch/x86/mm/fault.c | 3 +-- include/linux/sched.h | 2 ++ init/main.c | 1 + kernel/fork.c | 12 +++++++++--- kernel/trace/trace_stack.c | 4 +--- 6 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51ab9e7e6c39..35d0760c3fa4 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -538,7 +537,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) regs->nip); stackend = end_of_stack(current); - if (current != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a24194681513..bc23a7043c65 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,7 +3,6 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include /* STACK_END_MAGIC */ #include /* test_thread_flag(), ... */ #include /* oops_begin/end, ... */ #include /* search_exception_table */ @@ -710,7 +709,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); stackend = end_of_stack(tsk); - if (tsk != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 82ff3d6efb19..118dca7d5a28 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -57,6 +57,7 @@ struct sched_param { #include #include #include +#include #include @@ -2638,6 +2639,7 @@ static inline unsigned long stack_not_used(struct task_struct *p) return (unsigned long)n - (unsigned long)end_of_stack(p); } #endif +extern void set_task_stack_end_magic(struct task_struct *tsk); /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available diff --git a/init/main.c b/init/main.c index bb1aed928f21..5fc3fc7bd475 100644 --- a/init/main.c +++ b/init/main.c @@ -508,6 +508,7 @@ asmlinkage __visible void __init start_kernel(void) * lockdep hash: */ lockdep_init(); + set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 9387ae8ab048..ad64248c4b18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst, return 0; } +void set_task_stack_end_magic(struct task_struct *tsk) +{ + unsigned long *stackend; + + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; - unsigned long *stackend; int node = tsk_fork_get_node(orig); int err; @@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); - stackend = end_of_stack(tsk); - *stackend = STACK_END_MAGIC; /* for overflow detection */ + set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 8a4e5cb66a4c..1636e41828c2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,7 +13,6 @@ #include #include #include -#include #include @@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if ((current != &init_task && - *(end_of_stack(current)) != STACK_END_MAGIC)) { + if (*end_of_stack(current) != STACK_END_MAGIC) { print_max_stack(); BUG(); } -- cgit v1.2.3 From a70857e46dd13e87ae06bf0e64cb6a2d4f436265 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:18 +0100 Subject: sched: Add helper for task stack page overrun checking This facility is used in a few places so let's introduce a helper function to improve code readability. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Paul Mackerras Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-3-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 4 +--- arch/x86/mm/fault.c | 4 +--- include/linux/sched.h | 2 ++ kernel/trace/trace_stack.c | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 35d0760c3fa4..99b2f2775658 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -507,7 +507,6 @@ bail: void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; - unsigned long *stackend; /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -536,8 +535,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); - stackend = end_of_stack(current); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(current)) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index bc23a7043c65..6240bc7ae741 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -648,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code, unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; - unsigned long *stackend; unsigned long flags; int sig; @@ -708,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, show_fault_oops(regs, error_code, address); - stackend = end_of_stack(tsk); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(tsk)) printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; diff --git a/include/linux/sched.h b/include/linux/sched.h index 118dca7d5a28..18f52624eaa6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2617,6 +2617,8 @@ static inline unsigned long *end_of_stack(struct task_struct *p) } #endif +#define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(void *obj) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1636e41828c2..16eddb308c33 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -170,7 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - if (*end_of_stack(current) != STACK_END_MAGIC) { + if (task_stack_end_corrupted(current)) { print_max_stack(); BUG(); } -- cgit v1.2.3 From 0d9e26329b0c9263d4d9e0422d80a0e73268c52f Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:19 +0100 Subject: sched: Add default-disabled option to BUG() when stack end location is overwritten Currently in the event of a stack overrun a call to schedule() does not check for this type of corruption. This corruption is often silent and can go unnoticed. However once the corrupted region is examined at a later stage, the outcome is undefined and often results in a sporadic page fault which cannot be handled. This patch checks for a stack overrun and takes appropriate action since the damage is already done, there is no point in continuing. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: rostedt@goodmis.org Cc: hannes@cmpxchg.org Cc: Alexei Starovoitov Cc: Al Viro Cc: Andi Kleen Cc: Andrew Morton Cc: Dan Streetman Cc: Davidlohr Bueso Cc: David S. Miller Cc: Kees Cook Cc: Linus Torvalds Cc: Lubomir Rintel Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1410527779-8133-4-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ lib/Kconfig.debug | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b1ddebed54a..61ee2b327a27 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2693,6 +2693,9 @@ static noinline void __schedule_bug(struct task_struct *prev) */ static inline void schedule_debug(struct task_struct *prev) { +#ifdef CONFIG_SCHED_STACK_END_CHECK + BUG_ON(unlikely(task_stack_end_corrupted(prev))); +#endif /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path. Otherwise whine diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a28590083622..e58163d69db1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -824,6 +824,18 @@ config SCHEDSTATS application, you can say N to avoid the very slight overhead this adds. +config SCHED_STACK_END_CHECK + bool "Detect stack corruption on calls to schedule()" + depends on DEBUG_KERNEL + default n + help + This option checks for a stack overrun on calls to schedule(). + If the stack end location is found to be over written always panic as + the content of the corrupted region can no longer be trusted. + This is to ensure no erroneous behaviour occurs which could result in + data corruption or a sporadic crash at a later stage once the region + is examined. The runtime overhead introduced is minimal. + config TIMER_STATS bool "Collect kernel timers statistics" depends on DEBUG_KERNEL && PROC_FS -- cgit v1.2.3 From afdeee0510db918b31bb4aba47452df2ddbdbcf2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:44 +0200 Subject: sched: Fix imbalance flag reset The imbalance flag can stay set whereas there is no imbalance. Let assume that we have 3 tasks that run on a dual cores /dual cluster system. We will have some idle load balance which are triggered during tick. Unfortunately, the tick is also used to queue background work so we can reach the situation where short work has been queued on a CPU which already runs a task. The load balance will detect this imbalance (2 tasks on 1 CPU and an idle CPU) and will try to pull the waiting task on the idle CPU. The waiting task is a worker thread that is pinned on a CPU so an imbalance due to pinned task is detected and the imbalance flag is set. Then, we will not be able to clear the flag because we have at most 1 task on each CPU but the imbalance flag will trig to useless active load balance between the idle CPU and the busy CPU. We need to reset of the imbalance flag as soon as we have reached a balanced state. If all tasks are pinned, we don't consider that as a balanced state and let the imbalance flag set. Signed-off-by: Vincent Guittot Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9807a991dc0e..01856a8bcd4c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6765,10 +6765,8 @@ more_balance: if (sd_parent) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) *group_imbalance = 1; - } else if (*group_imbalance) - *group_imbalance = 0; } /* All tasks on this runqueue were pinned by CPU affinity */ @@ -6779,7 +6777,7 @@ more_balance: env.loop_break = sched_nr_migrate_break; goto redo; } - goto out_balanced; + goto out_all_pinned; } } @@ -6853,6 +6851,23 @@ more_balance: goto out; out_balanced: + /* + * We reach balance although we may have faced some affinity + * constraints. Clear the imbalance flag if it was set. + */ + if (sd_parent) { + int *group_imbalance = &sd_parent->groups->sgc->imbalance; + + if (*group_imbalance) + *group_imbalance = 0; + } + +out_all_pinned: + /* + * We reach balance because all tasks are pinned at this level so + * we can't migrate them. Let the imbalance flag set so parent level + * can try to migrate them. + */ schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; -- cgit v1.2.3 From 05bfb65f52cbdabe26ebb629959416a6cffb034d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:45 +0200 Subject: sched: Remove a wake_affine() condition In wake_affine() I have tried to understand the meaning of the condition: (this_load <= load && this_load + target_load(prev_cpu, idx) <= tl_per_task) but I failed to find a use case that can take advantage of it and I haven't found clear description in the previous commit's log. Futhermore, the comment of the condition refers to the task_hot function that was used before being replaced by the current condition: /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ If we look more deeply the below condition: this_load + target_load(prev_cpu, idx) <= tl_per_task When sync is clear, we have: tl_per_task = runnable_load_avg / nr_running this_load = max(runnable_load_avg, cpuload[idx]) target_load = max(runnable_load_avg', cpuload'[idx]) It implies that runnable_load_avg == 0 and nr_running <= 1 in order to match the condition. This implies that runnable_load_avg == 0 too because of the condition: this_load <= load. but if this _load is null, 'balanced' is already set and the test is redundant. If sync is set, it's not as straight forward as above (especially if cgroup are involved) but the policy should be similar as we have removed a task that's going to sleep in order to get a more accurate load and this_load values. The current conclusion is that these additional condition don't give any benefit so we can remove them. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 01856a8bcd4c..391eaf25a2aa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4285,7 +4285,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; int idx, this_cpu, prev_cpu; - unsigned long tl_per_task; struct task_group *tg; unsigned long weight; int balanced; @@ -4343,32 +4342,15 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; - - /* - * If the currently running task will sleep within - * a reasonable amount of time then attract this newly - * woken task: - */ - if (sync && balanced) - return 1; - schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || - (this_load <= load && - this_load + target_load(prev_cpu, idx) <= tl_per_task)) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(sd, ttwu_move_affine); - schedstat_inc(p, se.statistics.nr_wakeups_affine); + if (!balanced) + return 0; - return 1; - } - return 0; + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; } /* -- cgit v1.2.3 From 65fdac08c264506ff95ee1e34ae066e308c9e6e3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:46 +0200 Subject: sched: Fix avg_load computation The computation of avg_load and avg_load_per_task should only take into account the number of CFS tasks. The non-CFS tasks are already taken into account by decreasing the CPU's capacity and they will be tracked in the CPU's utilization (group_utilization) of the next patches. Reviewed-by: Preeti U Murthy Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 391eaf25a2aa..eb87229ed4af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4095,7 +4095,7 @@ static unsigned long capacity_of(int cpu) static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running); unsigned long load_avg = rq->cfs.runnable_load_avg; if (nr_running) @@ -5985,7 +5985,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) *overload = true; -- cgit v1.2.3 From 26bc3c50d3b3984564c270da86f1fbbfb774dbcd Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:47 +0200 Subject: sched: Allow all architectures to set 'capacity_orig' 'capacity_orig' is only changed for systems with an SMT sched_domain level in order to reflect the lower capacity of CPUs. Heterogenous systems also have to reflect an original capacity that is different from the default value. Create a more generic function arch_scale_cpu_capacity that can be also used by non SMT platforms to set capacity_orig. The __weak implementation of arch_scale_cpu_capacity() is the previous SMT variant, in order to keep backward compatibility with the use of capacity_orig. arch_scale_smt_capacity() and default_scale_smt_capacity() have been removed as they were not used elsewhere than in arch_scale_cpu_capacity(). Signed-off-by: Vincent Guittot Reviewed-by: Kamalesh Babulal Reviewed-by: Preeti U. Murthy [ Added default_scale_cpu_capacity() back. ] Signed-off-by: Peter Zijlstra (Intel) Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eb87229ed4af..be530e40ceb9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5705,19 +5705,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) return default_scale_capacity(sd, cpu); } -static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) +static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; - unsigned long smt_gain = sd->smt_gain; + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; - smt_gain /= weight; - - return smt_gain; + return SCHED_CAPACITY_SCALE; } -unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - return default_scale_smt_capacity(sd, cpu); + return default_scale_cpu_capacity(sd, cpu); } static unsigned long scale_rt_capacity(int cpu) @@ -5756,18 +5754,15 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long weight = sd->span_weight; unsigned long capacity = SCHED_CAPACITY_SCALE; struct sched_group *sdg = sd->groups; - if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_smt_capacity(sd, cpu); - else - capacity *= default_scale_smt_capacity(sd, cpu); + if (sched_feat(ARCH_CAPACITY)) + capacity *= arch_scale_cpu_capacity(sd, cpu); + else + capacity *= default_scale_cpu_capacity(sd, cpu); - capacity >>= SCHED_CAPACITY_SHIFT; - } + capacity >>= SCHED_CAPACITY_SHIFT; sdg->sgc->capacity_orig = capacity; -- cgit v1.2.3 From bd61c98f9b3f142cd63f9e15acfe203bec9e5f5a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 26 Aug 2014 13:06:50 +0200 Subject: sched: Test the CPU's capacity in wake_affine() Currently the task always wakes affine on this_cpu if the latter is idle. Before waking up the task on this_cpu, we check that this_cpu capacity is not significantly reduced because of RT tasks or irq activity. Use case where the number of irq and/or the time spent under irq is important will take benefit of this because the task that is woken up by irq or softirq will not use the same CPU than irq (and softirq) but a idle one. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Morten.Rasmussen@arm.com Cc: efault@gmx.de Cc: nicolas.pitre@linaro.org Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1409051215-16788-8-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be530e40ceb9..74fa2c210b6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4284,6 +4284,7 @@ static int wake_wide(struct task_struct *p) static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; + s64 this_eff_load, prev_eff_load; int idx, this_cpu, prev_cpu; struct task_group *tg; unsigned long weight; @@ -4327,21 +4328,21 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - if (this_load > 0) { - s64 this_eff_load, prev_eff_load; + this_eff_load = 100; + this_eff_load *= capacity_of(prev_cpu); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= capacity_of(this_cpu); - this_eff_load = 100; - this_eff_load *= capacity_of(prev_cpu); + if (this_load > 0) { this_eff_load *= this_load + effective_load(tg, this_cpu, weight, weight); - prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; - prev_eff_load *= capacity_of(this_cpu); prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + } + + balanced = this_eff_load <= prev_eff_load; - balanced = this_eff_load <= prev_eff_load; - } else - balanced = true; schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); if (!balanced) -- cgit v1.2.3 From 4e2ba65068ac1d0e8c9df78a4ad787cf39640418 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 19 Sep 2014 16:53:14 +0800 Subject: perf/cgroup: Remove perf_put_cgroup() Commit 5a17f543ed68 ("cgroup: improve css_from_dir() into css_tryget_from_dir()") removed perf_tryget_cgroup(), so let's also remove perf_put_cgroup(). Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/events/core.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cf24b3e42ec..8be3e34274b9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -391,14 +391,9 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline void perf_put_cgroup(struct perf_event *event) -{ - css_put(&event->cgrp->css); -} - static inline void perf_detach_cgroup(struct perf_event *event) { - perf_put_cgroup(event); + css_put(&event->cgrp->css); event->cgrp = NULL; } -- cgit v1.2.3 From a25eb52e81a40e986179a790fbb5a1f02f482b7a Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 19 Sep 2014 16:51:00 +0800 Subject: cgroup: remove CGRP_RELEASABLE flag We call put_css_set() after setting CGRP_RELEASABLE flag in cgroup_task_migrate(), but in other places we call it without setting the flag. I don't see the necessity of this flag. Moreover once the flag is set, it will never be cleared, unless writing to the notify_on_release control file, so it can be quite confusing if we look at the output of debug.releasable. # mount -t cgroup -o debug xxx /cgroup # mkdir /cgroup/child # cat /cgroup/child/debug.releasable 0 <-- shows 0 though the cgroup is empty # echo $$ > /cgroup/child/tasks # cat /cgroup/child/debug.releasable 0 # echo $$ > /cgroup/tasks && echo $$ > /cgroup/child/tasks # cat /proc/child/debug.releasable 1 <-- shows 1 though the cgroup is not empty This patch removes the flag, and now debug.releasable shows if the cgroup is empty or not. Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 ----- kernel/cgroup.c | 40 +++++++++++++--------------------------- 2 files changed, 13 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 818a81fe7ccc..1d5196889048 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -161,11 +161,6 @@ static inline void css_put(struct cgroup_subsys_state *css) /* bits in struct cgroup flags field */ enum { - /* - * Control Group has previously had a child cgroup or a task, - * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) - */ - CGRP_RELEASABLE, /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index df7733b48d2e..16e3a4f5c9dc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -329,14 +329,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) return false; } -static int cgroup_is_releasable(const struct cgroup *cgrp) -{ - const int bits = - (1 << CGRP_RELEASABLE) | - (1 << CGRP_NOTIFY_ON_RELEASE); - return (cgrp->flags & bits) == bits; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -491,7 +483,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -static void put_css_set_locked(struct css_set *cset, bool taskexit) +static void put_css_set_locked(struct css_set *cset) { struct cgrp_cset_link *link, *tmp_link; struct cgroup_subsys *ss; @@ -517,11 +509,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) /* @cgrp can't go away while we're holding css_set_rwsem */ if (list_empty(&cgrp->cset_links)) { cgroup_update_populated(cgrp, false); - if (notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } + check_for_release(cgrp); } kfree(link); @@ -530,7 +518,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) kfree_rcu(cset, rcu_head); } -static void put_css_set(struct css_set *cset, bool taskexit) +static void put_css_set(struct css_set *cset) { /* * Ensure that the refcount doesn't hit zero while any readers @@ -541,7 +529,7 @@ static void put_css_set(struct css_set *cset, bool taskexit) return; down_write(&css_set_rwsem); - put_css_set_locked(cset, taskexit); + put_css_set_locked(cset); up_write(&css_set_rwsem); } @@ -2037,8 +2025,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * task. As trading it for new_cset is protected by cgroup_mutex, * we're safe to drop it here; it will be freed under RCU. */ - set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set_locked(old_cset, false); + put_css_set_locked(old_cset); } /** @@ -2059,7 +2046,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) cset->mg_src_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); - put_css_set_locked(cset, false); + put_css_set_locked(cset); } up_write(&css_set_rwsem); } @@ -2153,8 +2140,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); - put_css_set(src_cset, false); - put_css_set(dst_cset, false); + put_css_set(src_cset); + put_css_set(dst_cset); continue; } @@ -2163,7 +2150,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, if (list_empty(&dst_cset->mg_preload_node)) list_add(&dst_cset->mg_preload_node, &csets); else - put_css_set(dst_cset, false); + put_css_set(dst_cset); } list_splice_tail(&csets, preloaded_csets); @@ -4159,7 +4146,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else @@ -4806,7 +4792,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ kernfs_remove(cgrp->kn); - set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags); check_for_release(cgroup_parent(cgrp)); /* put the base reference */ @@ -5244,12 +5229,12 @@ void cgroup_exit(struct task_struct *tsk) } if (put_cset) - put_css_set(cset, true); + put_css_set(cset); } static void check_for_release(struct cgroup *cgrp) { - if (cgroup_is_releasable(cgrp) && !cgroup_has_tasks(cgrp) && + if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } @@ -5496,7 +5481,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); + return (!cgroup_has_tasks(css->cgroup) && + !css_has_online_children(&css->cgroup->self)); } static struct cftype debug_files[] = { -- cgit v1.2.3 From 3e2cd91ab92665148616a80dc0745c499d2746a7 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Sat, 20 Sep 2014 14:35:43 +0800 Subject: cgroup: fix missing unlock in cgroup_release_agent() The patch 971ff4935538: "cgroup: use a per-cgroup work for release agent" from Sep 18, 2014, leads to the following static checker warning: kernel/cgroup.c:5310 cgroup_release_agent() warn: 'mutex:&cgroup_mutex' is sometimes locked here and sometimes unlocked. Reported-by: Dan Carpenter Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 16e3a4f5c9dc..f873c4681316 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5291,7 +5291,10 @@ static void cgroup_release_agent(struct work_struct *work) mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + goto out_free; out: + mutex_unlock(&cgroup_mutex); +out_free: kfree(agentbuf); kfree(pathbuf); } -- cgit v1.2.3 From 0c7bf3e8cab7900e17ce7f97104c39927d835469 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Sat, 20 Sep 2014 14:49:10 +0800 Subject: cgroup: remove redundant variable in cgroup_mount() Both pinned_sb and new_sb indicate if a new superblock is needed, so we can just remove new_sb. Note now we must check if kernfs_tryget_sb() returns NULL, because when it returns NULL, kernfs_mount() may still re-use an existing superblock, which is just allocated by another concurent mount. Suggested-by: Tejun Heo Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f873c4681316..5eb20cd1709c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1694,7 +1694,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct dentry *dentry; int ret; int i; - bool new_sb; /* * The first time anyone tries to mount a cgroup, enable the list @@ -1785,7 +1784,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * path is super cold. Let's just sleep a bit and retry. */ pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || + if (IS_ERR_OR_NULL(pinned_sb) || !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); if (!IS_ERR_OR_NULL(pinned_sb)) @@ -1831,18 +1830,16 @@ out_free: return ERR_PTR(ret); dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); - if (IS_ERR(dentry) || !new_sb) + CGROUP_SUPER_MAGIC, NULL); + if (IS_ERR(dentry) || pinned_sb) cgroup_put(&root->cgrp); /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. */ - if (pinned_sb) { - WARN_ON(new_sb); + if (pinned_sb) deactivate_super(pinned_sb); - } return dentry; } -- cgit v1.2.3 From 9c58c79a8a76c510cd3a5012c536d4fe3c81ec3b Mon Sep 17 00:00:00 2001 From: Zhihui Zhang Date: Sat, 20 Sep 2014 21:24:36 -0400 Subject: sched: Clean up some typos and grammatical errors in code/comments Signed-off-by: Zhihui Zhang Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1411262676-19928-1-git-send-email-zzhsuny@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 61ee2b327a27..a2841904f2d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8069,7 +8069,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); - parent_quota = parent_b->hierarchal_quota; + parent_quota = parent_b->hierarchical_quota; /* * ensure max(child_quota) <= parent_quota, inherit when no @@ -8080,7 +8080,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) else if (parent_quota != RUNTIME_INF && quota > parent_quota) return -EINVAL; } - cfs_b->hierarchal_quota = quota; + cfs_b->hierarchical_quota = quota; return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 74fa2c210b6d..2a1e6ac6bb32 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2224,8 +2224,8 @@ static __always_inline u64 decay_load(u64 val, u64 n) /* * As y^PERIOD = 1/2, we can combine - * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) - * With a look-up table which covers k^n (navg_load >= busiest->avg_load) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aa0f73ba3777..1bc6aad1391a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -188,7 +188,7 @@ struct cfs_bandwidth { raw_spinlock_t lock; ktime_t period; u64 quota, runtime; - s64 hierarchal_quota; + s64 hierarchical_quota; u64 runtime_expires; int idle, timer_active; -- cgit v1.2.3 From 0cadc70282df0d957c00e8e68ba58afeefdf0f64 Mon Sep 17 00:00:00 2001 From: Todd E Brandt Date: Fri, 19 Sep 2014 14:07:12 -0700 Subject: PM / sleep: new suspend_resume trace event for console resume This patch adds another suspend_resume trace event for analyze_suspend to capture. The resume_console call can take several hundred milliseconds if the printk buffer is full of debug info. The tool will now inform testers of the wasted time and encourage them to disable it in production builds. Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 18c62195660f..e837dd6783c6 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -361,7 +361,9 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + trace_suspend_resume(TPS("resume_console"), state, true); resume_console(); + trace_suspend_resume(TPS("resume_console"), state, false); Close: platform_suspend_end(state); -- cgit v1.2.3 From aa42240ab2544a8bcb2efb400193826f57f3175e Mon Sep 17 00:00:00 2001 From: Tomasz Figa Date: Fri, 19 Sep 2014 20:27:36 +0200 Subject: PM / Domains: Add generic OF-based PM domain look-up This patch introduces generic code to perform PM domain look-up using device tree and automatically bind devices to their PM domains. Generic device tree bindings are introduced to specify PM domains of devices in their device tree nodes. Backwards compatibility with legacy Samsung-specific PM domain bindings is provided, but for now the new code is not compiled when CONFIG_ARCH_EXYNOS is selected to avoid collision with legacy code. This will change as soon as the Exynos PM domain code gets converted to use the generic framework in further patch. This patch was originally submitted by Tomasz Figa when he was employed by Samsung. Link: http://marc.info/?l=linux-pm&m=139955349702152&w=2 Signed-off-by: Ulf Hansson Acked-by: Rob Herring Tested-by: Philipp Zabel Reviewed-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- .../devicetree/bindings/power/power_domain.txt | 49 ++++ drivers/base/power/domain.c | 289 +++++++++++++++++++++ include/linux/pm_domain.h | 52 ++++ kernel/power/Kconfig | 4 + 4 files changed, 394 insertions(+) create mode 100644 Documentation/devicetree/bindings/power/power_domain.txt (limited to 'kernel') diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt new file mode 100644 index 000000000000..98c16672ab5f --- /dev/null +++ b/Documentation/devicetree/bindings/power/power_domain.txt @@ -0,0 +1,49 @@ +* Generic PM domains + +System on chip designs are often divided into multiple PM domains that can be +used for power gating of selected IP blocks for power saving by reduced leakage +current. + +This device tree binding can be used to bind PM domain consumer devices with +their PM domains provided by PM domain providers. A PM domain provider can be +represented by any node in the device tree and can provide one or more PM +domains. A consumer node can refer to the provider by a phandle and a set of +phandle arguments (so called PM domain specifiers) of length specified by the +#power-domain-cells property in the PM domain provider node. + +==PM domain providers== + +Required properties: + - #power-domain-cells : Number of cells in a PM domain specifier; + Typically 0 for nodes representing a single PM domain and 1 for nodes + providing multiple PM domains (e.g. power controllers), but can be any value + as specified by device tree binding documentation of particular provider. + +Example: + + power: power-controller@12340000 { + compatible = "foo,power-controller"; + reg = <0x12340000 0x1000>; + #power-domain-cells = <1>; + }; + +The node above defines a power controller that is a PM domain provider and +expects one cell as its phandle argument. + +==PM domain consumers== + +Required properties: + - power-domains : A phandle and PM domain specifier as defined by bindings of + the power controller specified by phandle. + +Example: + + leaky-device@12350000 { + compatible = "foo,i-leak-current"; + reg = <0x12350000 0x1000>; + power-domains = <&power 0>; + }; + +The node above defines a typical PM domain consumer device, which is located +inside a PM domain with index 0 of a power controller represented by a node +with the label "power". diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index e6a11ca3ce26..a3d41a883d83 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -1933,3 +1934,291 @@ void pm_genpd_init(struct generic_pm_domain *genpd, list_add(&genpd->gpd_list_node, &gpd_list); mutex_unlock(&gpd_list_lock); } + +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF +/* + * Device Tree based PM domain providers. + * + * The code below implements generic device tree based PM domain providers that + * bind device tree nodes with generic PM domains registered in the system. + * + * Any driver that registers generic PM domains and needs to support binding of + * devices to these domains is supposed to register a PM domain provider, which + * maps a PM domain specifier retrieved from the device tree to a PM domain. + * + * Two simple mapping functions have been provided for convenience: + * - __of_genpd_xlate_simple() for 1:1 device tree node to PM domain mapping. + * - __of_genpd_xlate_onecell() for mapping of multiple PM domains per node by + * index. + */ + +/** + * struct of_genpd_provider - PM domain provider registration structure + * @link: Entry in global list of PM domain providers + * @node: Pointer to device tree node of PM domain provider + * @xlate: Provider-specific xlate callback mapping a set of specifier cells + * into a PM domain. + * @data: context pointer to be passed into @xlate callback + */ +struct of_genpd_provider { + struct list_head link; + struct device_node *node; + genpd_xlate_t xlate; + void *data; +}; + +/* List of registered PM domain providers. */ +static LIST_HEAD(of_genpd_providers); +/* Mutex to protect the list above. */ +static DEFINE_MUTEX(of_genpd_mutex); + +/** + * __of_genpd_xlate_simple() - Xlate function for direct node-domain mapping + * @genpdspec: OF phandle args to map into a PM domain + * @data: xlate function private data - pointer to struct generic_pm_domain + * + * This is a generic xlate function that can be used to model PM domains that + * have their own device tree nodes. The private data of xlate function needs + * to be a valid pointer to struct generic_pm_domain. + */ +struct generic_pm_domain *__of_genpd_xlate_simple( + struct of_phandle_args *genpdspec, + void *data) +{ + if (genpdspec->args_count != 0) + return ERR_PTR(-EINVAL); + return data; +} +EXPORT_SYMBOL_GPL(__of_genpd_xlate_simple); + +/** + * __of_genpd_xlate_onecell() - Xlate function using a single index. + * @genpdspec: OF phandle args to map into a PM domain + * @data: xlate function private data - pointer to struct genpd_onecell_data + * + * This is a generic xlate function that can be used to model simple PM domain + * controllers that have one device tree node and provide multiple PM domains. + * A single cell is used as an index into an array of PM domains specified in + * the genpd_onecell_data struct when registering the provider. + */ +struct generic_pm_domain *__of_genpd_xlate_onecell( + struct of_phandle_args *genpdspec, + void *data) +{ + struct genpd_onecell_data *genpd_data = data; + unsigned int idx = genpdspec->args[0]; + + if (genpdspec->args_count != 1) + return ERR_PTR(-EINVAL); + + if (idx >= genpd_data->num_domains) { + pr_err("%s: invalid domain index %u\n", __func__, idx); + return ERR_PTR(-EINVAL); + } + + if (!genpd_data->domains[idx]) + return ERR_PTR(-ENOENT); + + return genpd_data->domains[idx]; +} +EXPORT_SYMBOL_GPL(__of_genpd_xlate_onecell); + +/** + * __of_genpd_add_provider() - Register a PM domain provider for a node + * @np: Device node pointer associated with the PM domain provider. + * @xlate: Callback for decoding PM domain from phandle arguments. + * @data: Context pointer for @xlate callback. + */ +int __of_genpd_add_provider(struct device_node *np, genpd_xlate_t xlate, + void *data) +{ + struct of_genpd_provider *cp; + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) + return -ENOMEM; + + cp->node = of_node_get(np); + cp->data = data; + cp->xlate = xlate; + + mutex_lock(&of_genpd_mutex); + list_add(&cp->link, &of_genpd_providers); + mutex_unlock(&of_genpd_mutex); + pr_debug("Added domain provider from %s\n", np->full_name); + + return 0; +} +EXPORT_SYMBOL_GPL(__of_genpd_add_provider); + +/** + * of_genpd_del_provider() - Remove a previously registered PM domain provider + * @np: Device node pointer associated with the PM domain provider + */ +void of_genpd_del_provider(struct device_node *np) +{ + struct of_genpd_provider *cp; + + mutex_lock(&of_genpd_mutex); + list_for_each_entry(cp, &of_genpd_providers, link) { + if (cp->node == np) { + list_del(&cp->link); + of_node_put(cp->node); + kfree(cp); + break; + } + } + mutex_unlock(&of_genpd_mutex); +} +EXPORT_SYMBOL_GPL(of_genpd_del_provider); + +/** + * of_genpd_get_from_provider() - Look-up PM domain + * @genpdspec: OF phandle args to use for look-up + * + * Looks for a PM domain provider under the node specified by @genpdspec and if + * found, uses xlate function of the provider to map phandle args to a PM + * domain. + * + * Returns a valid pointer to struct generic_pm_domain on success or ERR_PTR() + * on failure. + */ +static struct generic_pm_domain *of_genpd_get_from_provider( + struct of_phandle_args *genpdspec) +{ + struct generic_pm_domain *genpd = ERR_PTR(-ENOENT); + struct of_genpd_provider *provider; + + mutex_lock(&of_genpd_mutex); + + /* Check if we have such a provider in our array */ + list_for_each_entry(provider, &of_genpd_providers, link) { + if (provider->node == genpdspec->np) + genpd = provider->xlate(genpdspec, provider->data); + if (!IS_ERR(genpd)) + break; + } + + mutex_unlock(&of_genpd_mutex); + + return genpd; +} + +/** + * genpd_dev_pm_detach - Detach a device from its PM domain. + * @dev: Device to attach. + * @power_off: Currently not used + * + * Try to locate a corresponding generic PM domain, which the device was + * attached to previously. If such is found, the device is detached from it. + */ +static void genpd_dev_pm_detach(struct device *dev, bool power_off) +{ + struct generic_pm_domain *pd = NULL, *gpd; + int ret = 0; + + if (!dev->pm_domain) + return; + + mutex_lock(&gpd_list_lock); + list_for_each_entry(gpd, &gpd_list, gpd_list_node) { + if (&gpd->domain == dev->pm_domain) { + pd = gpd; + break; + } + } + mutex_unlock(&gpd_list_lock); + + if (!pd) + return; + + dev_dbg(dev, "removing from PM domain %s\n", pd->name); + + while (1) { + ret = pm_genpd_remove_device(pd, dev); + if (ret != -EAGAIN) + break; + cond_resched(); + } + + if (ret < 0) { + dev_err(dev, "failed to remove from PM domain %s: %d", + pd->name, ret); + return; + } + + /* Check if PM domain can be powered off after removing this device. */ + genpd_queue_power_off_work(pd); +} + +/** + * genpd_dev_pm_attach - Attach a device to its PM domain using DT. + * @dev: Device to attach. + * + * Parse device's OF node to find a PM domain specifier. If such is found, + * attaches the device to retrieved pm_domain ops. + * + * Both generic and legacy Samsung-specific DT bindings are supported to keep + * backwards compatibility with existing DTBs. + * + * Returns 0 on successfully attached PM domain or negative error code. + */ +int genpd_dev_pm_attach(struct device *dev) +{ + struct of_phandle_args pd_args; + struct generic_pm_domain *pd; + int ret; + + if (!dev->of_node) + return -ENODEV; + + if (dev->pm_domain) + return -EEXIST; + + ret = of_parse_phandle_with_args(dev->of_node, "power-domains", + "#power-domain-cells", 0, &pd_args); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + + /* + * Try legacy Samsung-specific bindings + * (for backwards compatibility of DT ABI) + */ + pd_args.args_count = 0; + pd_args.np = of_parse_phandle(dev->of_node, + "samsung,power-domain", 0); + if (!pd_args.np) + return -ENOENT; + } + + pd = of_genpd_get_from_provider(&pd_args); + if (IS_ERR(pd)) { + dev_dbg(dev, "%s() failed to find PM domain: %ld\n", + __func__, PTR_ERR(pd)); + of_node_put(dev->of_node); + return PTR_ERR(pd); + } + + dev_dbg(dev, "adding to PM domain %s\n", pd->name); + + while (1) { + ret = pm_genpd_add_device(pd, dev); + if (ret != -EAGAIN) + break; + cond_resched(); + } + + if (ret < 0) { + dev_err(dev, "failed to add to PM domain %s: %d", + pd->name, ret); + of_node_put(dev->of_node); + return ret; + } + + dev->pm_domain->detach = genpd_dev_pm_detach; + + return 0; +} +EXPORT_SYMBOL_GPL(genpd_dev_pm_attach); +#endif diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index aa03586c94a2..292079d8da6b 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -264,4 +264,56 @@ static inline void pm_genpd_syscore_poweroff(struct device *dev) {} static inline void pm_genpd_syscore_poweron(struct device *dev) {} #endif +/* OF PM domain providers */ +struct of_device_id; + +struct genpd_onecell_data { + struct generic_pm_domain **domains; + unsigned int num_domains; +}; + +typedef struct generic_pm_domain *(*genpd_xlate_t)(struct of_phandle_args *args, + void *data); + +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF +int __of_genpd_add_provider(struct device_node *np, genpd_xlate_t xlate, + void *data); +void of_genpd_del_provider(struct device_node *np); + +struct generic_pm_domain *__of_genpd_xlate_simple( + struct of_phandle_args *genpdspec, + void *data); +struct generic_pm_domain *__of_genpd_xlate_onecell( + struct of_phandle_args *genpdspec, + void *data); + +int genpd_dev_pm_attach(struct device *dev); +#else /* !CONFIG_PM_GENERIC_DOMAINS_OF */ +static inline int __of_genpd_add_provider(struct device_node *np, + genpd_xlate_t xlate, void *data) +{ + return 0; +} +static inline void of_genpd_del_provider(struct device_node *np) {} + +#define __of_genpd_xlate_simple NULL +#define __of_genpd_xlate_onecell NULL + +static inline int genpd_dev_pm_attach(struct device *dev) +{ + return -ENODEV; +} +#endif /* CONFIG_PM_GENERIC_DOMAINS_OF */ + +static inline int of_genpd_add_provider_simple(struct device_node *np, + struct generic_pm_domain *genpd) +{ + return __of_genpd_add_provider(np, __of_genpd_xlate_simple, genpd); +} +static inline int of_genpd_add_provider_onecell(struct device_node *np, + struct genpd_onecell_data *data) +{ + return __of_genpd_add_provider(np, __of_genpd_xlate_onecell, data); +} + #endif /* _LINUX_PM_DOMAIN_H */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e4e4121fa327..897619b11fb2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -302,6 +302,10 @@ config PM_GENERIC_DOMAINS_RUNTIME def_bool y depends on PM_RUNTIME && PM_GENERIC_DOMAINS +config PM_GENERIC_DOMAINS_OF + def_bool y + depends on PM_GENERIC_DOMAINS && OF && !ARCH_EXYNOS + config CPU_PM bool depends on SUSPEND || CPU_IDLE -- cgit v1.2.3 From a4a8c2c4962bb655e7152c53a0eb6ca31c47f159 Mon Sep 17 00:00:00 2001 From: Tomasz Figa Date: Fri, 19 Sep 2014 20:27:43 +0200 Subject: ARM: exynos: Move to generic PM domain DT bindings This patch moves Exynos PM domain code to use the new generic PM domain look-up framework introduced in previous patches, thus also allowing the new code to be compiled with CONFIG_ARCH_EXYNOS. This patch was originally submitted by Tomasz Figa when he was employed by Samsung. Link: http://marc.info/?l=linux-pm&m=139955336002083&w=2 Signed-off-by: Ulf Hansson Reviewed-by: Kevin Hilman Reviewed-by: Dmitry Torokhov Signed-off-by: Rafael J. Wysocki --- .../bindings/arm/exynos/power_domain.txt | 13 ++-- arch/arm/mach-exynos/pm_domains.c | 78 +--------------------- kernel/power/Kconfig | 2 +- 3 files changed, 8 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/Documentation/devicetree/bindings/arm/exynos/power_domain.txt b/Documentation/devicetree/bindings/arm/exynos/power_domain.txt index 8b4f7b7fe88b..abde1ea8a119 100644 --- a/Documentation/devicetree/bindings/arm/exynos/power_domain.txt +++ b/Documentation/devicetree/bindings/arm/exynos/power_domain.txt @@ -8,6 +8,8 @@ Required Properties: * samsung,exynos4210-pd - for exynos4210 type power domain. - reg: physical base address of the controller and length of memory mapped region. +- #power-domain-cells: number of cells in power domain specifier; + must be 0. Optional Properties: - clocks: List of clock handles. The parent clocks of the input clocks to the @@ -29,6 +31,7 @@ Example: lcd0: power-domain-lcd0 { compatible = "samsung,exynos4210-pd"; reg = <0x10023C00 0x10>; + #power-domain-cells = <0>; }; mfc_pd: power-domain@10044060 { @@ -37,12 +40,8 @@ Example: clocks = <&clock CLK_FIN_PLL>, <&clock CLK_MOUT_SW_ACLK333>, <&clock CLK_MOUT_USER_ACLK333>; clock-names = "oscclk", "pclk0", "clk0"; + #power-domain-cells = <0>; }; -Example of the node using power domain: - - node { - /* ... */ - samsung,power-domain = <&lcd0>; - /* ... */ - }; +See Documentation/devicetree/bindings/power/power_domain.txt for description +of consumer-side bindings. diff --git a/arch/arm/mach-exynos/pm_domains.c b/arch/arm/mach-exynos/pm_domains.c index fd76e1b5a471..20f267121b3e 100644 --- a/arch/arm/mach-exynos/pm_domains.c +++ b/arch/arm/mach-exynos/pm_domains.c @@ -105,78 +105,6 @@ static int exynos_pd_power_off(struct generic_pm_domain *domain) return exynos_pd_power(domain, false); } -static void exynos_add_device_to_domain(struct exynos_pm_domain *pd, - struct device *dev) -{ - int ret; - - dev_dbg(dev, "adding to power domain %s\n", pd->pd.name); - - while (1) { - ret = pm_genpd_add_device(&pd->pd, dev); - if (ret != -EAGAIN) - break; - cond_resched(); - } - - pm_genpd_dev_need_restore(dev, true); -} - -static void exynos_remove_device_from_domain(struct device *dev) -{ - struct generic_pm_domain *genpd = dev_to_genpd(dev); - int ret; - - dev_dbg(dev, "removing from power domain %s\n", genpd->name); - - while (1) { - ret = pm_genpd_remove_device(genpd, dev); - if (ret != -EAGAIN) - break; - cond_resched(); - } -} - -static void exynos_read_domain_from_dt(struct device *dev) -{ - struct platform_device *pd_pdev; - struct exynos_pm_domain *pd; - struct device_node *node; - - node = of_parse_phandle(dev->of_node, "samsung,power-domain", 0); - if (!node) - return; - pd_pdev = of_find_device_by_node(node); - if (!pd_pdev) - return; - pd = platform_get_drvdata(pd_pdev); - exynos_add_device_to_domain(pd, dev); -} - -static int exynos_pm_notifier_call(struct notifier_block *nb, - unsigned long event, void *data) -{ - struct device *dev = data; - - switch (event) { - case BUS_NOTIFY_BIND_DRIVER: - if (dev->of_node) - exynos_read_domain_from_dt(dev); - - break; - - case BUS_NOTIFY_UNBOUND_DRIVER: - exynos_remove_device_from_domain(dev); - - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block platform_nb = { - .notifier_call = exynos_pm_notifier_call, -}; - static __init int exynos4_pm_init_power_domain(void) { struct platform_device *pdev; @@ -202,7 +130,6 @@ static __init int exynos4_pm_init_power_domain(void) pd->base = of_iomap(np, 0); pd->pd.power_off = exynos_pd_power_off; pd->pd.power_on = exynos_pd_power_on; - pd->pd.of_node = np; pd->oscclk = clk_get(dev, "oscclk"); if (IS_ERR(pd->oscclk)) @@ -228,15 +155,12 @@ static __init int exynos4_pm_init_power_domain(void) clk_put(pd->oscclk); no_clk: - platform_set_drvdata(pdev, pd); - on = __raw_readl(pd->base + 0x4) & INT_LOCAL_PWR_EN; pm_genpd_init(&pd->pd, NULL, !on); + of_genpd_add_provider_simple(np, &pd->pd); } - bus_register_notifier(&platform_bus_type, &platform_nb); - return 0; } arch_initcall(exynos4_pm_init_power_domain); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 897619b11fb2..bbef57f5bdfd 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -304,7 +304,7 @@ config PM_GENERIC_DOMAINS_RUNTIME config PM_GENERIC_DOMAINS_OF def_bool y - depends on PM_GENERIC_DOMAINS && OF && !ARCH_EXYNOS + depends on PM_GENERIC_DOMAINS && OF config CPU_PM bool -- cgit v1.2.3 From 4a99854c5840065e7d3a464523cbe1993acb4f00 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 28 Feb 2014 14:30:45 -0500 Subject: audit: __audit_syscall_entry: ignore arch arg and call syscall_get_arch() directly Since every arch should have syscall_get_arch() defined, stop using the function argument and just collect this ourselves. We do not drop the argument as fixing some code paths (in assembly) to not pass this first argument is non-trivial. The argument will be dropped when that is fixed. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 21eae3c05ec0..dff2a2325655 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1536,7 +1536,7 @@ void __audit_syscall_entry(int arch, int major, if (!audit_enabled) return; - context->arch = arch; + context->arch = syscall_get_arch(); context->major = major; context->argv[0] = a1; context->argv[1] = a2; -- cgit v1.2.3 From 84db564aad45774ab64375ee019d5e7a42675b1f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 29 Jan 2014 16:17:58 -0500 Subject: audit: add arch field to seccomp event log The AUDIT_SECCOMP record looks something like this: type=SECCOMP msg=audit(1373478171.953:32775): auid=4325 uid=4325 gid=4325 ses=1 subj=unconfined_u:unconfined_r:unconfined_t:s0 pid=12381 comm="test" sig=31 syscall=231 compat=0 ip=0x39ea8bca89 code=0x0 In order to determine what syscall 231 maps to, we need to have the arch= field right before it. To see the event, compile this test.c program: ===== int main(void) { return seccomp_load(seccomp_init(SCMP_ACT_KILL)); } ===== gcc -g test.c -o test -lseccomp After running the program, find the record by: ausearch --start recent -m SECCOMP -i Signed-off-by: Richard Guy Briggs signed-off-by: Eric Paris --- kernel/auditsc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index dff2a2325655..9f03ac205e1f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -2488,11 +2489,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) if (unlikely(!ab)) return; audit_log_task(ab); - audit_log_format(ab, " sig=%ld", signr); - audit_log_format(ab, " syscall=%ld", syscall); - audit_log_format(ab, " compat=%d", is_compat_task()); - audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); - audit_log_format(ab, " code=0x%x", code); + audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", + signr, syscall_get_arch(), syscall, is_compat_task(), + KSTK_EIP(current), code); audit_log_end(ab); } -- cgit v1.2.3 From b4f0d3755c5e9cc86292d5fd78261903b4f23d4a Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 4 Mar 2014 10:38:06 -0500 Subject: audit: x86: drop arch from __audit_syscall_entry() interface Since the arch is found locally in __audit_syscall_entry(), there is no need to pass it in as a parameter. Delete it from the parameter list. x86* was the only arch to call __audit_syscall_entry() directly and did so from assembly code. Signed-off-by: Richard Guy Briggs Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-audit@redhat.com Signed-off-by: Eric Paris --- As this patch relies on changes in the audit tree, I think it appropriate to send it through my tree rather than the x86 tree. --- arch/x86/ia32/ia32entry.S | 12 ++++++------ arch/x86/kernel/entry_32.S | 11 +++++------ arch/x86/kernel/entry_64.S | 11 +++++------ include/linux/audit.h | 5 ++--- kernel/auditsc.c | 6 ++---- 5 files changed, 20 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4299eb05023c..f5bdd2881815 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -186,12 +186,12 @@ sysexit_from_sys_call: #ifdef CONFIG_AUDITSYSCALL .macro auditsys_entry_common - movl %esi,%r9d /* 6th arg: 4th syscall arg */ - movl %edx,%r8d /* 5th arg: 3rd syscall arg */ - /* (already in %ecx) 4th arg: 2nd syscall arg */ - movl %ebx,%edx /* 3rd arg: 1st syscall arg */ - movl %eax,%esi /* 2nd arg: syscall number */ - movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ + movl %esi,%r8d /* 5th arg: 4th syscall arg */ + movl %ecx,%r9d /*swap with edx*/ + movl %edx,%ecx /* 4th arg: 3rd syscall arg */ + movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ + movl %ebx,%esi /* 2nd arg: 1st syscall arg */ + movl %eax,%edi /* 1st arg: syscall number */ call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 0d0c9d4ab6d5..f9e3fabc8716 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -449,12 +449,11 @@ sysenter_audit: jnz syscall_trace_entry addl $4,%esp CFI_ADJUST_CFA_OFFSET -4 - /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ - /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ - /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ - movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ - movl %eax,%edx /* 2nd arg: syscall number */ - movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + movl %esi,4(%esp) /* 5th arg: 4th syscall arg */ + movl %edx,(%esp) /* 4th arg: 3rd syscall arg */ + /* %ecx already in %ecx 3rd arg: 2nd syscall arg */ + movl %ebx,%edx /* 2nd arg: 1st syscall arg */ + /* %eax already in %eax 1st arg: syscall number */ call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c844f0816ab8..5e8cb2ad9fb3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -488,12 +488,11 @@ badsys: * jump back to the normal fast path. */ auditsys: - movq %r10,%r9 /* 6th arg: 4th syscall arg */ - movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ - movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ - movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ - movq %rax,%rsi /* 2nd arg: syscall number */ - movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + movq %r10,%r8 /* 5th arg: 4th syscall arg */ + movq %rdx,%rcx /* 4th arg: 3rd syscall arg */ + movq %rsi,%rdx /* 3rd arg: 2nd syscall arg */ + movq %rdi,%rsi /* 2nd arg: 1st syscall arg */ + movq %rax,%rdi /* 1st arg: syscall number */ call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath diff --git a/include/linux/audit.h b/include/linux/audit.h index 783157b289e8..1ae00891aff9 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -115,8 +115,7 @@ extern void audit_log_session_info(struct audit_buffer *ab); /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); -extern void __audit_syscall_entry(int arch, - int major, unsigned long a0, unsigned long a1, +extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); @@ -148,7 +147,7 @@ static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a3) { if (unlikely(current->audit_context)) - __audit_syscall_entry(syscall_get_arch(), major, a0, a1, a2, a3); + __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9f03ac205e1f..4e17443fd1ef 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1506,7 +1506,6 @@ void __audit_free(struct task_struct *tsk) /** * audit_syscall_entry - fill in an audit record at syscall entry - * @arch: architecture type * @major: major syscall type (function) * @a1: additional syscall register 1 * @a2: additional syscall register 2 @@ -1521,9 +1520,8 @@ void __audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void __audit_syscall_entry(int arch, int major, - unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4) +void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4) { struct task_struct *tsk = current; struct audit_context *context = tsk->audit_context; -- cgit v1.2.3 From c0a8d9b0692cced5b0701ed501012e28b224d32b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 26 May 2014 10:59:28 -0400 Subject: audit: reduce scope of audit_net_id audit_net_id isn't used outside kernel/audit.c. Reduce its scope. Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3ef2e0e797e8..9a951e67a89e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; -int audit_net_id; +static int audit_net_id; /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; -- cgit v1.2.3 From 691e6d59d2b6cdb4595e5f626503a1c9e98b8baf Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 26 May 2014 11:02:48 -0400 Subject: audit: reduce scope of audit_log_fcaps audit_log_fcaps() isn't used outside kernel/audit.c. Reduce its scope. Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- kernel/audit.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 9a951e67a89e..de991950091f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1681,7 +1681,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) } } -void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) { kernel_cap_t *perm = &name->fcap.permitted; kernel_cap_t *inh = &name->fcap.inheritable; diff --git a/kernel/audit.h b/kernel/audit.h index 7bb65730c890..3cdffad5a1d9 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name, const struct inode *inode); extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); -extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name); extern void audit_log_name(struct audit_context *context, struct audit_names *n, struct path *path, int record_num, int *call_panic); -- cgit v1.2.3 From 6eed9b261334932c742458edd64b7b9fd0b981a9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 3 Jun 2014 22:05:10 +0200 Subject: kernel/audit.c: use ARRAY_SIZE instead of sizeof/sizeof[0] Use kernel.h definition. Cc: Eric Paris Cc: Andrew Morton Signed-off-by: Fabian Frederick Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index de991950091f..8a82d481393d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -750,7 +750,7 @@ static int audit_set_feature(struct sk_buff *skb) struct audit_features *uaf; int i; - BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); + BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names)); uaf = nlmsg_data(nlmsg_hdr(skb)); /* if there is ever a version 2 we should handle that here */ -- cgit v1.2.3 From 01478d7d60f654419ba863856cad0446bcb73a59 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 13 Jun 2014 18:22:00 -0400 Subject: audit: use atomic_t to simplify audit_serial() Since there is already a primitive to do this operation in the atomic_t, use it to simplify audit_serial(). Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 8a82d481393d..7aef7cbd7bcf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1301,19 +1301,9 @@ err: */ unsigned int audit_serial(void) { - static DEFINE_SPINLOCK(serial_lock); - static unsigned int serial = 0; + static atomic_t serial = ATOMIC_INIT(0); - unsigned long flags; - unsigned int ret; - - spin_lock_irqsave(&serial_lock, flags); - do { - ret = ++serial; - } while (unlikely(!ret)); - spin_unlock_irqrestore(&serial_lock, flags); - - return ret; + return atomic_add_return(1, &serial); } static inline void audit_get_stamp(struct audit_context *ctx, -- cgit v1.2.3 From e7df61f4d1ddb7fdd654dde6cd40f7cc398c3932 Mon Sep 17 00:00:00 2001 From: Burn Alting Date: Fri, 4 Apr 2014 16:00:38 +1100 Subject: audit: invalid op= values for rules Various audit events dealing with adding, removing and updating rules result in invalid values set for the op keys which result in embedded spaces in op= values. The invalid values are op="add rule" set in kernel/auditfilter.c op="remove rule" set in kernel/auditfilter.c op="remove rule" set in kernel/audit_tree.c op="updated rules" set in kernel/audit_watch.c op="remove rule" set in kernel/audit_watch.c Replace the space in the above values with an underscore character ('_'). Coded-by: Burn Alting Signed-off-by: Richard Guy Briggs --- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 4 ++-- kernel/auditfilter.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 135944a7b28a..bd418c486e9a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -457,7 +457,7 @@ static void audit_log_remove_rule(struct audit_krule *rule) if (unlikely(!ab)) return; audit_log_format(ab, "op="); - audit_log_string(ab, "remove rule"); + audit_log_string(ab, "remove_rule"); audit_log_format(ab, " dir="); audit_log_untrustedstring(ab, rule->tree->pathname); audit_log_key(ab, rule->filterkey); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 70b4554d2fbe..ad9c1682f616 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent, &nentry->rule.list); } - audit_watch_log_rule_change(r, owatch, "updated rules"); + audit_watch_log_rule_change(r, owatch, "updated_rules"); call_rcu(&oentry->rcu, audit_free_rule_rcu); } @@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); - audit_watch_log_rule_change(r, w, "remove rule"); + audit_watch_log_rule_change(r, w, "remove_rule"); list_del(&r->rlist); list_del(&r->list); list_del_rcu(&e->list); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..b65a138250b8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1060,7 +1060,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, return PTR_ERR(entry); err = audit_add_rule(entry); - audit_log_rule_change("add rule", &entry->rule, !err); + audit_log_rule_change("add_rule", &entry->rule, !err); if (err) audit_free_rule(entry); break; @@ -1070,7 +1070,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, return PTR_ERR(entry); err = audit_del_rule(entry); - audit_log_rule_change("remove rule", &entry->rule, !err); + audit_log_rule_change("remove_rule", &entry->rule, !err); audit_free_rule(entry); break; default: -- cgit v1.2.3 From 219ca39427bf6c46c4e1473493e33bc00635e99b Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 26 Mar 2014 07:26:47 -0400 Subject: audit: use union for audit_field values since they are mutually exclusive Since only one of val, uid, gid and lsm* are used at any given time, combine them to reduce the size of the struct audit_field. Signed-off-by: Richard Guy Briggs --- include/linux/audit.h | 14 +++++++++----- kernel/auditfilter.c | 29 ++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 1ae00891aff9..36dffeccebdb 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -66,12 +66,16 @@ struct audit_krule { struct audit_field { u32 type; - u32 val; - kuid_t uid; - kgid_t gid; + union { + u32 val; + kuid_t uid; + kgid_t gid; + struct { + char *lsm_str; + void *lsm_rule; + }; + }; u32 op; - char *lsm_str; - void *lsm_rule; }; extern int is_audit_feature_set(int which); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b65a138250b8..40ed9813d4b2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); +static void audit_free_lsm_field(struct audit_field *f) +{ + switch (f->type) { + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + kfree(f->lsm_str); + security_audit_rule_free(f->lsm_rule); + } +} + static inline void audit_free_rule(struct audit_entry *e) { int i; @@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e) if (erule->watch) audit_put_watch(erule->watch); if (erule->fields) - for (i = 0; i < erule->field_count; i++) { - struct audit_field *f = &erule->fields[i]; - kfree(f->lsm_str); - security_audit_rule_free(f->lsm_rule); - } + for (i = 0; i < erule->field_count; i++) + audit_free_lsm_field(&erule->fields[i]); kfree(erule->fields); kfree(erule->filterkey); kfree(e); @@ -422,10 +437,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->type = data->fields[i]; f->val = data->values[i]; - f->uid = INVALID_UID; - f->gid = INVALID_GID; - f->lsm_str = NULL; - f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { -- cgit v1.2.3 From 54e05eddbe507d54f1df18c2680d4f614af9e133 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 21 Aug 2014 13:40:41 -0400 Subject: audit: set nlmsg_len for multicast messages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Report: Looking at your example code in http://people.redhat.com/rbriggs/audit-multicast-listen/audit-multicast-listen.c, it seems that nlmsg_len field in the received messages is supposed to contain the length of the header + payload, but it is always set to the size of the header only, i.e. 16. The example program works, because the printf format specifies the minimum width, not "precision", so it simply prints out the payload until the first zero byte. This isn't too much of a problem, but precludes the use of recvmmsg, iiuc? (gdb) p *(struct nlmsghdr*)nlh $14 = {nlmsg_len = 16, nlmsg_type = 1100, nlmsg_flags = 0, nlmsg_seq = 0, nlmsg_pid = 9910} The only time nlmsg_len would have been updated was at audit_buffer_alloc() inside audit_log_start() and never updated after. It should arguably be done in audit_log_vformat(), but would be more efficient in audit_log_end(). Reported-by: Zbigniew JÄ™drzejewski-Szmek Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 7aef7cbd7bcf..d20f00ff7bb5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1949,6 +1949,7 @@ void audit_log_end(struct audit_buffer *ab) } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + nlh->nlmsg_len = ab->skb->len; kauditd_send_multicast_skb(ab->skb); /* @@ -1960,7 +1961,7 @@ void audit_log_end(struct audit_buffer *ab) * protocol between the kaudit kernel subsystem and the auditd * userspace code. */ - nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; + nlh->nlmsg_len -= NLMSG_HDRLEN; if (audit_pid) { skb_queue_tail(&audit_skb_queue, ab->skb); -- cgit v1.2.3 From 9ef91514774a140e468f99d73d7593521e6d25dc Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sun, 24 Aug 2014 20:37:52 -0400 Subject: audit: correct AUDIT_GET_FEATURE return message type When an AUDIT_GET_FEATURE message is sent from userspace to the kernel, it should reply with a message tagged as an AUDIT_GET_FEATURE type with a struct audit_feature. The current reply is a message tagged as an AUDIT_GET type with a struct audit_feature. This appears to have been a cut-and-paste-eo in commit b0fed40. Reported-by: Steve Grubb Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d20f00ff7bb5..3a80abb6eaa1 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -724,7 +724,7 @@ static int audit_get_feature(struct sk_buff *skb) seq = nlmsg_hdr(skb)->nlmsg_seq; - audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); + audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af)); return 0; } -- cgit v1.2.3 From f874738e8c178b19479f7b143211a1df00367988 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 15 Sep 2014 16:17:37 -0400 Subject: audit: remove open_arg() function that is never used open_arg() was added in commit 55669bfa "audit: AUDIT_PERM support" and never used. Remove it. Signed-off-by: Richard Guy Briggs --- kernel/auditsc.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4e17443fd1ef..63a74a703c97 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -126,14 +126,6 @@ struct audit_tree_refs { struct audit_chunk *c[31]; }; -static inline int open_arg(int flags, int mask) -{ - int n = ACC_MODE(flags); - if (flags & (O_TRUNC | O_CREAT)) - n |= AUDIT_PERM_WRITE; - return n & mask; -} - static int audit_match_perm(struct audit_context *ctx, int mask) { unsigned n; -- cgit v1.2.3 From 9eab339b197a6903043d272295dcb716ff739b21 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Sat, 15 Mar 2014 18:42:34 -0400 Subject: audit: get comm using lock to avoid race in string printing When task->comm is passed directly to audit_log_untrustedstring() without getting a copy or using the task_lock, there is a race that could happen that would output a NULL (\0) in the output string that would effectively truncate the rest of the report text after the comm= field in the audit, losing fields. Use get_task_comm() to get a copy while acquiring the task_lock to prevent this and to prevent the result from being a mixture of old and new values of comm. Signed-off-by: Tetsuo Handa Signed-off-by: Richard Guy Briggs --- kernel/audit.c | 5 ++--- kernel/auditsc.c | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3a80abb6eaa1..53bb39bf79e2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1850,7 +1850,7 @@ EXPORT_SYMBOL(audit_log_task_context); void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; - char name[sizeof(tsk->comm)]; + char comm[sizeof(tsk->comm)]; struct mm_struct *mm = tsk->mm; char *tty; @@ -1884,9 +1884,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) from_kgid(&init_user_ns, cred->fsgid), tty, audit_get_sessionid(tsk)); - get_task_comm(name, tsk); audit_log_format(ab, " comm="); - audit_log_untrustedstring(ab, name); + audit_log_untrustedstring(ab, get_task_comm(comm, tsk)); if (mm) { down_read(&mm->mmap_sem); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 63a74a703c97..89335723fb2a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2424,6 +2424,7 @@ static void audit_log_task(struct audit_buffer *ab) kgid_t gid; unsigned int sessionid; struct mm_struct *mm = current->mm; + char comm[sizeof(current->comm)]; auid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); @@ -2436,7 +2437,7 @@ static void audit_log_task(struct audit_buffer *ab) sessionid); audit_log_task_context(ab); audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); - audit_log_untrustedstring(ab, current->comm); + audit_log_untrustedstring(ab, get_task_comm(comm, current)); if (mm) { down_read(&mm->mmap_sem); if (mm->exe_file) -- cgit v1.2.3 From be34f0f3e6aed6e828a8059247d169d38da128d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:47:43 +0200 Subject: sched/numa: Kill the wrong/dead TASK_DEAD check in task_numa_fault() current->state == TASK_DEAD means that the task is doing its last schedule(), page fault is obviously impossible at this stage. Signed-off-by: Oleg Nesterov Acked-by: Mel Gorman Acked-by: Rik van Riel Cc: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140921194743.GA30114@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2a1e6ac6bb32..9ee3d4f6de47 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1817,10 +1817,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; - /* Do not worry about placement if exiting */ - if (p->state == TASK_DEAD) - return; - /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults_memory)) { int size = sizeof(*p->numa_faults_memory) * -- cgit v1.2.3 From a5e7be3b28a235108c59561bea55eea1072b23b0 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Sep 2014 10:22:39 +0100 Subject: sched/deadline: Clear dl_entity params when setscheduling to different class When a task is using SCHED_DEADLINE and the user setschedules it to a different class its sched_dl_entity static parameters are not cleaned up. This causes a bug if the user sets it back to SCHED_DEADLINE with the same parameters again. The problem resides in the check we perform at the very beginning of dl_overflow(): if (new_bw == p->dl.dl_bw) return 0; This condition is met in the case depicted above, so the function returns and dl_b->total_bw is not updated (the p->dl.dl_bw is not added to it). After this, admission control is broken. This patch fixes the thing, properly clearing static parameters for a task that ceases to use SCHED_DEADLINE. Reported-by: Daniele Alessandrelli Reported-by: Daniel Wagner Reported-by: Vincent Legout Tested-by: Luca Abeni Tested-by: Daniel Wagner Tested-by: Vincent Legout Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Fabio Checconi Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1411118561-26323-2-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++---- kernel/sched/deadline.c | 2 ++ kernel/sched/sched.h | 3 +++ 3 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a2841904f2d5..09bde2ab2a0a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1808,6 +1808,20 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +/* + * This function clears the sched_dl_entity static params. + */ +void __dl_clear_params(struct task_struct *p) +{ + struct sched_dl_entity *dl_se = &p->dl; + + dl_se->dl_runtime = 0; + dl_se->dl_deadline = 0; + dl_se->dl_period = 0; + dl_se->flags = 0; + dl_se->dl_bw = 0; +} + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1832,10 +1846,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) RB_CLEAR_NODE(&p->dl.rb_node); hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - p->dl.dl_runtime = p->dl.runtime = 0; - p->dl.dl_deadline = p->dl.deadline = 0; - p->dl.dl_period = 0; - p->dl.flags = 0; + __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aaa5abbff2f1..efb94124420d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1565,6 +1565,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) hrtimer_try_to_cancel(&p->dl.dl_timer); + __dl_clear_params(p); + #ifdef CONFIG_SMP /* * Since this might be the only -deadline task on the rq, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1bc6aad1391a..76f3a38a401c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -130,6 +130,9 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; + +void __dl_clear_params(struct task_struct *p); + /* * To keep the bandwidth of -deadline tasks and groups under control * we need some place where: -- cgit v1.2.3 From 91ec6778ec4f963fcb2c2793610919b572f633b0 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Sep 2014 10:22:41 +0100 Subject: sched/deadline: Fix inter- exclusive cpusets migrations Users can perform clustered scheduling using the cpuset facility. After an exclusive cpuset is created, task migrations happen only between CPUs belonging to the same cpuset. Inter- cpuset migrations can only happen when the user requires so, moving a task between different cpusets. This behaviour is broken in SCHED_DEADLINE, as currently spurious inter- cpuset migration may happen without user intervention. This patch fix the problem (and shuffles the code a bit to improve clarity). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: raistlin@linux.it Cc: michael@amarulasolutions.com Cc: fchecconi@gmail.com Cc: daniel.wagner@bmw-carit.de Cc: vincent@legout.info Cc: luca.abeni@unitn.it Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1411118561-26323-4-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 +--- kernel/sched/deadline.c | 7 +++++++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index bd95963dae80..539ca3ce071b 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, int best_cpu = -1; const struct sched_dl_entity *dl_se = &p->dl; - if (later_mask && cpumask_and(later_mask, cp->free_cpus, - &p->cpus_allowed) && cpumask_and(later_mask, - later_mask, cpu_active_mask)) { + if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { best_cpu = cpumask_any(later_mask); goto out; } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index efb94124420d..abfaf3d9a29f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1164,6 +1164,13 @@ static int find_later_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; + /* + * We have to consider system topology and task affinity + * first, then we can look for a suitable cpu. + */ + cpumask_copy(later_mask, task_rq(task)->rd->span); + cpumask_and(later_mask, later_mask, cpu_active_mask); + cpumask_and(later_mask, later_mask, &task->cpus_allowed); best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask); if (best_cpu == -1) -- cgit v1.2.3 From 442bf3aaf55a91ebfec71da46a4ee10a3c905bcc Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 4 Sep 2014 11:32:09 -0400 Subject: sched: Let the scheduler see CPU idle states When the cpu enters idle, it stores the cpuidle state pointer in its struct rq instance which in turn could be used to make a better decision when balancing tasks. As soon as the cpu exits its idle state, the struct rq reference is cleared. There are a couple of situations where the idle state pointer could be changed while it is being consulted: 1. For x86/acpi with dynamic c-states, when a laptop switches from battery to AC that could result on removing the deeper idle state. The acpi driver triggers: 'acpi_processor_cst_has_changed' 'cpuidle_pause_and_lock' 'cpuidle_uninstall_idle_handler' 'kick_all_cpus_sync'. All cpus will exit their idle state and the pointed object will be set to NULL. 2. The cpuidle driver is unloaded. Logically that could happen but not in practice because the drivers are always compiled in and 95% of them are not coded to unregister themselves. In any case, the unloading code must call 'cpuidle_unregister_device', that calls 'cpuidle_pause_and_lock' leading to 'kick_all_cpus_sync' as mentioned above. A race can happen if we use the pointer and then one of these two scenarios occurs at the same moment. In order to be safe, the idle state pointer stored in the rq must be used inside a rcu_read_lock section where we are protected with the 'rcu_barrier' in the 'cpuidle_uninstall_idle_handler' function. The idle_get_state() and idle_put_state() accessors should be used to that effect. Signed-off-by: Daniel Lezcano Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: "Rafael J. Wysocki" Cc: linux-pm@vger.kernel.org Cc: linaro-kernel@lists.linaro.org Cc: Daniel Lezcano Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-@git.kernel.org Signed-off-by: Ingo Molnar --- drivers/cpuidle/cpuidle.c | 6 ++++++ kernel/sched/idle.c | 6 ++++++ kernel/sched/sched.h | 30 ++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index d31e04ca8703..125150dc6e81 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -225,6 +225,12 @@ void cpuidle_uninstall_idle_handler(void) initialized = 0; wake_up_all_idle_cpus(); } + + /* + * Make sure external observers (such as the scheduler) + * are done looking at pointed idle states. + */ + synchronize_rcu(); } /** diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 11e7bc434f43..c47fce75e666 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -147,6 +147,9 @@ use_default: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) goto use_default; + /* Take note of the planned idle state. */ + idle_set_state(this_rq(), &drv->states[next_state]); + /* * Enter the idle state previously returned by the governor decision. * This function will block until an interrupt occurs and will take @@ -154,6 +157,9 @@ use_default: */ entered_state = cpuidle_enter(drv, dev, next_state); + /* The cpu is no longer idle or about to enter idle. */ + idle_set_state(this_rq(), NULL); + if (broadcast) clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 76f3a38a401c..16e1ca9cb7e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -14,6 +14,7 @@ #include "cpuacct.h" struct rq; +struct cpuidle_state; /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 @@ -643,6 +644,11 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif + +#ifdef CONFIG_CPU_IDLE + /* Must be inspected within a rcu lock section */ + struct cpuidle_state *idle_state; +#endif }; static inline int cpu_of(struct rq *rq) @@ -1196,6 +1202,30 @@ static inline void idle_exit_fair(struct rq *rq) { } #endif +#ifdef CONFIG_CPU_IDLE +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ + rq->idle_state = idle_state; +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state; +} +#else +static inline void idle_set_state(struct rq *rq, + struct cpuidle_state *idle_state) +{ +} + +static inline struct cpuidle_state *idle_get_state(struct rq *rq) +{ + return NULL; +} +#endif + extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); -- cgit v1.2.3 From 83a0a96a5f26d974580fd7251043ff70c8f1823d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 4 Sep 2014 11:32:10 -0400 Subject: sched/fair: Leverage the idle state info when choosing the "idlest" cpu The code in find_idlest_cpu() looks for the CPU with the smallest load. However, if multiple CPUs are idle, the first idle CPU is selected irrespective of the depth of its idle state. Among the idle CPUs we should pick the one with with the shallowest idle state, or the latest to have gone idle if all idle CPUs are in the same state. The later applies even when cpuidle is configured out. This patch doesn't cover the following issues: - The idle exit latency of a CPU might be larger than the time needed to migrate the waking task to an already running CPU with sufficient capacity, and therefore performance would benefit from task packing in such case (in most cases task packing is about power saving). - Some idle states have a non negligible and non abortable entry latency which needs to run to completion before the exit latency can start. A concurrent patch series is making this info available to the cpuidle core. Once available, the entry latency with the idle timestamp could determine when the exit latency may be effective. Those issues will be handled in due course. In the mean time, what is implemented here should improve things already compared to the current state of affairs. Based on an initial patch from Daniel Lezcano. Signed-off-by: Nicolas Pitre Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: "Rafael J. Wysocki" Cc: Linus Torvalds Cc: linux-pm@vger.kernel.org Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/n/tip-@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ee3d4f6de47..8cb32f83c9b0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -4415,20 +4416,46 @@ static int find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; - int idlest = -1; + unsigned int min_exit_latency = UINT_MAX; + u64 latest_idle_timestamp = 0; + int least_loaded_cpu = this_cpu; + int shallowest_idle_cpu = -1; int i; /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - load = weighted_cpuload(i); - - if (load < min_load || (load == min_load && i == this_cpu)) { - min_load = load; - idlest = i; + if (idle_cpu(i)) { + struct rq *rq = cpu_rq(i); + struct cpuidle_state *idle = idle_get_state(rq); + if (idle && idle->exit_latency < min_exit_latency) { + /* + * We give priority to a CPU whose idle state + * has the smallest exit latency irrespective + * of any idle timestamp. + */ + min_exit_latency = idle->exit_latency; + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } else if ((!idle || idle->exit_latency == min_exit_latency) && + rq->idle_stamp > latest_idle_timestamp) { + /* + * If equal or no active idle state, then + * the most recently idled CPU might have + * a warmer cache. + */ + latest_idle_timestamp = rq->idle_stamp; + shallowest_idle_cpu = i; + } + } else { + load = weighted_cpuload(i); + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + least_loaded_cpu = i; + } } } - return idlest; + return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } /* -- cgit v1.2.3 From 8651c65844e93af44554272b7e0d2b142837b244 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:33:36 +0200 Subject: sched: Fix the task-group check in tg_has_rt_tasks() tg_has_rt_tasks() wants to find an RT task in this task_group, but task_rq(p)->rt.tg wrongly checks the root rt_rq. Signed-off-by: Oleg Nesterov Reviewed-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20140921193336.GA28618@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 09bde2ab2a0a..0abfb7ec9e62 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7441,7 +7441,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) struct task_struct *g, *p; for_each_process_thread(g, p) { - if (rt_task(p) && task_rq(p)->rt.tg == tg) + if (rt_task(p) && task_group(p) == tg) return 1; } -- cgit v1.2.3 From 3472eaa1f12e217e2b8b0ef658ff861b2308cbbd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:33:38 +0200 Subject: sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock() 1. read_lock(tasklist_lock) does not need to disable irqs. 2. ->mm != NULL is a common mistake, use PF_KTHREAD. 3. The second ->mm check can be simply removed. 4. task_rq_lock() looks better than raw_spin_lock(&p->pi_lock) + __task_rq_lock(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140921193338.GA28621@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0abfb7ec9e62..d65566d07fcf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7220,12 +7220,12 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irqsave(&tasklist_lock, flags); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* * Only normalize user tasks: */ - if (!p->mm) + if (p->flags & PF_KTHREAD) continue; p->se.exec_start = 0; @@ -7240,20 +7240,16 @@ void normalize_rt_tasks(void) * Renice negative nice level userspace * tasks back to 0: */ - if (task_nice(p) < 0 && p->mm) + if (task_nice(p) < 0) set_user_nice(p, 0); continue; } - raw_spin_lock(&p->pi_lock); - rq = __task_rq_lock(p); - + rq = task_rq_lock(p, &flags); normalize_task(rq, p); - - __task_rq_unlock(rq); - raw_spin_unlock(&p->pi_lock); + task_rq_unlock(rq, p, &flags); } - read_unlock_irqrestore(&tasklist_lock, flags); + read_unlock(&tasklist_lock); } #endif /* CONFIG_MAGIC_SYSRQ */ -- cgit v1.2.3 From 5bd96ab6fef66ec6b9f54134364e618fd0f8f2f3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 21 Sep 2014 21:33:41 +0200 Subject: sched: print_rq(): Don't use tasklist_lock read_lock_irqsave(tasklist_lock) in print_rq() looks strange. We do not need to disable irqs, and they are already disabled by the caller. And afaics this lock buys nothing, we can rely on rcu_read_lock(). In this case it makes sense to also move rcu_read_lock/unlock from the caller to print_rq(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140921193341.GA28628@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c7fe1ea0e8ab..ce33780d8f20 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - unsigned long flags; SEQ_printf(m, "\nrunnable tasks:\n" @@ -159,14 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) "------------------------------------------------------" "----------------------------------------------------\n"); - read_lock_irqsave(&tasklist_lock, flags); + rcu_read_lock(); for_each_process_thread(g, p) { if (task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); } - read_unlock_irqrestore(&tasklist_lock, flags); + rcu_read_unlock(); } void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) @@ -331,9 +330,7 @@ do { \ print_cfs_stats(m, cpu); print_rt_stats(m, cpu); - rcu_read_lock(); print_rq(m, rq, cpu); - rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); SEQ_printf(m, "\n"); } -- cgit v1.2.3 From c55f5158f5606f8a62e694b7e009f59b92ac6258 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2014 17:06:41 +0200 Subject: sched, mips, ia64: Remove __ARCH_WANT_UNLOCKED_CTXSW Kirill found that there's a subtle race in the __ARCH_WANT_UNLOCKED_CTXSW code, and instead of fixing it, remove the entire exception because neither arch that uses it seems to actually still require it. Boot tested on mips64el (qemu) only. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kirill Tkhai Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Fenghua Yu Cc: James Hogan Cc: Kees Cook Cc: Linus Torvalds Cc: Paul Burton Cc: Qais Yousef Cc: Ralf Baechle Cc: Tony Luck Cc: oleg@redhat.com Cc: linux@roeck-us.net Cc: linux-ia64@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20140923150641.GH3312@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/processor.h | 1 - arch/mips/include/asm/processor.h | 6 ------ kernel/sched/core.c | 6 ------ kernel/sched/sched.h | 30 ------------------------------ 4 files changed, 43 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index c7367130ab14..ce53c50d0ba4 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -19,7 +19,6 @@ #include #include -#define __ARCH_WANT_UNLOCKED_CTXSW #define ARCH_HAS_PREFETCH_SWITCH_STACK #define IA64_NUM_PHYS_STACK_REG 96 diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 05f08438a7c4..f1df4cb4a286 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -397,12 +397,6 @@ unsigned long get_wchan(struct task_struct *p); #define ARCH_HAS_PREFETCHW #define prefetchw(x) __builtin_prefetch((x), 1, 1) -/* - * See Documentation/scheduler/sched-arch.txt; prevents deadlock on SMP - * systems. - */ -#define __ARCH_WANT_UNLOCKED_CTXSW - #endif #endif /* _ASM_PROCESSOR_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d65566d07fcf..5b0eac9f4e78 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2331,10 +2331,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ post_schedule(rq); -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - /* In this case, finish_task_switch does not reenable preemption */ - preempt_enable(); -#endif if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); } @@ -2377,9 +2373,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ -#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -#endif context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 16e1ca9cb7e8..6130251de280 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -975,7 +975,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -1013,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_unlock_irq(&rq->lock); } -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif - raw_spin_unlock(&rq->lock); -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif - local_irq_enable(); -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ - /* * wake flags */ -- cgit v1.2.3 From 7a96c231ca23f0f5622852307df4209afc502ec3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:12 +0400 Subject: sched/fair: Remove duplicate code from can_migrate_task() Combine two branches which do the same. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183612.11015.64200.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cb32f83c9b0..10a5a286d8e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5315,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); - if (migrate_improves_locality(p, env)) { -#ifdef CONFIG_SCHEDSTATS + if (migrate_improves_locality(p, env) || !tsk_cache_hot || + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif - return 1; - } - - if (!tsk_cache_hot || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - - if (tsk_cache_hot) { - schedstat_inc(env->sd, lb_hot_gained[env->idle]); - schedstat_inc(p, se.statistics.nr_forced_migrations); - } - return 1; } -- cgit v1.2.3 From 66339c31bc3978d5fff9c4b4cb590a861def4db2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:24 +0400 Subject: sched: Use dl_bw_of() under RCU read lock dl_bw_of() dereferences rq->rd which has to have RCU read lock held. Probability of use-after-free isn't zero here. Also add lockdep assert into dl_bw_cpus(). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: # v3.14+ Cc: Paul E. McKenney Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183624.11015.71558.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b0eac9f4e78..f0adb038170b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2021,6 +2021,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2029,6 +2031,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; + rcu_lockdep_assert(rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; @@ -7645,6 +7649,8 @@ static int sched_dl_global_constraints(void) int cpu, ret = 0; unsigned long flags; + rcu_read_lock(); + /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in @@ -7666,6 +7672,8 @@ static int sched_dl_global_constraints(void) break; } + rcu_read_unlock(); + return ret; } @@ -7681,6 +7689,7 @@ static void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + rcu_read_lock(); /* * FIXME: As above... */ @@ -7691,6 +7700,7 @@ static void sched_dl_do_global(void) dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); } + rcu_read_unlock(); } static int sched_rt_global_validate(void) -- cgit v1.2.3 From 16303ab2fe214635240a8f57cad2cd29792d4e3b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:30 +0400 Subject: sched: cleanup: Rename 'out_unlock' to 'out_free_new_mask' Nothing is locked there, so label's name only confuses a reader. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20140922183630.11015.59500.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0adb038170b..316127acefc6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4029,14 +4029,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { rcu_read_unlock(); - goto out_unlock; + goto out_free_new_mask; } rcu_read_unlock(); } retval = security_task_setscheduler(p); if (retval) - goto out_unlock; + goto out_free_new_mask; cpuset_cpus_allowed(p, cpus_allowed); @@ -4054,7 +4054,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { retval = -EBUSY; - goto out_unlock; + goto out_free_new_mask; } } #endif @@ -4073,7 +4073,7 @@ again: goto again; } } -out_unlock: +out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: free_cpumask_var(cpus_allowed); -- cgit v1.2.3 From f1e3a0932f3a9554371792a7daaf1e0eb19f66d5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:36 +0400 Subject: sched: Use rq->rd in sched_setaffinity() under RCU read lock Probability of use-after-free isn't zero in this place. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: # v3.14+ Cc: Paul E. McKenney Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183636.11015.83611.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 316127acefc6..b5349fee1213 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4049,13 +4049,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) * root_domain. */ #ifdef CONFIG_SMP - if (task_has_dl_policy(p)) { - const struct cpumask *span = task_rq(p)->rd->span; - - if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { + rcu_read_lock(); + if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { retval = -EBUSY; + rcu_read_unlock(); goto out_free_new_mask; } + rcu_read_unlock(); } #endif again: -- cgit v1.2.3 From 8aa6f0ebf41b5fdd186276394bf07e7bd6884d94 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 22 Sep 2014 22:36:43 +0400 Subject: sched/rt: Use resched_curr() in task_tick_rt() Some time ago PREEMPT_NEED_RESCHED was implemented, so reschedule technics is a little more difficult now. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140922183642.11015.66039.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2e6a7743703e..87ea5bf1b87f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2072,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) for_each_sched_rt_entity(rt_se) { if (rt_se->run_list.prev != rt_se->run_list.next) { requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + resched_curr(rq); return; } } -- cgit v1.2.3 From dc633982ff3f4fd74cdc11b5a6ae53d39a0b2451 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 12 Sep 2014 13:18:26 +0200 Subject: perf: Do not POLLHUP event if it has children Currently we return POLLHUP in event polling if the monitored process is done, but we didn't consider possible children, that might be still running and producing data. Before returning POLLHUP making sure that: 1) the monitored task has exited and that 2) we don't have any children to monitor Also adding parent wakeup when the child event is gone. Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1410520708-19275-1-git-send-email-jolsa@kernel.org Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Stephane Eranian Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Stephane Eranian Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/events/core.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 733c61636f0d..15e58d4ea035 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3587,6 +3587,19 @@ static int perf_event_read_one(struct perf_event *event, return n * sizeof(u64); } +static bool is_event_hup(struct perf_event *event) +{ + bool no_children; + + if (event->state != PERF_EVENT_STATE_EXIT) + return false; + + mutex_lock(&event->child_mutex); + no_children = list_empty(&event->child_list); + mutex_unlock(&event->child_mutex); + return no_children; +} + /* * Read the performance event - simple non blocking version for now */ @@ -3632,7 +3645,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) poll_wait(file, &event->waitq, wait); - if (event->state == PERF_EVENT_STATE_EXIT) + if (is_event_hup(event)) return events; /* @@ -7579,6 +7592,12 @@ static void sync_child_event(struct perf_event *child_event, list_del_init(&child_event->child_list); mutex_unlock(&parent_event->child_mutex); + /* + * Make sure user/parent get notified, that we just + * lost one event. + */ + perf_event_wakeup(parent_event); + /* * Release the parent event, if this was the last * reference to it. -- cgit v1.2.3 From 1929def9e609d1a8cdb1626d85eda3da66921a7d Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 12 Sep 2014 13:18:27 +0200 Subject: perf: Fix child event initial state setup Currently we initialize the child event based on the original parent state. This is wrong, because the original parent event (and its state) is not related to current fork and also could be already gone. We need to initialize the child state based on the immediate parent event state. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Stephane Eranian Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410520708-19275-2-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 15e58d4ea035..132524c8b340 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7818,6 +7818,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event *group_leader, struct perf_event_context *child_ctx) { + enum perf_event_active_state parent_state = parent_event->state; struct perf_event *child_event; unsigned long flags; @@ -7851,7 +7852,7 @@ inherit_event(struct perf_event *parent_event, * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_event_{en, dis}able_family. */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + if (parent_state >= PERF_EVENT_STATE_INACTIVE) child_event->state = PERF_EVENT_STATE_INACTIVE; else child_event->state = PERF_EVENT_STATE_OFF; -- cgit v1.2.3 From 802c8a61d4c9c794db863dcabb0006ab001a651b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 12 Sep 2014 13:18:28 +0200 Subject: Revert "perf: Do not allow optimized switch for non-cloned events" This reverts commit 1f9a7268c67f0290837aada443d28fd953ddca90. With the fix of the initial state for the cloned event we now correctly handle the error described in: 1f9a7268c67f perf: Do not allow optimized switch for non-cloned events so we can revert it. I made an automated test for this, but its not suitable for automated perf tests framework. It needs to be customized for each machine (the more cpu the higher numbers for GROUPS/WORKERS/BYTES) and it could take longer time to hit the issue. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Frederic Weisbecker Cc: Stephane Eranian Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140910143535.GD2409@krava.brq.redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 132524c8b340..b164cb07b30d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2375,7 +2375,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ - if (!parent || !next_parent) + if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { -- cgit v1.2.3 From 2aad2a86f6685c10360ec8a5a55eb9ab7059cb72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: percpu_ref: add PERCPU_REF_INIT_* flags With the recent addition of percpu_ref_reinit(), percpu_ref now can be used as a persistent switch which can be turned on and off repeatedly where turning off maps to killing the ref and waiting for it to drain; however, there currently isn't a way to initialize a percpu_ref in its off (killed and drained) state, which can be inconvenient for certain persistent switch use cases. Similarly, percpu_ref_switch_to_atomic/percpu() allow dynamic selection of operation mode; however, currently a newly initialized percpu_ref is always in percpu mode making it impossible to avoid the latency overhead of switching to atomic mode. This patch adds @flags to percpu_ref_init() and implements the following flags. * PERCPU_REF_INIT_ATOMIC : start ref in atomic mode * PERCPU_REF_INIT_DEAD : start ref killed and drained These flags should be able to serve the above two use cases. v2: target_core_tpg.c conversion was missing. Fixed. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner --- block/blk-mq.c | 2 +- drivers/target/target_core_tpg.c | 2 +- fs/aio.c | 4 ++-- include/linux/percpu-refcount.h | 18 +++++++++++++++++- kernel/cgroup.c | 7 ++++--- lib/percpu-refcount.c | 23 ++++++++++++++++++----- 6 files changed, 43 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/block/blk-mq.c b/block/blk-mq.c index 44a78ae3f899..d85fe01c44ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1796,7 +1796,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) goto err_hctxs; if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - GFP_KERNEL)) + 0, GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 4ab6da338585..be783f717f19 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,7 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index 8d217ed04e6e..84a751005f5b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index cd7e20f0fe47..b0293f268cd2 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -63,6 +63,21 @@ enum { __PERCPU_REF_FLAG_BITS = 2, }; +/* @flags for percpu_ref_init() */ +enum { + /* + * Start w/ ref == 1 in atomic mode. Can be switched to percpu + * operation using percpu_ref_switch_to_percpu(). + */ + PERCPU_REF_INIT_ATOMIC = 1 << 0, + + /* + * Start dead w/ ref == 0 in atomic mode. Must be revived with + * percpu_ref_reinit() before used. Implies INIT_ATOMIC. + */ + PERCPU_REF_INIT_DEAD = 1 << 1, +}; + struct percpu_ref { atomic_long_t count; /* @@ -76,7 +91,8 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release, gfp_t gfp); + percpu_ref_func_t *release, unsigned int flags, + gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a99d504294de..753df01a9831 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1634,7 +1634,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -4510,7 +4511,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; @@ -4583,7 +4584,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 5a6d43baccc5..ed280fb1e5b5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -45,27 +45,40 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * - * Initializes the refcount in single atomic counter mode with a refcount of 1; - * analagous to atomic_long_set(ref, 1). + * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a + * refcount of 1; analagous to atomic_long_set(ref, 1). See the + * definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, - gfp_t gfp) + unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); - - atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); + unsigned long start_count = 0; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + else + start_count += PERCPU_COUNT_BIAS; + + if (flags & PERCPU_REF_INIT_DEAD) + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + else + start_count++; + + atomic_long_set(&ref->count, start_count); + ref->release = release; return 0; } -- cgit v1.2.3 From 5c4dd348af35a6f6db97b4f2401f74c71f7f3c7d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 25 Sep 2014 00:53:44 +0200 Subject: Revert "PM / Hibernate: Iterate over set bits instead of PFNs in swsusp_free()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert commit 6efde38f0769 (PM / Hibernate: Iterate over set bits instead of PFNs in swsusp_free()) that introduced a NULL pointer dereference during system resume from hibernation: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] swsusp_free+0x21/0x190 PGD b39c2067 PUD b39c1067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: CPU: 1 PID: 4898 Comm: s2disk Tainted: G C 3.17-rc5-amd64 #1 Debian 3.17~rc5-1~exp1 Hardware name: LENOVO 2776LEG/2776LEG, BIOS 6EET55WW (3.15 ) 12/19/2011 task: ffff88023155ea40 ti: ffff8800b3b14000 task.ti: ffff8800b3b14000 RIP: 0010:[] [] swsusp_free+0x21/0x190 RSP: 0018:ffff8800b3b17ea8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8800b39bab00 RCX: 0000000000000001 RDX: ffff8800b39bab10 RSI: ffff8800b39bab00 RDI: 0000000000000000 RBP: 0000000000000010 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8800b39bab10 R11: 0000000000000246 R12: ffffea0000000000 R13: ffff880232f485a0 R14: ffff88023ac27cd8 R15: ffff880232927590 FS: 00007f406d83b700(0000) GS:ffff88023bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 00000000b3a62000 CR4: 00000000000007e0 Stack: ffff8800b39bab00 0000000000000010 ffff880232927590 ffffffff810acb4a ffff8800b39bab00 ffffffff811a955a ffff8800b39bab10 0000000000000000 ffff88023155f098 ffffffff81a6b8c0 ffff88023155ea40 0000000000000007 Call Trace: [] ? snapshot_release+0x2a/0xb0 [] ? __fput+0xca/0x1d0 [] ? task_work_run+0x97/0xd0 [] ? do_notify_resume+0x69/0xa0 [] ? int_signal+0x12/0x17 Code: 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 41 54 48 8b 05 ba 62 9c 00 49 bc 00 00 00 00 00 ea ff ff 48 8b 3d a1 62 9c 00 55 53 <48> 8b 10 48 89 50 18 48 8b 52 20 48 c7 40 28 00 00 00 00 c7 40 RIP [] swsusp_free+0x21/0x190 RSP CR2: 0000000000000000 ---[ end trace f02be86a1ec0cccb ]--- due to forbidden_pages_map being NULL in swsusp_free(). Fixes: 6efde38f0769 "PM / Hibernate: Iterate over set bits instead of PFNs in swsusp_free()" Reported-by: Bjørn Mork Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 50 +++++++++++++++---------------------------------- 1 file changed, 15 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c4b8093c80b3..f1604d8cf489 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -725,14 +725,6 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } -static void memory_bm_clear_current(struct memory_bitmap *bm) -{ - int bit; - - bit = max(bm->cur.node_bit - 1, 0); - clear_bit(bit, bm->cur.node->data); -} - static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -1341,35 +1333,23 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - unsigned long fb_pfn, fr_pfn; - - memory_bm_position_reset(forbidden_pages_map); - memory_bm_position_reset(free_pages_map); - -loop: - fr_pfn = memory_bm_next_pfn(free_pages_map); - fb_pfn = memory_bm_next_pfn(forbidden_pages_map); - - /* - * Find the next bit set in both bitmaps. This is guaranteed to - * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. - */ - do { - if (fb_pfn < fr_pfn) - fb_pfn = memory_bm_next_pfn(forbidden_pages_map); - if (fr_pfn < fb_pfn) - fr_pfn = memory_bm_next_pfn(free_pages_map); - } while (fb_pfn != fr_pfn); - - if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { - struct page *page = pfn_to_page(fr_pfn); + struct zone *zone; + unsigned long pfn, max_zone_pfn; - memory_bm_clear_current(forbidden_pages_map); - memory_bm_clear_current(free_pages_map); - __free_page(page); - goto loop; + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (swsusp_page_is_forbidden(page) && + swsusp_page_is_free(page)) { + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } + } } - nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; -- cgit v1.2.3 From 7990da71ebfa887ae6fe4464ab0d99ddeb8efacc Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Wed, 3 Sep 2014 17:49:32 +0200 Subject: PM / QoS: Add PM_QOS_MEMORY_BANDWIDTH class Also adds a class type PM_QOS_SUM that aggregates the values by summing them. It can be used by memory controllers to calculate the optimum clock frequency based on the bandwidth needs of the different memory clients. Signed-off-by: Tomeu Vizoso Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/power/pm_qos_interface.txt | 4 +++- include/linux/pm_qos.h | 5 ++++- kernel/power/qos.c | 27 ++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index a5da5c7e7128..129f7c0e1483 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -5,7 +5,8 @@ performance expectations by drivers, subsystems and user space applications on one of the parameters. Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput. +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. 2. the per-device PM QoS framework provides the API to manage the per-device latency constraints and PM QoS flags. @@ -13,6 +14,7 @@ Each parameters have defined units: * latency: usec * timeout: usec * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) 1. PM QoS framework diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 9ab4bf7c4646..636e82834506 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -15,6 +15,7 @@ enum { PM_QOS_CPU_DMA_LATENCY, PM_QOS_NETWORK_LATENCY, PM_QOS_NETWORK_THROUGHPUT, + PM_QOS_MEMORY_BANDWIDTH, /* insert new class ID */ PM_QOS_NUM_CLASSES, @@ -32,6 +33,7 @@ enum pm_qos_flags_status { #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 +#define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE 0 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) @@ -69,7 +71,8 @@ struct dev_pm_qos_request { enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ + PM_QOS_MIN, /* return the smallest value */ + PM_QOS_SUM /* return the sum */ }; /* diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); -- cgit v1.2.3 From 2ad654bc5e2b211e92f66da1d819e47d79a866f0 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Thu, 25 Sep 2014 09:41:02 +0800 Subject: cpuset: PF_SPREAD_PAGE and PF_SPREAD_SLAB should be atomic flags When we change cpuset.memory_spread_{page,slab}, cpuset will flip PF_SPREAD_{PAGE,SLAB} bit of tsk->flags for each task in that cpuset. This should be done using atomic bitops, but currently we don't, which is broken. Tetsuo reported a hard-to-reproduce kernel crash on RHEL6, which happened when one thread tried to clear PF_USED_MATH while at the same time another thread tried to flip PF_SPREAD_PAGE/PF_SPREAD_SLAB. They both operate on the same task. Here's the full report: https://lkml.org/lkml/2014/9/19/230 To fix this, we make PF_SPREAD_PAGE and PF_SPREAD_SLAB atomic flags. v4: - updated mm/slab.c. (Fengguang Wu) - updated Documentation. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Miao Xie Cc: Kees Cook Fixes: 950592f7b991 ("cpusets: update tasks' page/slab spread flags in time") Cc: # 2.6.31+ Reported-by: Tetsuo Handa Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- Documentation/cgroups/cpusets.txt | 6 +++--- include/linux/cpuset.h | 4 ++-- include/linux/sched.h | 13 +++++++++++-- kernel/cpuset.c | 9 +++++---- mm/slab.c | 4 ++-- 5 files changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index 7740038d82bc..3c94ff3f9693 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -345,14 +345,14 @@ the named feature on. The implementation is simple. Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag -PF_SPREAD_PAGE for each task that is in that cpuset or subsequently +PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently joins that cpuset. The page allocation calls for the page cache -is modified to perform an inline check for this PF_SPREAD_PAGE task +is modified to perform an inline check for this PFA_SPREAD_PAGE task flag, and if set, a call to a new routine cpuset_mem_spread_node() returns the node to prefer for the allocation. Similarly, setting 'cpuset.memory_spread_slab' turns on the flag -PF_SPREAD_SLAB, and appropriately marked slab caches will allocate +PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate pages from the node returned by cpuset_mem_spread_node(). The cpuset_mem_spread_node() routine is also simple. It uses the diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index ade2390ffe92..6e39c9bb0dae 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -93,12 +93,12 @@ extern int cpuset_slab_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { - return current->flags & PF_SPREAD_PAGE; + return task_spread_page(current); } static inline int cpuset_do_slab_mem_spread(void) { - return current->flags & PF_SPREAD_SLAB; + return task_spread_slab(current); } extern int current_cpuset_is_being_rebound(void); diff --git a/include/linux/sched.h b/include/linux/sched.h index 5630763956d9..7b1cafefb05e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1903,8 +1903,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ -#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ @@ -1958,6 +1956,9 @@ static inline void memalloc_noio_restore(unsigned int flags) /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ +#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ +#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ + #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ @@ -1972,6 +1973,14 @@ static inline void memalloc_noio_restore(unsigned int flags) TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) +TASK_PFA_TEST(SPREAD_PAGE, spread_page) +TASK_PFA_SET(SPREAD_PAGE, spread_page) +TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) + +TASK_PFA_TEST(SPREAD_SLAB, spread_slab) +TASK_PFA_SET(SPREAD_SLAB, spread_slab) +TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) + /* * task->jobctl flags */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 22874d7cf2c0..52cb04c993b7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -365,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) { if (is_spread_page(cs)) - tsk->flags |= PF_SPREAD_PAGE; + task_set_spread_page(tsk); else - tsk->flags &= ~PF_SPREAD_PAGE; + task_clear_spread_page(tsk); + if (is_spread_slab(cs)) - tsk->flags |= PF_SPREAD_SLAB; + task_set_spread_slab(tsk); else - tsk->flags &= ~PF_SPREAD_SLAB; + task_clear_spread_slab(tsk); } /* diff --git a/mm/slab.c b/mm/slab.c index a467b308c682..881951e67f12 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2994,7 +2994,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. + * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3226,7 +3226,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; -- cgit v1.2.3 From cbbce82209490df8b68da9aec0d642451fe0a668 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 25 Sep 2014 13:55:19 +1000 Subject: SCHED: add some "wait..on_bit...timeout()" interfaces. In commit c1221321b7c25b53204447cff9949a6d5a7ddddc sched: Allow wait_on_bit_action() functions to support a timeout I suggested that a "wait_on_bit_timeout()" interface would not meet my need. This isn't true - I was just over-engineering. Including a 'private' field in wait_bit_key instead of a focused "timeout" field was just premature generalization. If some other use is ever found, it can be generalized or added later. So this patch renames "private" to "timeout" with a meaning "stop waiting when "jiffies" reaches or passes "timeout", and adds two of the many possible wait..bit..timeout() interfaces: wait_on_page_bit_killable_timeout(), which is the one I want to use, and out_of_line_wait_on_bit_timeout() which is a reasonably general example. Others can be added as needed. Acked-by: Peter Zijlstra (Intel) Signed-off-by: NeilBrown Acked-by: Ingo Molnar Signed-off-by: Trond Myklebust --- include/linux/pagemap.h | 2 ++ include/linux/wait.h | 5 ++++- kernel/sched/wait.c | 36 ++++++++++++++++++++++++++++++++++++ mm/filemap.c | 13 +++++++++++++ 4 files changed, 55 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3df8c7db7a4e..87f9e4230d3a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -502,6 +502,8 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, extern void wait_on_page_bit(struct page *page, int bit_nr); extern int wait_on_page_bit_killable(struct page *page, int bit_nr); +extern int wait_on_page_bit_killable_timeout(struct page *page, + int bit_nr, unsigned long timeout); static inline int wait_on_page_locked_killable(struct page *page) { diff --git a/include/linux/wait.h b/include/linux/wait.h index 6fb1ba5f9b2f..80115bf88671 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -25,7 +25,7 @@ struct wait_bit_key { void *flags; int bit_nr; #define WAIT_ATOMIC_T_BIT_NR -1 - unsigned long private; + unsigned long timeout; }; struct wait_bit_queue { @@ -154,6 +154,7 @@ int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_ac void wake_up_bit(void *, int); void wake_up_atomic_t(atomic_t *); int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned); +int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long); int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned); int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); wait_queue_head_t *bit_waitqueue(void *, int); @@ -859,6 +860,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); extern int bit_wait(struct wait_bit_key *); extern int bit_wait_io(struct wait_bit_key *); +extern int bit_wait_timeout(struct wait_bit_key *); +extern int bit_wait_io_timeout(struct wait_bit_key *); /** * wait_on_bit - wait for a bit to be cleared diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 15cab1a4f84e..5a62915f47a8 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit); +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + wait.key.timeout = jiffies + timeout; + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) @@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word) return 0; } EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/mm/filemap.c b/mm/filemap.c index 90effcdf948d..cbe5a9013f70 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -703,6 +703,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) bit_wait_io, TASK_KILLABLE); } +int wait_on_page_bit_killable_timeout(struct page *page, + int bit_nr, unsigned long timeout) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + wait.key.timeout = jiffies + timeout; + if (!test_bit(bit_nr, &page->flags)) + return 0; + return __wait_on_bit(page_waitqueue(page), &wait, + bit_wait_io_timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); + /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue * @page: Page defining the wait queue of interest -- cgit v1.2.3 From e756c7b698604f11a979f2781d06eb7b80aba363 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 26 Sep 2014 12:03:25 +0800 Subject: Revert "cgroup: remove redundant variable in cgroup_mount()" This reverts commit 0c7bf3e8cab7900e17ce7f97104c39927d835469. If there are child cgroups in the cgroupfs and then we umount it, the superblock will be destroyed but the cgroup_root will be kept around. When we mount it again, cgroup_mount() will find this cgroup_root and allocate a new sb for it. So with this commit we will be trapped in a dead loop in the case described above, because kernfs_pin_sb() keeps returning NULL. Currently I don't see how we can avoid using both pinned_sb and new_sb, so just revert it. Cc: Al Viro Reported-by: Andrey Wagin Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5eb20cd1709c..f873c4681316 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1694,6 +1694,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, struct dentry *dentry; int ret; int i; + bool new_sb; /* * The first time anyone tries to mount a cgroup, enable the list @@ -1784,7 +1785,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * path is super cold. Let's just sleep a bit and retry. */ pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR_OR_NULL(pinned_sb) || + if (IS_ERR(pinned_sb) || !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); if (!IS_ERR_OR_NULL(pinned_sb)) @@ -1830,16 +1831,18 @@ out_free: return ERR_PTR(ret); dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, NULL); - if (IS_ERR(dentry) || pinned_sb) + CGROUP_SUPER_MAGIC, &new_sb); + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. */ - if (pinned_sb) + if (pinned_sb) { + WARN_ON(new_sb); deactivate_super(pinned_sb); + } return dentry; } -- cgit v1.2.3 From b63adb979583ef185718d774d8162387db5589c0 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 26 Sep 2014 00:03:16 +0000 Subject: kernel: add support for kernel restart handler call chain Various drivers implement architecture and/or device specific means to restart (reset) the system. Various mechanisms have been implemented to support those schemes. The best known mechanism is arm_pm_restart, which is a function pointer to be set either from platform specific code or from drivers. Another mechanism is to use hardware watchdogs to issue a reset; this mechanism is used if there is no other method available to reset a board or system. Two examples are alim7101_wdt, which currently uses the reboot notifier to trigger a reset, and moxart_wdt, which registers the arm_pm_restart function. The existing mechanisms have a number of drawbacks. Typically only one scheme to restart the system is supported (at least if arm_pm_restart is used). At least in theory there can be multiple means to restart the system, some of which may be less desirable (for example one mechanism may only reset the CPU, while another may reset the entire system). Using arm_pm_restart can also be racy if the function pointer is set from a driver, as the driver may be in the process of being unloaded when arm_pm_restart is called. Using the reboot notifier is always racy, as it is unknown if and when other functions using the reboot notifier have completed execution by the time the watchdog fires. Introduce a system restart handler call chain to solve the described problems. This call chain is expected to be executed from the architecture specific machine_restart() function. Drivers providing system restart functionality (such as the watchdog drivers mentioned above) are expected to register with this call chain. By using the priority field in the notifier block, callers can control restart handler execution sequence and thus ensure that the restart handler with the optimal restart capabilities for a given system is called first. Signed-off-by: Guenter Roeck Acked-by: Catalin Marinas Acked-by: Heiko Stuebner Cc: Russell King Cc: Wim Van Sebroeck Cc: Maxime Ripard Cc: Will Deacon Cc: Arnd Bergmann Cc: Jonas Jensen Cc: Randy Dunlap Cc: Steven Rostedt Cc: Ingo Molnar Cc: Dmitry Eremin-Solenikov Cc: David Woodhouse Cc: Tomasz Figa Signed-off-by: Andrew Morton --- include/linux/reboot.h | 3 ++ kernel/reboot.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 48bf152761c7..67fc8fcdc4b0 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -38,6 +38,9 @@ extern int reboot_force; extern int register_reboot_notifier(struct notifier_block *); extern int unregister_reboot_notifier(struct notifier_block *); +extern int register_restart_handler(struct notifier_block *); +extern int unregister_restart_handler(struct notifier_block *); +extern void do_kernel_restart(char *cmd); /* * Architecture-specific implementations of sys_reboot commands. diff --git a/kernel/reboot.c b/kernel/reboot.c index a3a9e240fcdb..5925f5ae8dff 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* + * Notifier list for kernel code which wants to be called + * to restart the system. + */ +static ATOMIC_NOTIFIER_HEAD(restart_handler_list); + +/** + * register_restart_handler - Register function to be called to reset + * the system + * @nb: Info about handler function to be called + * @nb->priority: Handler priority. Handlers should follow the + * following guidelines for setting priorities. + * 0: Restart handler of last resort, + * with limited restart capabilities + * 128: Default restart handler; use if no other + * restart handler is expected to be available, + * and/or if restart functionality is + * sufficient to restart the entire system + * 255: Highest priority restart handler, will + * preempt all other restart handlers + * + * Registers a function with code to be called to restart the + * system. + * + * Registered functions will be called from machine_restart as last + * step of the restart sequence (if the architecture specific + * machine_restart function calls do_kernel_restart - see below + * for details). + * Registered functions are expected to restart the system immediately. + * If more than one function is registered, the restart handler priority + * selects which function will be called first. + * + * Restart handlers are expected to be registered from non-architecture + * code, typically from drivers. A typical use case would be a system + * where restart functionality is provided through a watchdog. Multiple + * restart handlers may exist; for example, one restart handler might + * restart the entire system, while another only restarts the CPU. + * In such cases, the restart handler which only restarts part of the + * hardware is expected to register with low priority to ensure that + * it only runs if no other means to restart the system is available. + * + * Currently always returns zero, as atomic_notifier_chain_register() + * always returns zero. + */ +int register_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&restart_handler_list, nb); +} +EXPORT_SYMBOL(register_restart_handler); + +/** + * unregister_restart_handler - Unregister previously registered + * restart handler + * @nb: Hook to be unregistered + * + * Unregisters a previously registered restart handler function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_restart_handler(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&restart_handler_list, nb); +} +EXPORT_SYMBOL(unregister_restart_handler); + +/** + * do_kernel_restart - Execute kernel restart handler call chain + * + * Calls functions registered with register_restart_handler. + * + * Expected to be called from machine_restart as last step of the restart + * sequence. + * + * Restarts the system immediately if a restart handler function has been + * registered. Otherwise does nothing. + */ +void do_kernel_restart(char *cmd) +{ + atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd); +} + void migrate_to_reboot_cpu(void) { /* The boot cpu is always logical cpu 0 */ -- cgit v1.2.3 From 75c349062a666deab57bdca8b5bd0779c9fb0d58 Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Thu, 18 Sep 2014 20:39:15 +0100 Subject: ARM: 8153/1: Enable gcov support on the ARM architecture Enable gcov support for ARM based on original patches by David Singleton and George G. Davis Riku - updated to patch to current mainline kernel. The patch has been submitted in 2010, 2012 - for symmetry, now in 2014 too. https://lwn.net/Articles/390419/ http://marc.info/?l=linux-arm-kernel&m=133823081813044 v2: remove arch/arm/kernel from gcov disabled files Cc: Andrey Ryabinin Cc: Naresh Kamboju Acked-by: Arnd Bergmann Signed-off-by: Riku Voipio Signed-off-by: Vincent Sanders Signed-off-by: Russell King --- arch/arm/boot/bootp/Makefile | 2 ++ arch/arm/boot/compressed/Makefile | 2 ++ kernel/gcov/Kconfig | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/arm/boot/bootp/Makefile b/arch/arm/boot/bootp/Makefile index c394e305447c..5761f0039133 100644 --- a/arch/arm/boot/bootp/Makefile +++ b/arch/arm/boot/bootp/Makefile @@ -5,6 +5,8 @@ # architecture-specific flags and dependencies. # +GCOV_PROFILE := n + LDFLAGS_bootp :=-p --no-undefined -X \ --defsym initrd_phys=$(INITRD_PHYS) \ --defsym params_phys=$(PARAMS_PHYS) -T diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 76a50ecae1c3..3ea230aa94b7 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -37,6 +37,8 @@ ifeq ($(CONFIG_ARM_VIRT_EXT),y) OBJS += hyp-stub.o endif +GCOV_PROFILE := n + # # Architecture dependencies # diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d04ce8ac4399..cf66c5c8458e 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3 From 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:57 -0700 Subject: bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 39 +++++++++ include/linux/bpf.h | 41 +++++++++ include/uapi/linux/bpf.h | 23 +++++ kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 169 ++++++++++++++++++++++++++++++++++++ 5 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf.h create mode 100644 kernel/bpf/syscall.c (limited to 'kernel') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 014e0319a5c4..4a01d71785e9 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF maps +--------- +'maps' is a generic storage of different types for sharing data between kernel +and userspace. + +The maps are accessed from user space via BPF syscall, which has commands: +- create a map with given type and attributes + map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) + using attr->map_type, attr->key_size, attr->value_size, attr->max_entries + returns process-local file descriptor or negative error + +- lookup key in a given map + err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero and stores found elem into value or negative error + +- create or update key/value pair in a given map + err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero or negative error + +- find and delete element by key in a given map + err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key + +- to delete map: close(fd) + Exiting process will delete maps automatically + +userspace programs use this syscall to create/access maps that eBPF programs +are concurrently updating. + +maps can have different types: hash, array, bloom filter, radix-tree, etc. + +The map is defined by: + . type + . max number of elements + . key size in bytes + . value size in bytes + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h new file mode 100644 index 000000000000..48014a71f0fe --- /dev/null +++ b/include/linux/bpf.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _LINUX_BPF_H +#define _LINUX_BPF_H 1 + +#include +#include + +struct bpf_map; + +/* map is generic key/value storage optionally accesible by eBPF programs */ +struct bpf_map_ops { + /* funcs callable from userspace (via syscall) */ + struct bpf_map *(*map_alloc)(union bpf_attr *attr); + void (*map_free)(struct bpf_map *); +}; + +struct bpf_map { + atomic_t refcnt; + enum bpf_map_type map_type; + u32 key_size; + u32 value_size; + u32 max_entries; + struct bpf_map_ops *ops; + struct work_struct work; +}; + +struct bpf_map_type_list { + struct list_head list_node; + struct bpf_map_ops *ops; + enum bpf_map_type type; +}; + +void bpf_register_map_type(struct bpf_map_type_list *tl); +void bpf_map_put(struct bpf_map *map); + +#endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 479ed0b6be16..f58a10f9670c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -62,4 +62,27 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; +/* BPF syscall commands */ +enum bpf_cmd { + /* create a map with given type and attributes + * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size) + * returns fd or negative error + * map is deleted when fd is closed + */ + BPF_MAP_CREATE, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, +}; + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + }; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..e9f7334ed07a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o +obj-y := core.o syscall.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,169 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} -- cgit v1.2.3 From 749730ce42a2121e1c88350d69478bff3994b10a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:58 -0700 Subject: bpf: enable bpf syscall on x64 and i386 done as separate commit to ease conflict resolution Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 3 ++- include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 3 +++ 5 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 028b78168d85..9fe1b5d002f0 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -363,3 +363,4 @@ 354 i386 seccomp sys_seccomp 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 35dd922727b9..281150b539a2 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -327,6 +327,7 @@ 318 common getrandom sys_getrandom 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0f86d85a9ce4..bda9b81357cc 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -65,6 +65,7 @@ struct old_linux_dirent; struct perf_event_attr; struct file_handle; struct sigaltstack; +union bpf_attr; #include #include @@ -875,5 +876,5 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, const char __user *uargs); asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); - +asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 11d11bc5c78f..22749c134117 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp) __SYSCALL(__NR_getrandom, sys_getrandom) #define __NR_memfd_create 279 __SYSCALL(__NR_memfd_create, sys_memfd_create) +#define __NR_bpf 280 +__SYSCALL(__NR_bpf, sys_bpf) #undef __NR_syscalls -#define __NR_syscalls 280 +#define __NR_syscalls 281 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 391d4ddb6f4b..b4b5083f5f5e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -218,3 +218,6 @@ cond_syscall(sys_kcmp); /* operate on Secure Computing state */ cond_syscall(sys_seccomp); + +/* access BPF programs and maps */ +cond_syscall(sys_bpf); -- cgit v1.2.3 From db20fd2b01087bdfbe30bce314a198eefedcc42e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:59 -0700 Subject: bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 ++ include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/syscall.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48014a71f0fe..2887f3f9da59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -9,6 +9,7 @@ #include #include +#include struct bpf_map; @@ -17,6 +18,12 @@ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_free)(struct bpf_map *); + int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + + /* funcs callable from userspace and from eBPF programs */ + void *(*map_lookup_elem)(struct bpf_map *map, void *key); + int (*map_update_elem)(struct bpf_map *map, void *key, void *value); + int (*map_delete_elem)(struct bpf_map *map, void *key); }; struct bpf_map { @@ -37,5 +44,6 @@ struct bpf_map_type_list { void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); +struct bpf_map *bpf_map_get(struct fd f); #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f58a10f9670c..395cabd2ca0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -70,6 +70,35 @@ enum bpf_cmd { * map is deleted when fd is closed */ BPF_MAP_CREATE, + + /* lookup key in a given map + * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero and stores found elem into value + * or negative error + */ + BPF_MAP_LOOKUP_ELEM, + + /* create or update key/value pair in a given map + * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero or negative error + */ + BPF_MAP_UPDATE_ELEM, + + /* find and delete elem by key in a given map + * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key + * returns zero or negative error + */ + BPF_MAP_DELETE_ELEM, + + /* lookup key in a given map and return next key + * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->next_key + * returns zero and stores next key or negative error + */ + BPF_MAP_GET_NEXT_KEY, }; enum bpf_map_type { @@ -83,6 +112,15 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + }; } __attribute__((aligned(8))); #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 428a0e23adc0..f94349ecaf61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -13,6 +13,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -111,6 +112,228 @@ free_map: return err; } +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ESRCH; + rcu_read_lock(); + value = map->ops->map_lookup_elem(map, key); + if (!value) + goto err_unlock; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto err_unlock; + + err = 0; + +err_unlock: + rcu_read_unlock(); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -160,6 +383,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_CREATE: err = map_create(&attr); break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 09756af46893c18839062976c3252e93a1beeba7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:00 -0700 Subject: bpf: expand BPF syscall with program load/unload eBPF programs are similar to kernel modules. They are loaded by the user process and automatically unloaded when process exits. Each eBPF program is a safe run-to-completion set of instructions. eBPF verifier statically determines that the program terminates and is safe to execute. The following syscall wrapper can be used to load the program: int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = prog_type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } where 'insns' is an array of eBPF instructions and 'license' is a string that must be GPL compatible to call helper functions marked gpl_only Upon succesful load the syscall returns prog_fd. Use close(prog_fd) to unload the program. User space tests and examples follow in the later patches Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 38 +++++++++++ include/linux/filter.h | 8 +-- include/uapi/linux/bpf.h | 26 ++++++++ kernel/bpf/core.c | 29 +++++---- kernel/bpf/syscall.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 246 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2887f3f9da59..92979182be81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,4 +46,42 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL + * instructions after verifying + */ +struct bpf_func_proto { + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + bool gpl_only; +}; + +struct bpf_verifier_ops { + /* return eBPF function prototype for verification */ + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); +}; + +struct bpf_prog_type_list { + struct list_head list_node; + struct bpf_verifier_ops *ops; + enum bpf_prog_type type; +}; + +void bpf_register_prog_type(struct bpf_prog_type_list *tl); + +struct bpf_prog; + +struct bpf_prog_aux { + atomic_t refcnt; + bool is_gpl_compatible; + enum bpf_prog_type prog_type; + struct bpf_verifier_ops *ops; + struct bpf_map **used_maps; + u32 used_map_cnt; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_prog_put(struct bpf_prog *prog); +struct bpf_prog *bpf_prog_get(u32 ufd); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 1a0bc6d134d7..4ffc0958d85e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,6 +21,7 @@ struct sk_buff; struct sock; struct seccomp_data; +struct bpf_prog_aux; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -300,17 +301,12 @@ struct bpf_binary_header { u8 image[]; }; -struct bpf_work_struct { - struct bpf_prog *prog; - struct work_struct work; -}; - struct bpf_prog { u16 pages; /* Number of allocated pages */ bool jited; /* Is our filter JIT'ed? */ u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - struct bpf_work_struct *work; /* Deferred free work struct */ + struct bpf_prog_aux *aux; /* Auxiliary fields */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); /* Instructions for interpreter */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395cabd2ca0a..424f442016e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -99,12 +99,23 @@ enum bpf_cmd { * returns zero and stores next key or negative error */ BPF_MAP_GET_NEXT_KEY, + + /* verify and load eBPF program + * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) + * Using attr->prog_type, attr->insns, attr->license + * returns fd or negative error + */ + BPF_PROG_LOAD, }; enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, }; +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -121,6 +132,21 @@ union bpf_attr { __aligned_u64 next_key; }; }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + }; } __attribute__((aligned(8))); +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +enum bpf_func_id { + BPF_FUNC_unspec, + __BPF_FUNC_MAX_ID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b7002488251..f0c30c59b317 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Registers */ #define BPF_R0 regs[BPF_REG_0] @@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); @@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); - if (ws == NULL) { + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { vfree(fp); return NULL; } fp->pages = size / PAGE_SIZE; - fp->work = ws; + fp->aux = aux; return fp; } @@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; - /* We keep fp->work from fp_old around in the new + /* We keep fp->aux from fp_old around in the new * reallocated structure. */ - fp_old->work = NULL; + fp_old->aux = NULL; __bpf_prog_free(fp_old); } @@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->work); + kfree(fp->aux); vfree(fp); } EXPORT_SYMBOL_GPL(__bpf_prog_free); @@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static void bpf_prog_free_deferred(struct work_struct *work) { - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; - ws = container_of(work, struct bpf_work_struct, work); - bpf_jit_free(ws->prog); + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); } /* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - struct bpf_work_struct *ws = fp->work; + struct bpf_prog_aux *aux = fp->aux; - INIT_WORK(&ws->work, bpf_prog_free_deferred); - ws->prog = fp; - schedule_work(&ws->work); + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f94349ecaf61..0afb4eaa1887 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include static LIST_HEAD(bpf_map_types); @@ -334,6 +336,166 @@ err_put: return err; } +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->aux->prog_type = type; + return 0; + } + } + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD license + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->aux->is_gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + /* err = bpf_check(prog, tb); */ + + if (err < 0) + goto free_used_maps; + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -395,6 +557,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 0a542a86d73b1577e7d4f55fc95dcffd3fe62643 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:01 -0700 Subject: bpf: handle pseudo BPF_CALL insn in native eBPF programs userspace is using pseudo BPF_CALL instructions which encode one of 'enum bpf_func_id' inside insn->imm field. Verifier checks that program using correct function arguments to given func_id. If all checks passed, kernel needs to fixup BPF_CALL->imm fields by replacing func_id with in-kernel function pointer. eBPF interpreter just calls the function. In-kernel eBPF users continue to use generic BPF_CALL. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0afb4eaa1887..b513659d120f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -357,6 +357,40 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl) list_add(&tl->list_node, &bpf_prog_types); } +/* fixup insn->imm field of bpf_call instructions: + * if (insn->imm == BPF_FUNC_map_lookup_elem) + * insn->imm = bpf_map_lookup_elem - __bpf_call_base; + * else if (insn->imm == BPF_FUNC_map_update_elem) + * insn->imm = bpf_map_update_elem - __bpf_call_base; + * else ... + * + * this function is called after eBPF program passed verification + */ +static void fixup_bpf_calls(struct bpf_prog *prog) +{ + const struct bpf_func_proto *fn; + int i; + + for (i = 0; i < prog->len; i++) { + struct bpf_insn *insn = &prog->insnsi[i]; + + if (insn->code == (BPF_JMP | BPF_CALL)) { + /* we reach here when program has bpf_call instructions + * and it passed bpf_check(), means that + * ops->get_func_proto must have been supplied, check it + */ + BUG_ON(!prog->aux->ops->get_func_proto); + + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + BUG_ON(!fn->func); + insn->imm = fn->func - __bpf_call_base; + } + } +} + /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { @@ -478,6 +512,9 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; + /* fixup BPF_CALL->imm field */ + fixup_bpf_calls(prog); + /* eBPF program is ready to be JITed */ bpf_prog_select_runtime(prog); -- cgit v1.2.3 From 51580e798cb61b0fc63fa3aa6c5c975375aa0550 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:02 -0700 Subject: bpf: verifier (add docs) this patch adds all of eBPF verfier documentation and empty bpf_check() The end goal for the verifier is to statically check safety of the program. Verifier will catch: - loops - out of range jumps - unreachable instructions - invalid instructions - uninitialized register access - uninitialized stack access - misaligned stack access - out of range stack access - invalid calling convention More details in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 224 ++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 2 + kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 133 +++++++++++++++++++++ 5 files changed, 361 insertions(+), 2 deletions(-) create mode 100644 kernel/bpf/verifier.c (limited to 'kernel') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 4a01d71785e9..5ce4d07406a5 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,99 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF verifier +------------- +The safety of the eBPF program is determined in two steps. + +First step does DAG check to disallow loops and other CFG validation. +In particular it will detect programs that have unreachable instructions. +(though classic BPF checker allows them) + +Second step starts from the first insn and descends all possible paths. +It simulates execution of every insn and observes the state change of +registers and stack. + +At the start of the program the register R1 contains a pointer to context +and has type PTR_TO_CTX. +If verifier sees an insn that does R2=R1, then R2 has now type +PTR_TO_CTX as well and can be used on the right hand side of expression. +If R1=PTR_TO_CTX and insn is R2=R1+R1, then R2=UNKNOWN_VALUE, +since addition of two valid pointers makes invalid pointer. +(In 'secure' mode verifier will reject any type of pointer arithmetic to make +sure that kernel addresses don't leak to unprivileged users) + +If register was never written to, it's not readable: + bpf_mov R0 = R2 + bpf_exit +will be rejected, since R2 is unreadable at the start of the program. + +After kernel function call, R1-R5 are reset to unreadable and +R0 has a return type of the function. + +Since R6-R9 are callee saved, their state is preserved across the call. + bpf_mov R6 = 1 + bpf_call foo + bpf_mov R0 = R6 + bpf_exit +is a correct program. If there was R1 instead of R6, it would have +been rejected. + +load/store instructions are allowed only with registers of valid types, which +are PTR_TO_CTX, PTR_TO_MAP, FRAME_PTR. They are bounds and alignment checked. +For example: + bpf_mov R1 = 1 + bpf_mov R2 = 2 + bpf_xadd *(u32 *)(R1 + 3) += R2 + bpf_exit +will be rejected, since R1 doesn't have a valid pointer type at the time of +execution of instruction bpf_xadd. + +At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context') +A callback is used to customize verifier to restrict eBPF program access to only +certain fields within ctx structure with specified size and alignment. + +For example, the following insn: + bpf_ld R0 = *(u32 *)(R6 + 8) +intends to load a word from address R6 + 8 and store it into R0 +If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know +that offset 8 of size 4 bytes can be accessed for reading, otherwise +the verifier will reject the program. +If R6=FRAME_PTR, then access should be aligned and be within +stack bounds, which are [-MAX_BPF_STACK, 0). In this example offset is 8, +so it will fail verification, since it's out of bounds. + +The verifier will allow eBPF program to read data from stack only after +it wrote into it. +Classic BPF verifier does similar check with M[0-15] memory slots. +For example: + bpf_ld R0 = *(u32 *)(R10 - 4) + bpf_exit +is invalid program. +Though R10 is correct read-only register and has type FRAME_PTR +and R10 - 4 is within stack bounds, there were no stores into that location. + +Pointer register spill/fill is tracked as well, since four (R6-R9) +callee saved registers may not be enough for some programs. + +Allowed function calls are customized with bpf_verifier_ops->get_func_proto() +The eBPF verifier will check that registers match argument constraints. +After the call register R0 will be set to return type of the function. + +Function calls is a main mechanism to extend functionality of eBPF programs. +Socket filters may let programs to call one set of functions, whereas tracing +filters may allow completely different set. + +If a function made accessible to eBPF program, it needs to be thought through +from safety point of view. The verifier will guarantee that the function is +called with valid arguments. + +seccomp vs socket filters have different security restrictions for classic BPF. +Seccomp solves this by two stage verifier: classic BPF verifier is followed +by seccomp verifier. In case of eBPF one configurable verifier is shared for +all use cases. + +See details of eBPF verifier in kernel/bpf/verifier.c + eBPF maps --------- 'maps' is a generic storage of different types for sharing data between kernel @@ -1040,6 +1133,137 @@ The map is defined by: . key size in bytes . value size in bytes +Understanding eBPF verifier messages +------------------------------------ + +The following are few examples of invalid eBPF programs and verifier error +messages as seen in the log: + +Program with unreachable instructions: +static struct bpf_insn prog[] = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), +}; +Error: + unreachable insn 1 + +Program that reads uninitialized register: + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), +Error: + 0: (bf) r0 = r2 + R2 !read_ok + +Program that doesn't initialize R0 before exiting: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r1 + 1: (95) exit + R0 !read_ok + +Program that accesses stack out of bounds: + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (bf) r2 = r10 + 1: (07) r2 += -8 + 2: (b7) r1 = 0x0 + 3: (85) call 1 + invalid indirect read from stack off -8+0 size 8 + +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + fd 0 is not pointing to valid bpf_map + +Program that doesn't check return value of map_lookup_elem() before accessing +map element: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 0x0 + 4: (85) call 1 + 5: (7a) *(u64 *)(r0 +0) = 0 + R0 invalid mem access 'map_value_or_null' + +Program that correctly checks map_lookup_elem() returned value for NULL, but +accesses the memory with incorrect alignment: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+1 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +4) = 0 + misaligned access off 4 size 8 + +Program that correctly checks map_lookup_elem() returned value for NULL and +accesses memory with correct alignment in one side of 'if' branch, but fails +to do so in the other side of 'if' branch: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), +Error: + 0: (7a) *(u64 *)(r10 -8) = 0 + 1: (bf) r2 = r10 + 2: (07) r2 += -8 + 3: (b7) r1 = 1 + 4: (85) call 1 + 5: (15) if r0 == 0x0 goto pc+2 + R0=map_ptr R10=fp + 6: (7a) *(u64 *)(r0 +0) = 0 + 7: (95) exit + + from 5 to 8: R0=imm0 R10=fp + 8: (7a) *(u64 *)(r0 +0) = 1 + R0 invalid mem access 'imm' + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 92979182be81..9dfeb36f8971 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -83,5 +83,7 @@ struct bpf_prog_aux { void bpf_prog_put(struct bpf_prog *prog); struct bpf_prog *bpf_prog_get(u32 ufd); +/* verify correctness of eBPF program */ +int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e9f7334ed07a..3c726b0995b7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o syscall.o +obj-y := core.o syscall.o verifier.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b513659d120f..74b3628c5fdb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -507,7 +507,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - /* err = bpf_check(prog, tb); */ + err = bpf_check(prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000000..d6f9c3d6b4d7 --- /dev/null +++ b/kernel/bpf/verifier.c @@ -0,0 +1,133 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* bpf_check() is a static code analyzer that walks eBPF program + * instruction by instruction and updates register/stack state. + * All paths of conditional branches are analyzed until 'bpf_exit' insn. + * + * The first pass is depth-first-search to check that the program is a DAG. + * It rejects the following programs: + * - larger than BPF_MAXINSNS insns + * - if loop is present (detected via back-edge) + * - unreachable insns exist (shouldn't be a forest. program = one function) + * - out of bounds or malformed jumps + * The second pass is all possible path descent from the 1st insn. + * Since it's analyzing all pathes through the program, the length of the + * analysis is limited to 32k insn, which may be hit even if total number of + * insn is less then 4K, but there are too many branches that change stack/regs. + * Number of 'branches to be analyzed' is limited to 1k + * + * On entry to each instruction, each register has a type, and the instruction + * changes the types of the registers depending on instruction semantics. + * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is + * copied to R1. + * + * All registers are 64-bit. + * R0 - return register + * R1-R5 argument passing registers + * R6-R9 callee saved registers + * R10 - frame pointer read-only + * + * At the start of BPF program the register R1 contains a pointer to bpf_context + * and has type PTR_TO_CTX. + * + * Verifier tracks arithmetic operations on pointers in case: + * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), + * 1st insn copies R10 (which has FRAME_PTR) type into R1 + * and 2nd arithmetic instruction is pattern matched to recognize + * that it wants to construct a pointer to some element within stack. + * So after 2nd insn, the register R1 has type PTR_TO_STACK + * (and -20 constant is saved for further stack bounds checking). + * Meaning that this reg is a pointer to stack plus known immediate constant. + * + * Most of the time the registers have UNKNOWN_VALUE type, which + * means the register has some value, but it's not a valid pointer. + * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * + * When verifier sees load or store instructions the type of base register + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * types recognized by check_mem_access() function. + * + * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' + * and the range of [ptr, ptr + map's value_size) is accessible. + * + * registers used to pass values to function calls are checked against + * function argument constraints. + * + * ARG_PTR_TO_MAP_KEY is one of such argument constraints. + * It means that the register type passed to this function must be + * PTR_TO_STACK and it will be used inside the function as + * 'pointer to map element key' + * + * For example the argument constraints for bpf_map_lookup_elem(): + * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + * .arg1_type = ARG_CONST_MAP_PTR, + * .arg2_type = ARG_PTR_TO_MAP_KEY, + * + * ret_type says that this function returns 'pointer to map elem value or null' + * function expects 1st argument to be a const pointer to 'struct bpf_map' and + * 2nd argument should be a pointer to stack, which will be used inside + * the helper function as a pointer to map element key. + * + * On the kernel side the helper function looks like: + * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) + * { + * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + * void *key = (void *) (unsigned long) r2; + * void *value; + * + * here kernel can access 'key' and 'map' pointers safely, knowing that + * [key, key + map->key_size) bytes are valid and were initialized on + * the stack of eBPF program. + * } + * + * Corresponding eBPF program may look like: + * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK + * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + * here verifier looks at prototype of map_lookup_elem() and sees: + * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, + * Now verifier knows that this map has key of R1->map_ptr->key_size bytes + * + * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, + * Now verifier checks that [R2, R2 + map's key_size) are within stack limits + * and were initialized prior to this call. + * If it's ok, then verifier allows this BPF_CALL insn and looks at + * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets + * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function + * returns ether pointer to map value or NULL. + * + * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' + * insn, the register holding that pointer in the true branch changes state to + * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false + * branch. See check_cond_jmp_op(). + * + * After the call R0 is set to return type of the function and registers R1-R5 + * are set to NOT_INIT to indicate that they are no longer readable. + */ + +int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +{ + int ret = -EINVAL; + + return ret; +} -- cgit v1.2.3 From cbd357008604925355ae7b54a09137dabb81b580 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:03 -0700 Subject: bpf: verifier (add ability to receive verification log) add optional attributes for BPF_PROG_LOAD syscall: union bpf_attr { struct { ... __u32 log_level; /* verbosity level of eBPF verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied 'char *buffer' */ }; }; when log_level > 0 the verifier will return its verification log in the user supplied buffer 'log_buf' which can be used by program author to analyze why verifier rejected given program. 'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt provides several examples of these messages, like the program: BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), will be rejected with the following multi-line message in log_buf: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 0 4: (85) call 1 5: (15) if r0 == 0x0 goto pc+1 R0=map_ptr R10=fp 6: (7a) *(u64 *)(r0 +4) = 0 misaligned access off 4 size 8 The format of the output can change at any time as verifier evolves. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 + kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 424f442016e7..31b0ac208a52 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -138,6 +138,9 @@ union bpf_attr { __u32 insn_cnt; __aligned_u64 insns; __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 74b3628c5fdb..ba61c8c16032 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -458,7 +458,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD license +#define BPF_PROG_LOAD_LAST_FIELD log_buf static int bpf_prog_load(union bpf_attr *attr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6f9c3d6b4d7..871edc1f2e1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,9 +125,244 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct verifier_env { +}; + +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + +static DEFINE_MUTEX(bpf_verifier_lock); + +/* log_level controls verbosity level of eBPF verifier. + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program + */ +static void verbose(const char *fmt, ...) +{ + va_list args; + + if (log_level == 0 || log_len >= log_size - 1) + return; + + va_start(args, fmt); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + va_end(args); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JGE >> 4] = ">=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSGE >> 4] = "s>=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; + } + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; + } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM) { + verbose("(%02x) r%d = 0x%x\n", + insn->code, insn->dst_reg, insn->imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %d\n", insn->code, insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + } +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { + char __user *log_ubuf = NULL; + struct verifier_env *env; int ret = -EINVAL; + if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + return -E2BIG; + + /* 'struct verifier_env' can be global, but since it's not small, + * allocate/free it every time bpf_check() is called + */ + env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + if (attr->log_level || attr->log_buf || attr->log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; + + ret = -EINVAL; + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) + goto free_env; + + ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto free_env; + } else { + log_level = 0; + } + + /* ret = do_check(env); */ + + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ + ret = -ENOSPC; + /* fall through to return what was recorded */ + } + + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + ret = -EFAULT; + goto free_log_buf; + } + + +free_log_buf: + if (log_level) + vfree(log_buf); +free_env: + kfree(env); + mutex_unlock(&bpf_verifier_lock); return ret; } -- cgit v1.2.3 From 0246e64d9a5fcd4805198de59b9b5cf1f974eb41 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:04 -0700 Subject: bpf: handle pseudo BPF_LD_IMM64 insn eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions to refer to process-local map_fd. Scan the program for such instructions and if FDs are valid, convert them to 'struct bpf_map' pointers which will be used by verifier to check access to maps in bpf_map_lookup/update() calls. If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping BPF_PSEUDO_MAP_FD flag. Note that eBPF interpreter is generic and knows nothing about pseudo insns. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 6 ++ kernel/bpf/verifier.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4ffc0958d85e..ca95abd2bed1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -145,6 +145,12 @@ struct bpf_prog_aux; .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) +#define BPF_PSEUDO_MAP_FD 1 + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 871edc1f2e1f..7227543e474b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,10 +125,15 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + /* single container for all structs * one verifier_env per bpf_check() call */ struct verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ }; /* verbose verifier prints what it's seeing @@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn) } } +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + +/* look for pseudo eBPF instructions that access map FDs and + * replace them with actual map pointers + */ +static int replace_map_fd_with_map_ptr(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_map *map; + struct fd f; + + if (i == insn_cnt - 1 || insn[1].code != 0 || + insn[1].dst_reg != 0 || insn[1].src_reg != 0 || + insn[1].off != 0) { + verbose("invalid bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + if (insn->src_reg == 0) + /* valid generic load 64-bit imm */ + goto next_insn; + + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + f = fdget(insn->imm); + + map = bpf_map_get(f); + if (IS_ERR(map)) { + verbose("fd %d is not pointing to valid bpf_map\n", + insn->imm); + fdput(f); + return PTR_ERR(map); + } + + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; + + /* check whether we recorded this map already */ + for (j = 0; j < env->used_map_cnt; j++) + if (env->used_maps[j] == map) { + fdput(f); + goto next_insn; + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + fdput(f); + return -E2BIG; + } + + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in free_bpf_prog_info() + */ + atomic_inc(&map->refcnt); + + fdput(f); +next_insn: + insn++; + i++; + } + } + + /* now all pseudo BPF_LD_IMM64 instructions load valid + * 'struct bpf_map *' into a register instead of user map_fd. + * These pointers will be used later by verifier to validate map access. + */ + return 0; +} + +/* drop refcnt of maps used by the rejected program */ +static void release_maps(struct verifier_env *env) +{ + int i; + + for (i = 0; i < env->used_map_cnt; i++) + bpf_map_put(env->used_maps[i]); +} + +/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ +static void convert_pseudo_ld_imm64(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) + insn->src_reg = 0; +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; @@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; + env->prog = prog; + /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) log_level = 0; } + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + /* ret = do_check(env); */ +skip_full_check: + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -357,11 +479,36 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) goto free_log_buf; } + if (ret == 0 && env->used_map_cnt) { + /* if program passed verifier, update used_maps in bpf_prog_info */ + prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); + + if (!prog->aux->used_maps) { + ret = -ENOMEM; + goto free_log_buf; + } + + memcpy(prog->aux->used_maps, env->used_maps, + sizeof(env->used_maps[0]) * env->used_map_cnt); + prog->aux->used_map_cnt = env->used_map_cnt; + + /* program is valid. Convert pseudo bpf_ld_imm64 into generic + * bpf_ld_imm64 instructions + */ + convert_pseudo_ld_imm64(env); + } free_log_buf: if (log_level) vfree(log_buf); free_env: + if (!prog->aux->used_maps) + /* if we didn't copy map pointers into bpf_prog_info, release + * them now. Otherwise free_bpf_prog_info() will release them. + */ + release_maps(env); kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; -- cgit v1.2.3 From 475fb78fbf48592ce541627c60a7b331060e31f5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:05 -0700 Subject: bpf: verifier (add branch/goto checks) check that control flow graph of eBPF program is a directed acyclic graph check_cfg() does: - detect loops - detect unreachable instructions - check that program terminates with BPF_EXIT insn - check that all branches are within program boundary Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7227543e474b..c689ab8e2713 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -313,6 +313,191 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) return (struct bpf_map *) (unsigned long) imm64; } +/* non-recursive DFS pseudo code + * 1 procedure DFS-iterative(G,v): + * 2 label v as discovered + * 3 let S be a stack + * 4 S.push(v) + * 5 while S is not empty + * 6 t <- S.pop() + * 7 if t is what we're looking for: + * 8 return t + * 9 for all edges e in G.adjacentEdges(t) do + * 10 if edge e is already labelled + * 11 continue with the next edge + * 12 w <- G.adjacentVertex(t,e) + * 13 if vertex w is not discovered and not explored + * 14 label e as tree-edge + * 15 label w as discovered + * 16 S.push(w) + * 17 continue at 5 + * 18 else if vertex w is discovered + * 19 label e as back-edge + * 20 else + * 21 // vertex w is explored + * 22 label e as forward- or cross-edge + * 23 label t as explored + * 24 S.pop() + * + * convention: + * 0x10 - discovered + * 0x11 - discovered and fall-through edge labelled + * 0x12 - discovered and fall-through and branch edges labelled + * 0x20 - explored + */ + +enum { + DISCOVERED = 0x10, + EXPLORED = 0x20, + FALLTHROUGH = 1, + BRANCH = 2, +}; + +static int *insn_stack; /* stack of insns to process */ +static int cur_stack; /* current stack index */ +static int *insn_state; + +/* t, w, e - match pseudo-code above: + * t - index of current instruction + * w - next instruction + * e - edge + */ +static int push_insn(int t, int w, int e, struct verifier_env *env) +{ + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) + return 0; + + if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) + return 0; + + if (w < 0 || w >= env->prog->len) { + verbose("jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + if (insn_state[w] == 0) { + /* tree-edge */ + insn_state[t] = DISCOVERED | e; + insn_state[w] = DISCOVERED; + if (cur_stack >= env->prog->len) + return -E2BIG; + insn_stack[cur_stack++] = w; + return 1; + } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose("back-edge from insn %d to %d\n", t, w); + return -EINVAL; + } else if (insn_state[w] == EXPLORED) { + /* forward- or cross-edge */ + insn_state[t] = DISCOVERED | e; + } else { + verbose("insn state internal bug\n"); + return -EFAULT; + } + return 0; +} + +/* non-recursive depth-first-search to detect loops in BPF program + * loop == back-edge in directed graph + */ +static int check_cfg(struct verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret = 0; + int i, t; + + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_state) + return -ENOMEM; + + insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_stack) { + kfree(insn_state); + return -ENOMEM; + } + + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ + insn_stack[0] = 0; /* 0 is the first instruction */ + cur_stack = 1; + +peek_stack: + if (cur_stack == 0) + goto check_state; + t = insn_stack[cur_stack - 1]; + + if (BPF_CLASS(insns[t].code) == BPF_JMP) { + u8 opcode = BPF_OP(insns[t].code); + + if (opcode == BPF_EXIT) { + goto mark_explored; + } else if (opcode == BPF_CALL) { + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } else if (opcode == BPF_JA) { + if (BPF_SRC(insns[t].code) != BPF_K) { + ret = -EINVAL; + goto err_free; + } + /* unconditional jump with single edge */ + ret = push_insn(t, t + insns[t].off + 1, + FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } else { + /* conditional jump with two edges */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + } else { + /* all other non-branch instructions with single + * fall-through edge + */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + +mark_explored: + insn_state[t] = EXPLORED; + if (cur_stack-- <= 0) { + verbose("pop stack internal bug\n"); + ret = -EFAULT; + goto err_free; + } + goto peek_stack; + +check_state: + for (i = 0; i < insn_cnt; i++) { + if (insn_state[i] != EXPLORED) { + verbose("unreachable insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + } + ret = 0; /* cfg looks good */ + +err_free: + kfree(insn_state); + kfree(insn_stack); + return ret; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -462,6 +647,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + /* ret = do_check(env); */ skip_full_check: -- cgit v1.2.3 From 17a5267067f3c372fec9ffb798d6eaba6b5e6a4c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:06 -0700 Subject: bpf: verifier (add verifier core) This patch adds verifier core which simulates execution of every insn and records the state of registers and program stack. Every branch instruction seen during simulation is pushed into state stack. When verifier reaches BPF_EXIT, it pops the state from the stack and continues until it reaches BPF_EXIT again. For program: 1: bpf_mov r1, xxx 2: if (r1 == 0) goto 5 3: bpf_mov r0, 1 4: goto 6 5: bpf_mov r0, 2 6: bpf_exit The verifier will walk insns: 1, 2, 3, 4, 6 then it will pop the state recorded at insn#2 and will continue: 5, 6 This way it walks all possible paths through the program and checks all possible values of registers. While doing so, it checks for: - invalid instructions - uninitialized register access - uninitialized stack access - misaligned stack access - out of range stack access - invalid calling convention - instruction encoding is not using reserved fields Kernel subsystem configures the verifier with two callbacks: - bool (*is_valid_access)(int off, int size, enum bpf_access_type type); that provides information to the verifer which fields of 'ctx' are accessible (remember 'ctx' is the first argument to eBPF program) - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); returns argument constraints of kernel helper functions that eBPF program may call, so that verifier can checks that R1-R5 types match the prototype More details in Documentation/networking/filter.txt and in kernel/bpf/verifier.c Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 47 +++ kernel/bpf/verifier.c | 1075 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 1121 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dfeb36f8971..3cf91754a957 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,6 +46,31 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* function argument constraints */ +enum bpf_arg_type { + ARG_ANYTHING = 0, /* any argument is ok */ + + /* the following constraints used to prototype + * bpf_map_lookup/update/delete_elem() functions + */ + ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ + ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ + ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ + + /* the following constraints used to prototype bpf_memcmp() and other + * functions that access data on eBPF program stack + */ + ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ + ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ +}; + +/* type of values returned from helper functions */ +enum bpf_return_type { + RET_INTEGER, /* function returns integer */ + RET_VOID, /* function doesn't return anything */ + RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ +}; + /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL * instructions after verifying @@ -53,11 +78,33 @@ struct bpf_map *bpf_map_get(struct fd f); struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; + enum bpf_return_type ret_type; + enum bpf_arg_type arg1_type; + enum bpf_arg_type arg2_type; + enum bpf_arg_type arg3_type; + enum bpf_arg_type arg4_type; + enum bpf_arg_type arg5_type; +}; + +/* bpf_context is intentionally undefined structure. Pointer to bpf_context is + * the first argument to eBPF programs. + * For socket filters: 'struct bpf_context *' == 'struct sk_buff *' + */ +struct bpf_context; + +enum bpf_access_type { + BPF_READ = 1, + BPF_WRITE = 2 }; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + + /* return true if 'size' wide access at offset 'off' within bpf_context + * with 'type' (read or write) is allowed + */ + bool (*is_valid_access)(int off, int size, enum bpf_access_type type); }; struct bpf_prog_type_list { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c689ab8e2713..a086dd3210a8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,6 +125,70 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +/* types of values stored in eBPF registers */ +enum bpf_reg_type { + NOT_INIT = 0, /* nothing was written into register */ + UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ + PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + FRAME_PTR, /* reg == frame_pointer */ + PTR_TO_STACK, /* reg == frame_pointer + imm */ + CONST_IMM, /* constant integer value */ +}; + +struct reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK */ + int imm; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* 1st byte of register spilled into stack */ + STACK_SPILL_PART, /* other 7 bytes of register spill */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +struct bpf_stack_slot { + enum bpf_stack_slot_type stype; + struct reg_state reg_st; +}; + +/* state of the program: + * type of all registers and stack info + */ +struct verifier_state { + struct reg_state regs[MAX_BPF_REG]; + struct bpf_stack_slot stack[MAX_BPF_STACK]; +}; + +/* linked list of verifier states used to prune search */ +struct verifier_state_list { + struct verifier_state state; + struct verifier_state_list *next; +}; + +/* verifier_state + insn_idx are pushed to stack when branch is encountered */ +struct verifier_stack_elem { + /* verifer state is 'st' + * before processing instruction 'insn_idx' + * and after processing instruction 'prev_insn_idx' + */ + struct verifier_state st; + int insn_idx; + int prev_insn_idx; + struct verifier_stack_elem *next; +}; + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ /* single container for all structs @@ -132,6 +196,9 @@ */ struct verifier_env { struct bpf_prog *prog; /* eBPF program being verified */ + struct verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct verifier_state cur_state; /* current verifier state */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ }; @@ -160,6 +227,45 @@ static void verbose(const char *fmt, ...) va_end(args); } +/* string representation of 'enum bpf_reg_type' */ +static const char * const reg_type_str[] = { + [NOT_INIT] = "?", + [UNKNOWN_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [FRAME_PTR] = "fp", + [PTR_TO_STACK] = "fp", + [CONST_IMM] = "imm", +}; + +static void print_verifier_state(struct verifier_env *env) +{ + enum bpf_reg_type t; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + t = env->cur_state.regs[i].type; + if (t == NOT_INIT) + continue; + verbose(" R%d=%s", i, reg_type_str[t]); + if (t == CONST_IMM || t == PTR_TO_STACK) + verbose("%d", env->cur_state.regs[i].imm); + else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL) + verbose("(ks=%d,vs=%d)", + env->cur_state.regs[i].map_ptr->key_size, + env->cur_state.regs[i].map_ptr->value_size); + } + for (i = 0; i < MAX_BPF_STACK; i++) { + if (env->cur_state.stack[i].stype == STACK_SPILL) + verbose(" fp%d=%s", -MAX_BPF_STACK + i, + reg_type_str[env->cur_state.stack[i].reg_st.type]); + } + verbose("\n"); +} + static const char *const bpf_class_string[] = { [BPF_LD] = "ld", [BPF_LDX] = "ldx", @@ -305,6 +411,735 @@ static void print_bpf_insn(struct bpf_insn *insn) } } +static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +{ + struct verifier_stack_elem *elem; + int insn_idx; + + if (env->head == NULL) + return -1; + + memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); + insn_idx = env->head->insn_idx; + if (prev_insn_idx) + *prev_insn_idx = env->head->prev_insn_idx; + elem = env->head->next; + kfree(env->head); + env->head = elem; + env->stack_size--; + return insn_idx; +} + +static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, + int prev_insn_idx) +{ + struct verifier_stack_elem *elem; + + elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + env->head = elem; + env->stack_size++; + if (env->stack_size > 1024) { + verbose("BPF program is too complex\n"); + goto err; + } + return &elem->st; +err: + /* pop all elements and return */ + while (pop_stack(env, NULL) >= 0); + return NULL; +} + +#define CALLER_SAVED_REGS 6 +static const int caller_saved[CALLER_SAVED_REGS] = { + BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 +}; + +static void init_reg_state(struct reg_state *regs) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + regs[i].type = NOT_INIT; + regs[i].imm = 0; + regs[i].map_ptr = NULL; + } + + /* frame pointer */ + regs[BPF_REG_FP].type = FRAME_PTR; + + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; +} + +static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; + regs[regno].imm = 0; + regs[regno].map_ptr = NULL; +} + +enum reg_arg_type { + SRC_OP, /* register is used as source operand */ + DST_OP, /* register is used as destination operand */ + DST_OP_NO_MARK /* same as above, check only, don't mark */ +}; + +static int check_reg_arg(struct reg_state *regs, u32 regno, + enum reg_arg_type t) +{ + if (regno >= MAX_BPF_REG) { + verbose("R%d is invalid\n", regno); + return -EINVAL; + } + + if (t == SRC_OP) { + /* check whether register used as source operand can be read */ + if (regs[regno].type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + } else { + /* check whether register used as dest operand can be written to */ + if (regno == BPF_REG_FP) { + verbose("frame pointer is read only\n"); + return -EACCES; + } + if (t == DST_OP) + mark_reg_unknown_value(regs, regno); + } + return 0; +} + +static int bpf_size_to_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 8; + else + return -EINVAL; +} + +/* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +static int check_stack_write(struct verifier_state *state, int off, int size, + int value_regno) +{ + struct bpf_stack_slot *slot; + int i; + + if (value_regno >= 0 && + (state->regs[value_regno].type == PTR_TO_MAP_VALUE || + state->regs[value_regno].type == PTR_TO_STACK || + state->regs[value_regno].type == PTR_TO_CTX)) { + + /* register containing pointer is being spilled into stack */ + if (size != 8) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + + slot = &state->stack[MAX_BPF_STACK + off]; + slot->stype = STACK_SPILL; + /* save register state */ + slot->reg_st = state->regs[value_regno]; + for (i = 1; i < 8; i++) { + slot = &state->stack[MAX_BPF_STACK + off + i]; + slot->stype = STACK_SPILL_PART; + slot->reg_st.type = UNKNOWN_VALUE; + slot->reg_st.map_ptr = NULL; + } + } else { + + /* regular write of data into stack */ + for (i = 0; i < size; i++) { + slot = &state->stack[MAX_BPF_STACK + off + i]; + slot->stype = STACK_MISC; + slot->reg_st.type = UNKNOWN_VALUE; + slot->reg_st.map_ptr = NULL; + } + } + return 0; +} + +static int check_stack_read(struct verifier_state *state, int off, int size, + int value_regno) +{ + int i; + struct bpf_stack_slot *slot; + + slot = &state->stack[MAX_BPF_STACK + off]; + + if (slot->stype == STACK_SPILL) { + if (size != 8) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + for (i = 1; i < 8; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != + STACK_SPILL_PART) { + verbose("corrupted spill memory\n"); + return -EACCES; + } + } + + if (value_regno >= 0) + /* restore register state from stack */ + state->regs[value_regno] = slot->reg_st; + return 0; + } else { + for (i = 0; i < size; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != + STACK_MISC) { + verbose("invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + } + if (value_regno >= 0) + /* have read misc data from the stack */ + mark_reg_unknown_value(state->regs, value_regno); + return 0; + } +} + +/* check read/write into map element returned by bpf_map_lookup_elem() */ +static int check_map_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + + if (off < 0 || off + size > map->value_size) { + verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + return 0; +} + +/* check access to 'struct bpf_context' fields */ +static int check_ctx_access(struct verifier_env *env, int off, int size, + enum bpf_access_type t) +{ + if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t)) + return 0; + + verbose("invalid bpf_context access off=%d size=%d\n", off, size); + return -EACCES; +} + +/* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +static int check_mem_access(struct verifier_env *env, u32 regno, int off, + int bpf_size, enum bpf_access_type t, + int value_regno) +{ + struct verifier_state *state = &env->cur_state; + int size, err = 0; + + size = bpf_size_to_bytes(bpf_size); + if (size < 0) + return size; + + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } + + if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + err = check_map_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == PTR_TO_CTX) { + err = check_ctx_access(env, off, size, t); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == FRAME_PTR) { + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + if (t == BPF_WRITE) + err = check_stack_write(state, off, size, value_regno); + else + err = check_stack_read(state, off, size, value_regno); + } else { + verbose("R%d invalid mem access '%s'\n", + regno, reg_type_str[state->regs[regno].type]); + return -EACCES; + } + return err; +} + +static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || + insn->imm != 0) { + verbose("BPF_XADD uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ + return check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1); +} + +/* when register 'regno' is passed into function that will read 'access_size' + * bytes from that pointer, make sure that it's within stack boundary + * and all elements of stack are initialized + */ +static int check_stack_boundary(struct verifier_env *env, + int regno, int access_size) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs; + int off, i; + + if (regs[regno].type != PTR_TO_STACK) + return -EACCES; + + off = regs[regno].imm; + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size <= 0) { + verbose("invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + return -EACCES; + } + + for (i = 0; i < access_size; i++) { + if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { + verbose("invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; + } + } + return 0; +} + +static int check_func_arg(struct verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, struct bpf_map **mapp) +{ + struct reg_state *reg = env->cur_state.regs + regno; + enum bpf_reg_type expected_type; + int err = 0; + + if (arg_type == ARG_ANYTHING) + return 0; + + if (reg->type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + arg_type == ARG_PTR_TO_MAP_VALUE) { + expected_type = PTR_TO_STACK; + } else if (arg_type == ARG_CONST_STACK_SIZE) { + expected_type = CONST_IMM; + } else if (arg_type == ARG_CONST_MAP_PTR) { + expected_type = CONST_PTR_TO_MAP; + } else { + verbose("unsupported arg_type %d\n", arg_type); + return -EFAULT; + } + + if (reg->type != expected_type) { + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], reg_type_str[expected_type]); + return -EACCES; + } + + if (arg_type == ARG_CONST_MAP_PTR) { + /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + *mapp = reg->map_ptr; + + } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + /* bpf_map_xxx(..., map_ptr, ..., key) call: + * check that [key, key + map->key_size) are within + * stack limits and initialized + */ + if (!*mapp) { + /* in function declaration map_ptr must come before + * map_key, so that it's verified and known before + * we have to check map_key here. Otherwise it means + * that kernel subsystem misconfigured verifier + */ + verbose("invalid map_ptr to access map->key\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->key_size); + + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + /* bpf_map_xxx(..., map_ptr, ..., value) call: + * check [value, value + map->value_size) validity + */ + if (!*mapp) { + /* kernel subsystem misconfigured verifier */ + verbose("invalid map_ptr to access map->value\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->value_size); + + } else if (arg_type == ARG_CONST_STACK_SIZE) { + /* bpf_xxx(..., buf, len) call will access 'len' bytes + * from stack pointer 'buf'. Check it + * note: regno == len, regno - 1 == buf + */ + if (regno == 0) { + /* kernel subsystem misconfigured verifier */ + verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno - 1, reg->imm); + } + + return err; +} + +static int check_call(struct verifier_env *env, int func_id) +{ + struct verifier_state *state = &env->cur_state; + const struct bpf_func_proto *fn = NULL; + struct reg_state *regs = state->regs; + struct bpf_map *map = NULL; + struct reg_state *reg; + int i, err; + + /* find function prototype */ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { + verbose("invalid func %d\n", func_id); + return -EINVAL; + } + + if (env->prog->aux->ops->get_func_proto) + fn = env->prog->aux->ops->get_func_proto(func_id); + + if (!fn) { + verbose("unknown func %d\n", func_id); + return -EINVAL; + } + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { + verbose("cannot call GPL only function from proprietary program\n"); + return -EINVAL; + } + + /* check args */ + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + if (err) + return err; + + /* reset caller saved regs */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* update return register */ + if (fn->ret_type == RET_INTEGER) { + regs[BPF_REG_0].type = UNKNOWN_VALUE; + } else if (fn->ret_type == RET_VOID) { + regs[BPF_REG_0].type = NOT_INIT; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + /* remember map_ptr, so that check_map_access() + * can check 'value_size' boundary of memory access + * to map element returned from bpf_map_lookup_elem() + */ + if (map == NULL) { + verbose("kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + regs[BPF_REG_0].map_ptr = map; + } else { + verbose("unknown return type %d of func %d\n", + fn->ret_type, func_id); + return -EINVAL; + } + return 0; +} + +/* check validity of 32-bit and 64-bit arithmetic operations */ +static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +{ + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode == BPF_END || opcode == BPF_NEG) { + if (opcode == BPF_NEG) { + if (BPF_SRC(insn->code) != 0 || + insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose("BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + verbose("BPF_END uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + } else if (opcode == BPF_MOV) { + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (BPF_SRC(insn->code) == BPF_X) { + if (BPF_CLASS(insn->code) == BPF_ALU64) { + /* case: R1 = R2 + * copy register state to dest reg + */ + regs[insn->dst_reg] = regs[insn->src_reg]; + } else { + regs[insn->dst_reg].type = UNKNOWN_VALUE; + regs[insn->dst_reg].map_ptr = NULL; + } + } else { + /* case: R = imm + * remember the value we stored into this reg + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + + } else if (opcode > BPF_END) { + verbose("invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + + } else { /* all other ALU ops: and, sub, xor, add, ... */ + + bool stack_relative = false; + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + if ((opcode == BPF_MOD || opcode == BPF_DIV) && + BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { + verbose("div by zero\n"); + return -EINVAL; + } + + /* pattern match 'bpf_add Rx, imm' instruction */ + if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + regs[insn->dst_reg].type == FRAME_PTR && + BPF_SRC(insn->code) == BPF_K) + stack_relative = true; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (stack_relative) { + regs[insn->dst_reg].type = PTR_TO_STACK; + regs[insn->dst_reg].imm = insn->imm; + } + } + + return 0; +} + +static int check_cond_jmp_op(struct verifier_env *env, + struct bpf_insn *insn, int *insn_idx) +{ + struct reg_state *regs = env->cur_state.regs; + struct verifier_state *other_branch; + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode > BPF_EXIT) { + verbose("invalid BPF_JMP opcode %x\n", opcode); + return -EINVAL; + } + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* detect if R == 0 where R was initialized to zero earlier */ + if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE) && + regs[insn->dst_reg].type == CONST_IMM && + regs[insn->dst_reg].imm == insn->imm) { + if (opcode == BPF_JEQ) { + /* if (imm == imm) goto pc+off; + * only follow the goto, ignore fall-through + */ + *insn_idx += insn->off; + return 0; + } else { + /* if (imm != imm) goto pc+off; + * only follow fall-through branch, since + * that's where the program will go + */ + return 0; + } + } + + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + if (!other_branch) + return -EFAULT; + + /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + if (BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || + opcode == BPF_JNE) && + regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + if (opcode == BPF_JEQ) { + /* next fallthrough insn can access memory via + * this register + */ + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + /* branch targer cannot access it, since reg == 0 */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = 0; + } else { + other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = 0; + } + } else if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE)) { + + if (opcode == BPF_JEQ) { + /* detect if (R == imm) goto + * and in the target state recognize that R = imm + */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = insn->imm; + } else { + /* detect if (R != imm) goto + * and in the fall-through state recognize that R = imm + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + } + if (log_level) + print_verifier_state(env); + return 0; +} + /* return the map pointer stored inside BPF_LD_IMM64 instruction */ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) { @@ -313,6 +1148,37 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) return (struct bpf_map *) (unsigned long) imm64; } +/* verify BPF_LD_IMM64 instruction */ +static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if (BPF_SIZE(insn->code) != BPF_DW) { + verbose("invalid BPF_LD_IMM insn\n"); + return -EINVAL; + } + if (insn->off != 0) { + verbose("BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (insn->src_reg == 0) + /* generic move 64-bit immediate into a register */ + return 0; + + /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ + BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -498,6 +1364,212 @@ err_free: return ret; } +static int do_check(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct bpf_insn *insns = env->prog->insnsi; + struct reg_state *regs = state->regs; + int insn_cnt = env->prog->len; + int insn_idx, prev_insn_idx = 0; + int insn_processed = 0; + bool do_print_state = false; + + init_reg_state(regs); + insn_idx = 0; + for (;;) { + struct bpf_insn *insn; + u8 class; + int err; + + if (insn_idx >= insn_cnt) { + verbose("invalid insn idx %d insn_cnt %d\n", + insn_idx, insn_cnt); + return -EFAULT; + } + + insn = &insns[insn_idx]; + class = BPF_CLASS(insn->code); + + if (++insn_processed > 32768) { + verbose("BPF program is too large. Proccessed %d insn\n", + insn_processed); + return -E2BIG; + } + + if (log_level && do_print_state) { + verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + print_verifier_state(env); + do_print_state = false; + } + + if (log_level) { + verbose("%d: ", insn_idx); + print_bpf_insn(insn); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(regs, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + /* check that memory (src_reg + off) is readable, + * the state of dst_reg will be updated by this func + */ + err = check_mem_access(env, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, + insn->dst_reg); + if (err) + return err; + + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_XADD) { + err = check_xadd(env, insn); + if (err) + return err; + insn_idx++; + continue; + } + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + insn->src_reg); + if (err) + return err; + + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose("BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + -1); + if (err) + return err; + + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + insn->off != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + err = check_call(env, insn->imm); + if (err) + return err; + + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + insn_idx += insn->off + 1; + continue; + + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(regs, BPF_REG_0, SRC_OP); + if (err) + return err; + + insn_idx = pop_stack(env, &prev_insn_idx); + if (insn_idx < 0) { + break; + } else { + do_print_state = true; + continue; + } + } else { + err = check_cond_jmp_op(env, insn, &insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + verbose("LD_ABS is not supported yet\n"); + return -EINVAL; + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + insn_idx++; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose("unknown insn class %d\n", class); + return -EINVAL; + } + + insn_idx++; + } + + return 0; +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -651,9 +1723,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - /* ret = do_check(env); */ + ret = do_check(env); skip_full_check: + while (pop_stack(env, NULL) >= 0); if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); -- cgit v1.2.3 From 3c731eba48e1b0650decfc91a839b80f0e05ce8f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:07 -0700 Subject: bpf: mini eBPF library, test stubs and verifier testsuite 1. the library includes a trivial set of BPF syscall wrappers: int bpf_create_map(int key_size, int value_size, int max_entries); int bpf_update_elem(int fd, void *key, void *value); int bpf_lookup_elem(int fd, void *key, void *value); int bpf_delete_elem(int fd, void *key); int bpf_get_next_key(int fd, void *key, void *next_key); int bpf_prog_load(enum bpf_prog_type prog_type, const struct sock_filter_int *insns, int insn_len, const char *license); bpf_prog_load() stores verifier log into global bpf_log_buf[] array and BPF_*() macros to build instructions 2. test stubs configure eBPF infra with 'unspec' map and program types. These are fake types used by user space testsuite only. 3. verifier tests valid and invalid programs and expects predefined error log messages from kernel. 40 tests so far. $ sudo ./test_verifier #0 add+sub+mul OK #1 unreachable OK #2 unreachable2 OK #3 out of range jump OK #4 out of range jump2 OK #5 test1 ld_imm64 OK ... Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 4 + kernel/bpf/test_stub.c | 116 ++++++++++ lib/Kconfig.debug | 3 +- samples/bpf/Makefile | 12 + samples/bpf/libbpf.c | 94 ++++++++ samples/bpf/libbpf.h | 172 ++++++++++++++ samples/bpf/test_verifier.c | 548 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 948 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/test_stub.c create mode 100644 samples/bpf/Makefile create mode 100644 samples/bpf/libbpf.c create mode 100644 samples/bpf/libbpf.h create mode 100644 samples/bpf/test_verifier.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3c726b0995b7..45427239f375 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1,5 @@ obj-y := core.o syscall.o verifier.o + +ifdef CONFIG_TEST_BPF +obj-y += test_stub.o +endif diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c new file mode 100644 index 000000000000..fcaddff4003e --- /dev/null +++ b/kernel/bpf/test_stub.c @@ -0,0 +1,116 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC + * to be used by user space verifier testsuite + */ +struct bpf_context { + u64 arg1; + u64 arg2; +}; + +static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + +static struct bpf_func_proto test_funcs[] = { + [BPF_FUNC_unspec] = { + .func = test_func, + .gpl_only = true, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + }, +}; + +static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) +{ + if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) + return NULL; + return &test_funcs[func_id]; +} + +static const struct bpf_context_access { + int size; + enum bpf_access_type type; +} test_ctx_access[] = { + [offsetof(struct bpf_context, arg1)] = { + FIELD_SIZEOF(struct bpf_context, arg1), + BPF_READ + }, + [offsetof(struct bpf_context, arg2)] = { + FIELD_SIZEOF(struct bpf_context, arg2), + BPF_READ + }, +}; + +static bool test_is_valid_access(int off, int size, enum bpf_access_type type) +{ + const struct bpf_context_access *access; + + if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) + return false; + + access = &test_ctx_access[off]; + if (access->size == size && (access->type & type)) + return true; + + return false; +} + +static struct bpf_verifier_ops test_ops = { + .get_func_proto = test_func_proto, + .is_valid_access = test_is_valid_access, +}; + +static struct bpf_prog_type_list tl_prog = { + .ops = &test_ops, + .type = BPF_PROG_TYPE_UNSPEC, +}; + +static struct bpf_map *test_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + map = kzalloc(sizeof(*map), GFP_USER); + if (!map) + return ERR_PTR(-ENOMEM); + + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + return map; +} + +static void test_map_free(struct bpf_map *map) +{ + kfree(map); +} + +static struct bpf_map_ops test_map_ops = { + .map_alloc = test_map_alloc, + .map_free = test_map_free, +}; + +static struct bpf_map_type_list tl_map = { + .ops = &test_map_ops, + .type = BPF_MAP_TYPE_UNSPEC, +}; + +static int __init register_test_ops(void) +{ + bpf_register_map_type(&tl_map); + bpf_register_prog_type(&tl_prog); + return 0; +} +late_initcall(register_test_ops); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a28590083622..3ac43f34437b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1672,7 +1672,8 @@ config TEST_BPF against the BPF interpreter or BPF JIT compiler depending on the current setting. This is in particular useful for BPF JIT compiler development, but also to run regression tests against changes in - the interpreter code. + the interpreter code. It also enables test stubs for eBPF maps and + verifier used by user space verifier testsuite. If unsure, say N. diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile new file mode 100644 index 000000000000..634391797856 --- /dev/null +++ b/samples/bpf/Makefile @@ -0,0 +1,12 @@ +# kbuild trick to avoid linker error. Can be omitted if a module is built. +obj- := dummy.o + +# List of programs to build +hostprogs-y := test_verifier + +test_verifier-objs := test_verifier.o libbpf.o + +# Tell kbuild to always build the programs +always := $(hostprogs-y) + +HOSTCFLAGS += -I$(objtree)/usr/include diff --git a/samples/bpf/libbpf.c b/samples/bpf/libbpf.c new file mode 100644 index 000000000000..ff6504420738 --- /dev/null +++ b/samples/bpf/libbpf.c @@ -0,0 +1,94 @@ +/* eBPF mini library */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" + +static __u64 ptr_to_u64(void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries) +{ + union bpf_attr attr = { + .map_type = map_type, + .key_size = key_size, + .value_size = value_size, + .max_entries = max_entries + }; + + return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); +} + +int bpf_update_elem(int fd, void *key, void *value) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .value = ptr_to_u64(value), + }; + + return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); +} + +int bpf_lookup_elem(int fd, void *key, void *value) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .value = ptr_to_u64(value), + }; + + return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + +int bpf_delete_elem(int fd, void *key) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + }; + + return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); +} + +int bpf_get_next_key(int fd, void *key, void *next_key) +{ + union bpf_attr attr = { + .map_fd = fd, + .key = ptr_to_u64(key), + .next_key = ptr_to_u64(next_key), + }; + + return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); +} + +#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u)) + +char bpf_log_buf[LOG_BUF_SIZE]; + +int bpf_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, int prog_len, + const char *license) +{ + union bpf_attr attr = { + .prog_type = prog_type, + .insns = ptr_to_u64((void *) insns), + .insn_cnt = prog_len / sizeof(struct bpf_insn), + .license = ptr_to_u64((void *) license), + .log_buf = ptr_to_u64(bpf_log_buf), + .log_size = LOG_BUF_SIZE, + .log_level = 1, + }; + + bpf_log_buf[0] = 0; + + return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); +} diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h new file mode 100644 index 000000000000..8a31babeca5d --- /dev/null +++ b/samples/bpf/libbpf.h @@ -0,0 +1,172 @@ +/* eBPF mini library */ +#ifndef __LIBBPF_H +#define __LIBBPF_H + +struct bpf_insn; + +int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, + int max_entries); +int bpf_update_elem(int fd, void *key, void *value); +int bpf_lookup_elem(int fd, void *key, void *value); +int bpf_delete_elem(int fd, void *key); +int bpf_get_next_key(int fd, void *key, void *next_key); + +int bpf_prog_load(enum bpf_prog_type prog_type, + const struct bpf_insn *insns, int insn_len, + const char *license); + +#define LOG_BUF_SIZE 8192 +extern char bpf_log_buf[LOG_BUF_SIZE]; + +/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ + +#define BPF_ALU64_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_ALU32_REG(OP, DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* Short form of mov, dst_reg = src_reg */ + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +/* Short form of mov, dst_reg = imm32 */ + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ +#define BPF_LD_IMM64(DST, IMM) \ + BPF_LD_IMM64_RAW(DST, 0, IMM) + +#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = (__u32) (IMM) }), \ + ((struct bpf_insn) { \ + .code = 0, /* zero is reserved opcode */ \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((__u64) (IMM)) >> 32 }) + +#define BPF_PSEUDO_MAP_FD 1 + +/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) + + +/* Memory load, dst_reg = *(uint *) (src_reg + off16) */ + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = src_reg */ + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Memory store, *(uint *) (dst_reg + off16) = imm32 */ + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ + +#define BPF_JMP_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +/* Raw code statement block */ + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +/* Program exit */ + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#endif diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c new file mode 100644 index 000000000000..d10992e2740e --- /dev/null +++ b/samples/bpf/test_verifier.c @@ -0,0 +1,548 @@ +/* + * Testsuite for eBPF verifier + * + * Copyright (c) 2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" + +#define MAX_INSNS 512 +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +struct bpf_test { + const char *descr; + struct bpf_insn insns[MAX_INSNS]; + int fixup[32]; + const char *errstr; + enum { + ACCEPT, + REJECT + } result; +}; + +static struct bpf_test tests[] = { + { + "add+sub+mul", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 2), + BPF_MOV64_IMM(BPF_REG_2, 3), + BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_2), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1), + BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 3), + BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "unreachable", + .insns = { + BPF_EXIT_INSN(), + BPF_EXIT_INSN(), + }, + .errstr = "unreachable", + .result = REJECT, + }, + { + "unreachable2", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_JMP_IMM(BPF_JA, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "unreachable", + .result = REJECT, + }, + { + "out of range jump", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "out of range jump2", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, -2), + BPF_EXIT_INSN(), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "test1 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM insn", + .result = REJECT, + }, + { + "test2 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM insn", + .result = REJECT, + }, + { + "test3 ld_imm64", + .insns = { + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 0), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "test4 ld_imm64", + .insns = { + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "test5 ld_imm64", + .insns = { + BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0), + }, + .errstr = "invalid bpf_ld_imm64 insn", + .result = REJECT, + }, + { + "no bpf_exit", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2), + }, + .errstr = "jump out of range", + .result = REJECT, + }, + { + "loop (back-edge)", + .insns = { + BPF_JMP_IMM(BPF_JA, 0, 0, -1), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "loop2 (back-edge)", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JA, 0, 0, -4), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "conditional loop", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_0), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3), + BPF_EXIT_INSN(), + }, + .errstr = "back-edge", + .result = REJECT, + }, + { + "read uninitialized register", + .insns = { + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), + BPF_EXIT_INSN(), + }, + .errstr = "R2 !read_ok", + .result = REJECT, + }, + { + "read invalid register", + .insns = { + BPF_MOV64_REG(BPF_REG_0, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R15 is invalid", + .result = REJECT, + }, + { + "program doesn't init R0 before exit", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1), + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, + }, + { + "stack out of bounds", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid stack", + .result = REJECT, + }, + { + "invalid call insn1", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_CALL uses reserved", + .result = REJECT, + }, + { + "invalid call insn2", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_CALL uses reserved", + .result = REJECT, + }, + { + "invalid function call", + .insns = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 1234567), + BPF_EXIT_INSN(), + }, + .errstr = "invalid func 1234567", + .result = REJECT, + }, + { + "uninitialized stack1", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_EXIT_INSN(), + }, + .fixup = {2}, + .errstr = "invalid indirect read from stack", + .result = REJECT, + }, + { + "uninitialized stack2", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -8), + BPF_EXIT_INSN(), + }, + .errstr = "invalid read from stack", + .result = REJECT, + }, + { + "check valid spill/fill", + .insns = { + /* spill R1(ctx) into stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + + /* fill it back into R2 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), + + /* should be able to access R0 = *(R2 + 8) */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + }, + { + "check corrupted spill/fill", + .insns = { + /* spill R1(ctx) into stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8), + + /* mess up with R1 pointer on stack */ + BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23), + + /* fill back into R0 should fail */ + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), + + BPF_EXIT_INSN(), + }, + .errstr = "corrupted spill", + .result = REJECT, + }, + { + "invalid src register in STX", + .insns = { + BPF_STX_MEM(BPF_B, BPF_REG_10, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R15 is invalid", + .result = REJECT, + }, + { + "invalid dst register in STX", + .insns = { + BPF_STX_MEM(BPF_B, 14, BPF_REG_10, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R14 is invalid", + .result = REJECT, + }, + { + "invalid dst register in ST", + .insns = { + BPF_ST_MEM(BPF_B, 14, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "R14 is invalid", + .result = REJECT, + }, + { + "invalid src register in LDX", + .insns = { + BPF_LDX_MEM(BPF_B, BPF_REG_0, 12, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R12 is invalid", + .result = REJECT, + }, + { + "invalid dst register in LDX", + .insns = { + BPF_LDX_MEM(BPF_B, 11, BPF_REG_1, 0), + BPF_EXIT_INSN(), + }, + .errstr = "R11 is invalid", + .result = REJECT, + }, + { + "junk insn", + .insns = { + BPF_RAW_INSN(0, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_LD_IMM", + .result = REJECT, + }, + { + "junk insn2", + .insns = { + BPF_RAW_INSN(1, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_LDX uses reserved fields", + .result = REJECT, + }, + { + "junk insn3", + .insns = { + BPF_RAW_INSN(-1, 0, 0, 0, 0), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_ALU opcode f0", + .result = REJECT, + }, + { + "junk insn4", + .insns = { + BPF_RAW_INSN(-1, -1, -1, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "invalid BPF_ALU opcode f0", + .result = REJECT, + }, + { + "junk insn5", + .insns = { + BPF_RAW_INSN(0x7f, -1, -1, -1, -1), + BPF_EXIT_INSN(), + }, + .errstr = "BPF_ALU uses reserved fields", + .result = REJECT, + }, + { + "misaligned read from stack", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -4), + BPF_EXIT_INSN(), + }, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "invalid map_fd for function call", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_EXIT_INSN(), + }, + .errstr = "fd 0 is not pointing to valid bpf_map", + .result = REJECT, + }, + { + "don't check return value before access", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "R0 invalid mem access 'map_value_or_null'", + .result = REJECT, + }, + { + "access memory with incorrect alignment", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "misaligned access", + .result = REJECT, + }, + { + "sometimes access memory with incorrect alignment", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_unspec), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), + BPF_EXIT_INSN(), + BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + }, + .fixup = {3}, + .errstr = "R0 invalid mem access", + .result = REJECT, + }, +}; + +static int probe_filter_length(struct bpf_insn *fp) +{ + int len = 0; + + for (len = MAX_INSNS - 1; len > 0; --len) + if (fp[len].code != 0 || fp[len].imm != 0) + break; + + return len + 1; +} + +static int create_map(void) +{ + long long key, value = 0; + int map_fd; + + map_fd = bpf_create_map(BPF_MAP_TYPE_UNSPEC, sizeof(key), sizeof(value), 1024); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + } + + return map_fd; +} + +static int test(void) +{ + int prog_fd, i; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + struct bpf_insn *prog = tests[i].insns; + int prog_len = probe_filter_length(prog); + int *fixup = tests[i].fixup; + int map_fd = -1; + + if (*fixup) { + map_fd = create_map(); + + do { + prog[*fixup].imm = map_fd; + fixup++; + } while (*fixup); + } + printf("#%d %s ", i, tests[i].descr); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog, + prog_len * sizeof(struct bpf_insn), + "GPL"); + + if (tests[i].result == ACCEPT) { + if (prog_fd < 0) { + printf("FAIL\nfailed to load prog '%s'\n", + strerror(errno)); + printf("%s", bpf_log_buf); + goto fail; + } + } else { + if (prog_fd >= 0) { + printf("FAIL\nunexpected success to load\n"); + printf("%s", bpf_log_buf); + goto fail; + } + if (strstr(bpf_log_buf, tests[i].errstr) == 0) { + printf("FAIL\nunexpected error message: %s", + bpf_log_buf); + goto fail; + } + } + + printf("OK\n"); +fail: + if (map_fd >= 0) + close(map_fd); + close(prog_fd); + + } + + return 0; +} + +int main(void) +{ + return test(); +} -- cgit v1.2.3 From 7bced397510ab569d31de4c70b39e13355046387 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 30 Dec 2013 12:37:29 -0800 Subject: net_dma: simple removal Per commit "77873803363c net_dma: mark broken" net_dma is no longer used and there is no plan to fix it. This is the mechanical removal of bits in CONFIG_NET_DMA ifdef guards. Reverting the remainder of the net_dma induced changes is deferred to subsequent patches. Marked for stable due to Roman's report of a memory leak in dma_pin_iovec_pages(): https://lkml.org/lkml/2014/9/3/177 Cc: Dave Jiang Cc: Vinod Koul Cc: David Whipple Cc: Alexander Duyck Cc: Reported-by: Roman Gushchin Acked-by: David S. Miller Signed-off-by: Dan Williams --- Documentation/ABI/removed/net_dma | 8 + Documentation/networking/ip-sysctl.txt | 6 - drivers/dma/Kconfig | 12 -- drivers/dma/Makefile | 1 - drivers/dma/dmaengine.c | 104 ------------ drivers/dma/ioat/dma.c | 1 - drivers/dma/ioat/dma.h | 7 - drivers/dma/ioat/dma_v2.c | 1 - drivers/dma/ioat/dma_v3.c | 1 - drivers/dma/iovlock.c | 280 --------------------------------- include/linux/dmaengine.h | 22 +-- include/linux/skbuff.h | 8 +- include/linux/tcp.h | 8 - include/net/netdma.h | 32 ---- include/net/sock.h | 19 +-- include/net/tcp.h | 8 - kernel/sysctl_binary.c | 1 - net/core/Makefile | 1 - net/core/dev.c | 10 -- net/core/sock.c | 6 - net/core/user_dma.c | 131 --------------- net/dccp/proto.c | 4 +- net/ipv4/sysctl_net_ipv4.c | 9 -- net/ipv4/tcp.c | 147 ++--------------- net/ipv4/tcp_input.c | 61 ------- net/ipv4/tcp_ipv4.c | 18 +-- net/ipv6/tcp_ipv6.c | 13 +- net/llc/af_llc.c | 10 +- 28 files changed, 35 insertions(+), 894 deletions(-) create mode 100644 Documentation/ABI/removed/net_dma delete mode 100644 drivers/dma/iovlock.c delete mode 100644 include/net/netdma.h delete mode 100644 net/core/user_dma.c (limited to 'kernel') diff --git a/Documentation/ABI/removed/net_dma b/Documentation/ABI/removed/net_dma new file mode 100644 index 000000000000..a173aecc2f18 --- /dev/null +++ b/Documentation/ABI/removed/net_dma @@ -0,0 +1,8 @@ +What: tcp_dma_copybreak sysctl +Date: Removed in kernel v3.13 +Contact: Dan Williams +Description: + Formerly the lower limit, in bytes, of the size of socket reads + that will be offloaded to a DMA copy engine. Removed due to + coherency issues of the cpu potentially touching the buffers + while dma is in flight. diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ab42c95f9985..ea8f3b182e70 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -582,12 +582,6 @@ tcp_workaround_signed_windows - BOOLEAN not receive a window scaling option from them. Default: 0 -tcp_dma_copybreak - INTEGER - Lower limit, in bytes, of the size of socket reads that will be - offloaded to a DMA copy engine, if one is present in the system - and CONFIG_NET_DMA is enabled. - Default: 4096 - tcp_thin_linear_timeouts - BOOLEAN Enable dynamic triggering of linear timeouts for thin streams. If set, a check is performed upon retransmission by timeout to diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 605b016bcea4..6b5f37e01a70 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -368,18 +368,6 @@ config DMA_OF comment "DMA Clients" depends on DMA_ENGINE -config NET_DMA - bool "Network: TCP receive copy offload" - depends on DMA_ENGINE && NET - default (INTEL_IOATDMA || FSL_DMA) - depends on BROKEN - help - This enables the use of DMA engines in the network stack to - offload receive copy-to-user operations, freeing CPU cycles. - - Say Y here if you enabled INTEL_IOATDMA or FSL_DMA, otherwise - say N. - config ASYNC_TX_DMA bool "Async_tx: Offload support for the async_tx api" depends on DMA_ENGINE diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index a029d0f4a1be..0c9dc7549327 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -6,7 +6,6 @@ obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o obj-$(CONFIG_DMA_ACPI) += acpi-dma.o obj-$(CONFIG_DMA_OF) += of-dma.o -obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_MID_DMAC) += intel_mid_dma.o obj-$(CONFIG_DMATEST) += dmatest.o obj-$(CONFIG_INTEL_IOATDMA) += ioat/ diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index ed610b497518..268de183b519 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -1084,110 +1084,6 @@ dmaengine_get_unmap_data(struct device *dev, int nr, gfp_t flags) } EXPORT_SYMBOL(dmaengine_get_unmap_data); -/** - * dma_async_memcpy_pg_to_pg - offloaded copy from page to page - * @chan: DMA channel to offload copy to - * @dest_pg: destination page - * @dest_off: offset in page to copy to - * @src_pg: source page - * @src_off: offset in page to copy from - * @len: length - * - * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus - * address according to the DMA mapping API rules for streaming mappings. - * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident - * (kernel memory or locked user space pages). - */ -dma_cookie_t -dma_async_memcpy_pg_to_pg(struct dma_chan *chan, struct page *dest_pg, - unsigned int dest_off, struct page *src_pg, unsigned int src_off, - size_t len) -{ - struct dma_device *dev = chan->device; - struct dma_async_tx_descriptor *tx; - struct dmaengine_unmap_data *unmap; - dma_cookie_t cookie; - unsigned long flags; - - unmap = dmaengine_get_unmap_data(dev->dev, 2, GFP_NOWAIT); - if (!unmap) - return -ENOMEM; - - unmap->to_cnt = 1; - unmap->from_cnt = 1; - unmap->addr[0] = dma_map_page(dev->dev, src_pg, src_off, len, - DMA_TO_DEVICE); - unmap->addr[1] = dma_map_page(dev->dev, dest_pg, dest_off, len, - DMA_FROM_DEVICE); - unmap->len = len; - flags = DMA_CTRL_ACK; - tx = dev->device_prep_dma_memcpy(chan, unmap->addr[1], unmap->addr[0], - len, flags); - - if (!tx) { - dmaengine_unmap_put(unmap); - return -ENOMEM; - } - - dma_set_unmap(tx, unmap); - cookie = tx->tx_submit(tx); - dmaengine_unmap_put(unmap); - - preempt_disable(); - __this_cpu_add(chan->local->bytes_transferred, len); - __this_cpu_inc(chan->local->memcpy_count); - preempt_enable(); - - return cookie; -} -EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); - -/** - * dma_async_memcpy_buf_to_buf - offloaded copy between virtual addresses - * @chan: DMA channel to offload copy to - * @dest: destination address (virtual) - * @src: source address (virtual) - * @len: length - * - * Both @dest and @src must be mappable to a bus address according to the - * DMA mapping API rules for streaming mappings. - * Both @dest and @src must stay memory resident (kernel memory or locked - * user space pages). - */ -dma_cookie_t -dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, - void *src, size_t len) -{ - return dma_async_memcpy_pg_to_pg(chan, virt_to_page(dest), - (unsigned long) dest & ~PAGE_MASK, - virt_to_page(src), - (unsigned long) src & ~PAGE_MASK, len); -} -EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); - -/** - * dma_async_memcpy_buf_to_pg - offloaded copy from address to page - * @chan: DMA channel to offload copy to - * @page: destination page - * @offset: offset in page to copy to - * @kdata: source address (virtual) - * @len: length - * - * Both @page/@offset and @kdata must be mappable to a bus address according - * to the DMA mapping API rules for streaming mappings. - * Both @page/@offset and @kdata must stay memory resident (kernel memory or - * locked user space pages) - */ -dma_cookie_t -dma_async_memcpy_buf_to_pg(struct dma_chan *chan, struct page *page, - unsigned int offset, void *kdata, size_t len) -{ - return dma_async_memcpy_pg_to_pg(chan, page, offset, - virt_to_page(kdata), - (unsigned long) kdata & ~PAGE_MASK, len); -} -EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); - void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, struct dma_chan *chan) { diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index b76c1485933b..940c1502a8b5 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -1222,7 +1222,6 @@ int ioat1_dma_probe(struct ioatdma_device *device, int dca) err = ioat_probe(device); if (err) return err; - ioat_set_tcp_copy_break(4096); err = ioat_register(device); if (err) return err; diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h index e982f00a9843..d63f68b1aa35 100644 --- a/drivers/dma/ioat/dma.h +++ b/drivers/dma/ioat/dma.h @@ -214,13 +214,6 @@ __dump_desc_dbg(struct ioat_chan_common *chan, struct ioat_dma_descriptor *hw, #define dump_desc_dbg(c, d) \ ({ if (d) __dump_desc_dbg(&c->base, d->hw, &d->txd, desc_id(d)); 0; }) -static inline void ioat_set_tcp_copy_break(unsigned long copybreak) -{ - #ifdef CONFIG_NET_DMA - sysctl_tcp_dma_copybreak = copybreak; - #endif -} - static inline struct ioat_chan_common * ioat_chan_by_index(struct ioatdma_device *device, int index) { diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c index 2ce9be498608..695483e6be32 100644 --- a/drivers/dma/ioat/dma_v2.c +++ b/drivers/dma/ioat/dma_v2.c @@ -900,7 +900,6 @@ int ioat2_dma_probe(struct ioatdma_device *device, int dca) err = ioat_probe(device); if (err) return err; - ioat_set_tcp_copy_break(2048); list_for_each_entry(c, &dma->channels, device_node) { chan = to_chan_common(c); diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c index 85971d6e9646..895f869d6c2c 100644 --- a/drivers/dma/ioat/dma_v3.c +++ b/drivers/dma/ioat/dma_v3.c @@ -1655,7 +1655,6 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca) err = ioat_probe(device); if (err) return err; - ioat_set_tcp_copy_break(262144); list_for_each_entry(c, &dma->channels, device_node) { chan = to_chan_common(c); diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c deleted file mode 100644 index bb48a57c2fc1..000000000000 --- a/drivers/dma/iovlock.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include -#include -#include -#include /* for memcpy_toiovec */ -#include -#include - -static int num_pages_spanned(struct iovec *iov) -{ - return - ((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - - ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT); -} - -/* - * Pin down all the iovec pages needed for len bytes. - * Return a struct dma_pinned_list to keep track of pages pinned down. - * - * We are allocating a single chunk of memory, and then carving it up into - * 3 sections, the latter 2 whose size depends on the number of iovecs and the - * total number of pages, respectively. - */ -struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len) -{ - struct dma_pinned_list *local_list; - struct page **pages; - int i; - int ret; - int nr_iovecs = 0; - int iovec_len_used = 0; - int iovec_pages_used = 0; - - /* don't pin down non-user-based iovecs */ - if (segment_eq(get_fs(), KERNEL_DS)) - return NULL; - - /* determine how many iovecs/pages there are, up front */ - do { - iovec_len_used += iov[nr_iovecs].iov_len; - iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]); - nr_iovecs++; - } while (iovec_len_used < len); - - /* single kmalloc for pinned list, page_list[], and the page arrays */ - local_list = kmalloc(sizeof(*local_list) - + (nr_iovecs * sizeof (struct dma_page_list)) - + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL); - if (!local_list) - goto out; - - /* list of pages starts right after the page list array */ - pages = (struct page **) &local_list->page_list[nr_iovecs]; - - local_list->nr_iovecs = 0; - - for (i = 0; i < nr_iovecs; i++) { - struct dma_page_list *page_list = &local_list->page_list[i]; - - len -= iov[i].iov_len; - - if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) - goto unpin; - - page_list->nr_pages = num_pages_spanned(&iov[i]); - page_list->base_address = iov[i].iov_base; - - page_list->pages = pages; - pages += page_list->nr_pages; - - /* pin pages down */ - down_read(¤t->mm->mmap_sem); - ret = get_user_pages( - current, - current->mm, - (unsigned long) iov[i].iov_base, - page_list->nr_pages, - 1, /* write */ - 0, /* force */ - page_list->pages, - NULL); - up_read(¤t->mm->mmap_sem); - - if (ret != page_list->nr_pages) - goto unpin; - - local_list->nr_iovecs = i + 1; - } - - return local_list; - -unpin: - dma_unpin_iovec_pages(local_list); -out: - return NULL; -} - -void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list) -{ - int i, j; - - if (!pinned_list) - return; - - for (i = 0; i < pinned_list->nr_iovecs; i++) { - struct dma_page_list *page_list = &pinned_list->page_list[i]; - for (j = 0; j < page_list->nr_pages; j++) { - set_page_dirty_lock(page_list->pages[j]); - page_cache_release(page_list->pages[j]); - } - } - - kfree(pinned_list); -} - - -/* - * We have already pinned down the pages we will be using in the iovecs. - * Each entry in iov array has corresponding entry in pinned_list->page_list. - * Using array indexing to keep iov[] and page_list[] in sync. - * Initial elements in iov array's iov->iov_len will be 0 if already copied into - * by another call. - * iov array length remaining guaranteed to be bigger than len. - */ -dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, - struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len) -{ - int iov_byte_offset; - int copy; - dma_cookie_t dma_cookie = 0; - int iovec_idx; - int page_idx; - - if (!chan) - return memcpy_toiovec(iov, kdata, len); - - iovec_idx = 0; - while (iovec_idx < pinned_list->nr_iovecs) { - struct dma_page_list *page_list; - - /* skip already used-up iovecs */ - while (!iov[iovec_idx].iov_len) - iovec_idx++; - - page_list = &pinned_list->page_list[iovec_idx]; - - iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); - page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) - - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; - - /* break up copies to not cross page boundary */ - while (iov[iovec_idx].iov_len) { - copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); - copy = min_t(int, copy, iov[iovec_idx].iov_len); - - dma_cookie = dma_async_memcpy_buf_to_pg(chan, - page_list->pages[page_idx], - iov_byte_offset, - kdata, - copy); - /* poll for a descriptor slot */ - if (unlikely(dma_cookie < 0)) { - dma_async_issue_pending(chan); - continue; - } - - len -= copy; - iov[iovec_idx].iov_len -= copy; - iov[iovec_idx].iov_base += copy; - - if (!len) - return dma_cookie; - - kdata += copy; - iov_byte_offset = 0; - page_idx++; - } - iovec_idx++; - } - - /* really bad if we ever run out of iovecs */ - BUG(); - return -EFAULT; -} - -dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, - struct dma_pinned_list *pinned_list, struct page *page, - unsigned int offset, size_t len) -{ - int iov_byte_offset; - int copy; - dma_cookie_t dma_cookie = 0; - int iovec_idx; - int page_idx; - int err; - - /* this needs as-yet-unimplemented buf-to-buff, so punt. */ - /* TODO: use dma for this */ - if (!chan || !pinned_list) { - u8 *vaddr = kmap(page); - err = memcpy_toiovec(iov, vaddr + offset, len); - kunmap(page); - return err; - } - - iovec_idx = 0; - while (iovec_idx < pinned_list->nr_iovecs) { - struct dma_page_list *page_list; - - /* skip already used-up iovecs */ - while (!iov[iovec_idx].iov_len) - iovec_idx++; - - page_list = &pinned_list->page_list[iovec_idx]; - - iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); - page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) - - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; - - /* break up copies to not cross page boundary */ - while (iov[iovec_idx].iov_len) { - copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); - copy = min_t(int, copy, iov[iovec_idx].iov_len); - - dma_cookie = dma_async_memcpy_pg_to_pg(chan, - page_list->pages[page_idx], - iov_byte_offset, - page, - offset, - copy); - /* poll for a descriptor slot */ - if (unlikely(dma_cookie < 0)) { - dma_async_issue_pending(chan); - continue; - } - - len -= copy; - iov[iovec_idx].iov_len -= copy; - iov[iovec_idx].iov_base += copy; - - if (!len) - return dma_cookie; - - offset += copy; - iov_byte_offset = 0; - page_idx++; - } - iovec_idx++; - } - - /* really bad if we ever run out of iovecs */ - BUG(); - return -EFAULT; -} diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index c5c92d59e531..3e382ecd1927 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -903,18 +903,6 @@ static inline void dmaengine_put(void) } #endif -#ifdef CONFIG_NET_DMA -#define net_dmaengine_get() dmaengine_get() -#define net_dmaengine_put() dmaengine_put() -#else -static inline void net_dmaengine_get(void) -{ -} -static inline void net_dmaengine_put(void) -{ -} -#endif - #ifdef CONFIG_ASYNC_TX_DMA #define async_dmaengine_get() dmaengine_get() #define async_dmaengine_put() dmaengine_put() @@ -936,16 +924,8 @@ async_dma_find_channel(enum dma_transaction_type type) return NULL; } #endif /* CONFIG_ASYNC_TX_DMA */ - -dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, - void *dest, void *src, size_t len); -dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, - struct page *page, unsigned int offset, void *kdata, size_t len); -dma_cookie_t dma_async_memcpy_pg_to_pg(struct dma_chan *chan, - struct page *dest_pg, unsigned int dest_off, struct page *src_pg, - unsigned int src_off, size_t len); void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, - struct dma_chan *chan); + struct dma_chan *chan); static inline void async_tx_ack(struct dma_async_tx_descriptor *tx) { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5e1e6f2d98c2..bdbf7afad6b7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -515,11 +514,8 @@ struct sk_buff { /* 6/8 bit hole (depending on ndisc_nodetype presence) */ kmemcheck_bitfield_end(flags2); -#if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL - union { - unsigned int napi_id; - dma_cookie_t dma_cookie; - }; +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id; #endif #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4ad0706d40eb..90895b8dc7f2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -19,7 +19,6 @@ #include -#include #include #include #include @@ -169,13 +168,6 @@ struct tcp_sock { struct iovec *iov; int memory; int len; -#ifdef CONFIG_NET_DMA - /* members for async copy */ - struct dma_chan *dma_chan; - int wakeup; - struct dma_pinned_list *pinned_list; - dma_cookie_t dma_cookie; -#endif } ucopy; u32 snd_wl1; /* Sequence for window update */ diff --git a/include/net/netdma.h b/include/net/netdma.h deleted file mode 100644 index 8ba8ce284eeb..000000000000 --- a/include/net/netdma.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ -#ifndef NETDMA_H -#define NETDMA_H -#ifdef CONFIG_NET_DMA -#include -#include - -int dma_skb_copy_datagram_iovec(struct dma_chan* chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list); - -#endif /* CONFIG_NET_DMA */ -#endif /* NETDMA_H */ diff --git a/include/net/sock.h b/include/net/sock.h index b9586a137cad..3353b47f3d40 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -231,7 +231,6 @@ struct cg_proto; * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_write_queue: Packet sending queue - * @sk_async_wait_queue: DMA copied packets * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward @@ -354,10 +353,6 @@ struct sock { struct sk_filter __rcu *sk_filter; struct socket_wq __rcu *sk_wq; -#ifdef CONFIG_NET_DMA - struct sk_buff_head sk_async_wait_queue; -#endif - #ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; #endif @@ -2214,27 +2209,15 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags); * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat - * @copied_early: flag indicating whether DMA operations copied this data early * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -#ifdef CONFIG_NET_DMA -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early) -{ - __skb_unlink(skb, &sk->sk_receive_queue); - if (!copied_early) - __kfree_skb(skb); - else - __skb_queue_tail(&sk->sk_async_wait_queue, skb); -} -#else -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, bool copied_early) +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } -#endif static inline struct net *sock_net(const struct sock *sk) diff --git a/include/net/tcp.h b/include/net/tcp.h index 8c4dd63134d4..2c2f24ffa383 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -267,7 +266,6 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1023,12 +1021,6 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp) tp->ucopy.len = 0; tp->ucopy.memory = 0; skb_queue_head_init(&tp->ucopy.prequeue); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - tp->ucopy.wakeup = 0; - tp->ucopy.pinned_list = NULL; - tp->ucopy.dma_cookie = 0; -#endif } bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd9e7ad..d457005acedf 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = { { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, diff --git a/net/core/Makefile b/net/core/Makefile index 9628c20acff6..5038f1ea0349 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o -obj-$(CONFIG_NET_DMA) += user_dma.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o diff --git a/net/core/dev.c b/net/core/dev.c index b1b0c8d4d7df..5e37e9abe8c5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1266,7 +1266,6 @@ static int __dev_open(struct net_device *dev) clear_bit(__LINK_STATE_START, &dev->state); else { dev->flags |= IFF_UP; - net_dmaengine_get(); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1342,7 +1341,6 @@ static int __dev_close_many(struct list_head *head) ops->ndo_stop(dev); dev->flags &= ~IFF_UP; - net_dmaengine_put(); } return 0; @@ -4405,14 +4403,6 @@ static void net_rx_action(struct softirq_action *h) out: net_rps_action_and_irq_enable(sd); -#ifdef CONFIG_NET_DMA - /* - * There may not be any more sk_buffs coming right now, so push - * any pending DMA copies to hardware - */ - dma_issue_pending_all(); -#endif - return; softnet_break: diff --git a/net/core/sock.c b/net/core/sock.c index c0fc6bdad1e3..2f143c3b190a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1452,9 +1452,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&newsk->sk_async_wait_queue); -#endif spin_lock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -2265,9 +2262,6 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); -#ifdef CONFIG_NET_DMA - skb_queue_head_init(&sk->sk_async_wait_queue); -#endif sk->sk_send_head = NULL; diff --git a/net/core/user_dma.c b/net/core/user_dma.c deleted file mode 100644 index 1b5fefdb8198..000000000000 --- a/net/core/user_dma.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. - * Portions based on net/core/datagram.c and copyrighted by their authors. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * The full GNU General Public License is included in this distribution in the - * file called COPYING. - */ - -/* - * This code allows the net stack to make use of a DMA engine for - * skb to iovec copies. - */ - -#include -#include -#include -#include -#include - -#define NET_DMA_DEFAULT_COPYBREAK 4096 - -int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; -EXPORT_SYMBOL(sysctl_tcp_dma_copybreak); - -/** - * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. - * @skb - buffer to copy - * @offset - offset in the buffer to start copying from - * @iovec - io vector to copy to - * @len - amount of data to copy from buffer to iovec - * @pinned_list - locked iovec buffer data - * - * Note: the iovec is modified during the copy. - */ -int dma_skb_copy_datagram_iovec(struct dma_chan *chan, - struct sk_buff *skb, int offset, struct iovec *to, - size_t len, struct dma_pinned_list *pinned_list) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - dma_cookie_t cookie = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_memcpy_to_iovec(chan, to, pinned_list, - skb->data + offset, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - copy = end - offset; - if (copy > 0) { - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - - cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, - frag->page_offset + offset - start, copy); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - copy = end - offset; - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, - offset - start, - to, copy, - pinned_list); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; - } - -end: - if (!len) { - skb->dma_cookie = cookie; - return cookie; - } - -fault: - return -EFAULT; -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c index eb892b4f4814..f9076f295b13 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -848,7 +848,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, default: dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { @@ -905,7 +905,7 @@ verify_sock_status: len = skb->len; found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); break; } while (1); out: diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 44eba052b43d..c3d2a48481f1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -635,15 +635,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -#ifdef CONFIG_NET_DMA - { - .procname = "tcp_dma_copybreak", - .data = &sysctl_tcp_dma_copybreak, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif { .procname = "tcp_slow_start_after_idle", .data = &sysctl_tcp_slow_start_after_idle, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 97c8f5620c43..28595a364f09 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -274,7 +274,6 @@ #include #include #include -#include #include #include @@ -1454,39 +1453,6 @@ static void tcp_prequeue_process(struct sock *sk) tp->ucopy.memory = 0; } -#ifdef CONFIG_NET_DMA -static void tcp_service_net_dma(struct sock *sk, bool wait) -{ - dma_cookie_t done, used; - dma_cookie_t last_issued; - struct tcp_sock *tp = tcp_sk(sk); - - if (!tp->ucopy.dma_chan) - return; - - last_issued = tp->ucopy.dma_cookie; - dma_async_issue_pending(tp->ucopy.dma_chan); - - do { - if (dma_async_is_tx_complete(tp->ucopy.dma_chan, - last_issued, &done, - &used) == DMA_COMPLETE) { - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - break; - } else { - struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_COMPLETE)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } - } while (wait); -} -#endif - static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1504,7 +1470,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); } return NULL; } @@ -1570,11 +1536,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } if (tcp_hdr(skb)->fin) { - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); ++seq; break; } - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); if (!desc->count) break; tp->copied_seq = seq; @@ -1612,7 +1578,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; - bool copied_early = false; struct sk_buff *skb; u32 urg_hole = 0; @@ -1655,28 +1620,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -#ifdef CONFIG_NET_DMA - tp->ucopy.dma_chan = NULL; - preempt_disable(); - skb = skb_peek_tail(&sk->sk_receive_queue); - { - int available = 0; - - if (skb) - available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); - if ((available < target) && - (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && - net_dma_find_channel()) { - preempt_enable(); - tp->ucopy.pinned_list = - dma_pin_iovec_pages(msg->msg_iov, len); - } else { - preempt_enable(); - } - } -#endif - do { u32 offset; @@ -1807,16 +1750,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Set realtime policy in scheduler __ */ } -#ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - if (tp->rcv_wnd == 0 && - !skb_queue_empty(&sk->sk_async_wait_queue)) { - tcp_service_net_dma(sk, true); - tcp_cleanup_rbuf(sk, copied); - } else - dma_async_issue_pending(tp->ucopy.dma_chan); - } -#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1824,11 +1757,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, false); /* Don't block */ - tp->ucopy.wakeup = 0; -#endif - if (user_recv) { int chunk; @@ -1886,43 +1814,13 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { -#ifdef CONFIG_NET_DMA - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan) { - tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( - tp->ucopy.dma_chan, skb, offset, - msg->msg_iov, used, - tp->ucopy.pinned_list); - - if (tp->ucopy.dma_cookie < 0) { - - pr_alert("%s: dma_cookie < 0\n", - __func__); - - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } - - dma_async_issue_pending(tp->ucopy.dma_chan); - - if ((offset + used) == skb->len) - copied_early = true; - - } else -#endif - { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; - } + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; } } @@ -1942,19 +1840,15 @@ skip_copy: if (tcp_hdr(skb)->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb, copied_early); - copied_early = false; - } + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); break; } while (len > 0); @@ -1977,16 +1871,6 @@ skip_copy: tp->ucopy.len = 0; } -#ifdef CONFIG_NET_DMA - tcp_service_net_dma(sk, true); /* Wait for queue to drain */ - tp->ucopy.dma_chan = NULL; - - if (tp->ucopy.pinned_list) { - dma_unpin_iovec_pages(tp->ucopy.pinned_list); - tp->ucopy.pinned_list = NULL; - } -#endif - /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -2330,9 +2214,6 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); -#ifdef CONFIG_NET_DMA - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif inet->inet_dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eeaac399420d..1342e9851f97 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -73,7 +73,6 @@ #include #include #include -#include int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -4970,53 +4969,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, __tcp_checksum_complete_user(sk, skb); } -#ifdef CONFIG_NET_DMA -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, - int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int dma_cookie; - bool copied_early = false; - - if (tp->ucopy.wakeup) - return false; - - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { - - dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, - skb, hlen, - tp->ucopy.iov, chunk, - tp->ucopy.pinned_list); - - if (dma_cookie < 0) - goto out; - - tp->ucopy.dma_cookie = dma_cookie; - copied_early = true; - - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - - if ((tp->ucopy.len == 0) || - (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || - (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); - } - } else if (chunk > 0) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); - } -out: - return copied_early; -} -#endif /* CONFIG_NET_DMA */ - /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5201,14 +5153,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tp->copied_seq == tp->rcv_nxt && len - tcp_header_len <= tp->ucopy.len) { -#ifdef CONFIG_NET_DMA - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && - tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { - copied_early = 1; - eaten = 1; - } -#endif if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { __set_current_state(TASK_RUNNING); @@ -5274,11 +5218,6 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (!copied_early || tp->rcv_nxt != tp->rcv_wup) __tcp_ack_snd_check(sk, 0); no_ack: -#ifdef CONFIG_NET_DMA - if (copied_early) - __skb_queue_tail(&sk->sk_async_wait_queue, skb); - else -#endif if (eaten) kfree_skb_partial(skb, fragstolen); sk->sk_data_ready(sk, 0); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3cf976510497..737c2e270ee3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -72,7 +72,6 @@ #include #include #include -#include #include #include #include @@ -1999,18 +1998,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); @@ -2169,11 +2158,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif -#ifdef CONFIG_NET_DMA - /* Cleans up our sk_async_wait_queue */ - __skb_queue_purge(&sk->sk_async_wait_queue); -#endif - /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 889079b2ea85..cb21fccf2089 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -1520,18 +1519,8 @@ process: bh_lock_sock_nested(sk); ret = 0; if (!sock_owned_by_user(sk)) { -#ifdef CONFIG_NET_DMA - struct tcp_sock *tp = tcp_sk(sk); - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - if (tp->ucopy.dma_chan) + if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); - else -#endif - { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); - } } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 0080d2b0a8ae..bb9cbc17d926 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -839,7 +839,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (!(flags & MSG_PEEK)) { spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); + sk_eat_skb(sk, skb); spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); *seq = 0; } @@ -861,10 +861,10 @@ copy_uaddr: llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { - spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); - sk_eat_skb(sk, skb, false); - spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); - *seq = 0; + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); + sk_eat_skb(sk, skb); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); + *seq = 0; } goto out; -- cgit v1.2.3 From e34191fad8e5d9fe4e76f6d03b5e29e3eae7535a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 29 Sep 2014 06:14:23 -0700 Subject: locktorture: Support rwlocks Add a "rw_lock" torture test to stress kernel rwlocks and their irq variant. Reader critical regions are 5x longer than writers. As such a similar ratio of lock acquisitions is seen in the statistics. In the case of massive contention, both hold the lock for 1/10 of a second. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- Documentation/locking/locktorture.txt | 5 + kernel/locking/locktorture.c | 115 ++++++++++++++++++++- .../selftests/rcutorture/configs/lock/CFLIST | 1 + .../selftests/rcutorture/configs/lock/LOCK04 | 6 ++ .../selftests/rcutorture/configs/lock/LOCK04.boot | 1 + 5 files changed, 125 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK04 create mode 100644 tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot (limited to 'kernel') diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt index be715015e0f7..619f2bb136a5 100644 --- a/Documentation/locking/locktorture.txt +++ b/Documentation/locking/locktorture.txt @@ -45,6 +45,11 @@ torture_type Type of lock to torture. By default, only spinlocks will o "spin_lock_irq": spin_lock_irq() and spin_unlock_irq() pairs. + o "rw_lock": read/write lock() and unlock() rwlock pairs. + + o "rw_lock_irq": read/write lock_irq() and unlock_irq() + rwlock pairs. + o "mutex_lock": mutex_lock() and mutex_unlock() pairs. o "rwsem_lock": read/write down() and up() semaphore pairs. diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 540d5dfe1112..0762b25b4110 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -229,6 +230,110 @@ static struct lock_torture_ops spin_lock_irq_ops = { .name = "spin_lock_irq" }; +static DEFINE_RWLOCK(torture_rwlock); + +static int torture_rwlock_write_lock(void) __acquires(torture_rwlock) +{ + write_lock(&torture_rwlock); + return 0; +} + +static void torture_rwlock_write_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + else + udelay(shortdelay_us); +} + +static void torture_rwlock_write_unlock(void) __releases(torture_rwlock) +{ + write_unlock(&torture_rwlock); +} + +static int torture_rwlock_read_lock(void) __acquires(torture_rwlock) +{ + read_lock(&torture_rwlock); + return 0; +} + +static void torture_rwlock_read_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 10; + const unsigned long longdelay_ms = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealreaders_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + else + udelay(shortdelay_us); +} + +static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) +{ + read_unlock(&torture_rwlock); +} + +static struct lock_torture_ops rw_lock_ops = { + .writelock = torture_rwlock_write_lock, + .write_delay = torture_rwlock_write_delay, + .writeunlock = torture_rwlock_write_unlock, + .readlock = torture_rwlock_read_lock, + .read_delay = torture_rwlock_read_delay, + .readunlock = torture_rwlock_read_unlock, + .name = "rw_lock" +}; + +static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock) +{ + unsigned long flags; + + write_lock_irqsave(&torture_rwlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_rwlock_write_unlock_irq(void) +__releases(torture_rwlock) +{ + write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); +} + +static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock) +{ + unsigned long flags; + + read_lock_irqsave(&torture_rwlock, flags); + cxt.cur_ops->flags = flags; + return 0; +} + +static void torture_rwlock_read_unlock_irq(void) +__releases(torture_rwlock) +{ + write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags); +} + +static struct lock_torture_ops rw_lock_irq_ops = { + .writelock = torture_rwlock_write_lock_irq, + .write_delay = torture_rwlock_write_delay, + .writeunlock = torture_rwlock_write_unlock_irq, + .readlock = torture_rwlock_read_lock_irq, + .read_delay = torture_rwlock_read_delay, + .readunlock = torture_rwlock_read_unlock_irq, + .name = "rw_lock_irq" +}; + static DEFINE_MUTEX(torture_mutex); static int torture_mutex_lock(void) __acquires(torture_mutex) @@ -535,8 +640,11 @@ static int __init lock_torture_init(void) int i, j; int firsterr = 0; static struct lock_torture_ops *torture_ops[] = { - &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, - &mutex_lock_ops, &rwsem_lock_ops, + &lock_busted_ops, + &spin_lock_ops, &spin_lock_irq_ops, + &rw_lock_ops, &rw_lock_irq_ops, + &mutex_lock_ops, + &rwsem_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) @@ -571,7 +679,8 @@ static int __init lock_torture_init(void) cxt.debug_lock = true; #endif #ifdef CONFIG_DEBUG_SPINLOCK - if (strncmp(torture_type, "spin", 4) == 0) + if ((strncmp(torture_type, "spin", 4) == 0) || + (strncmp(torture_type, "rw_lock", 7) == 0)) cxt.debug_lock = true; #endif diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST index 6108137da770..6910b7370761 100644 --- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST @@ -1,3 +1,4 @@ LOCK01 LOCK02 LOCK03 +LOCK04 \ No newline at end of file diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK04 b/tools/testing/selftests/rcutorture/configs/lock/LOCK04 new file mode 100644 index 000000000000..1d1da1477fc3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK04 @@ -0,0 +1,6 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot new file mode 100644 index 000000000000..48c04fe47fb4 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot @@ -0,0 +1 @@ +locktorture.torture_type=rw_lock -- cgit v1.2.3 From 219f800f99db6f4e43a582cb9e0d98931f13c012 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 29 Sep 2014 06:14:24 -0700 Subject: locktorture: Fix __acquire annotation for spinlock irq Its quite easy to get mixed up with the names -- 'torture_spinlock_irq' is not actually a valid spinlock name. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 0762b25b4110..9e9cd111fb0f 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -205,7 +205,7 @@ static struct lock_torture_ops spin_lock_ops = { }; static int torture_spin_lock_write_lock_irq(void) -__acquires(torture_spinlock_irq) +__acquires(torture_spinlock) { unsigned long flags; -- cgit v1.2.3 From a1229491006a3d55cc0d7e6d496be39915ccefdd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 29 Sep 2014 06:14:25 -0700 Subject: locktorture: Cannot hold read and write lock ... trigger an error if so. Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9e9cd111fb0f..b05dc46c4297 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -453,14 +453,19 @@ static int lock_torture_writer(void *arg) do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = 1; + if (WARN_ON_ONCE(lock_is_read_held)) + lwsp->n_lock_fail++; /* rare, but... */ + lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); lock_is_write_held = 0; cxt.cur_ops->writeunlock(); + stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); torture_kthread_stopping("lock_torture_writer"); @@ -482,12 +487,17 @@ static int lock_torture_reader(void *arg) do { if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + cxt.cur_ops->readlock(); lock_is_read_held = 1; + if (WARN_ON_ONCE(lock_is_write_held)) + lrsp->n_lock_fail++; /* rare, but... */ + lrsp->n_lock_acquired++; cxt.cur_ops->read_delay(&rand); lock_is_read_held = 0; cxt.cur_ops->readunlock(); + stutter_wait("lock_torture_reader"); } while (!torture_must_stop()); torture_kthread_stopping("lock_torture_reader"); -- cgit v1.2.3 From c98fed9fc6a7449affd941d8a8e9fcb0c72977d6 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 29 Sep 2014 06:14:26 -0700 Subject: locktorture: Cleanup header usage Remove some unnecessary ones and explicitly include rwsem.h Signed-off-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index b05dc46c4297..ec8cce259779 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -20,32 +20,20 @@ * Author: Paul E. McKenney * Based on kernel/rcu/torture.c. */ -#include #include -#include #include #include -#include #include #include #include +#include #include #include #include #include -#include -#include #include -#include -#include -#include -#include -#include #include -#include #include -#include -#include #include MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 2a8a8ce651d3a88fdf83e2ed15633c8d19292108 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Sep 2014 02:21:34 +0200 Subject: PM / sleep: Export dpm_suspend_late/noirq() and dpm_resume_early/noirq() Subsequent change sets will add platform-related operations between dpm_suspend_late() and dpm_suspend_noirq() as well as between dpm_resume_noirq() and dpm_resume_early() in suspend_enter(), so export these functions for suspend_enter() to be able to call them separately and split the invocations of dpm_suspend_end() and dpm_resume_start() in there accordingly. Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 8 ++++---- include/linux/pm.h | 4 ++++ kernel/power/suspend.c | 14 +++++++++++--- 3 files changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index b67d9aef9fe4..44973196d3fd 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -540,7 +540,7 @@ static void async_resume_noirq(void *data, async_cookie_t cookie) * Call the "noirq" resume handlers for all devices in dpm_noirq_list and * enable device drivers to receive interrupts. */ -static void dpm_resume_noirq(pm_message_t state) +void dpm_resume_noirq(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); @@ -662,7 +662,7 @@ static void async_resume_early(void *data, async_cookie_t cookie) * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ -static void dpm_resume_early(pm_message_t state) +void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); @@ -1093,7 +1093,7 @@ static int device_suspend_noirq(struct device *dev) * Prevent device drivers from receiving interrupts and call the "noirq" suspend * handlers for all non-sysdev devices. */ -static int dpm_suspend_noirq(pm_message_t state) +int dpm_suspend_noirq(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; @@ -1232,7 +1232,7 @@ static int device_suspend_late(struct device *dev) * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ -static int dpm_suspend_late(pm_message_t state) +int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; diff --git a/include/linux/pm.h b/include/linux/pm.h index 72c0fe098a27..e1c00b7ee913 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -679,12 +679,16 @@ struct dev_pm_domain { extern void device_pm_lock(void); extern void dpm_resume_start(pm_message_t state); extern void dpm_resume_end(pm_message_t state); +extern void dpm_resume_noirq(pm_message_t state); +extern void dpm_resume_early(pm_message_t state); extern void dpm_resume(pm_message_t state); extern void dpm_complete(pm_message_t state); extern void device_pm_unlock(void); extern int dpm_suspend_end(pm_message_t state); extern int dpm_suspend_start(pm_message_t state); +extern int dpm_suspend_noirq(pm_message_t state); +extern int dpm_suspend_late(pm_message_t state); extern int dpm_suspend(pm_message_t state); extern int dpm_prepare(pm_message_t state); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index e837dd6783c6..58ae98b7dc2b 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -265,11 +265,16 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (error) goto Platform_finish; - error = dpm_suspend_end(PMSG_SUSPEND); + error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down\n"); + printk(KERN_ERR "PM: late suspend of devices failed\n"); goto Platform_finish; } + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + goto Devices_early_resume; + } error = platform_suspend_prepare_late(state); if (error) goto Platform_wake; @@ -319,7 +324,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) Platform_wake: platform_suspend_wake(state); - dpm_resume_start(PMSG_RESUME); + dpm_resume_noirq(PMSG_RESUME); + + Devices_early_resume: + dpm_resume_early(PMSG_RESUME); Platform_finish: platform_suspend_finish(state); -- cgit v1.2.3 From ebc3e41e371620bae6c315c9174bcb2d6c4e9ae7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Sep 2014 02:22:24 +0200 Subject: PM / sleep: Rename platform suspend/resume functions in suspend.c Rename several local functions related to platform handling during system suspend resume in suspend.c so that their names better reflect their roles. Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 58ae98b7dc2b..a25e768d92b5 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -144,19 +144,19 @@ static int platform_suspend_prepare(suspend_state_t state) suspend_ops->prepare() : 0; } -static int platform_suspend_prepare_late(suspend_state_t state) +static int platform_suspend_prepare_noirq(suspend_state_t state) { return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0; } -static void platform_suspend_wake(suspend_state_t state) +static void platform_resume_noirq(suspend_state_t state) { if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) suspend_ops->wake(); } -static void platform_suspend_finish(suspend_state_t state) +static void platform_resume_finish(suspend_state_t state) { if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) suspend_ops->finish(); @@ -172,7 +172,7 @@ static int platform_suspend_begin(suspend_state_t state) return 0; } -static void platform_suspend_end(suspend_state_t state) +static void platform_resume_end(suspend_state_t state) { if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) freeze_ops->end(); @@ -180,7 +180,7 @@ static void platform_suspend_end(suspend_state_t state) suspend_ops->end(); } -static void platform_suspend_recover(suspend_state_t state) +static void platform_recover(suspend_state_t state) { if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) suspend_ops->recover(); @@ -275,7 +275,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) printk(KERN_ERR "PM: noirq suspend of devices failed\n"); goto Devices_early_resume; } - error = platform_suspend_prepare_late(state); + error = platform_suspend_prepare_noirq(state); if (error) goto Platform_wake; @@ -323,14 +323,14 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) enable_nonboot_cpus(); Platform_wake: - platform_suspend_wake(state); + platform_resume_noirq(state); dpm_resume_noirq(PMSG_RESUME); Devices_early_resume: dpm_resume_early(PMSG_RESUME); Platform_finish: - platform_suspend_finish(state); + platform_resume_finish(state); return error; } @@ -374,11 +374,11 @@ int suspend_devices_and_enter(suspend_state_t state) trace_suspend_resume(TPS("resume_console"), state, false); Close: - platform_suspend_end(state); + platform_resume_end(state); return error; Recover_platform: - platform_suspend_recover(state); + platform_recover(state); goto Resume_devices; } -- cgit v1.2.3 From a8d46b9e4e487301affe84fa53de40b890898604 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 30 Sep 2014 02:29:01 +0200 Subject: ACPI / sleep: Rework the handling of ACPI GPE wakeup from suspend-to-idle The ACPI GPE wakeup from suspend-to-idle is currently based on using the IRQF_NO_SUSPEND flag for the ACPI SCI, but that is problematic for a couple of reasons. First, in principle the ACPI SCI may be shared and IRQF_NO_SUSPEND does not really work well with shared interrupts. Second, it may require the ACPI subsystem to special-case the handling of device notifications depending on whether or not they are received during suspend-to-idle in some places which would lead to fragile code. Finally, it's better the handle ACPI wakeup interrupts consistently with wakeup interrupts from other sources. For this reason, remove the IRQF_NO_SUSPEND flag from the ACPI SCI and use enable_irq_wake()/disable_irq_wake() with it instead, which requires two additional platform hooks to be added to struct platform_freeze_ops. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/osl.c | 2 +- drivers/acpi/sleep.c | 16 ++++++++++++++++ include/linux/suspend.h | 2 ++ kernel/power/suspend.c | 21 ++++++++++++++++++++- 4 files changed, 39 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 3abe9b223ba7..5ca29b5af8d1 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -825,7 +825,7 @@ acpi_os_install_interrupt_handler(u32 gsi, acpi_osd_handler handler, acpi_irq_handler = handler; acpi_irq_context = context; - if (request_irq(irq, acpi_irq, IRQF_SHARED | IRQF_NO_SUSPEND, "acpi", acpi_irq)) { + if (request_irq(irq, acpi_irq, IRQF_SHARED, "acpi", acpi_irq)) { printk(KERN_ERR PREFIX "SCI (IRQ%d) allocation failed\n", irq); acpi_irq_handler = NULL; return AE_NOT_ACQUIRED; diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 54da4a3fe65e..05a31b573fc3 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -626,6 +627,19 @@ static int acpi_freeze_begin(void) return 0; } +static int acpi_freeze_prepare(void) +{ + acpi_enable_all_wakeup_gpes(); + enable_irq_wake(acpi_gbl_FADT.sci_interrupt); + return 0; +} + +static void acpi_freeze_restore(void) +{ + disable_irq_wake(acpi_gbl_FADT.sci_interrupt); + acpi_enable_all_runtime_gpes(); +} + static void acpi_freeze_end(void) { acpi_scan_lock_release(); @@ -633,6 +647,8 @@ static void acpi_freeze_end(void) static const struct platform_freeze_ops acpi_freeze_ops = { .begin = acpi_freeze_begin, + .prepare = acpi_freeze_prepare, + .restore = acpi_freeze_restore, .end = acpi_freeze_end, }; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 06a9910827c2..3388c1b6f7d8 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -189,6 +189,8 @@ struct platform_suspend_ops { struct platform_freeze_ops { int (*begin)(void); + int (*prepare)(void); + void (*restore)(void); void (*end)(void); }; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index a25e768d92b5..4ca9a33ff620 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -144,6 +144,12 @@ static int platform_suspend_prepare(suspend_state_t state) suspend_ops->prepare() : 0; } +static int platform_suspend_prepare_late(suspend_state_t state) +{ + return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ? + freeze_ops->prepare() : 0; +} + static int platform_suspend_prepare_noirq(suspend_state_t state) { return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? @@ -156,6 +162,12 @@ static void platform_resume_noirq(suspend_state_t state) suspend_ops->wake(); } +static void platform_resume_early(suspend_state_t state) +{ + if (state == PM_SUSPEND_FREEZE && freeze_ops->restore) + freeze_ops->restore(); +} + static void platform_resume_finish(suspend_state_t state) { if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) @@ -270,10 +282,14 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) printk(KERN_ERR "PM: late suspend of devices failed\n"); goto Platform_finish; } + error = platform_suspend_prepare_late(state); + if (error) + goto Devices_early_resume; + error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: noirq suspend of devices failed\n"); - goto Devices_early_resume; + goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); if (error) @@ -326,6 +342,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) platform_resume_noirq(state); dpm_resume_noirq(PMSG_RESUME); + Platform_early_resume: + platform_resume_early(state); + Devices_early_resume: dpm_resume_early(PMSG_RESUME); -- cgit v1.2.3 From fdd64ed54eeba6b8619b36dcc7cb6442f2c6da0c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 30 Sep 2014 13:31:29 +0200 Subject: PM / hibernate: Iterate over set bits instead of PFNs in swsusp_free() The existing implementation of swsusp_free iterates over all pfns in the system and checks every bit in the two memory bitmaps. This doesn't scale very well with large numbers of pfns, especially when the bitmaps are not populated very densly. Change the algorithm to iterate over the set bits in the bitmaps instead to make it scale better in large memory configurations. Also add a memory_bm_clear_current() helper function that clears the bit for the last position returned from the memory bitmap. This new version adds a !NULL check for the memory bitmaps before they are walked. Not doing so causes a kernel crash when the bitmaps are NULL. Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 54 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f1604d8cf489..791a61892bb5 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -725,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) clear_bit(bit, addr); } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ + int bit; + + bit = max(bm->cur.node_bit - 1, 0); + clear_bit(bit, bm->cur.node->data); +} + static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; @@ -1333,23 +1341,39 @@ static struct memory_bitmap copy_bm; void swsusp_free(void) { - struct zone *zone; - unsigned long pfn, max_zone_pfn; + unsigned long fb_pfn, fr_pfn; - for_each_populated_zone(zone) { - max_zone_pfn = zone_end_pfn(zone); - for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - - if (swsusp_page_is_forbidden(page) && - swsusp_page_is_free(page)) { - swsusp_unset_page_forbidden(page); - swsusp_unset_page_free(page); - __free_page(page); - } - } + if (!forbidden_pages_map || !free_pages_map) + goto out; + + memory_bm_position_reset(forbidden_pages_map); + memory_bm_position_reset(free_pages_map); + +loop: + fr_pfn = memory_bm_next_pfn(free_pages_map); + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + + /* + * Find the next bit set in both bitmaps. This is guaranteed to + * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. + */ + do { + if (fb_pfn < fr_pfn) + fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + if (fr_pfn < fb_pfn) + fr_pfn = memory_bm_next_pfn(free_pages_map); + } while (fb_pfn != fr_pfn); + + if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { + struct page *page = pfn_to_page(fr_pfn); + + memory_bm_clear_current(forbidden_pages_map); + memory_bm_clear_current(free_pages_map); + __free_page(page); + goto loop; } + +out: nr_copy_pages = 0; nr_meta_pages = 0; restore_pblist = NULL; -- cgit v1.2.3 From f1bca824dabba4ffe8582f87ca587780befce7ad Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 29 Sep 2014 18:50:01 -0700 Subject: bpf: add search pruning optimization to verifier consider C program represented in eBPF: int filter(int arg) { int a, b, c, *ptr; if (arg == 1) ptr = &a; else if (arg == 2) ptr = &b; else ptr = &c; *ptr = 0; return 0; } eBPF verifier has to follow all possible paths through the program to recognize that '*ptr = 0' instruction would be safe to execute in all situations. It's doing it by picking a path towards the end and observes changes to registers and stack at every insn until it reaches bpf_exit. Then it comes back to one of the previous branches and goes towards the end again with potentially different values in registers. When program has a lot of branches, the number of possible combinations of branches is huge, so verifer has a hard limit of walking no more than 32k instructions. This limit can be reached and complex (but valid) programs could be rejected. Therefore it's important to recognize equivalent verifier states to prune this depth first search. Basic idea can be illustrated by the program (where .. are some eBPF insns): 1: .. 2: if (rX == rY) goto 4 3: .. 4: .. 5: .. 6: bpf_exit In the first pass towards bpf_exit the verifier will walk insns: 1, 2, 3, 4, 5, 6 Since insn#2 is a branch the verifier will remember its state in verifier stack to come back to it later. Since insn#4 is marked as 'branch target', the verifier will remember its state in explored_states[4] linked list. Once it reaches insn#6 successfully it will pop the state recorded at insn#2 and will continue. Without search pruning optimization verifier would have to walk 4, 5, 6 again, effectively simulating execution of insns 1, 2, 4, 5, 6 With search pruning it will check whether state at #4 after jumping from #2 is equivalent to one recorded in explored_states[4] during first pass. If there is an equivalent state, verifier can prune the search at #4 and declare this path to be safe as well. In other words two states at #4 are equivalent if execution of 1, 2, 3, 4 insns and 1, 2, 4 insns produces equivalent registers and stack. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a086dd3210a8..801f5f3b9307 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -199,6 +199,7 @@ struct verifier_env { struct verifier_stack_elem *head; /* stack of verifier states to be processed */ int stack_size; /* number of states to be processed */ struct verifier_state cur_state; /* current verifier state */ + struct verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ }; @@ -1219,6 +1220,8 @@ enum { BRANCH = 2, }; +#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) + static int *insn_stack; /* stack of insns to process */ static int cur_stack; /* current stack index */ static int *insn_state; @@ -1241,6 +1244,10 @@ static int push_insn(int t, int w, int e, struct verifier_env *env) return -EINVAL; } + if (e == BRANCH) + /* mark branch target for state pruning */ + env->explored_states[w] = STATE_LIST_MARK; + if (insn_state[w] == 0) { /* tree-edge */ insn_state[t] = DISCOVERED | e; @@ -1314,6 +1321,10 @@ peek_stack: goto peek_stack; else if (ret < 0) goto err_free; + /* tell verifier to check for equivalent states + * after every call and jump + */ + env->explored_states[t + 1] = STATE_LIST_MARK; } else { /* conditional jump with two edges */ ret = push_insn(t, t + 1, FALLTHROUGH, env); @@ -1364,6 +1375,95 @@ err_free: return ret; } +/* compare two verifier states + * + * all states stored in state_list are known to be valid, since + * verifier reached 'bpf_exit' instruction through them + * + * this function is called when verifier exploring different branches of + * execution popped from the state stack. If it sees an old state that has + * more strict register state and more strict stack state then this execution + * branch doesn't need to be explored further, since verifier already + * concluded that more strict state leads to valid finish. + * + * Therefore two states are equivalent if register state is more conservative + * and explored stack state is more conservative than the current one. + * Example: + * explored current + * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) + * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) + * + * In other words if current stack state (one being explored) has more + * valid slots than old one that already passed validation, it means + * the verifier can stop exploring and conclude that current state is valid too + * + * Similarly with registers. If explored state has register type as invalid + * whereas register type in current state is meaningful, it means that + * the current state will reach 'bpf_exit' instruction safely + */ +static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + if (memcmp(&old->regs[i], &cur->regs[i], + sizeof(old->regs[0])) != 0) { + if (old->regs[i].type == NOT_INIT || + old->regs[i].type == UNKNOWN_VALUE) + continue; + return false; + } + } + + for (i = 0; i < MAX_BPF_STACK; i++) { + if (memcmp(&old->stack[i], &cur->stack[i], + sizeof(old->stack[0])) != 0) { + if (old->stack[i].stype == STACK_INVALID) + continue; + return false; + } + } + return true; +} + +static int is_state_visited(struct verifier_env *env, int insn_idx) +{ + struct verifier_state_list *new_sl; + struct verifier_state_list *sl; + + sl = env->explored_states[insn_idx]; + if (!sl) + /* this 'insn_idx' instruction wasn't marked, so we will not + * be doing state search here + */ + return 0; + + while (sl != STATE_LIST_MARK) { + if (states_equal(&sl->state, &env->cur_state)) + /* reached equivalent register/stack state, + * prune the search + */ + return 1; + sl = sl->next; + } + + /* there were no equivalent states, remember current one. + * technically the current state is not proven to be safe yet, + * but it will either reach bpf_exit (which means it's safe) or + * it will be rejected. Since there are no loops, we won't be + * seeing this 'insn_idx' instruction again on the way to bpf_exit + */ + new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + if (!new_sl) + return -ENOMEM; + + /* add new state to the head of linked list */ + memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + new_sl->next = env->explored_states[insn_idx]; + env->explored_states[insn_idx] = new_sl; + return 0; +} + static int do_check(struct verifier_env *env) { struct verifier_state *state = &env->cur_state; @@ -1396,6 +1496,21 @@ static int do_check(struct verifier_env *env) return -E2BIG; } + err = is_state_visited(env, insn_idx); + if (err < 0) + return err; + if (err == 1) { + /* found equivalent state, can prune the search */ + if (log_level) { + if (do_print_state) + verbose("\nfrom %d to %d: safe\n", + prev_insn_idx, insn_idx); + else + verbose("%d: safe\n", insn_idx); + } + goto process_bpf_exit; + } + if (log_level && do_print_state) { verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); print_verifier_state(env); @@ -1531,6 +1646,7 @@ static int do_check(struct verifier_env *env) if (err) return err; +process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { break; @@ -1671,6 +1787,28 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } +static void free_states(struct verifier_env *env) +{ + struct verifier_state_list *sl, *sln; + int i; + + if (!env->explored_states) + return; + + for (i = 0; i < env->prog->len; i++) { + sl = env->explored_states[i]; + + if (sl) + while (sl != STATE_LIST_MARK) { + sln = sl->next; + kfree(sl); + sl = sln; + } + } + + kfree(env->explored_states); +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; @@ -1719,6 +1857,13 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; + env->explored_states = kcalloc(prog->len, + sizeof(struct verifier_state_list *), + GFP_USER); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; @@ -1727,6 +1872,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) skip_full_check: while (pop_stack(env, NULL) >= 0); + free_states(env); if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); -- cgit v1.2.3 From 6c34f1f5424395994c125f8c68bed395920ecc58 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Tue, 16 Sep 2014 22:37:18 +0100 Subject: aarch64: filter $x from kallsyms Similar to ARM, AArch64 is generating $x and $d syms... which isn't terribly helpful when looking at %pF output and the like. Filter those out in kallsyms, modpost and when looking at module symbols. Seems simplest since none of these check EM_ARM anyway, to just add it to the strchr used, rather than trying to make things overly complicated. initcall_debug improves: dmesg_before.txt: initcall $x+0x0/0x154 [sg] returned 0 after 26331 usecs dmesg_after.txt: initcall init_sg+0x0/0x154 [sg] returned 0 after 15461 usecs Signed-off-by: Kyle McMartin Acked-by: Rusty Russell Signed-off-by: Catalin Marinas --- kernel/module.c | 2 +- scripts/kallsyms.c | 2 +- scripts/mod/modpost.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 03214bd288e9..3d52936031cc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3388,7 +3388,7 @@ static inline int is_arm_mapping_symbol(const char *str) { if (str[0] == '.' && str[1] == 'L') return true; - return str[0] == '$' && strchr("atd", str[1]) + return str[0] == '$' && strchr("axtd", str[1]) && (str[2] == '\0' || str[2] == '.'); } diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index dc7aa45e80ce..c6d33bd15b04 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -84,7 +84,7 @@ static void usage(void) */ static inline int is_arm_mapping_symbol(const char *str) { - return str[0] == '$' && strchr("atd", str[1]) + return str[0] == '$' && strchr("axtd", str[1]) && (str[2] == '\0' || str[2] == '.'); } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 091d90573b63..3017ec20e9f8 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1146,7 +1146,7 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, static inline int is_arm_mapping_symbol(const char *str) { - return str[0] == '$' && strchr("atd", str[1]) + return str[0] == '$' && strchr("axtd", str[1]) && (str[2] == '\0' || str[2] == '.'); } -- cgit v1.2.3 From 24607f114fd14f2f37e3e0cb3d47bce96e81e848 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 2 Oct 2014 16:51:18 -0400 Subject: ring-buffer: Fix infinite spin in reading buffer Commit 651e22f2701b "ring-buffer: Always reset iterator to reader page" fixed one bug but in the process caused another one. The reset is to update the header page, but that fix also changed the way the cached reads were updated. The cache reads are used to test if an iterator needs to be updated or not. A ring buffer iterator, when created, disables writes to the ring buffer but does not stop other readers or consuming reads from happening. Although all readers are synchronized via a lock, they are only synchronized when in the ring buffer functions. Those functions may be called by any number of readers. The iterator continues down when its not interrupted by a consuming reader. If a consuming read occurs, the iterator starts from the beginning of the buffer. The way the iterator sees that a consuming read has happened since its last read is by checking the reader "cache". The cache holds the last counts of the read and the reader page itself. Commit 651e22f2701b changed what was saved by the cache_read when the rb_iter_reset() occurred, making the iterator never match the cache. Then if the iterator calls rb_iter_reset(), it will go into an infinite loop by checking if the cache doesn't match, doing the reset and retrying, just to see that the cache still doesn't match! Which should never happen as the reset is suppose to set the cache to the current value and there's locks that keep a consuming reader from having access to the data. Fixes: 651e22f2701b "ring-buffer: Always reset iterator to reader page" Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b38fb2b9e237..2d75c94ae87d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3359,7 +3359,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->head = cpu_buffer->reader_page->read; iter->cache_reader_page = iter->head_page; - iter->cache_read = iter->head; + iter->cache_read = cpu_buffer->read; if (iter->head) iter->read_stamp = cpu_buffer->read_stamp; -- cgit v1.2.3 From 6c72e3501d0d62fc064d3680e5234f3463ec5a86 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Oct 2014 16:17:02 -0700 Subject: perf: fix perf bug in fork() Oleg noticed that a cleanup by Sylvain actually uncovered a bug; by calling perf_event_free_task() when failing sched_fork() we will not yet have done the memset() on ->perf_event_ctxp[] and will therefore try and 'free' the inherited contexts, which are still in use by the parent process. This is bad.. Suggested-by: Oleg Nesterov Reported-by: Oleg Nesterov Reported-by: Sylvain 'ythier' Hitier Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/core.c | 4 +++- kernel/fork.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d640a8b4dcbc..963bf139e2b2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7948,8 +7948,10 @@ int perf_event_init_task(struct task_struct *child) for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); - if (ret) + if (ret) { + perf_event_free_task(child); return ret; + } } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb6e491..a91e47d86de2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1360,7 +1360,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = copy_semundo(clone_flags, p); @@ -1566,8 +1566,9 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_policy: +bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: -- cgit v1.2.3 From 211de6eba8960521e2be450a7d07db85fba4604c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Sep 2014 19:23:08 +0200 Subject: perf: Fix unclone_ctx() vs. locking The idiot who did 4a1c0f262f88 ("perf: Fix lockdep warning on process exit") forgot to pay attention and fix all similar cases. Do so now. In particular, unclone_ctx() must be called while holding ctx->lock, therefore all such sites are broken for the same reason. Pull the put_ctx() call out from under ctx->lock. Reported-by: Sasha Levin Probably-also-reported-by: Vince Weaver Fixes: 4a1c0f262f88 ("perf: Fix lockdep warning on process exit") Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Sasha Levin Cc: Cong Wang Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140930172308.GI4241@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 54 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d640a8b4dcbc..afdd9e1d7144 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -902,13 +902,23 @@ static void put_ctx(struct perf_event_context *ctx) } } -static void unclone_ctx(struct perf_event_context *ctx) +/* + * This must be done under the ctx->lock, such as to serialize against + * context_equiv(), therefore we cannot call put_ctx() since that might end up + * calling scheduler related locks and ctx->lock nests inside those. + */ +static __must_check struct perf_event_context * +unclone_ctx(struct perf_event_context *ctx) { - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); + struct perf_event_context *parent_ctx = ctx->parent_ctx; + + lockdep_assert_held(&ctx->lock); + + if (parent_ctx) ctx->parent_ctx = NULL; - } ctx->generation++; + + return parent_ctx; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -2210,6 +2220,9 @@ static void ctx_sched_out(struct perf_event_context *ctx, static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { + lockdep_assert_held(&ctx1->lock); + lockdep_assert_held(&ctx2->lock); + /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; @@ -2943,6 +2956,7 @@ static int event_enable_on_exec(struct perf_event *event, */ static void perf_event_enable_on_exec(struct perf_event_context *ctx) { + struct perf_event_context *clone_ctx = NULL; struct perf_event *event; unsigned long flags; int enabled = 0; @@ -2974,7 +2988,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * Unclone this context if we enabled any event. */ if (enabled) - unclone_ctx(ctx); + clone_ctx = unclone_ctx(ctx); raw_spin_unlock(&ctx->lock); @@ -2984,6 +2998,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); + + if (clone_ctx) + put_ctx(clone_ctx); } void perf_event_exec(void) @@ -3135,7 +3152,7 @@ errout: static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { - struct perf_event_context *ctx; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; unsigned long flags; int ctxn, err; @@ -3169,9 +3186,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { - unclone_ctx(ctx); + clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); + + if (clone_ctx) + put_ctx(clone_ctx); } else { ctx = alloc_perf_context(pmu, task); err = -ENOMEM; @@ -7523,7 +7543,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *next; - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx, *clone_ctx = NULL; unsigned long flags; if (likely(!child->perf_event_ctxp[ctxn])) { @@ -7549,29 +7569,17 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) task_ctx_sched_out(child_ctx); child->perf_event_ctxp[ctxn] = NULL; - /* - * In order to avoid freeing: child_ctx->parent_ctx->task - * under perf_event_context::lock, grab another reference. - */ - parent_ctx = child_ctx->parent_ctx; - if (parent_ctx) - get_ctx(parent_ctx); - /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the events from it. */ - unclone_ctx(child_ctx); + clone_ctx = unclone_ctx(child_ctx); update_context_time(child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Now that we no longer hold perf_event_context::lock, drop - * our extra child_ctx->parent_ctx reference. - */ - if (parent_ctx) - put_ctx(parent_ctx); + if (clone_ctx) + put_ctx(clone_ctx); /* * Report the task dead after unscheduling the events so that we -- cgit v1.2.3 From 9c2b9d30e28559a78c9e431cdd7f2c6bf5a9ee67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Sep 2014 12:12:01 +0200 Subject: perf: Fix perf bug in fork() Oleg noticed that a cleanup by Sylvain actually uncovered a bug; by calling perf_event_free_task() when failing sched_fork() we will not yet have done the memset() on ->perf_event_ctxp[] and will therefore try and 'free' the inherited contexts, which are still in use by the parent process. This is bad and might explain some outstanding fuzzer failures ... Suggested-by: Oleg Nesterov Reported-by: Oleg Nesterov Reported-by: Sylvain 'ythier' Hitier Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Tomlin Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Daeseok Youn Cc: David Rientjes Cc: Kees Cook Cc: Linus Torvalds Cc: Paul Mackerras Cc: Rik van Riel Cc: Vladimir Davydov Cc: Link: http://lkml.kernel.org/r/20140929101201.GE5430@worktop Signed-off-by: Ingo Molnar --- kernel/events/core.c | 4 +++- kernel/fork.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index afdd9e1d7144..658f232af04c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7956,8 +7956,10 @@ int perf_event_init_task(struct task_struct *child) for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); - if (ret) + if (ret) { + perf_event_free_task(child); return ret; + } } return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 0cf9cdb6e491..a91e47d86de2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1360,7 +1360,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_policy; retval = audit_alloc(p); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = copy_semundo(clone_flags, p); @@ -1566,8 +1566,9 @@ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_audit: audit_free(p); -bad_fork_cleanup_policy: +bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: -- cgit v1.2.3 From 43f4d66637bc752e93a77ff2536474a5a3888442 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 1 Oct 2014 15:38:55 +0200 Subject: sched: Improve sysbench performance by fixing spurious active migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit caeb178c60f4 ("sched/fair: Make update_sd_pick_busiest() ...") sd_pick_busiest returns a group that can be neither imbalanced nor overloaded but is only more loaded than others. This change has been introduced to ensure a better load balance in system that are not overloaded but as a side effect, it can also generate useless active migration between groups. Let take the example of 3 tasks on a quad cores system. We will always have an idle core so the load balance will find a busiest group (core) whenever an ILB is triggered and it will force an active migration (once above nr_balance_failed threshold) so the idle core becomes busy but another core will become idle. With the next ILB, the freshly idle core will try to pull the task of a busy CPU. The number of spurious active migration is not so huge in quad core system because the ILB is not triggered so much. But it becomes significant as soon as you have more than one sched_domain level like on a dual cluster of quad cores where the ILB is triggered every tick when you have more than 1 busy_cpu We need to ensure that the migration generate a real improveùent and will not only move the avg_load imbalance on another CPU. Before caeb178c60f4f93f1b45c0bc056b5cf6d217b67f, the filtering of such use case was ensured by the following test in f_b_g: if ((local->idle_cpus < busiest->idle_cpus) && busiest->sum_nr_running <= busiest->group_weight) This patch modified the condition to take into account situation where busiest group is not overloaded: If the diff between the number of idle cpus in 2 groups is less than or equal to 1 and the busiest group is not overloaded, moving a task will not improve the load balance but just move it. A test with sysbench on a dual clusters of quad cores gives the following results: command: sysbench --test=cpu --num-threads=5 --max-time=5 run The HZ is 200 which means that 1000 ticks has fired during the test. With Mainline, perf gives the following figures: Samples: 727 of event 'sched:sched_migrate_task' Event count (approx.): 727 Overhead Command Shared Object Symbol ........ ............... ............. .............. 12.52% migration/1 [unknown] [.] 00000000 12.52% migration/5 [unknown] [.] 00000000 12.52% migration/7 [unknown] [.] 00000000 12.10% migration/6 [unknown] [.] 00000000 11.83% migration/0 [unknown] [.] 00000000 11.83% migration/3 [unknown] [.] 00000000 11.14% migration/4 [unknown] [.] 00000000 10.87% migration/2 [unknown] [.] 00000000 2.75% sysbench [unknown] [.] 00000000 0.83% swapper [unknown] [.] 00000000 0.55% ktps65090charge [unknown] [.] 00000000 0.41% mmcqd/1 [unknown] [.] 00000000 0.14% perf [unknown] [.] 00000000 With this patch, perf gives the following figures Samples: 20 of event 'sched:sched_migrate_task' Event count (approx.): 20 Overhead Command Shared Object Symbol ........ ............... ............. .............. 80.00% sysbench [unknown] [.] 00000000 10.00% swapper [unknown] [.] 00000000 5.00% ktps65090charge [unknown] [.] 00000000 5.00% migration/1 [unknown] [.] 00000000 Signed-off-by: Vincent Guittot Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1412170735-5356-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10a5a286d8e2..dfdcbfde2c5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6436,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (env->idle == CPU_IDLE) { /* - * This cpu is idle. If the busiest group load doesn't - * have more tasks than the number of available cpu's and - * there is no imbalance between this and busiest group - * wrt to idle cpu's, it is balanced. + * This cpu is idle. If the busiest group is not overloaded + * and there is no imbalance between this and busiest group + * wrt idle cpus, it is balanced. The imbalance becomes + * significant if the diff is greater than 1 otherwise we + * might end up to just move the imbalance on another group */ - if ((local->idle_cpus < busiest->idle_cpus) && - busiest->sum_nr_running <= busiest->group_weight) + if ((busiest->group_type != group_overloaded) && + (local->idle_cpus <= (busiest->idle_cpus + 1))) goto out_balanced; } else { /* -- cgit v1.2.3 From 347abad981c1ef815ea5ba861adba6a8c6aa1580 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 30 Sep 2014 15:59:47 -0400 Subject: sched, time: Fix build error with 64 bit cputime_t on 32 bit systems On 32 bit systems cmpxchg cannot handle 64 bit values, so some additional magic is required to allow a 32 bit system with CONFIG_VIRT_CPU_ACCOUNTING_GEN=y enabled to build. Make sure the correct cmpxchg function is used when doing an atomic swap of a cputime_t. Reported-by: Arnd Bergmann Signed-off-by: Rik van Riel Acked-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Cc: umgwanakikbuti@gmail.com Cc: fweisbec@gmail.com Cc: srao@redhat.com Cc: lwoodman@redhat.com Cc: atheurer@redhat.com Cc: oleg@redhat.com Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: linux390@de.ibm.com Cc: linux-arch@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Link: http://lkml.kernel.org/r/20140930155947.070cdb1f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/cputime.h | 2 ++ arch/s390/include/asm/cputime.h | 2 ++ include/asm-generic/cputime_jiffies.h | 2 ++ include/asm-generic/cputime_nsecs.h | 2 ++ kernel/sched/cputime.c | 29 +++++++++++++++++++---------- 5 files changed, 27 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 607559ab271f..6c840ceab820 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -32,6 +32,8 @@ static inline void setup_cputime_one_jiffy(void) { } typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #ifdef __KERNEL__ /* diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index f65bd3634519..3001887f94b7 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -18,6 +18,8 @@ typedef unsigned long long __nocast cputime_t; typedef unsigned long long __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + static inline unsigned long __div(unsigned long long n, unsigned long base) { #ifndef CONFIG_64BIT diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index d5cb78f53986..fe386fc6e85e 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h @@ -3,6 +3,8 @@ typedef unsigned long __nocast cputime_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) #define cputime_to_scaled(__ct) (__ct) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index 4e817606c549..0419485891f2 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -21,6 +21,8 @@ typedef u64 __nocast cputime_t; typedef u64 __nocast cputime64_t; +#define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) + #define cputime_one_jiffy jiffies_to_cputime(1) #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 64492dff8a81..8394b1ee600c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -554,6 +554,23 @@ drop_precision: return (__force cputime_t) scaled; } +/* + * Atomically advance counter to the new value. Interrupts, vcpu + * scheduling, and scaling inaccuracies can cause cputime_advance + * to be occasionally called with a new value smaller than counter. + * Let's enforce atomicity. + * + * Normally a caller will only go through this loop once, or not + * at all in case a previous caller updated counter the same jiffy. + */ +static void cputime_advance(cputime_t *counter, cputime_t new) +{ + cputime_t old; + + while (new > (old = ACCESS_ONCE(*counter))) + cmpxchg_cputime(counter, old, new); +} + /* * Adjust tick based cputime random precision against scheduler * runtime accounting. @@ -599,16 +616,8 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime - stime; } - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - * Atomic exchange protects against concurrent cputime_adjust(). - */ - while (stime > (rtime = ACCESS_ONCE(prev->stime))) - cmpxchg(&prev->stime, rtime, stime); - while (utime > (rtime = ACCESS_ONCE(prev->utime))) - cmpxchg(&prev->utime, rtime, utime); + cputime_advance(&prev->stime, stime); + cputime_advance(&prev->utime, utime); out: *ut = prev->utime; -- cgit v1.2.3 From 10a12983b3d437a6998b3845870e52c1c752c101 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 1 Oct 2014 01:04:44 +0400 Subject: sched/fair: Delete resched_cpu() from idle_balance() We already reschedule env.dst_cpu in attach_tasks()->check_preempt_curr() if this is necessary. Furthermore, a higher priority class task may be current on dest rq, we shouldn't disturb it. Signed-off-by: Kirill Tkhai Cc: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/20140930210441.5258.55054.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfdcbfde2c5b..bd61cff8ee4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6701,12 +6701,6 @@ more_balance: local_irq_restore(flags); - /* - * some other cpu did the load balance for us. - */ - if (cur_ld_moved && env.dst_cpu != smp_processor_id()) - resched_cpu(env.dst_cpu); - if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; goto more_balance; -- cgit v1.2.3 From f10e00f4bf360c36edbe6bf18a6c75b171cbe012 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 30 Sep 2014 12:23:37 +0400 Subject: sched/dl: Use dl_bw_of() under rcu_read_lock_sched() rq->rd is freed using call_rcu_sched(), so rcu_read_lock() to access it is not enough. We should use either rcu_read_lock_sched() or preempt_disable(). Reported-by: Sasha Levin Suggested-by: Peter Zijlstra Signed-off-by: Kirill Tkhai Fixes: 66339c31bc39 "sched: Use dl_bw_of() under RCU read lock" Link: http://lkml.kernel.org/r/1412065417.20287.24.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5349fee1213..c84bdc098656 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5264,6 +5264,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb, { unsigned long flags; long cpu = (long)hcpu; + struct dl_bw *dl_b; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: @@ -5271,15 +5272,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb, /* explicitly allow suspend */ if (!(action & CPU_TASKS_FROZEN)) { - struct dl_bw *dl_b = dl_bw_of(cpu); bool overflow; int cpus; + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + raw_spin_lock_irqsave(&dl_b->lock, flags); cpus = dl_bw_cpus(cpu); overflow = __dl_overflow(dl_b, cpus, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (overflow) return notifier_from_errno(-EBUSY); } @@ -7647,11 +7652,10 @@ static int sched_dl_global_constraints(void) u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + struct dl_bw *dl_b; int cpu, ret = 0; unsigned long flags; - rcu_read_lock(); - /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in @@ -7662,25 +7666,27 @@ static int sched_dl_global_constraints(void) * solutions is welcome! */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); if (new_bw < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + rcu_read_unlock_sched(); + if (ret) break; } - rcu_read_unlock(); - return ret; } static void sched_dl_do_global(void) { u64 new_bw = -1; + struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -7690,18 +7696,19 @@ static void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - rcu_read_lock(); /* * FIXME: As above... */ for_each_possible_cpu(cpu) { - struct dl_bw *dl_b = dl_bw_of(cpu); + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); dl_b->bw = new_bw; raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); } - rcu_read_unlock(); } static int sched_rt_global_validate(void) -- cgit v1.2.3 From debfab74e453f079cd8b12b0604387a8c510ef3a Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 16 Sep 2014 17:16:57 -0700 Subject: locking/rwsem: Avoid double checking before try acquiring write lock Commit 9b0fc9c09f1b ("rwsem: skip initial trylock in rwsem_down_write_failed") checks for if there are known active lockers in order to avoid write trylocking using expensive cmpxchg() when it likely wouldn't get the lock. However, a subsequent patch was added such that we directly check for sem->count == RWSEM_WAITING_BIAS right before trying that cmpxchg(). Thus, commit 9b0fc9c09f1b now just adds overhead. This patch modifies it so that we only do a check for if count == RWSEM_WAITING_BIAS. Also, add a comment on why we do an "extra check" of count before the cmpxchg(). Signed-off-by: Jason Low Acked-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Aswin Chandramouleeswaran Cc: Chegu Vinod Cc: Peter Hurley Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1410913017.2447.22.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 12166ec9b7e7..7628c3fc37ca 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -250,16 +250,18 @@ EXPORT_SYMBOL(rwsem_down_read_failed); static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* try acquiring the write lock */ - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); - return true; - } + /* + * Try acquiring the write lock. Check count first in order + * to reduce unnecessary expensive cmpxchg() operations. + */ + if (count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { + if (!list_is_singular(&sem->wait_list)) + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + return true; } + return false; } -- cgit v1.2.3 From 8acd91e8620836a56ff62028ed28ba629f2881a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Sep 2014 15:26:00 +0200 Subject: locking/lockdep: Revert qrwlock recusive stuff Commit f0bab73cb539 ("locking/lockdep: Restrict the use of recursive read_lock() with qrwlock") changed lockdep to try and conform to the qrwlock semantics which differ from the traditional rwlock semantics. In particular qrwlock is fair outside of interrupt context, but in interrupt context readers will ignore all fairness. The problem modeling this is that read and write side have different lock state (interrupts) semantics but we only have a single representation of these. Therefore lockdep will get confused, thinking the lock can cause interrupt lock inversions. So revert it for now; the old rwlock semantics were already imperfectly modeled and the qrwlock extra won't fit either. If we want to properly fix this, I think we need to resurrect the work by Gautham did a few years ago that split the read and write state of locks: http://lwn.net/Articles/332801/ FWIW the locking selftest that would've failed (and was reported by Borislav earlier) is something like: RL(X1); /* IRQ-ON */ LOCK(A); UNLOCK(A); RU(X1); IRQ_ENTER(); RL(X1); /* IN-IRQ */ RU(X1); IRQ_EXIT(); At which point it would report that because A is an IRQ-unsafe lock we can suffer the following inversion: CPU0 CPU1 lock(A) lock(X1) lock(A) lock(X1) And this is 'wrong' because X1 can recurse (assuming the above lock are in fact read-lock) but lockdep doesn't know about this. Signed-off-by: Peter Zijlstra (Intel) Cc: Waiman Long Cc: ego@linux.vnet.ibm.com Cc: bp@alien8.de Cc: Linus Torvalds Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/20140930132600.GA7444@worktop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 10 +-------- kernel/locking/lockdep.c | 6 ------ lib/locking-selftest.c | 56 ++++++------------------------------------------ 3 files changed, 8 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b5a84b62fb84..f388481201cd 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -478,24 +478,16 @@ static inline void print_irqtrace_events(struct task_struct *curr) * on the per lock-class debug mode: */ -/* - * Read states in the 2-bit held_lock:read field: - * 0: Exclusive lock - * 1: Shareable lock, cannot be recursively called - * 2: Shareable lock, can be recursively called - * 3: Shareable lock, cannot be recursively called except in interrupt context - */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) -#define lock_acquire_shared_irecursive(l, s, t, n, i) lock_acquire(l, s, t, 3, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, n, i) lock_release(l, n, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) -#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_irecursive(l, s, t, NULL, i) +#define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define rwlock_release(l, n, i) lock_release(l, n, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 420ba685c4e5..88d0d4420ad2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3597,12 +3597,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, raw_local_irq_save(flags); check_flags(flags); - /* - * An interrupt recursive read in interrupt context can be considered - * to be the same as a recursive read from checking perspective. - */ - if ((read == 3) && in_interrupt()) - read = 2; current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 62af709b2083..872a15a2a637 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -267,46 +267,19 @@ GENERATE_TESTCASE(AA_rsem) #undef E /* - * Special-case for read-locking, they are not allowed to - * recurse on the same lock class except under interrupt context: + * Special-case for read-locking, they are + * allowed to recurse on the same lock class: */ static void rlock_AA1(void) { RL(X1); - RL(X1); // this one should fail + RL(X1); // this one should NOT fail } static void rlock_AA1B(void) { RL(X1); - RL(X2); // this one should fail -} - -static void rlock_AHA1(void) -{ - RL(X1); - HARDIRQ_ENTER(); - RL(X1); // this one should NOT fail - HARDIRQ_EXIT(); -} - -static void rlock_AHA1B(void) -{ - RL(X1); - HARDIRQ_ENTER(); - RL(X2); // this one should NOT fail - HARDIRQ_EXIT(); -} - -static void rlock_ASAHA1(void) -{ - RL(X1); - SOFTIRQ_ENTER(); - RL(X1); // this one should NOT fail - HARDIRQ_ENTER(); - RL(X1); // this one should NOT fail - HARDIRQ_EXIT(); - SOFTIRQ_EXIT(); + RL(X2); // this one should NOT fail } static void rsem_AA1(void) @@ -1096,7 +1069,7 @@ static inline void print_testname(const char *testname) print_testname(desc); \ dotest(name##_spin, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock, FAILURE, LOCKTYPE_RWLOCK); \ - dotest(name##_rlock, FAILURE, LOCKTYPE_RWLOCK); \ + dotest(name##_rlock, SUCCESS, LOCKTYPE_RWLOCK); \ dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ @@ -1857,14 +1830,14 @@ void locking_selftest(void) printk(" --------------------------------------------------------------------------\n"); print_testname("recursive read-lock"); printk(" |"); - dotest(rlock_AA1, FAILURE, LOCKTYPE_RWLOCK); + dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK); printk(" |"); dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM); printk("\n"); print_testname("recursive read-lock #2"); printk(" |"); - dotest(rlock_AA1B, FAILURE, LOCKTYPE_RWLOCK); + dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK); printk(" |"); dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM); printk("\n"); @@ -1883,21 +1856,6 @@ void locking_selftest(void) dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM); printk("\n"); - print_testname("recursive rlock with interrupt"); - printk(" |"); - dotest(rlock_AHA1, SUCCESS, LOCKTYPE_RWLOCK); - printk("\n"); - - print_testname("recursive rlock with interrupt #2"); - printk(" |"); - dotest(rlock_AHA1B, SUCCESS, LOCKTYPE_RWLOCK); - printk("\n"); - - print_testname("recursive rlock with interrupt #3"); - printk(" |"); - dotest(rlock_ASAHA1, SUCCESS, LOCKTYPE_RWLOCK); - printk("\n"); - printk(" --------------------------------------------------------------------------\n"); /* -- cgit v1.2.3 From 789cbbeca4eb7141cbd748ee93772471101b507b Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Sun, 5 Oct 2014 13:24:21 -0400 Subject: workqueue: Add quiescent state between work items Similar to the stop_machine deadlock scenario on !PREEMPT kernels addressed in b22ce2785d97 "workqueue: cond_resched() after processing each work item", kworker threads requeueing back-to-back with zero jiffy delay can stall RCU. The cond_resched call introduced in that fix will yield only iff there are other higher priority tasks to run, so force a quiescent RCU state between work items. Signed-off-by: Joe Lawrence Link: https://lkml.kernel.org/r/20140926105227.01325697@jlaw-desktop.mno.stratus.com Link: https://lkml.kernel.org/r/20140929115445.40221d8e@jlaw-desktop.mno.stratus.com Fixes: b22ce2785d97 ("workqueue: cond_resched() after processing each work item") Cc: Acked-by: Tejun Heo Signed-off-by: Paul E. McKenney --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5dbe22aa3efd..345bec95e708 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2043,8 +2043,10 @@ __acquires(&pool->lock) * kernels, where a requeueing work item waiting for something to * happen could deadlock with stop_machine as such work item could * indefinitely requeue itself while all other CPUs are trapped in - * stop_machine. + * stop_machine. At the same time, report a quiescent RCU state so + * the same condition doesn't freeze RCU. */ + rcu_note_voluntary_context_switch(current); cond_resched(); spin_lock_irq(&pool->lock); -- cgit v1.2.3 From 3e28e377204badfc3c4119ff2abda473127ee0ff Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Sun, 5 Oct 2014 13:24:22 -0400 Subject: workqueue: Use cond_resched_rcu_qs macro Tidy up and use cond_resched_rcu_qs when calling cond_resched and reporting potential quiescent state to RCU. Splitting this change in this way allows easy backporting to -stable for kernel versions not having cond_resched_rcu_qs(). Signed-off-by: Joe Lawrence Acked-by: Tejun Heo Signed-off-by: Paul E. McKenney --- kernel/workqueue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 345bec95e708..09b685daee3d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2046,8 +2046,7 @@ __acquires(&pool->lock) * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ - rcu_note_voluntary_context_switch(current); - cond_resched(); + cond_resched_rcu_qs(); spin_lock_irq(&pool->lock); -- cgit v1.2.3 From fe0e01c77dd9f7a60916aec2149d8a1182baf63c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Oct 2014 18:51:10 +0200 Subject: tracing: Robustify wait loop The pending nested sleep debugging triggered on the potential stale TASK_INTERRUPTIBLE in this code. While there, fix the loop such that we won't revert to a while(1) yield() 'spin' loop if we ever get a spurious wakeup. And fix the actual issue by properly terminating the 'wait' loop by setting TASK_RUNNING. Link: http://lkml.kernel.org/p/20141008165110.GA14547@worktop.programming.kicks-ass.net Reported-by: Fengguang Wu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ef06ce7e9cf8..0cc51edde3a8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2513,8 +2513,11 @@ static __init int event_test_thread(void *unused) kfree(test_malloc); set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) + while (!kthread_should_stop()) { schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); return 0; } -- cgit v1.2.3 From 849f3127bb46ef75a66dffc1b9b0d3f5f43fa395 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Aug 2014 12:23:53 -0400 Subject: switch /dev/kmsg to ->write_iter() Signed-off-by: Al Viro --- kernel/printk/printk.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1ce770687ea8..7a6e69441f75 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -519,14 +519,13 @@ struct devkmsg_user { char buf[8192]; }; -static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) +static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ - size_t len = iov_length(iv, count); + size_t len = iocb->ki_nbytes; ssize_t ret = len; if (len > LOG_LINE_MAX) @@ -535,13 +534,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, if (buf == NULL) return -ENOMEM; - line = buf; - for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { - ret = -EFAULT; - goto out; - } - line += iv[i].iov_len; + buf[len] = '\0'; + if (copy_from_iter(buf, len, from) != len) { + kfree(buf); + return -EFAULT; } /* @@ -567,10 +563,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, line = endp; } } - line[len] = '\0'; printk_emit(facility, level, NULL, 0, "%s", line); -out: kfree(buf); return ret; } @@ -802,7 +796,7 @@ static int devkmsg_release(struct inode *inode, struct file *file) const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, - .aio_write = devkmsg_writev, + .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, -- cgit v1.2.3 From fe0f49768d807a8fe6336b097feb8c4441951710 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 30 Sep 2014 17:37:52 +0200 Subject: s390/nohz: use a per-cpu flag for arch_needs_cpu Move the nohz_delay bit from the s390_idle data structure to the per-cpu flags. Clear the nohz delay flag in __cpu_disable and remove the cpu hotplug notifier that used to do this. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cputime.h | 8 -------- arch/s390/include/asm/processor.h | 4 ++++ arch/s390/kernel/irq.c | 2 +- arch/s390/kernel/smp.c | 1 + arch/s390/kernel/vtime.c | 19 +------------------ drivers/s390/cio/airq.c | 2 +- drivers/s390/cio/cio.c | 2 +- include/linux/tick.h | 2 +- kernel/time/tick-sched.c | 2 +- 9 files changed, 11 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index f65bd3634519..01887b1fade5 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -166,7 +166,6 @@ static inline clock_t cputime64_to_clock_t(cputime64_t cputime) } struct s390_idle_data { - int nohz_delay; unsigned int sequence; unsigned long long idle_count; unsigned long long idle_time; @@ -182,11 +181,4 @@ cputime64_t s390_get_idle_time(int cpu); #define arch_idle_time(cpu) s390_get_idle_time(cpu) -static inline int s390_nohz_delay(int cpu) -{ - return __get_cpu_var(s390_idle).nohz_delay != 0; -} - -#define arch_needs_cpu(cpu) s390_nohz_delay(cpu) - #endif /* _S390_CPUTIME_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index e568fc8a7250..bc796d73129b 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -13,9 +13,11 @@ #define CIF_MCCK_PENDING 0 /* machine check handling is pending */ #define CIF_ASCE 1 /* user asce needs fixup / uaccess */ +#define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */ #define _CIF_MCCK_PENDING (1<int_code; if (ext_code.code != EXT_IRQ_CLK_COMP) - __get_cpu_var(s390_idle).nohz_delay = 1; + set_cpu_flag(CIF_NOHZ_DELAY); index = ext_hash(ext_code.code); rcu_read_lock(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index abec97b4ddbf..46317d6951c4 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -720,6 +720,7 @@ int __cpu_disable(void) cregs[6] &= ~0xff000000UL; /* disable all I/O interrupts */ cregs[14] &= ~0x1f000000UL; /* disable most machine checks */ __ctl_load(cregs, 0, 15); + clear_cpu_flag(CIF_NOHZ_DELAY); return 0; } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 8c34363d6f1e..40709821abde 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -163,7 +163,7 @@ void __kprobes vtime_stop_cpu(void) /* Wait for external, I/O or machine check interrupt. */ psw_mask = PSW_KERNEL_BITS | PSW_MASK_WAIT | PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; - idle->nohz_delay = 0; + clear_cpu_flag(CIF_NOHZ_DELAY); /* Call the assembler magic in entry.S */ psw_idle(idle, psw_mask); @@ -378,25 +378,8 @@ void init_cpu_vtimer(void) set_vtimer(VTIMER_MAX_SLICE); } -static int s390_nohz_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - struct s390_idle_data *idle; - long cpu = (long) hcpu; - - idle = &per_cpu(s390_idle, cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DYING: - idle->nohz_delay = 0; - default: - break; - } - return NOTIFY_OK; -} - void __init vtime_init(void) { /* Enable cpu timer interrupts on the boot cpu. */ init_cpu_vtimer(); - cpu_notifier(s390_nohz_notify, 0); } diff --git a/drivers/s390/cio/airq.c b/drivers/s390/cio/airq.c index 00bfbee0af9e..56eb4ee4deba 100644 --- a/drivers/s390/cio/airq.c +++ b/drivers/s390/cio/airq.c @@ -87,7 +87,7 @@ static irqreturn_t do_airq_interrupt(int irq, void *dummy) struct airq_struct *airq; struct hlist_head *head; - __this_cpu_write(s390_idle.nohz_delay, 1); + set_cpu_flag(CIF_NOHZ_DELAY); tpi_info = (struct tpi_info *) &get_irq_regs()->int_code; head = &airq_lists[tpi_info->isc]; rcu_read_lock(); diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 2905d8b0ec95..d5a6f287d2fe 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -561,7 +561,7 @@ static irqreturn_t do_cio_interrupt(int irq, void *dummy) struct subchannel *sch; struct irb *irb; - __this_cpu_write(s390_idle.nohz_delay, 1); + set_cpu_flag(CIF_NOHZ_DELAY); tpi_info = (struct tpi_info *) &get_irq_regs()->int_code; irb = &__get_cpu_var(cio_irb); sch = (struct subchannel *)(unsigned long) tpi_info->intparm; diff --git a/include/linux/tick.h b/include/linux/tick.h index 9a82c7dc3fdd..e5832d03da19 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -108,7 +108,7 @@ extern struct tick_sched *tick_get_tick_sched(int cpu); extern void tick_irq_enter(void); extern int tick_oneshot_mode_active(void); # ifndef arch_needs_cpu -# define arch_needs_cpu(cpu) (0) +# define arch_needs_cpu() (0) # endif # else static inline void tick_clock_notify(void) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f654a8a298fa..01d512fd45f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -572,7 +572,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || - arch_needs_cpu(cpu) || irq_work_needs_cpu()) { + arch_needs_cpu() || irq_work_needs_cpu()) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { -- cgit v1.2.3 From addff1feb02b03cb766b9a611c6b2cebf29bc285 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 8 Oct 2014 13:52:16 -0400 Subject: tracing: Clean up scheduling in trace_wakeup_test_thread() Peter's new debugging tool triggers when tasks exit with !TASK_RUNNING. The code in trace_wakeup_test_thread() also has a single schedule() call that should be encompassed by a loop. This cleans up the code a little to make it a bit more robust and also makes the return exit properly with TASK_RUNNING. Link: http://lkml.kernel.org/p/20141008135216.76142204@gandalf.local.home Reported-by: Peter Zijlstra Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 47 +++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ef60499dc8e..593f52b73551 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1025,6 +1025,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #endif #ifdef CONFIG_SCHED_TRACER + +struct wakeup_test_data { + struct completion is_ready; + int go; +}; + static int trace_wakeup_test_thread(void *data) { /* Make this a -deadline thread */ @@ -1034,51 +1040,56 @@ static int trace_wakeup_test_thread(void *data) .sched_deadline = 10000000ULL, .sched_period = 10000000ULL }; - struct completion *x = data; + struct wakeup_test_data *x = data; sched_setattr(current, &attr); /* Make it know we have a new prio */ - complete(x); + complete(&x->is_ready); /* now go to sleep and let the test wake us up */ set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!x->go) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } - complete(x); + complete(&x->is_ready); + + set_current_state(TASK_INTERRUPTIBLE); /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { - /* - * This will likely be the system top priority - * task, do short sleeps to let others run. - */ - msleep(100); + schedule(); + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); + return 0; } - int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { unsigned long save_max = tr->max_latency; struct task_struct *p; - struct completion is_ready; + struct wakeup_test_data data; unsigned long count; int ret; - init_completion(&is_ready); + memset(&data, 0, sizeof(data)); + + init_completion(&data.is_ready); /* create a -deadline thread */ - p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); + p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } /* make sure the thread is running at -deadline policy */ - wait_for_completion(&is_ready); + wait_for_completion(&data.is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -1099,18 +1110,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) msleep(100); } - init_completion(&is_ready); + init_completion(&data.is_ready); + + data.go = 1; + /* memory barrier is in the wake_up_process() */ wake_up_process(p); /* Wait for the task to wake up */ - wait_for_completion(&is_ready); + wait_for_completion(&data.is_ready); /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ ret = trace_test_buffer(&tr->trace_buffer, NULL); - printk("ret = %d\n", ret); if (!ret) ret = trace_test_buffer(&tr->max_buffer, &count); -- cgit v1.2.3 From b1a8de1f534337b398c7778578a56ec4f018cb27 Mon Sep 17 00:00:00 2001 From: chai wen Date: Thu, 9 Oct 2014 15:25:17 -0700 Subject: softlockup: make detector be aware of task switch of processes hogging cpu For now, soft lockup detector warns once for each case of process softlockup. But the thread 'watchdog/n' may not always get the cpu at the time slot between the task switch of two processes hogging that cpu to reset soft_watchdog_warn. An example would be two processes hogging the cpu. Process A causes the softlockup warning and is killed manually by a user. Process B immediately becomes the new process hogging the cpu preventing the softlockup code from resetting the soft_watchdog_warn variable. This case is a false negative of "warn only once for a process", as there may be a different process that is going to hog the cpu. Resolve this by saving/checking the task pointer of the hogging process and use that to reset soft_watchdog_warn too. [dzickus@redhat.com: update comment] Signed-off-by: chai wen Signed-off-by: Don Zickus Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a8d6914030fe..7b223b212683 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -47,6 +47,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); +static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); #ifdef CONFIG_HARDLOCKUP_DETECTOR static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); @@ -333,8 +334,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) + if (__this_cpu_read(soft_watchdog_warn) == true) { + /* + * When multiple processes are causing softlockups the + * softlockup detector only warns on the first one + * because the code relies on a full quiet cycle to + * re-arm. The second process prevents the quiet cycle + * and never gets reported. Use task pointers to detect + * this. + */ + if (__this_cpu_read(softlockup_task_ptr_saved) != + current) { + __this_cpu_write(soft_watchdog_warn, false); + __touch_watchdog(); + } return HRTIMER_RESTART; + } if (softlockup_all_cpu_backtrace) { /* Prevent multiple soft-lockup reports if one cpu is already @@ -350,6 +365,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); + __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); if (regs) -- cgit v1.2.3 From 109228389a943edd7e5c6ae94a7fda119691baec Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 9 Oct 2014 15:26:18 -0700 Subject: kernel/kthread.c: partial revert of 81c98869faa5 ("kthread: ensure locality of task_struct allocations") After discussions with Tejun, we don't want to spread the use of cpu_to_mem() (and thus knowledge of allocators/NUMA topology details) into callers, but would rather ensure the callees correctly handle memoryless nodes. With the previous patches ("topology: add support for node_to_mem_node() to determine the fallback node" and "slub: fallback to node_to_mem_node() node if allocating on memoryless node") adding and using node_to_mem_node(), we can safely undo part of the change to the kthread logic from 81c98869faa5. Signed-off-by: Nishanth Aravamudan Cc: Joonsoo Kim Cc: David Rientjes Cc: Han Pingtian Cc: Pekka Enberg Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Anton Blanchard Cc: Christoph Lameter Cc: Wanpeng Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index ef483220e855..10e489c448fe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), { struct task_struct *p; - p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, + p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; -- cgit v1.2.3 From 8764b338b37524ab1a78aee527318ebee9762487 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:32 -0700 Subject: mm: use may_adjust_brk helper Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 11 ++++------- mm/mmap.c | 7 +++---- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..7879729bd3bd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1693,7 +1693,6 @@ exit: static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { - unsigned long rlim = rlimit(RLIMIT_DATA); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int error; @@ -1733,9 +1732,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (mm->brk - addr) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, + mm->end_data, mm->start_data)) goto out; mm->start_brk = addr; @@ -1745,9 +1743,8 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr <= mm->end_data) goto out; - if (rlim < RLIM_INFINITY && - (addr - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, + mm->end_data, mm->start_data)) goto out; mm->brk = addr; diff --git a/mm/mmap.c b/mm/mmap.c index 2814189f501e..7ff38f1a66ec 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len); SYSCALL_DEFINE1(brk, unsigned long, brk) { - unsigned long rlim, retval; + unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; @@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ - rlim = rlimit(RLIMIT_DATA); - if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + - (mm->end_data - mm->start_data) > rlim) + if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, + mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); -- cgit v1.2.3 From 71fe97e185040c5dac3216cd54e186dfa534efa0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:34 -0700 Subject: prctl: PR_SET_MM -- factor out mmap_sem when updating mm::exe_file Instead of taking mm->mmap_sem inside prctl_set_mm_exe_file() move it out and rename the helper to prctl_set_mm_exe_file_locked(). This will allow to reuse this function in a next patch. Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 7879729bd3bd..14222a1699c0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1628,12 +1628,14 @@ SYSCALL_DEFINE1(umask, int, mask) return mask; } -static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) { struct fd exe; struct inode *inode; int err; + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + exe = fdget(fd); if (!exe.file) return -EBADF; @@ -1654,8 +1656,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; - down_write(&mm->mmap_sem); - /* * Forbid mm->exe_file change if old file still mapped. */ @@ -1667,7 +1667,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (vma->vm_file && path_equal(&vma->vm_file->f_path, &mm->exe_file->f_path)) - goto exit_unlock; + goto exit; } /* @@ -1678,13 +1678,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) */ err = -EPERM; if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit_unlock; + goto exit; err = 0; set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ -exit_unlock: - up_write(&mm->mmap_sem); - exit: fdput(exe); return err; @@ -1703,8 +1700,12 @@ static int prctl_set_mm(int opt, unsigned long addr, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (opt == PR_SET_MM_EXE_FILE) - return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (opt == PR_SET_MM_EXE_FILE) { + down_write(&mm->mmap_sem); + error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); + up_write(&mm->mmap_sem); + return error; + } if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; -- cgit v1.2.3 From f606b77f1a9e362451aca8f81d8f36a3a112139e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:37 -0700 Subject: prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation During development of c/r we've noticed that in case if we need to support user namespaces we face a problem with capabilities in prctl(PR_SET_MM, ...) call, in particular once new user namespace is created capable(CAP_SYS_RESOURCE) no longer passes. A approach is to eliminate CAP_SYS_RESOURCE check but pass all new values in one bundle, which would allow the kernel to make more intensive test for sanity of values and same time allow us to support checkpoint/restore of user namespaces. Thus a new command PR_SET_MM_MAP introduced. It takes a pointer of prctl_mm_map structure which carries all the members to be updated. prctl(PR_SET_MM, PR_SET_MM_MAP, struct prctl_mm_map *, size) struct prctl_mm_map { __u64 start_code; __u64 end_code; __u64 start_data; __u64 end_data; __u64 start_brk; __u64 brk; __u64 start_stack; __u64 arg_start; __u64 arg_end; __u64 env_start; __u64 env_end; __u64 *auxv; __u32 auxv_size; __u32 exe_fd; }; All members except @exe_fd correspond ones of struct mm_struct. To figure out which available values these members may take here are meanings of the members. - start_code, end_code: represent bounds of executable code area - start_data, end_data: represent bounds of data area - start_brk, brk: used to calculate bounds for brk() syscall - start_stack: used when accounting space needed for command line arguments, environment and shmat() syscall - arg_start, arg_end, env_start, env_end: represent memory area supplied for command line arguments and environment variables - auxv, auxv_size: carries auxiliary vector, Elf format specifics - exe_fd: file descriptor number for executable link (/proc/self/exe) Thus we apply the following requirements to the values 1) Any member except @auxv, @auxv_size, @exe_fd is rather an address in user space thus it must be laying inside [mmap_min_addr, mmap_max_addr) interval. 2) While @[start|end]_code and @[start|end]_data may point to an nonexisting VMAs (say a program maps own new .text and .data segments during execution) the rest of members should belong to VMA which must exist. 3) Addresses must be ordered, ie @start_ member must not be greater or equal to appropriate @end_ member. 4) As in regular Elf loading procedure we require that @start_brk and @brk be greater than @end_data. 5) If RLIMIT_DATA rlimit is set to non-infinity new values should not exceed existing limit. Same applies to RLIMIT_STACK. 6) Auxiliary vector size must not exceed existing one (which is predefined as AT_VECTOR_SIZE and depends on architecture). 7) File descriptor passed in @exe_file should be pointing to executable file (because we use existing prctl_set_mm_exe_file_locked helper it ensures that the file we are going to use as exe link has all required permission granted). Now about where these members are involved inside kernel code: - @start_code and @end_code are used in /proc/$pid/[stat|statm] output; - @start_data and @end_data are used in /proc/$pid/[stat|statm] output, also they are considered if there enough space for brk() syscall result if RLIMIT_DATA is set; - @start_brk shown in /proc/$pid/stat output and accounted in brk() syscall if RLIMIT_DATA is set; also this member is tested to find a symbolic name of mmap event for perf system (we choose if event is generated for "heap" area); one more aplication is selinux -- we test if a process has PROCESS__EXECHEAP permission if trying to make heap area being executable with mprotect() syscall; - @brk is a current value for brk() syscall which lays inside heap area, it's shown in /proc/$pid/stat. When syscall brk() succesfully provides new memory area to a user space upon brk() completion the mm::brk is updated to carry new value; Both @start_brk and @brk are actively used in /proc/$pid/maps and /proc/$pid/smaps output to find a symbolic name "heap" for VMA being scanned; - @start_stack is printed out in /proc/$pid/stat and used to find a symbolic name "stack" for task and threads in /proc/$pid/maps and /proc/$pid/smaps output, and as the same as with @start_brk -- perf system uses it for event naming. Also kernel treat this member as a start address of where to map vDSO pages and to check if there is enough space for shmat() syscall; - @arg_start, @arg_end, @env_start and @env_end are printed out in /proc/$pid/stat. Another access to the data these members represent is to read /proc/$pid/environ or /proc/$pid/cmdline. Any attempt to read these areas kernel tests with access_process_vm helper so a user must have enough rights for this action; - @auxv and @auxv_size may be read from /proc/$pid/auxv. Strictly speaking kernel doesn't care much about which exactly data is sitting there because it is solely for userspace; - @exe_fd is referred from /proc/$pid/exe and when generating coredump. We uses prctl_set_mm_exe_file_locked helper to update this member, so exe-file link modification remains one-shot action. Still note that updating exe-file link now doesn't require sys-resource capability anymore, after all there is no much profit in preventing setup own file link (there are a number of ways to execute own code -- ptrace, ld-preload, so that the only reliable way to find which exactly code is executed is to inspect running program memory). Still we require the caller to be at least user-namespace root user. I believe the old interface should be deprecated and ripped off in a couple of kernel releases if no one against. To test if new interface is implemented in the kernel one can pass PR_SET_MM_MAP_SIZE opcode and the kernel returns the size of currently supported struct prctl_mm_map. [akpm@linux-foundation.org: fix 80-col wordwrap in macro definitions] Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Acked-by: Andrew Vagin Tested-by: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/prctl.h | 27 +++++++ kernel/sys.c | 190 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 216 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 58afc04c107e..513df75d0fc9 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -1,6 +1,8 @@ #ifndef _LINUX_PRCTL_H #define _LINUX_PRCTL_H +#include + /* Values to pass as first argument to prctl() */ #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ @@ -119,6 +121,31 @@ # define PR_SET_MM_ENV_END 11 # define PR_SET_MM_AUXV 12 # define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; /* * Set specific pid that is allowed to ptrace the current task. diff --git a/kernel/sys.c b/kernel/sys.c index 14222a1699c0..f7030b060018 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1687,6 +1687,187 @@ exit: return err; } +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * WARNING: we don't require any capability here so be very careful + * in what is allowed for modification from userspace. + */ +static int validate_prctl_map(struct prctl_mm_map *prctl_map) +{ + unsigned long mmap_max_addr = TASK_SIZE; + struct mm_struct *mm = current->mm; + int error = -EINVAL, i; + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), + offsetof(struct prctl_mm_map, end_code), + offsetof(struct prctl_mm_map, start_data), + offsetof(struct prctl_mm_map, end_data), + offsetof(struct prctl_mm_map, start_brk), + offsetof(struct prctl_mm_map, brk), + offsetof(struct prctl_mm_map, start_stack), + offsetof(struct prctl_mm_map, arg_start), + offsetof(struct prctl_mm_map, arg_end), + offsetof(struct prctl_mm_map, env_start), + offsetof(struct prctl_mm_map, env_end), + }; + + /* + * Make sure the members are not somewhere outside + * of allowed address space. + */ + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; + } + + /* + * Make sure the pairs are ordered. + */ +#define __prctl_check_order(__m1, __op, __m2) \ + ((unsigned long)prctl_map->__m1 __op \ + (unsigned long)prctl_map->__m2) ? 0 : -EINVAL + error = __prctl_check_order(start_code, <, end_code); + error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_brk, <=, brk); + error |= __prctl_check_order(arg_start, <=, arg_end); + error |= __prctl_check_order(env_start, <=, env_end); + if (error) + goto out; +#undef __prctl_check_order + + error = -EINVAL; + + /* + * @brk should be after @end_data in traditional maps. + */ + if (prctl_map->start_brk <= prctl_map->end_data || + prctl_map->brk <= prctl_map->end_data) + goto out; + + /* + * Neither we should allow to override limits if they set. + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, + prctl_map->start_brk, prctl_map->end_data, + prctl_map->start_data)) + goto out; + + /* + * Someone is trying to cheat the auxv vector. + */ + if (prctl_map->auxv_size) { + if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) + goto out; + } + + /* + * Finally, make sure the caller has the rights to + * change /proc/pid/exe link: only local root should + * be allowed to. + */ + if (prctl_map->exe_fd != (u32)-1) { + struct user_namespace *ns = current_user_ns(); + const struct cred *cred = current_cred(); + + if (!uid_eq(cred->uid, make_kuid(ns, 0)) || + !gid_eq(cred->gid, make_kgid(ns, 0))) + goto out; + } + + error = 0; +out: + return error; +} + +static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) +{ + struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; + unsigned long user_auxv[AT_VECTOR_SIZE]; + struct mm_struct *mm = current->mm; + int error; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); + + if (opt == PR_SET_MM_MAP_SIZE) + return put_user((unsigned int)sizeof(prctl_map), + (unsigned int __user *)addr); + + if (data_size != sizeof(prctl_map)) + return -EINVAL; + + if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) + return -EFAULT; + + error = validate_prctl_map(&prctl_map); + if (error) + return error; + + if (prctl_map.auxv_size) { + memset(user_auxv, 0, sizeof(user_auxv)); + if (copy_from_user(user_auxv, + (const void __user *)prctl_map.auxv, + prctl_map.auxv_size)) + return -EFAULT; + + /* Last entry must be AT_NULL as specification requires */ + user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; + user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; + } + + down_write(&mm->mmap_sem); + if (prctl_map.exe_fd != (u32)-1) + error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); + downgrade_write(&mm->mmap_sem); + if (error) + goto out; + + /* + * We don't validate if these members are pointing to + * real present VMAs because application may have correspond + * VMAs already unmapped and kernel uses these members for statistics + * output in procfs mostly, except + * + * - @start_brk/@brk which are used in do_brk but kernel lookups + * for VMAs when updating these memvers so anything wrong written + * here cause kernel to swear at userspace program but won't lead + * to any problem in kernel itself + */ + + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + + /* + * Note this update of @saved_auxv is lockless thus + * if someone reads this member in procfs while we're + * updating -- it may get partly updated results. It's + * known and acceptable trade off: we leave it as is to + * not introduce additional locks here making the kernel + * more complex. + */ + if (prctl_map.auxv_size) + memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); + + error = 0; +out: + up_read(&mm->mmap_sem); + return error; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { @@ -1694,9 +1875,16 @@ static int prctl_set_mm(int opt, unsigned long addr, struct vm_area_struct *vma; int error; - if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && + opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) + return prctl_set_mm_map(opt, (const void __user *)addr, arg4); +#endif + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; -- cgit v1.2.3 From 1f13ae399c58af5a05b5cee61da864e1f4071de4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 9 Oct 2014 15:27:39 -0700 Subject: mm: remove noisy remainder of the scan_unevictable interface The deprecation warnings for the scan_unevictable interface triggers by scripts doing `sysctl -a | grep something else'. This is annoying and not helpful. The interface has been defunct since 264e56d8247e ("mm: disable user interface to manually rescue unevictable pages"), which was in 2011, and there haven't been any reports of usecases for it, only reports that the deprecation warnings are annying. It's unlikely that anybody is using this interface specifically at this point, so remove it. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/stable/sysfs-devices-node | 8 ---- drivers/base/node.c | 3 -- include/linux/swap.h | 16 -------- kernel/sysctl.c | 7 ---- mm/vmscan.c | 63 ----------------------------- 5 files changed, 97 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index ce259c13c36a..5b2d0f08867c 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -85,14 +85,6 @@ Description: will be compacted. When it completes, memory will be freed into blocks which have as many contiguous pages as possible -What: /sys/devices/system/node/nodeX/scan_unevictable_pages -Date: October 2008 -Contact: Lee Schermerhorn -Description: - When set, it triggers scanning the node's unevictable lists - and move any pages that have become evictable onto the respective - zone's inactive list. See mm/vmscan.c - What: /sys/devices/system/node/nodeX/hugepages/hugepages-/ Date: December 2009 Contact: Lee Schermerhorn diff --git a/drivers/base/node.c b/drivers/base/node.c index d51c49c9bafa..472168cd0c97 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -289,8 +289,6 @@ static int register_node(struct node *node, int num, struct node *parent) device_create_file(&node->dev, &dev_attr_distance); device_create_file(&node->dev, &dev_attr_vmstat); - scan_unevictable_register_node(node); - hugetlb_register_node(node); compaction_register_node(node); @@ -314,7 +312,6 @@ void unregister_node(struct node *node) device_remove_file(&node->dev, &dev_attr_distance); device_remove_file(&node->dev, &dev_attr_vmstat); - scan_unevictable_unregister_node(node); hugetlb_unregister_node(node); /* no-op, if memoryless node */ device_unregister(&node->dev); diff --git a/include/linux/swap.h b/include/linux/swap.h index 1b72060f093a..ea4f926e6b9b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -354,22 +354,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) extern int page_evictable(struct page *page); extern void check_move_unevictable_pages(struct page **, int nr_pages); -extern unsigned long scan_unevictable_pages; -extern int scan_unevictable_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#ifdef CONFIG_NUMA -extern int scan_unevictable_register_node(struct node *node); -extern void scan_unevictable_unregister_node(struct node *node); -#else -static inline int scan_unevictable_register_node(struct node *node) -{ - return 0; -} -static inline void scan_unevictable_unregister_node(struct node *node) -{ -} -#endif - extern int kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_MEMCG diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75875a741b5e..91180987e40e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1460,13 +1460,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif - { - .procname = "scan_unevictable_pages", - .data = &scan_unevictable_pages, - .maxlen = sizeof(scan_unevictable_pages), - .mode = 0644, - .proc_handler = scan_unevictable_handler, - }, #ifdef CONFIG_MEMORY_FAILURE { .procname = "memory_failure_early_kill", diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a71b8b1ea34..af72fe8e8d74 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3797,66 +3797,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) } } #endif /* CONFIG_SHMEM */ - -static void warn_scan_unevictable_pages(void) -{ - printk_once(KERN_WARNING - "%s: The scan_unevictable_pages sysctl/node-interface has been " - "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n", - current->comm); -} - -/* - * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of - * all nodes' unevictable lists for evictable pages - */ -unsigned long scan_unevictable_pages; - -int scan_unevictable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - warn_scan_unevictable_pages(); - proc_doulongvec_minmax(table, write, buffer, length, ppos); - scan_unevictable_pages = 0; - return 0; -} - -#ifdef CONFIG_NUMA -/* - * per node 'scan_unevictable_pages' attribute. On demand re-scan of - * a specified node's per zone unevictable lists for evictable pages. - */ - -static ssize_t read_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - warn_scan_unevictable_pages(); - return sprintf(buf, "0\n"); /* always zero; should fit... */ -} - -static ssize_t write_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - warn_scan_unevictable_pages(); - return 1; -} - - -static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, - read_scan_unevictable_node, - write_scan_unevictable_node); - -int scan_unevictable_register_node(struct node *node) -{ - return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); -} - -void scan_unevictable_unregister_node(struct node *node) -{ - device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); -} -#endif -- cgit v1.2.3 From 6b6482bbf64ef6f6dbc8b52f7a7cf88a0498bd51 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Oct 2014 15:27:48 -0700 Subject: mempolicy: remove the "task" arg of vma_policy_mof() and simplify it 1. vma_policy_mof(task) is simply not safe unless task == current, it can race with do_exit()->mpol_put(). Remove this arg and update its single caller. 2. vma can not be NULL, remove this check and simplify the code. Signed-off-by: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Alexander Viro Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Andi Kleen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 +- kernel/sched/fair.c | 2 +- mm/mempolicy.c | 25 +++++++++++-------------- 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index f230a978e6ba..5e4bfcedd2ce 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -136,7 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_vma_policy(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long addr); -bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); +bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..82088b29704e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1946,7 +1946,7 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma)) continue; /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b86b08e77b8d..ad27bbc757bf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1646,27 +1646,24 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } -bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +bool vma_policy_mof(struct vm_area_struct *vma) { - struct mempolicy *pol = NULL; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - bool ret = false; + struct mempolicy *pol; - pol = vma->vm_ops->get_policy(vma, vma->vm_start); - if (pol && (pol->flags & MPOL_F_MOF)) - ret = true; - mpol_cond_put(pol); + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; - return ret; - } + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); - pol = vma->vm_policy; + return ret; } + pol = vma->vm_policy; if (!pol) - pol = get_task_policy(task); + pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } -- cgit v1.2.3 From 96dad67ff244e797c4bc3e4f7f0fdaa0cfdf0a7d Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 9 Oct 2014 15:28:39 -0700 Subject: mm: use VM_BUG_ON_MM where possible Dump the contents of the relevant struct_mm when we hit the bug condition. Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +-- kernel/sys.c | 2 +- mm/huge_memory.c | 2 +- mm/mlock.c | 2 +- mm/mmap.c | 7 ++++--- mm/pagewalk.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a91e47d86de2..8c162d102740 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -601,9 +601,8 @@ static void check_mm(struct mm_struct *mm) printk(KERN_ALERT "BUG: Bad rss-counter state " "mm:%p idx:%d val:%ld\n", mm, i, x); } - #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON(mm->pmd_huge_pte); + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } diff --git a/kernel/sys.c b/kernel/sys.c index f7030b060018..df692fbf1e79 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1634,7 +1634,7 @@ static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) struct inode *inode; int err; - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); exe = fdget(fd); if (!exe.file) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c13148cc745f..74c78aa8bc2f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2048,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON(khugepaged_test_exit(mm)); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; diff --git a/mm/mlock.c b/mm/mlock.c index d5d09d0786ec..03aa8512723b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -235,7 +235,7 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* diff --git a/mm/mmap.c b/mm/mmap.c index c9bc285df255..16d19b48e2ad 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -410,8 +410,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); - BUG_ON(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); + VM_BUG_ON_VMA(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma), + vma); } } @@ -448,7 +449,7 @@ static void validate_mm(struct mm_struct *mm) pr_emerg("map_count %d rb %d\n", mm->map_count, i); bug = 1; } - BUG_ON(bug); + VM_BUG_ON_MM(bug, mm); } #else #define validate_mm_rb(root, ignore) do { } while (0) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2beeabf502c5..ad83195521f2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); pgd = pgd_offset(walk->mm, addr); do { -- cgit v1.2.3 From 27fb10edcacbb70ac4e97fe1506006d732421210 Mon Sep 17 00:00:00 2001 From: Ionut Alexa Date: Thu, 9 Oct 2014 15:30:19 -0700 Subject: kernel/async.c: switch to pr_foo() Signed-off-by: Ionut Alexa Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/async.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 61f023ce0228..4c3773c0bf63 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state == SYSTEM_BOOTING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); @@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain ktime_t uninitialized_var(starttime), delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { - printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); + pr_debug("async_waiting @ %i\n", task_pid_nr(current)); starttime = ktime_get(); } @@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain endtime = ktime_get(); delta = ktime_sub(endtime, starttime); - printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", + pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), (long long)ktime_to_ns(delta) >> 10); } -- cgit v1.2.3 From 067b722faf98adbe1e94581f39c06a7c82b58676 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Thu, 9 Oct 2014 15:30:21 -0700 Subject: acct: eliminate compile warning If ACCT_VERSION is not defined to 3, below warning appears: CC kernel/acct.o kernel/acct.c: In function `do_acct_process': kernel/acct.c:475:24: warning: unused variable `ns' [-Wunused-variable] [akpm@linux-foundation.org: retain the local for code size improvements Signed-off-by: Ying Xue Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index b4c667d22e79..33738ef972f3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct) acct_t ac; unsigned long flim; const struct cred *orig_cred; - struct pid_namespace *ns = acct->ns; struct file *file = acct->file; /* @@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct) ac.ac_gid16 = ac.ac_gid; #endif #if ACCT_VERSION == 3 - ac.ac_pid = task_tgid_nr_ns(current, ns); - rcu_read_lock(); - ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); - rcu_read_unlock(); + { + struct pid_namespace *ns = acct->ns; + + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), + ns); + rcu_read_unlock(); + } #endif /* * Get freeze protection. If the fs is frozen, just skip the write -- cgit v1.2.3 From ec94fc3d59b54561da03a0e433d93217b08c1481 Mon Sep 17 00:00:00 2001 From: "vishnu.ps" Date: Thu, 9 Oct 2014 15:30:23 -0700 Subject: kernel/sys.c: whitespace fixes Fix minor errors and warning messages in kernel/sys.c. These errors were reported by checkpatch while working with some modifications in sys.c file. Fixing this first will help me to improve my further patches. ERROR: trailing whitespace - 9 ERROR: do not use assignment in if condition - 4 ERROR: spaces required around that '?' (ctx:VxO) - 10 ERROR: switch and case should be at the same indent - 3 total 26 errors & 3 warnings fixed. Signed-off-by: vishnu.ps Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 265 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 137 insertions(+), 128 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index df692fbf1e79..037fd76bdc76 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -62,28 +62,28 @@ #include #ifndef SET_UNALIGN_CTL -# define SET_UNALIGN_CTL(a,b) (-EINVAL) +# define SET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef GET_UNALIGN_CTL -# define GET_UNALIGN_CTL(a,b) (-EINVAL) +# define GET_UNALIGN_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEMU_CTL -# define SET_FPEMU_CTL(a,b) (-EINVAL) +# define SET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEMU_CTL -# define GET_FPEMU_CTL(a,b) (-EINVAL) +# define GET_FPEMU_CTL(a, b) (-EINVAL) #endif #ifndef SET_FPEXC_CTL -# define SET_FPEXC_CTL(a,b) (-EINVAL) +# define SET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_FPEXC_CTL -# define GET_FPEXC_CTL(a,b) (-EINVAL) +# define GET_FPEXC_CTL(a, b) (-EINVAL) #endif #ifndef GET_ENDIAN -# define GET_ENDIAN(a,b) (-EINVAL) +# define GET_ENDIAN(a, b) (-EINVAL) #endif #ifndef SET_ENDIAN -# define SET_ENDIAN(a,b) (-EINVAL) +# define SET_ENDIAN(a, b) (-EINVAL) #endif #ifndef GET_TSC_CTL # define GET_TSC_CTL(a) (-EINVAL) @@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) - error = set_one_prio(p, niceval, error); - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - error = set_one_prio(p, niceval, error); - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) - error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* For find_user() */ - break; + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) + error = set_one_prio(p, niceval, error); + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* For find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { - case PRIO_PROCESS: - if (who) - p = find_task_by_vpid(who); - else - p = current; - if (p) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = nice_to_rlimit(task_nice(p)); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + uid = make_kuid(cred->user_ns, who); + user = cred->user; + if (!who) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid)) { + user = find_user(uid); + if (!user) + goto out_unlock; /* No processes for this user */ + } + do_each_thread(g, p) { + if (uid_eq(task_uid(p), uid)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - break; - case PRIO_PGRP: - if (who) - pgrp = find_vpid(who); - else - pgrp = task_pgrp(current); - do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); - break; - case PRIO_USER: - uid = make_kuid(cred->user_ns, who); - user = cred->user; - if (!who) - uid = cred->uid; - else if (!uid_eq(uid, cred->uid) && - !(user = find_user(uid))) - goto out_unlock; /* No processes for this user */ - - do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { - niceval = nice_to_rlimit(task_nice(p)); - if (niceval > retval) - retval = niceval; - } - } while_each_thread(g, p); - if (!uid_eq(uid, cred->uid)) - free_uid(user); /* for find_user() */ - break; + } while_each_thread(g, p); + if (!uid_eq(uid, cred->uid)) + free_uid(user); /* for find_user() */ + break; } out_unlock: read_unlock(&tasklist_lock); @@ -306,7 +308,7 @@ out_unlock: * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). @@ -364,7 +366,7 @@ error: } /* - * setgid() is implemented like SysV w/ SAVED_IDS + * setgid() is implemented like SysV w/ SAVED_IDS * * SMP: Same implicit races as above. */ @@ -442,7 +444,7 @@ static int set_user(struct cred *new) * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be - * 100% compatible with POSIX with saved IDs. + * 100% compatible with POSIX with saved IDs. */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { @@ -503,17 +505,17 @@ error: abort_creds(new); return retval; } - + /* - * setuid() is implemented like SysV with SAVED_IDS - * + * setuid() is implemented like SysV with SAVED_IDS + * * Note that SAVED_ID's is deficient in that a setuid root program - * like sendmail, for example, cannot set its uid to be a normal + * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to - * regain them by swapping the real and effective uid. + * regain them by swapping the real and effective uid. */ SYSCALL_DEFINE1(setuid, uid_t, uid) { @@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ euid = from_kuid_munged(cred->user_ns, cred->euid); suid = from_kuid_munged(cred->user_ns, cred->suid); - if (!(retval = put_user(ruid, ruidp)) && - !(retval = put_user(euid, euidp))) - retval = put_user(suid, suidp); - + retval = put_user(ruid, ruidp); + if (!retval) { + retval = put_user(euid, euidp); + if (!retval) + return put_user(suid, suidp); + } return retval; } @@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ egid = from_kgid_munged(cred->user_ns, cred->egid); sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(rgid, rgidp)) && - !(retval = put_user(egid, egidp))) - retval = put_user(sgid, sgidp); + retval = put_user(rgid, rgidp); + if (!retval) { + retval = put_user(egid, egidp); + if (!retval) + retval = put_user(sgid, sgidp); + } return retval; } @@ -1284,7 +1291,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) /* * Back compatibility for getrlimit. Needed for some apps. */ - SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { @@ -1299,7 +1305,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, x.rlim_cur = 0x7FFFFFFF; if (x.rlim_max > 0x7FFFFFFF) x.rlim_max = 0x7FFFFFFF; - return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; + return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; } #endif @@ -1527,7 +1533,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) cputime_t tgutime, tgstime, utime, stime; unsigned long maxrss = 0; - memset((char *) r, 0, sizeof *r); + memset((char *)r, 0, sizeof (*r)); utime = stime = 0; if (who == RUSAGE_THREAD) { @@ -1541,41 +1547,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) return; switch (who) { - case RUSAGE_BOTH: - case RUSAGE_CHILDREN: - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - r->ru_inblock = p->signal->cinblock; - r->ru_oublock = p->signal->coublock; - maxrss = p->signal->cmaxrss; - - if (who == RUSAGE_CHILDREN) - break; - - case RUSAGE_SELF: - thread_group_cputime_adjusted(p, &tgutime, &tgstime); - utime += tgutime; - stime += tgstime; - r->ru_nvcsw += p->signal->nvcsw; - r->ru_nivcsw += p->signal->nivcsw; - r->ru_minflt += p->signal->min_flt; - r->ru_majflt += p->signal->maj_flt; - r->ru_inblock += p->signal->inblock; - r->ru_oublock += p->signal->oublock; - if (maxrss < p->signal->maxrss) - maxrss = p->signal->maxrss; - t = p; - do { - accumulate_thread_rusage(t, r); - } while_each_thread(p, t); + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; + + if (who == RUSAGE_CHILDREN) break; - default: - BUG(); + case RUSAGE_SELF: + thread_group_cputime_adjusted(p, &tgutime, &tgstime); + utime += tgutime; + stime += tgstime; + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; + t = p; + do { + accumulate_thread_rusage(t, r); + } while_each_thread(p, t); + break; + + default: + BUG(); } unlock_task_sighand(p, &flags); @@ -1585,6 +1591,7 @@ out: if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); + if (mm) { setmax_mm_hiwater_rss(&maxrss, mm); mmput(mm); @@ -1596,6 +1603,7 @@ out: int getrusage(struct task_struct *p, int who, struct rusage __user *ru) { struct rusage r; + k_getrusage(p, who, &r); return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; } @@ -2209,6 +2217,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, { int err = 0; int cpu = raw_smp_processor_id(); + if (cpup) err |= put_user(cpu, cpup); if (nodep) -- cgit v1.2.3 From 0baae41ea8365a7b5a34c6474a77d7eb1126f6b2 Mon Sep 17 00:00:00 2001 From: Scotty Bauer Date: Thu, 9 Oct 2014 15:30:26 -0700 Subject: kernel/sys.c: compat sysinfo syscall: fix undefined behavior Fix undefined behavior and compiler warning by replacing right shift 32 with upper_32_bits macro Signed-off-by: Scotty Bauer Cc: Clemens Ladisch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 037fd76bdc76..dfce4debd138 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2330,7 +2330,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) /* Check to see if any memory value is too large for 32-bit and scale * down if needed */ - if ((s.totalram >> 32) || (s.totalswap >> 32)) { + if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { int bitcount = 0; while (s.mem_unit < PAGE_SIZE) { -- cgit v1.2.3 From 3639f17068ed40e4e208a6e218481d49817bbd56 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 2 Oct 2014 22:05:18 -0400 Subject: audit: put rule existence check in canonical order Use same rule existence check order as audit_make_tree(), audit_to_watch(), update_lsm_rule() for legibility. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 40ed9813d4b2..4a11697cf5b8 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -163,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { if (krule->listnr != AUDIT_FILTER_EXIT || - krule->watch || krule->inode_f || krule->tree || + krule->inode_f || krule->watch || krule->tree || (f->op != Audit_equal && f->op != Audit_not_equal)) return -EINVAL; -- cgit v1.2.3 From 739c95038e68d364b01c0fc6f8fb8e47b1c1e979 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 10 Oct 2014 15:05:21 -0400 Subject: audit: WARN if audit_rule_change called illegally Signed-off-by: Eric Paris --- kernel/auditfilter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4a11697cf5b8..4419d1fbcad1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1085,7 +1085,8 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, audit_free_rule(entry); break; default: - return -EINVAL; + err = -EINVAL; + WARN_ON(1); } return err; -- cgit v1.2.3 From e85322d21cfebeac64f58a204e9adc0bc5c1e46f Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 2 Oct 2014 22:05:19 -0400 Subject: audit: cull redundancy in audit_rule_change Re-factor audit_rule_change() to reduce the amount of code redundancy and simplify the logic. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/auditfilter.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4419d1fbcad1..d214cd073a58 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1064,31 +1064,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, int err = 0; struct audit_entry *entry; + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + switch (type) { case AUDIT_ADD_RULE: - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); - if (err) - audit_free_rule(entry); break; case AUDIT_DEL_RULE: - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); - audit_free_rule(entry); break; default: err = -EINVAL; WARN_ON(1); } + if (err || type == AUDIT_DEL_RULE) + audit_free_rule(entry); + return err; } -- cgit v1.2.3 From 2991dd2b0117e864f394c826af6df144206ce0db Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 2 Oct 2014 22:05:24 -0400 Subject: audit: rename audit_log_remove_rule to disambiguate for trees Rename audit_log_remove_rule() to audit_tree_log_remove_rule() to avoid confusion with watch and mark rule removal/changes. Signed-off-by: Richard Guy Briggs Signed-off-by: Eric Paris --- kernel/audit_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index bd418c486e9a..e242e3a9864a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -449,7 +449,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static void audit_log_remove_rule(struct audit_krule *rule) +static void audit_tree_log_remove_rule(struct audit_krule *rule) { struct audit_buffer *ab; @@ -476,7 +476,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - audit_log_remove_rule(rule); + audit_tree_log_remove_rule(rule); rule->tree = NULL; list_del_rcu(&entry->list); list_del(&entry->rule.list); -- cgit v1.2.3 From 2240a31db67582468e2f7a5a5962b7d0ffaaa6a4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 13 Oct 2014 15:51:11 -0700 Subject: printk: don't bother using LOG_CPU_MAX_BUF_SHIFT on !SMP When configuring a uniprocessor kernel, don't bother the user with an irrelevant LOG_CPU_MAX_BUF_SHIFT question, and don't build the unused code. Signed-off-by: Geert Uytterhoeven Acked-by: Luis R. Rodriguez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 1 + kernel/printk/printk.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 1c505e090422..3ee28ae02cc8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -838,6 +838,7 @@ config LOG_BUF_SHIFT config LOG_CPU_MAX_BUF_SHIFT int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + depends on SMP range 0 21 default 12 if !BASE_SMALL default 0 if BASE_SMALL diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7a6e69441f75..a4436b0cf769 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -267,7 +267,6 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) -#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -852,6 +851,9 @@ static int __init log_buf_len_setup(char *str) } early_param("log_buf_len", log_buf_len_setup); +#ifdef CONFIG_SMP +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) + static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; @@ -878,6 +880,9 @@ static void __init log_buf_add_cpu(void) log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } +#else /* !CONFIG_SMP */ +static inline void log_buf_add_cpu(void) {} +#endif /* CONFIG_SMP */ void __init setup_log_buf(int early) { -- cgit v1.2.3 From 98e35f5894cf208084688ec0c7bb7b713efc997f Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Mon, 13 Oct 2014 15:51:13 -0700 Subject: printk: git rid of [sched_delayed] message for printk_deferred Commit 458df9fd4815 ("printk: remove separate printk_sched buffers and use printk buf instead") hardcodes printk_deferred() to KERN_WARNING and inserts the string "[sched_delayed] " before the actual message. However it doesn't take into account the KERN_* prefix of the message, that now ends up in the middle of the output: [sched_delayed] ^a4CE: hpet increased min_delta_ns to 20115 nsec Fix this by just getting rid of the "[sched_delayed] " scnprintf(). The prefix is useless since 458df9fd4815 anyway since from that moment printk_deferred() inserts the message into the kernel printk buffer immediately. So if the message eventually gets printed to console, it is printed in the correct order with other messages and there's no need for any special prefix. And if the kernel crashes before the message makes it to console, then prefix in the printk buffer doesn't make the situation any better. Link: http://lkml.org/lkml/2014/9/14/4 Signed-off-by: Markus Trippelsdorf Acked-by: Jan Kara Acked-by: Steven Rostedt Cc: Geert Uytterhoeven Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a4436b0cf769..e3962d63e368 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1679,12 +1679,7 @@ asmlinkage int vprintk_emit(int facility, int level, * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - if (in_sched) - text_len = scnprintf(text, sizeof(textbuf), - KERN_WARNING "[sched_delayed] "); - - text_len += vscnprintf(text + text_len, - sizeof(textbuf) - text_len, fmt, args); + text_len = vscnprintf(text, sizeof(textbuf), fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { -- cgit v1.2.3 From 0049f26ae0ad00016d8e237a6d712bff155cedc5 Mon Sep 17 00:00:00 2001 From: Rob Jones Date: Mon, 13 Oct 2014 15:52:10 -0700 Subject: kernel/kallsyms.c: use __seq_open_private() Reduce boilerplate code by using __seq_open_private() instead of seq_open() in kallsyms_open(). Signed-off-by: Rob Jones Cc: Gideon Israel Dsouza Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ae5167087845..5c5987f10819 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file) * using get_symbol_offset for every symbol. */ struct kallsym_iter *iter; - int ret; - - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); if (!iter) return -ENOMEM; reset_iter(iter, 0); - ret = seq_open(file, &kallsyms_op); - if (ret == 0) - ((struct seq_file *)file->private_data)->private = iter; - else - kfree(iter); - return ret; + return 0; } #ifdef CONFIG_KGDB_KDB -- cgit v1.2.3 From 669280a152ce5144321c0e511498877383f34393 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 13 Oct 2014 15:53:40 -0700 Subject: kexec: take the segment adding out of locate_mem_hole functions In locate_mem_hole functions, a memory hole is located and added as kexec_segment. But from the name of locate_mem_hole, it should only take responsibility of searching a available memory hole to contain data of a specified size. So in this patch add a new field 'mem' into kexec_buf, then take that kexec segment adding code out of locate_mem_hole_top_down and locate_mem_hole_bottom_up. This make clear of the functionality of locate_mem_hole just like it declars to do. And by this locate_mem_hole_callback chould be used later if anyone want to locate a memory hole for other use. Meanwhile Vivek suggested opening code function __kexec_add_segment(), that way we have to retreive ksegment pointer once and it is easy to read. So just do it in this patch and remove __kexec_add_segment() since no one use it anymore. Signed-off-by: Baoquan He Acked-by: Vivek Goyal Cc: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 1 + kernel/kexec.c | 29 ++++++++--------------------- 2 files changed, 9 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 4b2a0e11cc5b..9d957b7ae095 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -178,6 +178,7 @@ struct kexec_buf { struct kimage *image; char *buffer; unsigned long bufsz; + unsigned long mem; unsigned long memsz; unsigned long buf_align; unsigned long buf_min; diff --git a/kernel/kexec.c b/kernel/kexec.c index 2bee072268d9..63bc3cdfb629 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -2016,22 +2016,6 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); #ifdef CONFIG_KEXEC_FILE -static int __kexec_add_segment(struct kimage *image, char *buf, - unsigned long bufsz, unsigned long mem, - unsigned long memsz) -{ - struct kexec_segment *ksegment; - - ksegment = &image->segment[image->nr_segments]; - ksegment->kbuf = buf; - ksegment->bufsz = bufsz; - ksegment->mem = mem; - ksegment->memsz = memsz; - image->nr_segments++; - - return 0; -} - static int locate_mem_hole_top_down(unsigned long start, unsigned long end, struct kexec_buf *kbuf) { @@ -2064,8 +2048,7 @@ static int locate_mem_hole_top_down(unsigned long start, unsigned long end, } while (1); /* If we are here, we found a suitable memory range */ - __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, - kbuf->memsz); + kbuf->mem = temp_start; /* Success, stop navigating through remaining System RAM ranges */ return 1; @@ -2099,8 +2082,7 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, } while (1); /* If we are here, we found a suitable memory range */ - __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, - kbuf->memsz); + kbuf->mem = temp_start; /* Success, stop navigating through remaining System RAM ranges */ return 1; @@ -2187,7 +2169,12 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, } /* Found a suitable memory range */ - ksegment = &image->segment[image->nr_segments - 1]; + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = kbuf->buffer; + ksegment->bufsz = kbuf->bufsz; + ksegment->mem = kbuf->mem; + ksegment->memsz = kbuf->memsz; + image->nr_segments++; *load_addr = ksegment->mem; return 0; } -- cgit v1.2.3 From 36f3f500efe6a19b7ce1e1205c105a2cbb2124d9 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 13 Oct 2014 15:53:44 -0700 Subject: kexec: remove the unused function parameter This is a cleanup. In function parse_crashkernel_suffix, the parameter crash_base is not used. So here remove it. Signed-off-by: Baoquan He Acked-by: Vivek Goyal Cc: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 63bc3cdfb629..2abf9f6e9a61 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1759,7 +1759,6 @@ static __initdata char *suffix_tbl[] = { */ static int __init parse_crashkernel_suffix(char *cmdline, unsigned long long *crash_size, - unsigned long long *crash_base, const char *suffix) { char *cur = cmdline; @@ -1848,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline, if (suffix) return parse_crashkernel_suffix(ck_cmdline, crash_size, - crash_base, suffix); + suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax -- cgit v1.2.3 From 67cf13ceed89e2c1a967719e98624a20c48dfb5a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 13 Oct 2014 15:54:03 -0700 Subject: x86: optimize resource lookups for ioremap We have a large university system in the UK that is experiencing very long delays modprobing the driver for a specific I/O device. The delay is from 8-10 minutes per device and there are 31 devices in the system. This 4 to 5 hour delay in starting up those I/O devices is very much a burden on the customer. There are two causes for requiring a restart/reload of the drivers. First is periodic preventive maintenance (PM) and the second is if any of the devices experience a fatal error. Both of these trigger this excessively long delay in bringing the system back up to full capability. The problem was tracked down to a very slow IOREMAP operation and the excessively long ioresource lookup to insure that the user is not attempting to ioremap RAM. These patches provide a speed up to that function. The modprobe time appears to be affected quite a bit by previous activity on the ioresource list, which I suspect is due to cache preloading. While the overall improvement is impacted by other overhead of starting the devices, this drastically improves the modprobe time. Also our system is considerably smaller so the percentages gained will not be the same. Best case improvement with the modprobe on our 20 device smallish system was from 'real 5m51.913s' to 'real 0m18.275s'. This patch (of 2): Since the ioremap operation is verifying that the specified address range is NOT RAM, it will search the entire ioresource list if the condition is true. To make matters worse, it does this one 4k page at a time. For a 128M BAR region this is 32 passes to determine the entire region does not contain any RAM addresses. This patch provides another resource lookup function, region_is_ram, that searches for the entire region specified, verifying that it is completely contained within the resource region. If it is found, then it is checked to be RAM or not, within a single pass. The return result reflects if it was found or not (-1), and whether it is RAM (1) or not (0). This allows the caller to fallback to the previous page by page search if it was not found. [akpm@linux-foundation.org: fix spellos and typos in comment] Signed-off-by: Mike Travis Acked-by: Alex Thorlton Reviewed-by: Cliff Wickman Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Mark Salter Cc: Dave Young Cc: Rik van Riel Cc: Peter Zijlstra Cc: Mel Gorman Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + kernel/resource.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index fa0d74e06428..4cd45cb95e6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -347,6 +347,7 @@ static inline int put_page_unless_one(struct page *page) } extern int page_is_ram(unsigned long pfn); +extern int region_is_ram(resource_size_t phys_addr, unsigned long size); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); diff --git a/kernel/resource.c b/kernel/resource.c index 46322019ab7d..0bcebffc4e77 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -491,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +/* + * Search for a resouce entry that fully contains the specified region. + * If found, return 1 if it is RAM, 0 if not. + * If not found, or region is not fully contained, return -1 + * + * Used by the ioremap functions to ensure the user is not remapping RAM and is + * a vast speed up over walking through the resource table page by page. + */ +int region_is_ram(resource_size_t start, unsigned long size) +{ + struct resource *p; + resource_size_t end = start + size - 1; + int flags = IORESOURCE_MEM | IORESOURCE_BUSY; + const char *name = "System RAM"; + int ret = -1; + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + if (end < p->start) + continue; + + if (p->start <= start && end <= p->end) { + /* resource fully contains region */ + if ((p->flags != flags) || strcmp(p->name, name)) + ret = 0; + else + ret = 1; + break; + } + if (p->end < start) + break; /* not found */ + } + read_unlock(&resource_lock); + return ret; +} + void __weak arch_remove_reservations(struct resource *avail) { } -- cgit v1.2.3 From f9f2bac27ca587dc3eb4737880ca4a8e5d92bd93 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 13 Oct 2014 15:55:03 -0700 Subject: kdb: replace strnicmp with strncasecmp The kernel used to contain two functions for length-delimited, case-insensitive string comparison, strnicmp with correct semantics and a slightly buggy strncasecmp. The latter is the POSIX name, so strnicmp was renamed to strncasecmp, and strnicmp made into a wrapper for the new strncasecmp to avoid breaking existing users. To allow the compat wrapper strnicmp to be removed at some point in the future, and to avoid the extra indirection cost, do s/strnicmp/strncasecmp/g. Signed-off-by: Rasmus Villemoes Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/debug/kdb/kdb_bp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 70a504601dc3..b20d544f20c2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) bp->bph_length = 1; if ((argc + 1) != nextarg) { - if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) + if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0) bp->bp_type = BP_ACCESS_WATCHPOINT; - else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) + else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) bp->bp_type = BP_WRITE_WATCHPOINT; - else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) + else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0) bp->bp_type = BP_HARDWARE_BREAKPOINT; else return KDB_ARGCOUNT; -- cgit v1.2.3 From 6e7458a6f074c71e74cda31c483114e65ea0f570 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Mon, 13 Oct 2014 15:55:35 -0700 Subject: kernel/watchdog.c: control hard lockup detection default In some cases we don't want hard lockup detection enabled by default. An example is when running as a guest. Introduce watchdog_enable_hardlockup_detector(bool) allowing those cases to disable hard lockup detection. This must be executed early by the boot processor from e.g. smp_prepare_boot_cpu, in order to allow kernel command line arguments to override it, as well as to avoid hard lockup detection being enabled before we've had a chance to indicate that it's unwanted. In summary, initial boot: default=enabled smp_prepare_boot_cpu watchdog_enable_hardlockup_detector(false): default=disabled cmdline has 'nmi_watchdog=1': default=enabled The running kernel still has the ability to enable/disable at any time with /proc/sys/kernel/nmi_watchdog us usual. However even when the default has been overridden /proc/sys/kernel/nmi_watchdog will initially show '1'. To truly turn it on one must disable/enable it, i.e. echo 0 > /proc/sys/kernel/nmi_watchdog echo 1 > /proc/sys/kernel/nmi_watchdog This patch will be immediately useful for KVM with the next patch of this series. Other hypervisor guest types may find it useful as well. [akpm@linux-foundation.org: fix build] [dzickus@redhat.com: fix compile issues on sparc] Signed-off-by: Ulrich Obergfell Signed-off-by: Andrew Jones Signed-off-by: Don Zickus Signed-off-by: Don Zickus Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 13 +++++++++++++ kernel/watchdog.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 1d2a6ab6b8bb..9b2022ab4d85 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -24,6 +24,19 @@ static inline void touch_nmi_watchdog(void) } #endif +#if defined(CONFIG_HARDLOCKUP_DETECTOR) +extern void watchdog_enable_hardlockup_detector(bool val); +extern bool watchdog_hardlockup_detector_is_enabled(void); +#else +static inline void watchdog_enable_hardlockup_detector(bool val) +{ +} +static inline bool watchdog_hardlockup_detector_is_enabled(void) +{ + return true; +} +#endif + /* * Create trigger_all_cpu_backtrace() out of the arch-provided * base function. Return whether such support was available, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ff7fd80bef99..49e9537f3673 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -59,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn; static int hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +static bool hardlockup_detector_enabled = true; +/* + * We may not want to enable hard lockup detection by default in all cases, + * for example when running the kernel as a guest on a hypervisor. In these + * cases this function can be called to disable hard lockup detection. This + * function should only be executed once by the boot processor before the + * kernel command line parameters are parsed, because otherwise it is not + * possible to override this in hardlockup_panic_setup(). + */ +void watchdog_enable_hardlockup_detector(bool val) +{ + hardlockup_detector_enabled = val; +} + +bool watchdog_hardlockup_detector_is_enabled(void) +{ + return hardlockup_detector_enabled; +} + static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) @@ -67,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_user_enabled = 0; + else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { + /* + * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) + * has the same effect. + */ + watchdog_user_enabled = 1; + watchdog_enable_hardlockup_detector(true); + } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -465,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu) struct perf_event_attr *wd_attr; struct perf_event *event = per_cpu(watchdog_ev, cpu); + /* + * Some kernels need to default hard lockup detection to + * 'disabled', for example a guest on a hypervisor. + */ + if (!watchdog_hardlockup_detector_is_enabled()) { + event = ERR_PTR(-ENOENT); + goto handle_err; + } + /* is it already setup and enabled? */ if (event && event->state > PERF_EVENT_STATE_OFF) goto out; @@ -479,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); +handle_err: /* save cpu0 error for future comparision */ if (cpu == 0 && IS_ERR(event)) cpu0_err = PTR_ERR(event); @@ -624,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int err, old_thresh, old_enabled; + bool old_hardlockup; static DEFINE_MUTEX(watchdog_proc_mutex); mutex_lock(&watchdog_proc_mutex); old_thresh = ACCESS_ONCE(watchdog_thresh); old_enabled = ACCESS_ONCE(watchdog_user_enabled); + old_hardlockup = watchdog_hardlockup_detector_is_enabled(); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err || !write) @@ -640,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write, * disabled. The 'watchdog_running' variable check in * watchdog_*_all_cpus() function takes care of this. */ - if (watchdog_user_enabled && watchdog_thresh) + if (watchdog_user_enabled && watchdog_thresh) { + /* + * Prevent a change in watchdog_thresh accidentally overriding + * the enablement of the hardlockup detector. + */ + if (watchdog_user_enabled != old_enabled) + watchdog_enable_hardlockup_detector(true); err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); - else + } else watchdog_disable_all_cpus(); /* Restore old values on failure */ if (err) { watchdog_thresh = old_thresh; watchdog_user_enabled = old_enabled; + watchdog_enable_hardlockup_detector(old_hardlockup); } out: mutex_unlock(&watchdog_proc_mutex); -- cgit v1.2.3 From 63a12d9d01831208a47f5c0fbbf93f503d1fb162 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 13 Oct 2014 15:55:44 -0700 Subject: kernel/param: consolidate __{start,stop}___param[] in Consolidate the various external const and non-const declarations of __start___param[] and __stop___param in . This requires making a few struct kernel_param pointers in kernel/params.c const. Signed-off-by: Geert Uytterhoeven Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/moduleparam.h | 2 ++ init/main.c | 2 -- kernel/params.c | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index b43f4752304e..1c9effa25e26 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -78,6 +78,8 @@ struct kernel_param { }; }; +extern const struct kernel_param __start___param[], __stop___param[]; + /* Special one for strings we want to copy into */ struct kparam_string { unsigned int maxlen; diff --git a/init/main.c b/init/main.c index 89ec862da2d4..800a0daede7e 100644 --- a/init/main.c +++ b/init/main.c @@ -501,7 +501,6 @@ asmlinkage __visible void __init start_kernel(void) { char *command_line; char *after_dashes; - extern const struct kernel_param __start___param[], __stop___param[]; /* * Need to run as early as possible, to initialize the @@ -844,7 +843,6 @@ static char *initcall_level_names[] __initdata = { static void __init do_initcall_level(int level) { - extern const struct kernel_param __start___param[], __stop___param[]; initcall_t *fn; strcpy(initcall_command_line, saved_command_line); diff --git a/kernel/params.c b/kernel/params.c index 041b5899d5e2..db97b791390f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -513,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string); #define to_module_attr(n) container_of(n, struct module_attribute, attr) #define to_module_kobject(n) container_of(n, struct module_kobject, kobj) -extern struct kernel_param __start___param[], __stop___param[]; - struct param_attribute { struct module_attribute mattr; @@ -774,7 +773,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name) } static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, + const struct kernel_param *kparam, unsigned int name_skip) { struct module_kobject *mk; @@ -809,7 +808,7 @@ static void __init kernel_add_sysfs_param(const char *name, */ static void __init param_sysfs_builtin(void) { - struct kernel_param *kp; + const struct kernel_param *kp; unsigned int name_len; char modname[MODULE_NAME_LEN]; -- cgit v1.2.3 From d3051b489aa81ca9ba62af366149ef42b8dae97c Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 14 Oct 2014 02:51:39 +1030 Subject: modules, lock around setting of MODULE_STATE_UNFORMED A panic was seen in the following sitation. There are two threads running on the system. The first thread is a system monitoring thread that is reading /proc/modules. The second thread is loading and unloading a module (in this example I'm using my simple dummy-module.ko). Note, in the "real world" this occurred with the qlogic driver module. When doing this, the following panic occurred: ------------[ cut here ]------------ kernel BUG at kernel/module.c:3739! invalid opcode: 0000 [#1] SMP Modules linked in: binfmt_misc sg nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache intel_powerclamp coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel lrw igb gf128mul glue_helper iTCO_wdt iTCO_vendor_support ablk_helper ptp sb_edac cryptd pps_core edac_core shpchp i2c_i801 pcspkr wmi lpc_ich ioatdma mfd_core dca ipmi_si nfsd ipmi_msghandler auth_rpcgss nfs_acl lockd sunrpc xfs libcrc32c sr_mod cdrom sd_mod crc_t10dif crct10dif_common mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ttm isci drm libsas ahci libahci scsi_transport_sas libata i2c_core dm_mirror dm_region_hash dm_log dm_mod [last unloaded: dummy_module] CPU: 37 PID: 186343 Comm: cat Tainted: GF O-------------- 3.10.0+ #7 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.00.29.D696.1311111329 11/11/2013 task: ffff8807fd2d8000 ti: ffff88080fa7c000 task.ti: ffff88080fa7c000 RIP: 0010:[] [] module_flags+0xb5/0xc0 RSP: 0018:ffff88080fa7fe18 EFLAGS: 00010246 RAX: 0000000000000003 RBX: ffffffffa03b5200 RCX: 0000000000000000 RDX: 0000000000001000 RSI: ffff88080fa7fe38 RDI: ffffffffa03b5000 RBP: ffff88080fa7fe28 R08: 0000000000000010 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000000000f R12: ffffffffa03b5000 R13: ffffffffa03b5008 R14: ffffffffa03b5200 R15: ffffffffa03b5000 FS: 00007f6ae57ef740(0000) GS:ffff88101e7a0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000404f70 CR3: 0000000ffed48000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Stack: ffffffffa03b5200 ffff8810101e4800 ffff88080fa7fe70 ffffffff810d666c ffff88081e807300 000000002e0f2fbf 0000000000000000 ffff88100f257b00 ffffffffa03b5008 ffff88080fa7ff48 ffff8810101e4800 ffff88080fa7fee0 Call Trace: [] m_show+0x19c/0x1e0 [] seq_read+0x16e/0x3b0 [] proc_reg_read+0x3d/0x80 [] vfs_read+0x9c/0x170 [] SyS_read+0x58/0xb0 [] system_call_fastpath+0x16/0x1b Code: 48 63 c2 83 c2 01 c6 04 03 29 48 63 d2 eb d9 0f 1f 80 00 00 00 00 48 63 d2 c6 04 13 2d 41 8b 0c 24 8d 50 02 83 f9 01 75 b2 eb cb <0f> 0b 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 RIP [] module_flags+0xb5/0xc0 RSP Consider the two processes running on the system. CPU 0 (/proc/modules reader) CPU 1 (loading/unloading module) CPU 0 opens /proc/modules, and starts displaying data for each module by traversing the modules list via fs/seq_file.c:seq_open() and fs/seq_file.c:seq_read(). For each module in the modules list, seq_read does op->start() <-- this is a pointer to m_start() op->show() <- this is a pointer to m_show() op->stop() <-- this is a pointer to m_stop() The m_start(), m_show(), and m_stop() module functions are defined in kernel/module.c. The m_start() and m_stop() functions acquire and release the module_mutex respectively. ie) When reading /proc/modules, the module_mutex is acquired and released for each module. m_show() is called with the module_mutex held. It accesses the module struct data and attempts to write out module data. It is in this code path that the above BUG_ON() warning is encountered, specifically m_show() calls static char *module_flags(struct module *mod, char *buf) { int bx = 0; BUG_ON(mod->state == MODULE_STATE_UNFORMED); ... The other thread, CPU 1, in unloading the module calls the syscall delete_module() defined in kernel/module.c. The module_mutex is acquired for a short time, and then released. free_module() is called without the module_mutex. free_module() then sets mod->state = MODULE_STATE_UNFORMED, also without the module_mutex. Some additional code is called and then the module_mutex is reacquired to remove the module from the modules list: /* Now we can delete it from the lists */ mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); mutex_unlock(&module_mutex); This is the sequence of events that leads to the panic. CPU 1 is removing dummy_module via delete_module(). It acquires the module_mutex, and then releases it. CPU 1 has NOT set dummy_module->state to MODULE_STATE_UNFORMED yet. CPU 0, which is reading the /proc/modules, acquires the module_mutex and acquires a pointer to the dummy_module which is still in the modules list. CPU 0 calls m_show for dummy_module. The check in m_show() for MODULE_STATE_UNFORMED passed for dummy_module even though it is being torn down. Meanwhile CPU 1, which has been continuing to remove dummy_module without holding the module_mutex, now calls free_module() and sets dummy_module->state to MODULE_STATE_UNFORMED. CPU 0 now calls module_flags() with dummy_module and ... static char *module_flags(struct module *mod, char *buf) { int bx = 0; BUG_ON(mod->state == MODULE_STATE_UNFORMED); and BOOM. Acquire and release the module_mutex lock around the setting of MODULE_STATE_UNFORMED in the teardown path, which should resolve the problem. Testing: In the unpatched kernel I can panic the system within 1 minute by doing while (true) do insmod dummy_module.ko; rmmod dummy_module.ko; done and while (true) do cat /proc/modules; done in separate terminals. In the patched kernel I was able to run just over one hour without seeing any issues. I also verified the output of panic via sysrq-c and the output of /proc/modules looks correct for all three states for the dummy_module. dummy_module 12661 0 - Unloading 0xffffffffa03a5000 (OE-) dummy_module 12661 0 - Live 0xffffffffa03bb000 (OE) dummy_module 14015 1 - Loading 0xffffffffa03a5000 (OE+) Signed-off-by: Prarit Bhargava Reviewed-by: Oleg Nesterov Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 8a0dc91eddbc..138b83e31bd5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1842,7 +1842,9 @@ static void free_module(struct module *mod) /* We leave it in list to prevent duplicate loads, but make sure * that noone uses it while it's being deconstructed. */ + mutex_lock(&module_mutex); mod->state = MODULE_STATE_UNFORMED; + mutex_unlock(&module_mutex); /* Remove dynamic debug info */ ddebug_remove_module(mod->name); -- cgit v1.2.3 From 76835b0ebf8a7fe85beb03c75121419a7dec52f0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 17 Oct 2014 17:38:49 +0100 Subject: futex: Ensure get_futex_key_refs() always implies a barrier Commit b0c29f79ecea (futexes: Avoid taking the hb->lock if there's nothing to wake up) changes the futex code to avoid taking a lock when there are no waiters. This code has been subsequently fixed in commit 11d4616bd07f (futex: revert back to the explicit waiter counting code). Both the original commit and the fix-up rely on get_futex_key_refs() to always imply a barrier. However, for private futexes, none of the cases in the switch statement of get_futex_key_refs() would be hit and the function completes without a memory barrier as required before checking the "waiters" in futex_wake() -> hb_waiters_pending(). The consequence is a race with a thread waiting on a futex on another CPU, allowing the waker thread to read "waiters == 0" while the waiter thread to have read "futex_val == locked" (in kernel). Without this fix, the problem (user space deadlocks) can be seen with Android bionic's mutex implementation on an arm64 multi-cluster system. Signed-off-by: Catalin Marinas Reported-by: Matteo Franchin Fixes: b0c29f79ecea (futexes: Avoid taking the hb->lock if there's nothing to wake up) Acked-by: Davidlohr Bueso Tested-by: Mike Galbraith Cc: Cc: Darren Hart Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Paul E. McKenney Signed-off-by: Linus Torvalds --- kernel/futex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 815d7af2ffe8..f3a3a071283c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -343,6 +343,8 @@ static void get_futex_key_refs(union futex_key *key) case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies MB (B) */ break; + default: + smp_mb(); /* explicit MB (B) */ } } -- cgit v1.2.3 From 51fae6da640edf9d266c94f36bc806c63c301991 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:12 +0200 Subject: freezer: Do not freeze tasks killed by OOM killer Since f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) OOM killer relies on being able to thaw a frozen task to handle OOM situation but a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) has reorganized the code and stopped clearing freeze flag in __thaw_task. This means that the target task only wakes up and goes into the fridge again because the freezing condition hasn't changed for it. This reintroduces the bug fixed by f660daac474c6f. Fix the issue by checking for TIF_MEMDIE thread flag in freezing_slow_path and exclude the task from freezing completely. If a task was already frozen it would get woken by __thaw_task from OOM killer and get out of freezer after rechecking freezing(). Changes since v1 - put TIF_MEMDIE check into freezing_slowpath rather than in __refrigerator as per Oleg - return __thaw_task into oom_scan_process_thread because oom_kill_process will not wake task in the fridge because it is sleeping uninterruptible [mhocko@suse.cz: rewrote the changelog] Fixes: a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) Cc: 3.3+ # 3.3+ Signed-off-by: Cong Wang Signed-off-by: Michal Hocko Acked-by: Oleg Nesterov Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..8f9279b9c6d7 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; -- cgit v1.2.3 From c05eb32f472fb9f7f474c20ff6fa5bfe0cbedc05 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:13 +0200 Subject: freezer: remove obsolete comments in __thaw_task() __thaw_task() no longer clears frozen flag since commit a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE). Reviewed-by: Michal Hocko Signed-off-by: Cong Wang Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 8f9279b9c6d7..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -150,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); -- cgit v1.2.3 From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 20 Oct 2014 18:12:32 +0200 Subject: OOM, PM: OOM killed task shouldn't escape PM suspend PM freezer relies on having all tasks frozen by the time devices are getting frozen so that no task will touch them while they are getting frozen. But OOM killer is allowed to kill an already frozen task in order to handle OOM situtation. In order to protect from late wake ups OOM killer is disabled after all tasks are frozen. This, however, still keeps a window open when a killed task didn't manage to die by the time freeze_processes finishes. Reduce the race window by checking all tasks after OOM killer has been disabled. This is still not race free completely unfortunately because oom_killer_disable cannot stop an already ongoing OOM killer so a task might still wake up from the fridge and get killed without freeze_processes noticing. Full synchronization of OOM and freezer is, however, too heavy weight for this highly unlikely case. Introduce and check oom_kills counter which gets incremented early when the allocator enters __alloc_pages_may_oom path and only check all the tasks if the counter changes during the freezing attempt. The counter is updated so early to reduce the race window since allocator checked oom_killer_disabled which is set by PM-freezing code. A false positive will push the PM-freezer into a slow path but that is not a big deal. Changes since v1 - push the re-check loop out of freeze_processes into check_frozen_processes and invert the condition to make the code more readable as per Rafael Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) Cc: 3.2+ # 3.2+ Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- include/linux/oom.h | 3 +++ kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- mm/oom_kill.c | 17 +++++++++++++++++ mm/page_alloc.c | 8 ++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a550..e8d6e1058723 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9ee..5cc588c1abab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,6 +108,28 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + struct task_struct *g, *p; + bool ret = true; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && !freezer_should_skip(p) && + !frozen(p)) { + ret = false; + goto done; + } + } +done: + read_unlock(&tasklist_lock); + + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +140,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,12 +155,27 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + goto done; + } + printk("done."); } +done: printk("\n"); BUG_ON(in_atomic()); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..9cd36b822444 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2251,6 +2251,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if -- cgit v1.2.3 From a28e785a9f794ba32e603570ab52a262cf963489 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Oct 2014 09:27:15 +0200 Subject: PM: convert do_each_thread to for_each_process_thread as per 0c740d0afc3b (introduce for_each_thread() to replace the buggy while_each_thread()) get rid of do_each_thread { } while_each_thread() construct and replace it by a more error prone for_each_thread. This patch doesn't introduce any user visible change. Suggested-by: Oleg Nesterov Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 5cc588c1abab..7f0d4343af1b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { @@ -229,11 +229,11 @@ void thaw_processes(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -256,10 +256,10 @@ void thaw_kernel_threads(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); -- cgit v1.2.3 From 32bf08a6257b9c7380dcd040af3c0858eee3ef05 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Oct 2014 14:54:57 -0700 Subject: bpf: fix bug in eBPF verifier while comparing for verifier state equivalency the comparison was missing a check for uninitialized register. Make sure it does so and add a testcase. Fixes: f1bca824dabb ("bpf: add search pruning optimization to verifier") Cc: Hannes Frederic Sowa Signed-off-by: Alexei Starovoitov Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 3 ++- samples/bpf/test_verifier.c | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 801f5f3b9307..9f81818f2941 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) if (memcmp(&old->regs[i], &cur->regs[i], sizeof(old->regs[0])) != 0) { if (old->regs[i].type == NOT_INIT || - old->regs[i].type == UNKNOWN_VALUE) + (old->regs[i].type == UNKNOWN_VALUE && + cur->regs[i].type != NOT_INIT)) continue; return false; } diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index f44ef11f65a7..eb4bec0ad8af 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -208,6 +208,17 @@ static struct bpf_test tests[] = { .errstr = "R0 !read_ok", .result = REJECT, }, + { + "program doesn't init R0 before exit in all branches", + .insns = { + BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, + .errstr = "R0 !read_ok", + .result = REJECT, + }, { "stack out of bounds", .insns = { -- cgit v1.2.3 From 71be2114a5474a76edad95343d89b8731457fccd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 22 Oct 2014 22:47:32 +0200 Subject: PM / freezer: Clean up code after recent fixes Clean up the code in process.c after recent changes to get rid of unnecessary labels and goto statements. Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 7f0d4343af1b..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,25 +108,27 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +static bool __check_frozen_processes(void) +{ + struct task_struct *g, *p; + + for_each_process_thread(g, p) + if (p != current && !freezer_should_skip(p) && !frozen(p)) + return false; + + return true; +} + /* * Returns true if all freezable tasks (except for current) are frozen already */ static bool check_frozen_processes(void) { - struct task_struct *g, *p; - bool ret = true; + bool ret; read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) && - !frozen(p)) { - ret = false; - goto done; - } - } -done: + ret = __check_frozen_processes(); read_unlock(&tasklist_lock); - return ret; } @@ -167,15 +169,14 @@ int freeze_processes(void) * on the way out so we have to double check for race. */ if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { + !check_frozen_processes()) { __usermodehelper_set_disable_depth(UMH_ENABLED); printk("OOM in progress."); error = -EBUSY; - goto done; + } else { + printk("done."); } - printk("done."); } -done: printk("\n"); BUG_ON(in_atomic()); -- cgit v1.2.3 From b2c4623dcd07af4b8ae3b56ae5f879e281c7b4f8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Oct 2014 10:00:05 -0700 Subject: rcu: More on deadlock between CPU hotplug and expedited grace periods Commit dd56af42bd82 (rcu: Eliminate deadlock between CPU hotplug and expedited grace periods) was incomplete. Although it did eliminate deadlocks involving synchronize_sched_expedited()'s acquisition of cpu_hotplug.lock via get_online_cpus(), it did nothing about the similar deadlock involving acquisition of this same lock via put_online_cpus(). This deadlock became apparent with testing involving hibernation. This commit therefore changes put_online_cpus() acquisition of this lock to be conditional, and increments a new cpu_hotplug.puts_pending field in case of acquisition failure. Then cpu_hotplug_begin() checks for this new field being non-zero, and applies any changes to cpu_hotplug.refcount. Reported-by: Jiri Kosina Signed-off-by: Paul E. McKenney Tested-by: Jiri Kosina Tested-by: Borislav Petkov --- kernel/cpu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 356450f09c1f..90a3d017b90c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -64,6 +64,8 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + /* And allows lockless put_online_cpus(). */ + atomic_t puts_pending; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -113,7 +115,11 @@ void put_online_cpus(void) { if (cpu_hotplug.active_writer == current) return; - mutex_lock(&cpu_hotplug.lock); + if (!mutex_trylock(&cpu_hotplug.lock)) { + atomic_inc(&cpu_hotplug.puts_pending); + cpuhp_lock_release(); + return; + } if (WARN_ON(!cpu_hotplug.refcount)) cpu_hotplug.refcount++; /* try to fix things up */ @@ -155,6 +161,12 @@ void cpu_hotplug_begin(void) cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); + if (atomic_read(&cpu_hotplug.puts_pending)) { + int delta; + + delta = atomic_xchg(&cpu_hotplug.puts_pending, 0); + cpu_hotplug.refcount -= delta; + } if (likely(!cpu_hotplug.refcount)) break; __set_current_state(TASK_UNINTERRUPTIBLE); -- cgit v1.2.3 From 8252ecf346474cfe46315bd0a7ca655c293c34a9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 24 Oct 2014 14:56:01 -0400 Subject: ftrace: Set ops->old_hash on modifying what an ops hooks to The code that checks for trampolines when modifying function hooks tests against a modified ops "old_hash". But the ops old_hash pointer is not being updated before the changes are made, making it possible to not find the right hash to the callback and possibly causing ftrace to break in accounting and disable itself. Have the ops set its old_hash before the modifying takes place. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fb186b9ddf51..483b8c1b1de0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2293,10 +2293,13 @@ static void ftrace_run_update_code(int command) FTRACE_WARN_ON(ret); } -static void ftrace_run_modify_code(struct ftrace_ops *ops, int command) +static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, + struct ftrace_hash *old_hash) { ops->flags |= FTRACE_OPS_FL_MODIFYING; + ops->old_hash.filter_hash = old_hash; ftrace_run_update_code(command); + ops->old_hash.filter_hash = NULL; ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } @@ -3340,7 +3343,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(void) +static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) { int ret; int i; @@ -3348,7 +3351,8 @@ static void __enable_ftrace_function_probe(void) if (ftrace_probe_registered) { /* still need to update the function call sites */ if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + old_hash); return; } @@ -3477,13 +3481,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } while_for_each_ftrace_rec(); ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + __enable_ftrace_function_probe(old_hash); + if (!ret) free_ftrace_hash_rcu(old_hash); else count = ret; - __enable_ftrace_function_probe(); - out_unlock: mutex_unlock(&ftrace_lock); out: @@ -3764,10 +3769,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops) +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_hash *old_hash) { if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS); + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); } static int @@ -3813,7 +3819,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, old_hash = *orig_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret) { - ftrace_ops_update_code(ops); + ftrace_ops_update_code(ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); @@ -4058,7 +4064,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); if (!ret) { - ftrace_ops_update_code(iter->ops); + ftrace_ops_update_code(iter->ops, old_hash); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); -- cgit v1.2.3 From 4fc409048d5afb1ad853f294b4262ecf2c980a49 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 24 Oct 2014 14:48:35 -0400 Subject: ftrace: Fix checking of trampoline ftrace_ops in finding trampoline When modifying code, ftrace has several checks to make sure things are being done correctly. One of them is to make sure any code it modifies is exactly what it expects it to be before it modifies it. In order to do so with the new trampoline logic, it must be able to find out what trampoline a function is hooked to in order to see if the code that hooks to it is what's expected. The logic to find the trampoline from a record (accounting descriptor for a function that is hooked) needs to only look at the "old_hash" of an ops that is being modified. The old_hash is the list of function an ops is hooked to before its update. Since a record would only be pointing to an ops that is being modified if it was already hooked before. Currently, it can pick a modified ops based on its new functions it will be hooked to, and this picks the wrong trampoline and causes the check to fail, disabling ftrace. Signed-off-by: Steven Rostedt ftrace: squash into ordering of ops for modification --- kernel/trace/ftrace.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 483b8c1b1de0..31c90fec4158 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1925,8 +1925,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) * when we are adding another op to the rec or removing the * current one. Thus, if the op is being added, we can * ignore it because it hasn't attached itself to the rec - * yet. That means we just need to find the op that has a - * trampoline and is not beeing added. + * yet. + * + * If an ops is being modified (hooking to different functions) + * then we don't care about the new functions that are being + * added, just the old ones (that are probably being removed). + * + * If we are adding an ops to a function that already is using + * a trampoline, it needs to be removed (trampolines are only + * for single ops connected), then an ops that is not being + * modified also needs to be checked. */ do_for_each_ftrace_op(op, ftrace_ops_list) { @@ -1940,17 +1948,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) if (op->flags & FTRACE_OPS_FL_ADDING) continue; + /* - * If the ops is not being added and has a trampoline, - * then it must be the one that we want! + * If the ops is being modified and is in the old + * hash, then it is probably being removed from this + * function. */ - if (hash_contains_ip(ip, op->func_hash)) - return op; - - /* If the ops is being modified, it may be in the old hash. */ if ((op->flags & FTRACE_OPS_FL_MODIFYING) && hash_contains_ip(ip, &op->old_hash)) return op; + /* + * If the ops is not being added or modified, and it's + * in its normal filter hash, then this must be the one + * we want! + */ + if (!(op->flags & FTRACE_OPS_FL_MODIFYING) && + hash_contains_ip(ip, op->func_hash)) + return op; } while_for_each_ftrace_op(op); -- cgit v1.2.3 From 6891c4509c792209c44ced55a60f13954cb50ef4 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 4 Oct 2014 23:06:39 +0200 Subject: posix-timers: Fix stack info leak in timer_create() If userland creates a timer without specifying a sigevent info, we'll create one ourself, using a stack local variable. Particularly will we use the timer ID as sival_int. But as sigev_value is a union containing a pointer and an int, that assignment will only partially initialize sigev_value on systems where the size of a pointer is bigger than the size of an int. On such systems we'll copy the uninitialized stack bytes from the timer_create() call to userland when the timer actually fires and we're going to deliver the signal. Initialize sigev_value with 0 to plug the stack info leak. Found in the PaX patch, written by the PaX Team. Fixes: 5a9fa7307285 ("posix-timers: kill ->it_sigev_signo and...") Signed-off-by: Mathias Krause Cc: Oleg Nesterov Cc: Brad Spengler Cc: PaX Team Cc: # v2.6.28+ Link: http://lkml.kernel.org/r/1412456799-32339-1-git-send-email-minipli@googlemail.com Signed-off-by: Thomas Gleixner --- kernel/time/posix-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 42b463ad90f2..31ea01f42e1f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } } else { + memset(&event.sigev_value, 0, sizeof(event.sigev_value)); event.sigev_notify = SIGEV_SIGNAL; event.sigev_signo = SIGALRM; event.sigev_value.sival_int = new_timer->it_id; -- cgit v1.2.3 From 10632008b9e18b76cbff0ffc69c15e948aa548e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 20 Oct 2014 15:07:50 +0400 Subject: clockevents: Prevent shift out of bounds Andrey reported that on a kernel with UBSan enabled he found: UBSan: Undefined behaviour in ../kernel/time/clockevents.c:75:34 I guess it should be 1ULL here instead of 1U: (!ismax || evt->mult <= (1U << evt->shift))) That's indeed the correct solution because shift might be 32. Reported-by: Andrey Ryabinin Cc: Peter Zijlstra Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9c94c19f1305..55449909f114 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && - (!ismax || evt->mult <= (1U << evt->shift))) + (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); -- cgit v1.2.3 From 993b2ff221999066fcff231590593d0b98f45d32 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 23 Oct 2014 20:27:00 -0700 Subject: futex: Mention key referencing differences between shared and private futexes Update our documentation as of fix 76835b0ebf8 (futex: Ensure get_futex_key_refs() always implies a barrier). Explicitly state that we don't do key referencing for private futexes. Signed-off-by: Davidlohr Bueso Cc: Matteo Franchin Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Darren Hart Cc: Peter Zijlstra Cc: Paul E. McKenney Acked-by: Catalin Marinas Link: http://lkml.kernel.org/r/1414121220.817.0.camel@linux-t7sj.site Signed-off-by: Thomas Gleixner --- kernel/futex.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f3a3a071283c..bbf071f325b8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -143,9 +143,8 @@ * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see hb_waiters_inc) and where (B) orders the write - * to futex and the waiters read -- this is done by the barriers in - * get_futex_key_refs(), through either ihold or atomic_inc, depending on the - * futex type. + * to futex and the waiters read -- this is done by the barriers for both + * shared and private futexes in get_futex_key_refs(). * * This yields the following case (where X:=waiters, Y:=futex): * @@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key) futex_get_mm(key); /* implies MB (B) */ break; default: + /* + * Private futexes do not hold reference on an inode or + * mm, therefore the only purpose of calling get_futex_key_refs + * is because we need the barrier for the lockless waiter check. + */ smp_mb(); /* explicit MB (B) */ } } /* * Drop a reference to the resource addressed by a key. - * The hash bucket spinlock must not be held. + * The hash bucket spinlock must not be held. This is + * a no-op for private futexes, see comment in the get + * counterpart. */ static void drop_futex_key_refs(union futex_key *key) { -- cgit v1.2.3 From 30a6b8031fe14031ab27c1fa3483cb9780e7f63c Mon Sep 17 00:00:00 2001 From: Brian Silverman Date: Sat, 25 Oct 2014 20:20:37 -0400 Subject: futex: Fix a race condition between REQUEUE_PI and task death free_pi_state and exit_pi_state_list both clean up futex_pi_state's. exit_pi_state_list takes the hb lock first, and most callers of free_pi_state do too. requeue_pi doesn't, which means free_pi_state can free the pi_state out from under exit_pi_state_list. For example: task A | task B exit_pi_state_list | pi_state = | curr->pi_state_list->next | | futex_requeue(requeue_pi=1) | // pi_state is the same as | // the one in task A | free_pi_state(pi_state) | list_del_init(&pi_state->list) | kfree(pi_state) list_del_init(&pi_state->list) | Move the free_pi_state calls in requeue_pi to before it drops the hb locks which it's already holding. [ tglx: Removed a pointless free_pi_state() call and the hb->lock held debugging. The latter comes via a seperate patch ] Signed-off-by: Brian Silverman Cc: austin.linux@gmail.com Cc: darren@dvhart.com Cc: peterz@infradead.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1414282837-23092-1-git-send-email-bsilver16384@gmail.com Signed-off-by: Thomas Gleixner --- kernel/futex.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bbf071f325b8..63678b573d61 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -647,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +/* + * Must be called with the hb lock held. + */ static void free_pi_state(struct futex_pi_state *pi_state) { + if (!pi_state) + return; + if (!atomic_dec_and_test(&pi_state->refcount)) return; @@ -1527,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - if (pi_state != NULL) { - /* - * We will have to lookup the pi_state again, so free this one - * to keep the accounting correct. - */ - free_pi_state(pi_state); - pi_state = NULL; - } - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); if (unlikely(ret != 0)) goto out; @@ -1625,6 +1622,8 @@ retry_private: case 0: break; case -EFAULT: + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1640,6 +1639,8 @@ retry_private: * exit to complete. * - The user space value changed. */ + free_pi_state(pi_state); + pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1716,6 +1717,7 @@ retry_private: } out_unlock: + free_pi_state(pi_state); double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); @@ -1733,8 +1735,6 @@ out_put_keys: out_put_key1: put_futex_key(&key1); out: - if (pi_state != NULL) - free_pi_state(pi_state); return ret ? ret : task_count; } -- cgit v1.2.3 From 94fb823fcb4892614f57e59601bb9d4920f24711 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 Oct 2014 20:29:10 +0300 Subject: PM / Sleep: fix recovery during resuming from hibernation If a device's dev_pm_ops::freeze callback fails during the QUIESCE phase, we don't rollback things correctly calling the thaw and complete callbacks. This could leave some devices in a suspended state in case of an error during resuming from hibernation. Signed-off-by: Imre Deak Cc: All applicable Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a9dfa79b6bab..1f35a3478f3c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode) error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); - dpm_resume_end(PMSG_RECOVER); + /* + * The above should either succeed and jump to the new kernel, + * or return with an error. Otherwise things are just + * undefined, so let's be paranoid. + */ + BUG_ON(!error); } + dpm_resume_end(PMSG_RECOVER); pm_restore_gfp_mask(); resume_console(); pm_restore_console(); -- cgit v1.2.3 From f89b7755f517cdbb755d7543eef986ee9d54e654 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 23 Oct 2014 18:41:08 -0700 Subject: bpf: split eBPF out of NET introduce two configs: - hidden CONFIG_BPF to select eBPF interpreter that classic socket filters depend on - visible CONFIG_BPF_SYSCALL (default off) that tracing and sockets can use that solves several problems: - tracing and others that wish to use eBPF don't need to depend on NET. They can use BPF_SYSCALL to allow loading from userspace or select BPF to use it directly from kernel in NET-less configs. - in 3.18 programs cannot be attached to events yet, so don't force it on - when the rest of eBPF infra is there in 3.19+, it's still useful to switch it off to minimize kernel size bloat-o-meter on x64 shows: add/remove: 0/60 grow/shrink: 0/2 up/down: 0/-15601 (-15601) tested with many different config combinations. Hopefully didn't miss anything. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- init/Kconfig | 14 ++++++++++++++ kernel/Makefile | 2 +- kernel/bpf/Makefile | 6 +++--- kernel/bpf/core.c | 9 +++++++++ net/Kconfig | 2 +- 5 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 3ee28ae02cc8..2081a4d3d917 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1341,6 +1341,10 @@ config SYSCTL_ARCH_UNALIGN_ALLOW config HAVE_PCSPKR_PLATFORM bool +# interpreter that classic socket filters depend on +config BPF + bool + menuconfig EXPERT bool "Configure standard kernel features (expert users)" # Unhide debug options, to make the on-by-default options visible @@ -1521,6 +1525,16 @@ config EVENTFD If unsure, say Y. +# syscall, maps, verifier +config BPF_SYSCALL + bool "Enable bpf() system call" if EXPERT + select ANON_INODES + select BPF + default n + help + Enable the bpf() system call that allows to manipulate eBPF + programs and maps via file descriptors. + config SHMEM bool "Use full shmem filesystem" if EXPERT default y diff --git a/kernel/Makefile b/kernel/Makefile index dc5c77544fd6..17ea6d4a9a24 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o -obj-$(CONFIG_NET) += bpf/ +obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 45427239f375..0daf7f6ae7df 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ -obj-y := core.o syscall.o verifier.o - +obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o ifdef CONFIG_TEST_BPF -obj-y += test_stub.o +obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f0c30c59b317..d6594e457a25 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp) schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/net/Kconfig b/net/Kconfig index 6272420a721b..99815b5454bf 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -6,7 +6,7 @@ menuconfig NET bool "Networking support" select NLATTR select GENERIC_NET_UTILS - select ANON_INODES + select BPF ---help--- Unless you really know what you are doing, you should say Y here. The reason is that some programs need kernel networking support even -- cgit v1.2.3 From eeb61e53ea19be0c4015b00b2e8b3b2185436f2b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Oct 2014 14:18:25 +0400 Subject: sched: Fix race between task_group and sched_task_group The race may happen when somebody is changing task_group of a forking task. Child's cgroup is the same as parent's after dup_task_struct() (there just memory copying). Also, cfs_rq and rt_rq are the same as parent's. But if parent changes its task_group before it's called cgroup_post_fork(), we do not reflect this situation on child. Child's cfs_rq and rt_rq remain the same, while child's task_group changes in cgroup_post_fork(). To fix this we introduce fork() method, which calls sched_move_task() directly. This function changes sched_task_group on appropriate (also its logic has no problem with freshly created tasks, so we shouldn't introduce something special; we are able just to use it). Possibly, this decides the Burke Libbey's problem: https://lkml.org/lkml/2014/10/24/456 Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414405105.19914.169.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44999505e1bf..dde8adb7d0c0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7833,6 +7833,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -8205,6 +8210,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, -- cgit v1.2.3 From 64be6f1f5f710f5995d41caf8a1767fe6d2b5a87 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 24 Oct 2014 10:16:37 +0100 Subject: sched/deadline: Don't replenish from a !SCHED_DEADLINE entity In the deboost path, right after the dl_boosted flag has been reset, we can currently end up replenishing using -deadline parameters of a !SCHED_DEADLINE entity. This of course causes a bug, as those parameters are empty. In the case depicted above it is safe to simply bail out, as the deboosted task is going to be back to its original scheduling class anyway. Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: vincent@legout.info Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Link: http://lkml.kernel.org/r/1414142198-18552-4-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577faf1b..92279eaf0ef2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -847,8 +847,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted -- cgit v1.2.3 From aee38ea95419c818dfdde52b115aeffe9cbb259b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 24 Oct 2014 10:16:38 +0100 Subject: sched/deadline: Fix races between rt_mutex_setprio() and dl_task_timer() dl_task_timer() is racy against several paths. Daniel noticed that the replenishment timer may experience a race condition against an enqueue_dl_entity() called from rt_mutex_setprio(). With his own words: rt_mutex_setprio() resets p->dl.dl_throttled. So the pattern is: start_dl_timer() throttled = 1, rt_mutex_setprio() throlled = 0, sched_switch() -> enqueue_task(), dl_task_timer-> enqueue_task() throttled is 0 => BUG_ON(on_dl_rq(dl_se)) fires as the scheduling entity is already enqueued on the -deadline runqueue. As we do for the other races, we just bail out in the replenishment timer code. Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: vincent@legout.info Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414142198-18552-5-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 92279eaf0ef2..46167899d852 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -518,12 +518,20 @@ again: } /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); -- cgit v1.2.3 From 1effd9f19324efb05fccc7421530e11a52db0278 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 22 Oct 2014 11:17:11 +0400 Subject: sched/numa: Fix unsafe get_task_struct() in task_numa_assign() Unlocked access to dst_rq->curr in task_numa_compare() is racy. If curr task is exiting this may be a reason of use-after-free: task_numa_compare() do_exit() ... current->flags |= PF_EXITING; ... release_task() ... ~~delayed_put_task_struct()~~ ... schedule() rcu_read_lock() ... cur = ACCESS_ONCE(dst_rq->curr) ... ... rq->curr = next; ... context_switch() ... finish_task_switch() ... put_task_struct() ... __put_task_struct() ... free_task_struct() task_numa_assign() ... get_task_struct() ... As noted by Oleg: <task_numa_assign() path does get_task_struct(dst_rq->curr) and this is not safe. The task_struct itself can't go away, but rcu_read_lock() can't save us from the final put_task_struct() in finish_task_switch(); this reference goes away without rcu gp>> The patch provides simple check of PF_EXITING flag. If it's not set, this guarantees that call_rcu() of delayed_put_task_struct() callback hasn't happened yet, so we can safely do get_task_struct() in task_numa_assign(). Locked dst_rq->lock protects from concurrency with the last schedule(). Reusing or unmapping of cur's memory may happen without it. Suggested-by: Oleg Nesterov Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413962231.19914.130.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b069bf3e708..fbc0b8214af0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1164,9 +1164,19 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); /* * "imp" is the fault differential for the source task between the -- cgit v1.2.3 From 2847c90e1b3ae95379af24894fc4f98e7f2fd705 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 22 Oct 2014 16:04:35 +0900 Subject: sched/fair: Care divide error in update_task_scan_period() While offling node by hot removing memory, the following divide error occurs: divide error: 0000 [#1] SMP [...] Call Trace: [...] handle_mm_fault [...] ? try_to_wake_up [...] ? wake_up_state [...] __do_page_fault [...] ? do_futex [...] ? put_prev_entity [...] ? __switch_to [...] do_page_fault [...] page_fault [...] RIP [] task_numa_fault RSP The issue occurs as follows: 1. When page fault occurs and page is allocated from node 1, task_struct->numa_faults_buffer_memory[] of node 1 is incremented and p->numa_faults_locality[] is also incremented as follows: o numa_faults_buffer_memory[] o numa_faults_locality[] NR_NUMA_HINT_FAULT_TYPES | 0 | 1 | ---------------------------------- ---------------------- node 0 | 0 | 0 | remote | 0 | node 1 | 0 | 1 | locale | 1 | ---------------------------------- ---------------------- 2. node 1 is offlined by hot removing memory. 3. When page fault occurs, fault_types[] is calculated by using p->numa_faults_buffer_memory[] of all online nodes in task_numa_placement(). But node 1 was offline by step 2. So the fault_types[] is calculated by using only p->numa_faults_buffer_memory[] of node 0. So both of fault_types[] are set to 0. 4. The values(0) of fault_types[] pass to update_task_scan_period(). 5. numa_faults_locality[1] is set to 1. So the following division is calculated. static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private){ ... ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); } 6. But both of private and shared are set to 0. So divide error occurs here. The divide error is rare case because the trigger is node offline. This patch always increments denominator for avoiding divide error. Signed-off-by: Yasuaki Ishimatsu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/54475703.8000505@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fbc0b8214af0..e9abd4e4c5cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1530,7 +1530,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } -- cgit v1.2.3 From 6419265899d9bd27e5ff9f8b43db3715407fc2ba Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 16 Oct 2014 14:39:37 +0400 Subject: sched/fair: Fix division by zero sysctl_numa_balancing_scan_size File /proc/sys/kernel/numa_balancing_scan_size_mb allows writing of zero. This bash command reproduces problem: $ while :; do echo 0 > /proc/sys/kernel/numa_balancing_scan_size_mb; \ echo 256 > /proc/sys/kernel/numa_balancing_scan_size_mb; done divide error: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 24112 Comm: bash Not tainted 3.17.0+ #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff88013c852600 ti: ffff880037a68000 task.ti: ffff880037a68000 RIP: 0010:[] [] task_scan_min+0x21/0x50 RSP: 0000:ffff880037a6bce0 EFLAGS: 00010246 RAX: 0000000000000a00 RBX: 00000000000003e8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88013c852600 RBP: ffff880037a6bcf0 R08: 0000000000000001 R09: 0000000000015c90 R10: ffff880239bf6c00 R11: 0000000000000016 R12: 0000000000003fff R13: ffff88013c852600 R14: ffffea0008d1b000 R15: 0000000000000003 FS: 00007f12bb048700(0000) GS:ffff88007da00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000001505678 CR3: 0000000234770000 CR4: 00000000000006f0 Stack: ffff88013c852600 0000000000003fff ffff880037a6bd18 ffffffff810741d1 ffff88013c852600 0000000000003fff 000000000002bfff ffff880037a6bda8 ffffffff81077ef7 ffffea0008a56d40 0000000000000001 0000000000000001 Call Trace: [] task_scan_max+0x11/0x40 [] task_numa_fault+0x1f7/0xae0 [] ? migrate_misplaced_page+0x276/0x300 [] handle_mm_fault+0x62d/0xba0 [] __do_page_fault+0x191/0x510 [] ? native_smp_send_reschedule+0x42/0x60 [] ? check_preempt_curr+0x80/0xa0 [] ? wake_up_new_task+0x11c/0x1a0 [] ? do_fork+0x14d/0x340 [] ? get_unused_fd_flags+0x2b/0x30 [] ? __fd_install+0x1f/0x60 [] do_page_fault+0xc/0x10 [] page_fault+0x22/0x30 RIP [] task_scan_min+0x21/0x50 RSP ---[ end trace 9a826d16936c04de ]--- Also fix race in task_scan_min (it depends on compiler behaviour). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Tomlin Cc: Andrew Morton Cc: Dario Faggioli Cc: David Rientjes Cc: Jens Axboe Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Rik van Riel Link: http://lkml.kernel.org/r/1413455977.24793.78.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- kernel/sysctl.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9abd4e4c5cb..34baa60f8a7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aada6d9fe74..15f2511a1b7c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", -- cgit v1.2.3 From 009f60e2763568cdcd75bd1cf360c7c7165e2e60 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 5 Oct 2014 22:23:22 +0200 Subject: sched: stop the unbound recursion in preempt_schedule_context() preempt_schedule_context() does preempt_enable_notrace() at the end and this can call the same function again; exception_exit() is heavy and it is quite possible that need-resched is true again. 1. Change this code to dec preempt_count() and check need_resched() by hand. 2. As Linus suggested, we can use the PREEMPT_ACTIVE bit and avoid the enable/disable dance around __schedule(). But in this case we need to move into sched/core.c. 3. Cosmetic, but x86 forgets to declare this function. This doesn't really matter because it is only called by asm helpers, still it make sense to add the declaration into asm/preempt.h to match preempt_schedule(). Reported-by: Sasha Levin Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Graf Cc: Andrew Morton Cc: Christoph Lameter Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Steven Rostedt Cc: Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Chuck Ebbert Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20141005202322.GB27962@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 1 + kernel/context_tracking.c | 40 ---------------------------------------- kernel/sched/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7024c12f7bfe..400873450e33 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -105,6 +105,7 @@ static __always_inline bool should_resched(void) # ifdef CONFIG_CONTEXT_TRACKING extern asmlinkage void ___preempt_schedule_context(void); # define __preempt_schedule_context() asm ("call ___preempt_schedule_context") + extern asmlinkage void preempt_schedule_context(void); # endif #endif diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a0..937ecdfdf258 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) } NOKPROBE_SYMBOL(context_tracking_user_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ - /** * context_tracking_user_exit - Inform the context tracking that the CPU is * exiting userspace mode and entering the kernel. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dde8adb7d0c0..240157c13ddc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + __preempt_count_add(PREEMPT_ACTIVE); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* -- cgit v1.2.3 From f3a7e1a9c464a32ee186ab91388313c82e7ce018 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Oct 2014 20:35:56 +0400 Subject: sched/dl: Fix preemption checks 1) switched_to_dl() check is wrong. We reschedule only if rq->curr is deadline task, and we do not reschedule if it's a lower priority task. But we must always preempt a task of other classes. 2) dl_task_timer(): Policy does not change in case of priority inheritance. rt_mutex_setprio() changes prio, while policy remains old. So we lose some balancing logic in dl_task_timer() and switched_to_dl() when we check policy instead of priority. Boosted task may be rq->curr. (I didn't change switched_from_dl() because no check is necessary there at all). I've looked at this place(switched_to_dl) several times and even fixed this function, but found just now... I suppose some performance tests may work better after this. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413909356.19914.128.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 46167899d852..5285332392d5 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -540,7 +540,7 @@ again: dl_se->dl_yielded = 0; if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) + if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); @@ -1626,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } -- cgit v1.2.3 From c719f56092add9b3d4192f57c64ce7af11105130 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Oct 2014 11:10:21 +0200 Subject: perf: Fix and clean up initialization of pmu::event_idx Andy reported that the current state of event_idx is rather confused. So remove all but the x86_pmu implementation and change the default to return 0 (the safe option). Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Christoph Lameter Cc: Cody P Schafer Cc: Cody P Schafer Cc: Heiko Carstens Cc: Hendrik Brueckner Cc: Himangi Saraogi Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Gortmaker Cc: Paul Mackerras Cc: sukadev@linux.vnet.ibm.com Cc: Thomas Huth Cc: Vince Weaver Cc: linux390@de.ibm.com Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/perf/hv-24x7.c | 6 ------ arch/powerpc/perf/hv-gpci.c | 6 ------ arch/s390/kernel/perf_cpum_sf.c | 6 ------ kernel/events/core.c | 15 +-------------- kernel/events/hw_breakpoint.c | 7 ------- 5 files changed, 1 insertion(+), 39 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 6c8710dd90c9..dba34088da28 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -417,11 +417,6 @@ static int h_24x7_event_add(struct perf_event *event, int flags) return 0; } -static int h_24x7_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu h_24x7_pmu = { .task_ctx_nr = perf_invalid_context, @@ -433,7 +428,6 @@ static struct pmu h_24x7_pmu = { .start = h_24x7_event_start, .stop = h_24x7_event_stop, .read = h_24x7_event_update, - .event_idx = h_24x7_event_idx, }; static int hv_24x7_init(void) diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 15fc76c93022..a051fe946c63 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -246,11 +246,6 @@ static int h_gpci_event_init(struct perf_event *event) return 0; } -static int h_gpci_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu h_gpci_pmu = { .task_ctx_nr = perf_invalid_context, @@ -262,7 +257,6 @@ static struct pmu h_gpci_pmu = { .start = h_gpci_event_start, .stop = h_gpci_event_stop, .read = h_gpci_event_update, - .event_idx = h_gpci_event_idx, }; static int hv_gpci_init(void) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 08e761318c17..b878f12a9597 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1411,11 +1411,6 @@ static void cpumsf_pmu_del(struct perf_event *event, int flags) perf_pmu_enable(event->pmu); } -static int cpumsf_pmu_event_idx(struct perf_event *event) -{ - return event->hw.idx; -} - CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC, PERF_EVENT_CPUM_SF); CPUMF_EVENT_ATTR(SF, SF_CYCLES_BASIC_DIAG, PERF_EVENT_CPUM_SF_DIAG); @@ -1458,7 +1453,6 @@ static struct pmu cpumf_sampling = { .stop = cpumsf_pmu_stop, .read = cpumsf_pmu_read, - .event_idx = cpumsf_pmu_event_idx, .attr_groups = cpumsf_pmu_attr_groups, }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1425d07018de..2b02c9fda790 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6071,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event) return 0; } -static int perf_swevent_event_idx(struct perf_event *event) -{ - return 0; -} - static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -6085,8 +6080,6 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -6204,8 +6197,6 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, - - .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -6431,8 +6422,6 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; /* @@ -6511,8 +6500,6 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, - - .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -6542,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) static int perf_event_idx_default(struct perf_event *event) { - return event->hw.idx + 1; + return 0; } /* diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 1559fb0b9296..9803a6600d49 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } -static int hw_breakpoint_event_idx(struct perf_event *bp) -{ - return 0; -} - static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, - - .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) -- cgit v1.2.3 From d7e29933969e5ca7c112ce1368a07911f4485dc2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 09:15:54 -0700 Subject: rcu: Make rcu_barrier() understand about missing rcuo kthreads Commit 35ce7f29a44a (rcu: Create rcuo kthreads only for onlined CPUs) avoids creating rcuo kthreads for CPUs that never come online. This fixes a bug in many instances of firmware: Instead of lying about their age, these systems instead lie about the number of CPUs that they have. Before commit 35ce7f29a44a, this could result in huge numbers of useless rcuo kthreads being created. It appears that experience indicates that I should have told the people suffering from this problem to fix their broken firmware, but I instead produced what turned out to be a partial fix. The missing piece supplied by this commit makes sure that rcu_barrier() knows not to post callbacks for no-CBs CPUs that have not yet come online, because otherwise rcu_barrier() will hang on systems having firmware that lies about the number of CPUs. It is tempting to simply have rcu_barrier() refuse to post a callback on any no-CBs CPU that does not have an rcuo kthread. This unfortunately does not work because rcu_barrier() is required to wait for all pending callbacks. It is therefore required to wait even for those callbacks that cannot possibly be invoked. Even if doing so hangs the system. Given that posting a callback to a no-CBs CPU that does not yet have an rcuo kthread can hang rcu_barrier(), It is tempting to report an error in this case. Unfortunately, this will result in false positives at boot time, when it is perfectly legal to post callbacks to the boot CPU before the scheduler has started, in other words, before it is legal to invoke rcu_barrier(). So this commit instead has rcu_barrier() avoid posting callbacks to CPUs having neither rcuo kthread nor pending callbacks, and has it complain bitterly if it finds CPUs having no rcuo kthread but some pending callbacks. And when rcu_barrier() does find CPUs having no rcuo kthread but pending callbacks, as noted earlier, it has no choice but to hang indefinitely. Reported-by: Yanko Kaneti Reported-by: Jay Vosburgh Reported-by: Meelis Roos Reported-by: Eric B Munson Signed-off-by: Paul E. McKenney Tested-by: Eric B Munson Tested-by: Jay Vosburgh Tested-by: Yanko Kaneti Tested-by: Kevin Fenzi Tested-by: Meelis Roos --- include/trace/events/rcu.h | 18 +++++++++--------- kernel/rcu/tree.c | 15 ++++++++++----- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 9b56f37148cf..e335e7d8c6c2 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -660,18 +660,18 @@ TRACE_EVENT(rcu_torture_read, /* * Tracepoint for _rcu_barrier() execution. The string "s" describes * the _rcu_barrier phase: - * "Begin": rcu_barrier_callback() started. - * "Check": rcu_barrier_callback() checking for piggybacking. - * "EarlyExit": rcu_barrier_callback() piggybacked, thus early exit. - * "Inc1": rcu_barrier_callback() piggyback check counter incremented. - * "Offline": rcu_barrier_callback() found offline CPU - * "OnlineNoCB": rcu_barrier_callback() found online no-CBs CPU. - * "OnlineQ": rcu_barrier_callback() found online CPU with callbacks. - * "OnlineNQ": rcu_barrier_callback() found online CPU, no callbacks. + * "Begin": _rcu_barrier() started. + * "Check": _rcu_barrier() checking for piggybacking. + * "EarlyExit": _rcu_barrier() piggybacked, thus early exit. + * "Inc1": _rcu_barrier() piggyback check counter incremented. + * "OfflineNoCB": _rcu_barrier() found callback on never-online CPU + * "OnlineNoCB": _rcu_barrier() found online no-CBs CPU. + * "OnlineQ": _rcu_barrier() found online CPU with callbacks. + * "OnlineNQ": _rcu_barrier() found online CPU, no callbacks. * "IRQ": An rcu_barrier_callback() callback posted on remote CPU. * "CB": An rcu_barrier_callback() invoked a callback, not the last. * "LastCB": An rcu_barrier_callback() invoked the last callback. - * "Inc2": rcu_barrier_callback() piggyback check counter incremented. + * "Inc2": _rcu_barrier() piggyback check counter incremented. * The "cpu" argument is the CPU or -1 if meaningless, the "cnt" argument * is the count of remaining callbacks, and "done" is the piggybacking count. */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 133e47223095..9815447d22e0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3299,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp) continue; rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); + if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { + _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + rsp->n_barrier_done); + } else { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, + rcu_barrier_callback, rsp, cpu, 0); + } } else if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d03764652d91..bbdc45d8d74f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -587,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 387dd4599344..c1d7f27bd38f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2049,6 +2049,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) } } +/* + * Does the specified CPU need an RCU callback for the specified flavor + * of rcu_barrier()? + */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_head *rhp; + + /* No-CBs CPUs might have callbacks on any of three lists. */ + rhp = ACCESS_ONCE(rdp->nocb_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_gp_head); + if (!rhp) + rhp = ACCESS_ONCE(rdp->nocb_follower_head); + + /* Having no rcuo kthread but CBs after scheduler starts is bad! */ + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + /* RCU callback enqueued before CPU first came online??? */ + pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", + cpu, rhp->func); + WARN_ON_ONCE(1); + } + + return !!rhp; +} + /* * Enqueue the specified string of rcu_head structures onto the specified * CPU's no-CBs lists. The CPU is specified by rdp, the head of the @@ -2642,6 +2669,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) +{ + WARN_ON_ONCE(1); /* Should be dead code. */ + return false; +} + static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { } -- cgit v1.2.3 From f601de204465048bdf0d5537f630729622ebc3a6 Mon Sep 17 00:00:00 2001 From: Riku Voipio Date: Wed, 29 Oct 2014 14:50:24 -0700 Subject: gcov: add ARM64 to GCOV_PROFILE_ALL Following up the arm testing of gcov, turns out gcov on ARM64 works fine as well. Only change needed is adding ARM64 to Kconfig depends. Tested with qemu and mach-virt Signed-off-by: Riku Voipio Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index cf66c5c8458e..3b7408759bdf 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM + depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3 From 0baf2a4dbf75abb7c186fd6c8d55d27aaa354a29 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 29 Oct 2014 14:50:35 -0700 Subject: kernel/kmod: fix use-after-free of the sub_info structure Found this in the message log on a s390 system: BUG kmalloc-192 (Not tainted): Poison overwritten Disabling lock debugging due to kernel taint INFO: 0x00000000684761f4-0x00000000684761f7. First byte 0xff instead of 0x6b INFO: Allocated in call_usermodehelper_setup+0x70/0x128 age=71 cpu=2 pid=648 __slab_alloc.isra.47.constprop.56+0x5f6/0x658 kmem_cache_alloc_trace+0x106/0x408 call_usermodehelper_setup+0x70/0x128 call_usermodehelper+0x62/0x90 cgroup_release_agent+0x178/0x1c0 process_one_work+0x36e/0x680 worker_thread+0x2f0/0x4f8 kthread+0x10a/0x120 kernel_thread_starter+0x6/0xc kernel_thread_starter+0x0/0xc INFO: Freed in call_usermodehelper_exec+0x110/0x1b8 age=71 cpu=2 pid=648 __slab_free+0x94/0x560 kfree+0x364/0x3e0 call_usermodehelper_exec+0x110/0x1b8 cgroup_release_agent+0x178/0x1c0 process_one_work+0x36e/0x680 worker_thread+0x2f0/0x4f8 kthread+0x10a/0x120 kernel_thread_starter+0x6/0xc kernel_thread_starter+0x0/0xc There is a use-after-free bug on the subprocess_info structure allocated by the user mode helper. In case do_execve() returns with an error ____call_usermodehelper() stores the error code to sub_info->retval, but sub_info can already have been freed. Regarding UMH_NO_WAIT, the sub_info structure can be freed by __call_usermodehelper() before the worker thread returns from do_execve(), allowing memory corruption when do_execve() failed after exec_mmap() is called. Regarding UMH_WAIT_EXEC, the call to umh_complete() allows call_usermodehelper_exec() to continue which then frees sub_info. To fix this race the code needs to make sure that the call to call_usermodehelper_freeinfo() is always done after the last store to sub_info->retval. Signed-off-by: Martin Schwidefsky Reviewed-by: Oleg Nesterov Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 76 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8637e041a247..80f7a6d00519 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...) EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* * This is the task which runs the usermode application */ static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + int wait = sub_info->wait & ~UMH_KILLABLE; struct cred *new; int retval; @@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data) retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) - goto fail; + goto out; spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data) retval = sub_info->init(sub_info, new); if (retval) { abort_creds(new); - goto fail; + goto out; } } @@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data) retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ + if (wait != UMH_WAIT_PROC) + umh_complete(sub_info); if (!retval) return 0; - - /* Exec failed? */ -fail: - sub_info->retval = retval; do_exit(0); } @@ -258,26 +281,6 @@ static int call_helper(void *data) return ____call_usermodehelper(data); } -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work) kmod_thread_locker = NULL; } - switch (wait) { - case UMH_NO_WAIT: - call_usermodehelper_freeinfo(sub_info); - break; - - case UMH_WAIT_PROC: - if (pid > 0) - break; - /* FALLTHROUGH */ - case UMH_WAIT_EXEC: - if (pid < 0) - sub_info->retval = pid; + if (pid < 0) { + sub_info->retval = pid; umh_complete(sub_info); } } @@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } - sub_info->complete = &done; + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); -- cgit v1.2.3 From 897f1acbb6702ddaa953e8d8436eee3b12016c7e Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Thu, 30 Oct 2014 11:22:53 -0400 Subject: audit: AUDIT_FEATURE_CHANGE message format missing delimiting space Add a space between subj= and feature= fields to make them parsable. Signed-off-by: Richard Guy Briggs Cc: stable@vger.kernel.org Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 53bb39bf79e2..8ee4508f9666 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -739,7 +739,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); audit_log_task_info(ab, current); - audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", + audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", audit_feature_names[which], !!old_feature, !!new_feature, !!old_lock, !!new_lock, res); audit_log_end(ab); -- cgit v1.2.3 From 086ba77a6db00ed858ff07451bedee197df868c9 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Wed, 29 Oct 2014 23:06:58 +0100 Subject: tracing/syscalls: Ignore numbers outside NR_syscalls' range ARM has some private syscalls (for example, set_tls(2)) which lie outside the range of NR_syscalls. If any of these are called while syscall tracing is being performed, out-of-bounds array access will occur in the ftrace and perf sys_{enter,exit} handlers. # trace-cmd record -e raw_syscalls:* true && trace-cmd report ... true-653 [000] 384.675777: sys_enter: NR 192 (0, 1000, 3, 4000022, ffffffff, 0) true-653 [000] 384.675812: sys_exit: NR 192 = 1995915264 true-653 [000] 384.675971: sys_enter: NR 983045 (76f74480, 76f74000, 76f74b28, 76f74480, 76f76f74, 1) true-653 [000] 384.675988: sys_exit: NR 983045 = 0 ... # trace-cmd record -e syscalls:* true [ 17.289329] Unable to handle kernel paging request at virtual address aaaaaace [ 17.289590] pgd = 9e71c000 [ 17.289696] [aaaaaace] *pgd=00000000 [ 17.289985] Internal error: Oops: 5 [#1] PREEMPT SMP ARM [ 17.290169] Modules linked in: [ 17.290391] CPU: 0 PID: 704 Comm: true Not tainted 3.18.0-rc2+ #21 [ 17.290585] task: 9f4dab00 ti: 9e710000 task.ti: 9e710000 [ 17.290747] PC is at ftrace_syscall_enter+0x48/0x1f8 [ 17.290866] LR is at syscall_trace_enter+0x124/0x184 Fix this by ignoring out-of-NR_syscalls-bounds syscall numbers. Commit cd0980fc8add "tracing: Check invalid syscall nr while tracing syscalls" added the check for less than zero, but it should have also checked for greater than NR_syscalls. Link: http://lkml.kernel.org/p/1414620418-29472-1-git-send-email-rabin@rab.in Fixes: cd0980fc8add "tracing: Check invalid syscall nr while tracing syscalls" Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4dc8b79c5f75..29228c4d5696 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ @@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) int syscall_nr; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ @@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = trace_get_syscall_nr(current, regs); - if (syscall_nr < 0) + if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; -- cgit v1.2.3 From 403b9636fe9f59124d1a437a297b330729061252 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Sat, 8 Nov 2014 19:17:13 +0300 Subject: PM / sleep: Fix entering suspend-to-IDLE if no freeze_oops is set If no freeze_ops is set, trying to enter suspend-to-IDLE will cause a nice oops in platform_suspend_prepare_late(). Add respective checks to platform_suspend_prepare_late() and platform_resume_early() functions. Fixes: a8d46b9e4e48 (ACPI / sleep: Rework the handling of ACPI GPE wakeup ...) Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4ca9a33ff620..c347e3ce3a55 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -146,7 +146,7 @@ static int platform_suspend_prepare(suspend_state_t state) static int platform_suspend_prepare_late(suspend_state_t state) { - return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ? + return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ? freeze_ops->prepare() : 0; } @@ -164,7 +164,7 @@ static void platform_resume_noirq(suspend_state_t state) static void platform_resume_early(suspend_state_t state) { - if (state == PM_SUSPEND_FREEZE && freeze_ops->restore) + if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore) freeze_ops->restore(); } -- cgit v1.2.3 From e30f53aad2202b5526c40c36d8eeac8bf290bde5 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Mon, 10 Nov 2014 19:46:34 +0100 Subject: tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [] ? __wake_up_common+0x90/0x90 [] wait_on_pipe+0x35/0x40 [] tracing_buffers_splice_read+0x2e3/0x3c0 [] ? tracing_stats_read+0x2a0/0x2a0 [] ? _raw_spin_unlock+0x2b/0x40 [] ? do_read_fault+0x21b/0x290 [] ? handle_mm_fault+0x2ba/0xbd0 [] ? trace_event_buffer_lock_reserve+0x40/0x80 [] ? trace_buffer_lock_reserve+0x22/0x60 [] ? trace_event_buffer_lock_reserve+0x40/0x80 [] do_splice_to+0x6d/0x90 [] SyS_splice+0x7c1/0x800 [] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 81 ++++++++++++++++++++++++++++++--------------- kernel/trace/trace.c | 23 ++++--------- 3 files changed, 62 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 49a4d6f59108..e2c13cd863bd 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) -int ring_buffer_wait(struct ring_buffer *buffer, int cpu); +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full); int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2d75c94ae87d..a56e07c8d15b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on + * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); struct rb_irq_work *work; + int ret = 0; /* * Depending on what the caller is waiting for, either any @@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) } - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + while (true) { + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - work->waiters_pending = true; + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (!pagebusy) + break; + } - if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) schedule(); + } finish_wait(&work->waiters, &wait); - return 0; + + return ret; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a528392b1f4..15209335888d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter, bool full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + full); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&iter->mutex); if (ret) return ret; - - if (signal_pending(current)) - return -EINTR; } return 1; @@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&trace_types_lock); if (ret) { size = ret; goto out_unlock; } - if (signal_pending(current)) { - size = -EINTR; - goto out_unlock; - } goto again; } size = 0; @@ -5587,14 +5581,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, true); mutex_lock(&trace_types_lock); if (ret) goto out; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + goto again; } -- cgit v1.2.3 From 07906da78810dce5fd35b9449358c9208c693dca Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Thu, 6 Nov 2014 22:26:07 +0100 Subject: tracing: Do not risk busy looping in buffer splice If the read loop in trace_buffers_splice_read() keeps failing due to memory allocation failures without reading even a single page then this function will keep busy looping. Remove the risk for that by exiting the function if memory allocation failures are seen. Link: http://lkml.kernel.org/r/1415309167-2373-2-git-send-email-rabin@rab.in Signed-off-by: Rabin Vincent Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 15209335888d..92f4a6cee172 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5494,7 +5494,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, }; struct buffer_ref *ref; int entries, size, i; - ssize_t ret; + ssize_t ret = 0; mutex_lock(&trace_types_lock); @@ -5532,13 +5532,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (!ref) + if (!ref) { + ret = -ENOMEM; break; + } ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { + ret = -ENOMEM; kfree(ref); break; } @@ -5576,6 +5579,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { + if (ret) + goto out; + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { ret = -EAGAIN; goto out; -- cgit v1.2.3 From 799b601451b21ebe7af0e6e8f6e2ccd4683c5064 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 4 Nov 2014 11:27:12 +0100 Subject: audit: keep inode pinned Audit rules disappear when an inode they watch is evicted from the cache. This is likely not what we want. The guilty commit is "fsnotify: allow marks to not pin inodes in core", which didn't take into account that audit_tree adds watches with a zero mask. Adding any mask should fix this. Fixes: 90b1e7a57880 ("fsnotify: allow marks to not pin inodes in core") Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org # 2.6.36+ Signed-off-by: Paul Moore --- kernel/audit_tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e242e3a9864a..80f29e015570 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count) chunk->owners[i].index = i; } fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + chunk->mark.mask = FS_IN_IGNORED; return chunk; } -- cgit v1.2.3 From bc53a3f46de8f3b2e28d46106216f3a759be8705 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Thu, 13 Nov 2014 15:19:44 -0800 Subject: kernel/panic.c: update comments for print_tainted Commit 69361eef9056 ("panic: add TAINT_SOFTLOCKUP") added the 'L' flag, but failed to update the comments for print_tainted(). So, update the comments. Signed-off-by: Xie XiuQi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index d09dc5c32c67..cf80672b7924 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -244,6 +244,7 @@ static const struct tnt tnts[] = { * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. * 'E' - Unsigned module has been loaded. + * 'L' - A soft lockup has previously occurred. * * The string is overwritten by the next call to print_tainted(). */ -- cgit v1.2.3