diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 3 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | kernel/kcov.c | 1 | ||||
-rw-r--r-- | kernel/kmod.c | 563 | ||||
-rw-r--r-- | kernel/locking/rtmutex-debug.c | 2 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 35 | ||||
-rw-r--r-- | kernel/locking/rtmutex_common.h | 12 | ||||
-rw-r--r-- | kernel/memremap.c | 60 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 2 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 50 | ||||
-rw-r--r-- | kernel/sched/debug.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 39 | ||||
-rw-r--r-- | kernel/sched/sched.h | 9 | ||||
-rw-r--r-- | kernel/sched/topology.c | 2 | ||||
-rw-r--r-- | kernel/smp.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 2 | ||||
-rw-r--r-- | kernel/umh.c | 568 |
19 files changed, 696 insertions, 666 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 9c323a6daa46..ed470aac53da 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,12 +5,13 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o user.o \ - signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ + signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o +obj-$(CONFIG_MODULES) += kmod.o obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/fork.c b/kernel/fork.c index 24a4c0be80d5..6f1b0af00bda 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> +#include <linux/hmm.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/vmacache.h> @@ -824,6 +825,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); + hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; @@ -903,6 +905,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); @@ -1459,8 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - p->pi_waiters = RB_ROOT; - p->pi_waiters_leftmost = NULL; + p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif diff --git a/kernel/kcov.c b/kernel/kcov.c index cd771993f96f..3f693a0f6f3e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -270,6 +270,7 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) static const struct file_operations kcov_fops = { .open = kcov_open, .unlocked_ioctl = kcov_ioctl, + .compat_ioctl = kcov_ioctl, .mmap = kcov_mmap, .release = kcov_close, }; diff --git a/kernel/kmod.c b/kernel/kmod.c index 2f37acde640b..bc6addd9152b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -1,23 +1,6 @@ /* - kmod, the new module loader (replaces kerneld) - Kirk Petersen - - Reorganized not to be a daemon by Adam Richter, with guidance - from Greg Zornetzer. - - Modified to avoid chroot and file sharing problems. - Mikael Pettersson - - Limit the concurrent number of kmod modprobes to catch loops from - "modprobe needs a service that is in a module". - Keith Owens <kaos@ocs.com.au> December 1999 - - Unblock all signals when we exec a usermode process. - Shuu Yamaguchi <shuu@wondernetworkresources.com> December 2000 - - call_usermodehelper wait flag, and remove exec_usermodehelper. - Rusty Russell <rusty@rustcorp.com.au> Jan 2003 -*/ + * kmod - the kernel module loader + */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -45,15 +28,6 @@ #include <trace/events/module.h> -#define CAP_BSET (void *)1 -#define CAP_PI (void *)2 - -static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; -static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; -static DEFINE_SPINLOCK(umh_sysctl_lock); -static DECLARE_RWSEM(umhelper_sem); - -#ifdef CONFIG_MODULES /* * Assuming: * @@ -202,536 +176,3 @@ int __request_module(bool wait, const char *fmt, ...) return ret; } EXPORT_SYMBOL(__request_module); - -#endif /* CONFIG_MODULES */ - -static void call_usermodehelper_freeinfo(struct subprocess_info *info) -{ - if (info->cleanup) - (*info->cleanup)(info); - kfree(info); -} - -static void umh_complete(struct subprocess_info *sub_info) -{ - struct completion *comp = xchg(&sub_info->complete, NULL); - /* - * See call_usermodehelper_exec(). If xchg() returns NULL - * we own sub_info, the UMH_KILLABLE caller has gone away - * or the caller used UMH_NO_WAIT. - */ - if (comp) - complete(comp); - else - call_usermodehelper_freeinfo(sub_info); -} - -/* - * This is the task which runs the usermode application - */ -static int call_usermodehelper_exec_async(void *data) -{ - struct subprocess_info *sub_info = data; - struct cred *new; - int retval; - - spin_lock_irq(¤t->sighand->siglock); - flush_signal_handlers(current, 1); - spin_unlock_irq(¤t->sighand->siglock); - - /* - * Our parent (unbound workqueue) runs with elevated scheduling - * priority. Avoid propagating that into the userspace child. - */ - set_user_nice(current, 0); - - retval = -ENOMEM; - new = prepare_kernel_cred(current); - if (!new) - goto out; - - spin_lock(&umh_sysctl_lock); - new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); - new->cap_inheritable = cap_intersect(usermodehelper_inheritable, - new->cap_inheritable); - spin_unlock(&umh_sysctl_lock); - - if (sub_info->init) { - retval = sub_info->init(sub_info, new); - if (retval) { - abort_creds(new); - goto out; - } - } - - commit_creds(new); - - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); -out: - sub_info->retval = retval; - /* - * call_usermodehelper_exec_sync() will call umh_complete - * if UHM_WAIT_PROC. - */ - if (!(sub_info->wait & UMH_WAIT_PROC)) - umh_complete(sub_info); - if (!retval) - return 0; - do_exit(0); -} - -/* Handles UMH_WAIT_PROC. */ -static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) -{ - pid_t pid; - - /* If SIGCLD is ignored sys_wait4 won't populate the status. */ - kernel_sigaction(SIGCHLD, SIG_DFL); - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - } else { - int ret = -ECHILD; - /* - * Normally it is bogus to call wait4() from in-kernel because - * wait4() wants to write the exit code to a userspace address. - * But call_usermodehelper_exec_sync() always runs as kernel - * thread (workqueue) and put_user() to a kernel address works - * OK for kernel threads, due to their having an mm_segment_t - * which spans the entire address space. - * - * Thus the __user pointer cast is valid here. - */ - sys_wait4(pid, (int __user *)&ret, 0, NULL); - - /* - * If ret is 0, either call_usermodehelper_exec_async failed and - * the real error code is already in sub_info->retval or - * sub_info->retval is 0 anyway, so don't mess with it then. - */ - if (ret) - sub_info->retval = ret; - } - - /* Restore default kernel sig handler */ - kernel_sigaction(SIGCHLD, SIG_IGN); - - umh_complete(sub_info); -} - -/* - * We need to create the usermodehelper kernel thread from a task that is affine - * to an optimized set of CPUs (or nohz housekeeping ones) such that they - * inherit a widest affinity irrespective of call_usermodehelper() callers with - * possibly reduced affinity (eg: per-cpu workqueues). We don't want - * usermodehelper targets to contend a busy CPU. - * - * Unbound workqueues provide such wide affinity and allow to block on - * UMH_WAIT_PROC requests without blocking pending request (up to some limit). - * - * Besides, workqueues provide the privilege level that caller might not have - * to perform the usermodehelper request. - * - */ -static void call_usermodehelper_exec_work(struct work_struct *work) -{ - struct subprocess_info *sub_info = - container_of(work, struct subprocess_info, work); - - if (sub_info->wait & UMH_WAIT_PROC) { - call_usermodehelper_exec_sync(sub_info); - } else { - pid_t pid; - /* - * Use CLONE_PARENT to reparent it to kthreadd; we do not - * want to pollute current->children, and we need a parent - * that always ignores SIGCHLD to ensure auto-reaping. - */ - pid = kernel_thread(call_usermodehelper_exec_async, sub_info, - CLONE_PARENT | SIGCHLD); - if (pid < 0) { - sub_info->retval = pid; - umh_complete(sub_info); - } - } -} - -/* - * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY - * (used for preventing user land processes from being created after the user - * land has been frozen during a system-wide hibernation or suspend operation). - * Should always be manipulated under umhelper_sem acquired for write. - */ -static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; - -/* Number of helpers running */ -static atomic_t running_helpers = ATOMIC_INIT(0); - -/* - * Wait queue head used by usermodehelper_disable() to wait for all running - * helpers to finish. - */ -static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); - -/* - * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled - * to become 'false'. - */ -static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); - -/* - * Time to wait for running_helpers to become zero before the setting of - * usermodehelper_disabled in usermodehelper_disable() fails - */ -#define RUNNING_HELPERS_TIMEOUT (5 * HZ) - -int usermodehelper_read_trylock(void) -{ - DEFINE_WAIT(wait); - int ret = 0; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_INTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - if (usermodehelper_disabled == UMH_DISABLED) - ret = -EAGAIN; - - up_read(&umhelper_sem); - - if (ret) - break; - - schedule(); - try_to_freeze(); - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return ret; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); - -long usermodehelper_read_lock_wait(long timeout) -{ - DEFINE_WAIT(wait); - - if (timeout < 0) - return -EINVAL; - - down_read(&umhelper_sem); - for (;;) { - prepare_to_wait(&usermodehelper_disabled_waitq, &wait, - TASK_UNINTERRUPTIBLE); - if (!usermodehelper_disabled) - break; - - up_read(&umhelper_sem); - - timeout = schedule_timeout(timeout); - if (!timeout) - break; - - down_read(&umhelper_sem); - } - finish_wait(&usermodehelper_disabled_waitq, &wait); - return timeout; -} -EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); - -void usermodehelper_read_unlock(void) -{ - up_read(&umhelper_sem); -} -EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); - -/** - * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. - * @depth: New value to assign to usermodehelper_disabled. - * - * Change the value of usermodehelper_disabled (under umhelper_sem locked for - * writing) and wakeup tasks waiting for it to change. - */ -void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) -{ - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - wake_up(&usermodehelper_disabled_waitq); - up_write(&umhelper_sem); -} - -/** - * __usermodehelper_disable - Prevent new helpers from being started. - * @depth: New value to assign to usermodehelper_disabled. - * - * Set usermodehelper_disabled to @depth and wait for running helpers to exit. - */ -int __usermodehelper_disable(enum umh_disable_depth depth) -{ - long retval; - - if (!depth) - return -EINVAL; - - down_write(&umhelper_sem); - usermodehelper_disabled = depth; - up_write(&umhelper_sem); - - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, - atomic_read(&running_helpers) == 0, - RUNNING_HELPERS_TIMEOUT); - if (retval) - return 0; - - __usermodehelper_set_disable_depth(UMH_ENABLED); - return -EAGAIN; -} - -static void helper_lock(void) -{ - atomic_inc(&running_helpers); - smp_mb__after_atomic(); -} - -static void helper_unlock(void) -{ - if (atomic_dec_and_test(&running_helpers)) - wake_up(&running_helpers_waitq); -} - -/** - * call_usermodehelper_setup - prepare to call a usermode helper - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @gfp_mask: gfp mask for memory allocation - * @cleanup: a cleanup function - * @init: an init function - * @data: arbitrary context sensitive data - * - * Returns either %NULL on allocation failure, or a subprocess_info - * structure. This should be passed to call_usermodehelper_exec to - * exec the process and free the structure. - * - * The init function is used to customize the helper process prior to - * exec. A non-zero return code causes the process to error out, exit, - * and return the failure to the calling process - * - * The cleanup function is just before ethe subprocess_info is about to - * be freed. This can be used for freeing the argv and envp. The - * Function must be runnable in either a process context or the - * context in which call_usermodehelper_exec is called. - */ -struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, - char **envp, gfp_t gfp_mask, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data) -{ - struct subprocess_info *sub_info; - sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); - if (!sub_info) - goto out; - - INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); - -#ifdef CONFIG_STATIC_USERMODEHELPER - sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; -#else - sub_info->path = path; -#endif - sub_info->argv = argv; - sub_info->envp = envp; - - sub_info->cleanup = cleanup; - sub_info->init = init; - sub_info->data = data; - out: - return sub_info; -} -EXPORT_SYMBOL(call_usermodehelper_setup); - -/** - * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * Runs a user-space application. The application is started - * asynchronously if wait is not set, and runs as a child of system workqueues. - * (ie. it runs with full root capabilities and optimized affinity). - */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) -{ - DECLARE_COMPLETION_ONSTACK(done); - int retval = 0; - - if (!sub_info->path) { - call_usermodehelper_freeinfo(sub_info); - return -EINVAL; - } - helper_lock(); - if (usermodehelper_disabled) { - retval = -EBUSY; - goto out; - } - - /* - * If there is no binary for us to call, then just return and get out of - * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and - * disable all call_usermodehelper() calls. - */ - if (strlen(sub_info->path) == 0) - goto out; - - /* - * Set the completion pointer only if there is a waiter. - * This makes it possible to use umh_complete to free - * the data structure in case of UMH_NO_WAIT. - */ - sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; - sub_info->wait = wait; - - queue_work(system_unbound_wq, &sub_info->work); - if (wait == UMH_NO_WAIT) /* task has freed sub_info */ - goto unlock; - - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; - - /* umh_complete() will see NULL and free sub_info */ - if (xchg(&sub_info->complete, NULL)) - goto unlock; - /* fallthrough, umh_complete() was already called */ - } - - wait_for_completion(&done); -wait_done: - retval = sub_info->retval; -out: - call_usermodehelper_freeinfo(sub_info); -unlock: - helper_unlock(); - return retval; -} -EXPORT_SYMBOL(call_usermodehelper_exec); - -/** - * call_usermodehelper() - prepare and start a usermode application - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @wait: wait for the application to finish and return status. - * when UMH_NO_WAIT don't wait at all, but you get no useful error back - * when the program couldn't be exec'ed. This makes it safe to call - * from interrupt context. - * - * This function is the equivalent to use call_usermodehelper_setup() and - * call_usermodehelper_exec(). - */ -int call_usermodehelper(const char *path, char **argv, char **envp, int wait) -{ - struct subprocess_info *info; - gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - - info = call_usermodehelper_setup(path, argv, envp, gfp_mask, - NULL, NULL, NULL); - if (info == NULL) - return -ENOMEM; - - return call_usermodehelper_exec(info, wait); -} -EXPORT_SYMBOL(call_usermodehelper); - -static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; - kernel_cap_t new_cap; - int err, i; - - if (write && (!capable(CAP_SETPCAP) || - !capable(CAP_SYS_MODULE))) - return -EPERM; - - /* - * convert from the global kernel_cap_t to the ulong array to print to - * userspace if this is a read. - */ - spin_lock(&umh_sysctl_lock); - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { - if (table->data == CAP_BSET) - cap_array[i] = usermodehelper_bset.cap[i]; - else if (table->data == CAP_PI) - cap_array[i] = usermodehelper_inheritable.cap[i]; - else - BUG(); - } - spin_unlock(&umh_sysctl_lock); - - t = *table; - t.data = &cap_array; - - /* - * actually read or write and array of ulongs from userspace. Remember - * these are least significant 32 bits first - */ - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - /* - * convert from the sysctl array of ulongs to the kernel_cap_t - * internal representation - */ - for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) - new_cap.cap[i] = cap_array[i]; - - /* - * Drop everything not in the new_cap (but don't add things) - */ - spin_lock(&umh_sysctl_lock); - if (write) { - if (table->data == CAP_BSET) - usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); - if (table->data == CAP_PI) - usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); - } - spin_unlock(&umh_sysctl_lock); - - return 0; -} - -struct ctl_table usermodehelper_table[] = { - { - .procname = "bset", - .data = CAP_BSET, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { - .procname = "inheritable", - .data = CAP_PI, - .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), - .mode = 0600, - .proc_handler = proc_cap_handler, - }, - { } -}; diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index ac35e648b0e5..f4a74e78d467 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -58,7 +58,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 649dc9d3951a..6f3dba6e4e9e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -271,10 +271,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_node; + struct rb_node **link = &lock->waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -283,15 +283,12 @@ rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - lock->waiters_leftmost = &waiter->tree_entry; - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color(&waiter->tree_entry, &lock->waiters); + rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); } static void @@ -300,20 +297,17 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->tree_entry)) return; - if (lock->waiters_leftmost == &waiter->tree_entry) - lock->waiters_leftmost = rb_next(&waiter->tree_entry); - - rb_erase(&waiter->tree_entry, &lock->waiters); + rb_erase_cached(&waiter->tree_entry, &lock->waiters); RB_CLEAR_NODE(&waiter->tree_entry); } static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node **link = &task->pi_waiters.rb_root.rb_node; struct rb_node *parent = NULL; struct rt_mutex_waiter *entry; - int leftmost = 1; + bool leftmost = true; while (*link) { parent = *link; @@ -322,15 +316,12 @@ rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) - task->pi_waiters_leftmost = &waiter->pi_tree_entry; - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); + rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); } static void @@ -339,10 +330,7 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) return; - if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) - task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); - - rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); RB_CLEAR_NODE(&waiter->pi_tree_entry); } @@ -1657,8 +1645,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT; - lock->waiters_leftmost = NULL; + lock->waiters = RB_ROOT_CACHED; if (name && key) debug_rt_mutex_init(lock, name, key); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 8d039b928d61..7453be0485a5 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -45,7 +45,7 @@ struct rt_mutex_waiter { static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !RB_EMPTY_ROOT(&lock->waiters); + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * @@ -53,8 +53,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, - tree_entry); + w = rb_entry(lock->waiters.rb_leftmost, + struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); return w; @@ -62,14 +62,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !RB_EMPTY_ROOT(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + return rb_entry(p->pi_waiters.rb_leftmost, + struct rt_mutex_waiter, pi_tree_entry); } #else diff --git a/kernel/memremap.c b/kernel/memremap.c index 066e73c2fcc9..6bcbfbf1a8fd 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,13 +11,14 @@ * General Public License for more details. */ #include <linux/radix-tree.h> -#include <linux/memremap.h> #include <linux/device.h> #include <linux/types.h> #include <linux/pfn_t.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> +#include <linux/swap.h> +#include <linux/swapops.h> #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ @@ -219,6 +220,34 @@ static unsigned long order_at(struct resource *res, unsigned long pgoff) for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ pgoff += 1UL << order, order = order_at((res), pgoff)) +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) +int device_private_entry_fault(struct vm_area_struct *vma, + unsigned long addr, + swp_entry_t entry, + unsigned int flags, + pmd_t *pmdp) +{ + struct page *page = device_private_entry_to_page(entry); + + /* + * The page_fault() callback must migrate page back to system memory + * so that CPU can access it. This might fail for various reasons + * (device issue, device was unsafely unplugged, ...). When such + * error conditions happen, the callback must return VM_FAULT_SIGBUS. + * + * Note that because memory cgroup charges are accounted to the device + * memory, this should never fail because of memory restrictions (but + * allocation of regular system page might still fail because we are + * out of memory). + * + * There is a more in-depth description of what that callback can and + * cannot do, in include/linux/memremap.h + */ + return page->pgmap->page_fault(vma, addr, page, flags, pmdp); +} +EXPORT_SYMBOL(device_private_entry_fault); +#endif /* CONFIG_DEVICE_PRIVATE */ + static void pgmap_radix_release(struct resource *res) { unsigned long pgoff, order; @@ -356,6 +385,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, } pgmap->ref = ref; pgmap->res = &page_map->res; + pgmap->type = MEMORY_DEVICE_HOST; + pgmap->page_fault = NULL; + pgmap->page_free = NULL; + pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; @@ -466,3 +499,28 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } #endif /* CONFIG_ZONE_DEVICE */ + + +#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) +void put_zone_device_private_or_public_page(struct page *page) +{ + int count = page_ref_dec_return(page); + + /* + * If refcount is 1 then page is freed and refcount is stable as nobody + * holds a reference on the page. + */ + if (count == 1) { + /* Clear Active bit in case of parallel mark_page_accessed */ + __ClearPageActive(page); + __ClearPageWaiters(page); + + page->mapping = NULL; + mem_cgroup_uncharge(page); + + page->pgmap->page_free(page, page->pgmap->data); + } else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_zone_device_private_or_public_page); +#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84fe96641b2e..1250e4bd4b85 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4091,7 +4091,7 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; - pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", rcu_fanout_leaf, nr_cpu_ids); /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 55bde94b9572..e012b9be777e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -89,7 +89,7 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_fanout_leaf != RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) - pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_BOOST pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", kthread_prio, CONFIG_RCU_BOOST_DELAY); #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9e38df7649f4..0191ec7667c3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -296,7 +296,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; - return dl_rq->rb_leftmost == &dl_se->rb_node; + return dl_rq->root.rb_leftmost == &dl_se->rb_node; } void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) @@ -320,7 +320,7 @@ void init_dl_bw(struct dl_bw *dl_b) void init_dl_rq(struct dl_rq *dl_rq) { - dl_rq->rb_root = RB_ROOT; + dl_rq->root = RB_ROOT_CACHED; #ifdef CONFIG_SMP /* zero means no -deadline tasks */ @@ -328,7 +328,7 @@ void init_dl_rq(struct dl_rq *dl_rq) dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; - dl_rq->pushable_dl_tasks_root = RB_ROOT; + dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else init_dl_bw(&dl_rq->dl_bw); #endif @@ -410,10 +410,10 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; struct rb_node *parent = NULL; struct task_struct *entry; - int leftmost = 1; + bool leftmost = true; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); @@ -425,17 +425,16 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) link = &parent->rb_left; else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - if (leftmost) { - dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + if (leftmost) dl_rq->earliest_dl.next = p->dl.deadline; - } rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_insert_color_cached(&p->pushable_dl_tasks, + &dl_rq->pushable_dl_tasks_root, leftmost); } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -445,24 +444,23 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { struct rb_node *next_node; next_node = rb_next(&p->pushable_dl_tasks); - dl_rq->pushable_dl_tasks_leftmost = next_node; if (next_node) { dl_rq->earliest_dl.next = rb_entry(next_node, struct task_struct, pushable_dl_tasks)->dl.deadline; } } - rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } static inline int has_pushable_dl_tasks(struct rq *rq) { - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); } static int push_dl_task(struct rq *rq); @@ -1266,7 +1264,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); } else { - struct rb_node *leftmost = dl_rq->rb_leftmost; + struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); @@ -1313,7 +1311,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node **link = &dl_rq->root.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_dl_entity *entry; int leftmost = 1; @@ -1331,11 +1329,8 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) } } - if (leftmost) - dl_rq->rb_leftmost = &dl_se->rb_node; - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); inc_dl_tasks(dl_se, dl_rq); } @@ -1347,14 +1342,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) if (RB_EMPTY_NODE(&dl_se->rb_node)) return; - if (dl_rq->rb_leftmost == &dl_se->rb_node) { - struct rb_node *next_node; - - next_node = rb_next(&dl_se->rb_node); - dl_rq->rb_leftmost = next_node; - } - - rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + rb_erase_cached(&dl_se->rb_node, &dl_rq->root); RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); @@ -1647,7 +1635,7 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, struct dl_rq *dl_rq) { - struct rb_node *left = dl_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&dl_rq->root); if (!left) return NULL; @@ -1771,7 +1759,7 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) */ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu) { - struct rb_node *next_node = rq->dl.pushable_dl_tasks_leftmost; + struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost; struct task_struct *p = NULL; if (!has_pushable_dl_tasks(rq)) @@ -1945,7 +1933,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) if (!has_pushable_dl_tasks(rq)) return NULL; - p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, struct task_struct, pushable_dl_tasks); BUG_ON(rq->cpu != task_cpu(p)); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4a23bbc3111b..8e536d963652 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -530,7 +530,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) + if (rb_first_cached(&cfs_rq->tasks_timeline)) MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bc0a883d190..a5d83ed8dd82 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -513,6 +513,7 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -523,10 +524,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); + if (leftmost) { /* non-empty tree */ + struct sched_entity *se; + se = rb_entry(leftmost, struct sched_entity, run_node); if (!curr) vruntime = se->vruntime; @@ -547,10 +547,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; - int leftmost = 1; + bool leftmost = true; /* * Find the right place in the rbtree: @@ -566,36 +566,23 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) link = &parent->rb_left; } else { link = &parent->rb_right; - leftmost = 0; + leftmost = false; } } - /* - * Maintain a cache of leftmost tree entries (it is frequently - * used): - */ - if (leftmost) - cfs_rq->rb_leftmost = &se->run_node; - rb_link_node(&se->run_node, parent, link); - rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); + rb_insert_color_cached(&se->run_node, + &cfs_rq->tasks_timeline, leftmost); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) { - struct rb_node *next_node; - - next_node = rb_next(&se->run_node); - cfs_rq->rb_leftmost = next_node; - } - - rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { - struct rb_node *left = cfs_rq->rb_leftmost; + struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); if (!left) return NULL; @@ -616,7 +603,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; @@ -9312,7 +9299,7 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { - cfs_rq->tasks_timeline = RB_ROOT; + cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6ed7962dc896..746ac78ff492 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -426,8 +426,7 @@ struct cfs_rq { u64 min_vruntime_copy; #endif - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; + struct rb_root_cached tasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. @@ -550,8 +549,7 @@ struct rt_rq { /* Deadline class' related fields in a runqueue */ struct dl_rq { /* runqueue is an rbtree, ordered by deadline */ - struct rb_root rb_root; - struct rb_node *rb_leftmost; + struct rb_root_cached root; unsigned long dl_nr_running; @@ -575,8 +573,7 @@ struct dl_rq { * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ - struct rb_root pushable_dl_tasks_root; - struct rb_node *pushable_dl_tasks_leftmost; + struct rb_root_cached pushable_dl_tasks_root; #else struct dl_bw dl_bw; #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6f7b43982f73..5d0062cc10cb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -473,7 +473,7 @@ static int __init isolated_cpu_setup(char *str) alloc_bootmem_cpumask_var(&cpu_isolated_map); ret = cpulist_parse(str, cpu_isolated_map); if (ret) { - pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids); return 0; } return 1; diff --git a/kernel/smp.c b/kernel/smp.c index 81cfca9b4cc3..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -550,7 +550,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); /* Setup number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; +unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ea4fb315719..2cafb49aa65e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2316,7 +2316,7 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } EXPORT_SYMBOL(hardpps); -#endif +#endif /* CONFIG_NTP_PPS */ /** * xtime_update() - advances the timekeeping infrastructure diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d56123cdcc89..b8f1f54731af 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1543,7 +1543,7 @@ fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { - max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { pr_warn("Warning: could not register graph trace events\n"); diff --git a/kernel/umh.c b/kernel/umh.c new file mode 100644 index 000000000000..6ff9905250ff --- /dev/null +++ b/kernel/umh.c @@ -0,0 +1,568 @@ +/* + * umh - the kernel usermode helper + */ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> +#include <linux/unistd.h> +#include <linux/kmod.h> +#include <linux/slab.h> +#include <linux/completion.h> +#include <linux/cred.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/workqueue.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/resource.h> +#include <linux/notifier.h> +#include <linux/suspend.h> +#include <linux/rwsem.h> +#include <linux/ptrace.h> +#include <linux/async.h> +#include <linux/uaccess.h> + +#include <trace/events/module.h> + +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); +static DECLARE_RWSEM(umhelper_sem); + +static void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} + +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away + * or the caller used UMH_NO_WAIT. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + +/* + * This is the task which runs the usermode application + */ +static int call_usermodehelper_exec_async(void *data) +{ + struct subprocess_info *sub_info = data; + struct cred *new; + int retval; + + spin_lock_irq(¤t->sighand->siglock); + flush_signal_handlers(current, 1); + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Our parent (unbound workqueue) runs with elevated scheduling + * priority. Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto out; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto out; + } + } + + commit_creds(new); + + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); +out: + sub_info->retval = retval; + /* + * call_usermodehelper_exec_sync() will call umh_complete + * if UHM_WAIT_PROC. + */ + if (!(sub_info->wait & UMH_WAIT_PROC)) + umh_complete(sub_info); + if (!retval) + return 0; + do_exit(0); +} + +/* Handles UMH_WAIT_PROC. */ +static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) +{ + pid_t pid; + + /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + kernel_sigaction(SIGCHLD, SIG_DFL); + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + } else { + int ret = -ECHILD; + /* + * Normally it is bogus to call wait4() from in-kernel because + * wait4() wants to write the exit code to a userspace address. + * But call_usermodehelper_exec_sync() always runs as kernel + * thread (workqueue) and put_user() to a kernel address works + * OK for kernel threads, due to their having an mm_segment_t + * which spans the entire address space. + * + * Thus the __user pointer cast is valid here. + */ + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either call_usermodehelper_exec_async failed and + * the real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; + } + + /* Restore default kernel sig handler */ + kernel_sigaction(SIGCHLD, SIG_IGN); + + umh_complete(sub_info); +} + +/* + * We need to create the usermodehelper kernel thread from a task that is affine + * to an optimized set of CPUs (or nohz housekeeping ones) such that they + * inherit a widest affinity irrespective of call_usermodehelper() callers with + * possibly reduced affinity (eg: per-cpu workqueues). We don't want + * usermodehelper targets to contend a busy CPU. + * + * Unbound workqueues provide such wide affinity and allow to block on + * UMH_WAIT_PROC requests without blocking pending request (up to some limit). + * + * Besides, workqueues provide the privilege level that caller might not have + * to perform the usermodehelper request. + * + */ +static void call_usermodehelper_exec_work(struct work_struct *work) +{ + struct subprocess_info *sub_info = + container_of(work, struct subprocess_info, work); + + if (sub_info->wait & UMH_WAIT_PROC) { + call_usermodehelper_exec_sync(sub_info); + } else { + pid_t pid; + /* + * Use CLONE_PARENT to reparent it to kthreadd; we do not + * want to pollute current->children, and we need a parent + * that always ignores SIGCHLD to ensure auto-reaping. + */ + pid = kernel_thread(call_usermodehelper_exec_async, sub_info, + CLONE_PARENT | SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + umh_complete(sub_info); + } + } +} + +/* + * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY + * (used for preventing user land processes from being created after the user + * land has been frozen during a system-wide hibernation or suspend operation). + * Should always be manipulated under umhelper_sem acquired for write. + */ +static enum umh_disable_depth usermodehelper_disabled = UMH_DISABLED; + +/* Number of helpers running */ +static atomic_t running_helpers = ATOMIC_INIT(0); + +/* + * Wait queue head used by usermodehelper_disable() to wait for all running + * helpers to finish. + */ +static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); + +/* + * Used by usermodehelper_read_lock_wait() to wait for usermodehelper_disabled + * to become 'false'. + */ +static DECLARE_WAIT_QUEUE_HEAD(usermodehelper_disabled_waitq); + +/* + * Time to wait for running_helpers to become zero before the setting of + * usermodehelper_disabled in usermodehelper_disable() fails + */ +#define RUNNING_HELPERS_TIMEOUT (5 * HZ) + +int usermodehelper_read_trylock(void) +{ + DEFINE_WAIT(wait); + int ret = 0; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_INTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + if (usermodehelper_disabled == UMH_DISABLED) + ret = -EAGAIN; + + up_read(&umhelper_sem); + + if (ret) + break; + + schedule(); + try_to_freeze(); + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return ret; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_trylock); + +long usermodehelper_read_lock_wait(long timeout) +{ + DEFINE_WAIT(wait); + + if (timeout < 0) + return -EINVAL; + + down_read(&umhelper_sem); + for (;;) { + prepare_to_wait(&usermodehelper_disabled_waitq, &wait, + TASK_UNINTERRUPTIBLE); + if (!usermodehelper_disabled) + break; + + up_read(&umhelper_sem); + + timeout = schedule_timeout(timeout); + if (!timeout) + break; + + down_read(&umhelper_sem); + } + finish_wait(&usermodehelper_disabled_waitq, &wait); + return timeout; +} +EXPORT_SYMBOL_GPL(usermodehelper_read_lock_wait); + +void usermodehelper_read_unlock(void) +{ + up_read(&umhelper_sem); +} +EXPORT_SYMBOL_GPL(usermodehelper_read_unlock); + +/** + * __usermodehelper_set_disable_depth - Modify usermodehelper_disabled. + * @depth: New value to assign to usermodehelper_disabled. + * + * Change the value of usermodehelper_disabled (under umhelper_sem locked for + * writing) and wakeup tasks waiting for it to change. + */ +void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) +{ + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + wake_up(&usermodehelper_disabled_waitq); + up_write(&umhelper_sem); +} + +/** + * __usermodehelper_disable - Prevent new helpers from being started. + * @depth: New value to assign to usermodehelper_disabled. + * + * Set usermodehelper_disabled to @depth and wait for running helpers to exit. + */ +int __usermodehelper_disable(enum umh_disable_depth depth) +{ + long retval; + + if (!depth) + return -EINVAL; + + down_write(&umhelper_sem); + usermodehelper_disabled = depth; + up_write(&umhelper_sem); + + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, + atomic_read(&running_helpers) == 0, + RUNNING_HELPERS_TIMEOUT); + if (retval) + return 0; + + __usermodehelper_set_disable_depth(UMH_ENABLED); + return -EAGAIN; +} + +static void helper_lock(void) +{ + atomic_inc(&running_helpers); + smp_mb__after_atomic(); +} + +static void helper_unlock(void) +{ + if (atomic_dec_and_test(&running_helpers)) + wake_up(&running_helpers_waitq); +} + +/** + * call_usermodehelper_setup - prepare to call a usermode helper + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @gfp_mask: gfp mask for memory allocation + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data + * + * Returns either %NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, + char **envp, gfp_t gfp_mask, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + +#ifdef CONFIG_STATIC_USERMODEHELPER + sub_info->path = CONFIG_STATIC_USERMODEHELPER_PATH; +#else + sub_info->path = path; +#endif + sub_info->argv = argv; + sub_info->envp = envp; + + sub_info->cleanup = cleanup; + sub_info->init = init; + sub_info->data = data; + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * Runs a user-space application. The application is started + * asynchronously if wait is not set, and runs as a child of system workqueues. + * (ie. it runs with full root capabilities and optimized affinity). + */ +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) +{ + DECLARE_COMPLETION_ONSTACK(done); + int retval = 0; + + if (!sub_info->path) { + call_usermodehelper_freeinfo(sub_info); + return -EINVAL; + } + helper_lock(); + if (usermodehelper_disabled) { + retval = -EBUSY; + goto out; + } + + /* + * If there is no binary for us to call, then just return and get out of + * here. This allows us to set STATIC_USERMODEHELPER_PATH to "" and + * disable all call_usermodehelper() calls. + */ + if (strlen(sub_info->path) == 0) + goto out; + + /* + * Set the completion pointer only if there is a waiter. + * This makes it possible to use umh_complete to free + * the data structure in case of UMH_NO_WAIT. + */ + sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; + sub_info->wait = wait; + + queue_work(system_unbound_wq, &sub_info->work); + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ + goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + + wait_for_completion(&done); +wait_done: + retval = sub_info->retval; +out: + call_usermodehelper_freeinfo(sub_info); +unlock: + helper_unlock(); + return retval; +} +EXPORT_SYMBOL(call_usermodehelper_exec); + +/** + * call_usermodehelper() - prepare and start a usermode application + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @wait: wait for the application to finish and return status. + * when UMH_NO_WAIT don't wait at all, but you get no useful error back + * when the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * This function is the equivalent to use call_usermodehelper_setup() and + * call_usermodehelper_exec(). + */ +int call_usermodehelper(const char *path, char **argv, char **envp, int wait) +{ + struct subprocess_info *info; + gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; + + info = call_usermodehelper_setup(path, argv, envp, gfp_mask, + NULL, NULL, NULL); + if (info == NULL) + return -ENOMEM; + + return call_usermodehelper_exec(info, wait); +} +EXPORT_SYMBOL(call_usermodehelper); + +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; |