diff options
author | Ingo Molnar <mingo@kernel.org> | 2012-03-26 17:18:44 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-03-26 17:19:03 +0200 |
commit | 7fd52392c56361a40f0c630a82b36b95ca31eac6 (patch) | |
tree | 14091de24c6b28ea4cae9826f98aeedb7be091f5 /kernel | |
parent | b01c3a0010aabadf745f3e7fdb9cab682e0a28a2 (diff) | |
parent | e22057c8599373e5caef0bc42bdb95d2a361ab0d (diff) | |
download | linux-7fd52392c56361a40f0c630a82b36b95ca31eac6.tar.bz2 |
Merge branch 'linus' into perf/urgent
Merge reason: we need to fix a non-trivial merge conflict.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
44 files changed, 1391 insertions, 1385 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 2d9de86b7e76..cb41b9547c9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,7 +27,6 @@ obj-y += power/ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o -obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/audit.c b/kernel/audit.c index bb0eb5bb9a0a..1c7f2c61416b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct path *path) + const struct path *path) { char *p, *pathname; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..f4ea4b6f3cf1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { - ret = ss->pre_destroy(ss, cgrp); + ret = ss->pre_destroy(cgrp); if (ret) break; } @@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(ss, cgrp); + ss->bind(cgrp); mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { @@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); if (ss->bind) - ss->bind(ss, dummytop); + ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; @@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb) struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - struct dentry *dentry; if (!inode) return -ENOMEM; @@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb) inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - dentry = d_alloc_root(inode); - if (!dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) return -ENOMEM; - } - sb->s_root = dentry; /* for everything else we want ->d_op set */ sb->s_d_op = &cgroup_dops; return 0; @@ -1763,6 +1759,7 @@ EXPORT_SYMBOL_GPL(cgroup_path); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; + struct css_set *cg; }; struct cgroup_taskset { @@ -1843,11 +1840,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * will already exist. If not set, this function might sleep, and can fail with * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ -static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, - struct task_struct *tsk, bool guarantee) +static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; - struct css_set *newcg; /* * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1857,23 +1853,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - /* locate or allocate a new css_set for this task. */ - if (guarantee) { - /* we know the css_set we want already exists. */ - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - read_lock(&css_set_lock); - newcg = find_existing_css_set(oldcg, cgrp, template); - BUG_ON(!newcg); - get_css_set(newcg); - read_unlock(&css_set_lock); - } else { - might_sleep(); - /* find_css_set will give us newcg already referenced. */ - newcg = find_css_set(oldcg, cgrp); - if (!newcg) - return -ENOMEM; - } - task_lock(tsk); rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1892,7 +1871,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, put_css_set(oldcg); set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - return 0; } /** @@ -1910,6 +1888,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; struct cgroup_taskset tset = { }; + struct css_set *newcg; /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) @@ -1925,7 +1904,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1939,13 +1918,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); - if (retval) + newcg = find_css_set(tsk->cgroups, cgrp); + if (!newcg) { + retval = -ENOMEM; goto out; + } + + cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } synchronize_rcu(); @@ -1967,7 +1950,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } return retval; @@ -1997,66 +1980,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -/* - * cgroup_attach_proc works in two stages, the first of which prefetches all - * new css_sets needed (to make sure we have enough memory before committing - * to the move) and stores them in a list of entries of the following type. - * TODO: possible optimization: use css_set->rcu_head for chaining instead - */ -struct cg_list_entry { - struct css_set *cg; - struct list_head links; -}; - -static bool css_set_check_fetched(struct cgroup *cgrp, - struct task_struct *tsk, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - read_lock(&css_set_lock); - newcg = find_existing_css_set(cg, cgrp, template); - read_unlock(&css_set_lock); - - /* doesn't exist at all? */ - if (!newcg) - return false; - /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) - if (cg_entry->cg == newcg) - return true; - - /* not found */ - return false; -} - -/* - * Find the new css_set and store it in the list in preparation for moving the - * given task to the given cgroup. Returns 0 or -ENOMEM. - */ -static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - - /* ensure a new css_set will exist for this thread */ - newcg = find_css_set(cg, cgrp); - if (!newcg) - return -ENOMEM; - /* add it to the list */ - cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); - if (!cg_entry) { - put_css_set(newcg); - return -ENOMEM; - } - cg_entry->cg = newcg; - list_add(&cg_entry->links, newcg_list); - return 0; -} - /** * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup * @cgrp: the cgroup to attach to @@ -2070,20 +1993,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; /* guaranteed to be initialized later, but the compiler needs this */ - struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; - /* - * we need to make sure we have css_sets for all the tasks we're - * going to move -before- we actually start moving them, so that in - * case we get an ENOMEM we can bail out before making any changes. - */ - struct list_head newcg_list; - struct cg_list_entry *cg_entry, *temp_nobe; /* * step 0: in order to do expensive, possibly blocking operations for @@ -2102,23 +2017,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (retval) goto out_free_group_list; - /* prevent changes to the threadgroup list while we take a snapshot. */ - read_lock(&tasklist_lock); - if (!thread_group_leader(leader)) { - /* - * a race with de_thread from another thread's exec() may strip - * us of our leadership, making while_each_thread unsafe to use - * on this task. if this happens, there is no choice but to - * throw this task away and try again (from cgroup_procs_write); - * this is "double-double-toil-and-trouble-check locking". - */ - read_unlock(&tasklist_lock); - retval = -EAGAIN; - goto out_free_group_list; - } - tsk = leader; i = 0; + /* + * Prevent freeing of tasks while we take a snapshot. Tasks that are + * already PF_EXITING could be freed from underneath us unless we + * take an rcu_read_lock. + */ + rcu_read_lock(); do { struct task_and_cgroup ent; @@ -2128,24 +2034,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ ent.task = tsk; ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) continue; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); + rcu_read_unlock(); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; tset.tc_array_len = group_size; - read_unlock(&tasklist_lock); /* methods shouldn't be called if no task is actually migrating */ retval = 0; @@ -2157,7 +2063,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2169,17 +2075,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 2: make sure css_sets exist for all threads to be migrated. * we use find_css_set, which allocates a new one if necessary. */ - INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - oldcg = tc->task->cgroups; - - /* if we don't already have it in the list get a new one */ - if (!css_set_check_fetched(cgrp, tc->task, oldcg, - &newcg_list)) { - retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - if (retval) - goto out_list_teardown; + tc->cg = find_css_set(tc->task->cgroups, cgrp); + if (!tc->cg) { + retval = -ENOMEM; + goto out_put_css_set_refs; } } @@ -2190,8 +2091,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); - BUG_ON(retval); + cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2200,7 +2100,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } /* @@ -2209,21 +2109,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) synchronize_rcu(); cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; -out_list_teardown: - /* clean up the list of prefetched css_sets. */ - list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { - list_del(&cg_entry->links); - put_css_set(cg_entry->cg); - kfree(cg_entry); +out_put_css_set_refs: + if (retval) { + for (i = 0; i < group_size; i++) { + tc = flex_array_get(group, i); + if (!tc->cg) + break; + put_css_set(tc->cg); + } } out_cancel_attach: - /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } out_free_group_list: @@ -2245,22 +2146,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) if (!cgroup_lock_live_group(cgrp)) return -ENODEV; +retry_find_task: + rcu_read_lock(); if (pid) { - rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; - } - if (threadgroup) { - /* - * RCU protects this access, since tsk was found in the - * tid map. a race with de_thread may cause group_leader - * to stop being the leader, but cgroup_attach_proc will - * detect it later. - */ - tsk = tsk->group_leader; + ret= -ESRCH; + goto out_unlock_cgroup; } /* * even if we're attaching all tasks in the thread group, we @@ -2271,29 +2164,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); - cgroup_unlock(); - return -EACCES; + ret = -EACCES; + goto out_unlock_cgroup; } - get_task_struct(tsk); - rcu_read_unlock(); - } else { - if (threadgroup) - tsk = current->group_leader; - else - tsk = current; - get_task_struct(tsk); - } - - threadgroup_lock(tsk); + } else + tsk = current; if (threadgroup) + tsk = tsk->group_leader; + get_task_struct(tsk); + rcu_read_unlock(); + + threadgroup_lock(tsk); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + threadgroup_unlock(tsk); + put_task_struct(tsk); + goto retry_find_task; + } ret = cgroup_attach_proc(cgrp, tsk); - else + } else ret = cgroup_attach_task(cgrp, tsk); - threadgroup_unlock(tsk); put_task_struct(tsk); +out_unlock_cgroup: cgroup_unlock(); return ret; } @@ -2305,16 +2207,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { - int ret; - do { - /* - * attach_proc fails with -EAGAIN if threadgroup leadership - * changes in the middle of the operation, in which case we need - * to find the task_struct for the new leader and start over. - */ - ret = attach_task_by_pid(cgrp, tgid, true); - } while (ret == -EAGAIN); - return ret; + return attach_task_by_pid(cgrp, tgid, true); } /** @@ -2804,15 +2697,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp, * using their cgroups capability, we don't maintain the lists running * through each css_set to its tasks until we see the list actually * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU. */ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); /* @@ -2824,6 +2722,7 @@ static void cgroup_enable_task_cg_lists(void) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); + read_unlock(&tasklist_lock); write_unlock(&css_set_lock); } @@ -3043,6 +2942,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * */ +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* how many files are using the current array */ + int use_count; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* protects the other fields */ + struct rw_semaphore mutex; +}; + /* * The following two functions "fix" the issue where there are more pids * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. @@ -3827,7 +3758,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); + struct cgroup_subsys_state *css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); @@ -3841,7 +3772,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); + ss->post_clone(cgrp); } cgroup_lock_hierarchy(root); @@ -3875,7 +3806,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); } mutex_unlock(&cgroup_mutex); @@ -4099,7 +4030,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(ss, dummytop); + css = ss->create(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4188,7 +4119,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * no ss->create seems to need anything important in the ss struct, so * this can happen first (i.e. before the rootnode attachment). */ - css = ss->create(ss, dummytop); + css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[i] = NULL; @@ -4206,7 +4137,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) int ret = cgroup_init_idr(ss, css); if (ret) { dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(ss, dummytop); + ss->destroy(dummytop); subsys[i] = NULL; mutex_unlock(&cgroup_mutex); return ret; @@ -4304,7 +4235,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * pointer to find their state. note that this also takes care of * freeing the css_id. */ - ss->destroy(ss, dummytop); + ss->destroy(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4580,7 +4511,7 @@ void cgroup_fork_callbacks(struct task_struct *child) for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) - ss->fork(ss, child); + ss->fork(child); } } } @@ -4596,6 +4527,17 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_post_fork(struct task_struct *child) { + /* + * use_task_css_set_links is set to 1 before we walk the tasklist + * under the tasklist_lock and we read it here after we added the child + * to the tasklist under the tasklist_lock as well. If the child wasn't + * yet in the tasklist when we walked through it from + * cgroup_enable_task_cg_lists(), then use_task_css_set_links value + * should be visible now due to the paired locking and barriers implied + * by LOCK/UNLOCK: it is written before the tasklist_lock unlock + * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock + * lock on fork. + */ if (use_task_css_set_links) { write_lock(&css_set_lock); if (list_empty(&child->cg_list)) { @@ -4682,7 +4624,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(ss, cgrp, old_cgrp, tsk); + ss->exit(cgrp, old_cgrp, tsk); } } } @@ -4939,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4967,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4985,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4999,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - rwlock_init(&ss->id_lock); + spin_lock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5087,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id, return NULL; BUG_ON(!ss->use_id); + WARN_ON_ONCE(!rcu_read_lock_held()); + /* fill start point for scan */ tmpid = id; while (1) { @@ -5094,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - read_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - read_unlock(&ss->id_lock); - if (!tmp) break; if (tmp->depth >= depth && tmp->stack[depth] == rootid) { @@ -5137,8 +5078,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *debug_create(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5148,7 +5088,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, return css; } -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void debug_destroy(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fc0646b78a64..f86e93920b62 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys; * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ -static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, return &freezer->css; } -static void freezer_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void freezer_destroy(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); @@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task) * a write to that file racing against an attach, and hence the * can_attach() result will remain valid until the attach completes. */ -static int freezer_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, +static int freezer_can_attach(struct cgroup *new_cgroup, struct cgroup_taskset *tset) { struct freezer *freezer; @@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..1010cc61931f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; -repeat: /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat: */ need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); + if (need_loop) + write_seqcount_begin(&tsk->mems_allowed_seq); - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + + if (need_loop) + write_seqcount_end(&tsk->mems_allowed_seq); + task_unlock(tsk); } @@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; @@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct mm_struct *mm; struct task_struct *task; @@ -1833,8 +1804,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) * (and likewise for mems) to the new cgroup. Called with cgroup_mutex * held. */ -static void cpuset_post_clone(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void cpuset_post_clone(struct cgroup *cgroup) { struct cgroup *parent, *child; struct cpuset *cs, *parent_cs; @@ -1857,13 +1827,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, /* * cpuset_create - create a cpuset - * ss: cpuset cgroup subsystem * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create( - struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) { struct cpuset *cs; struct cpuset *parent; @@ -1902,7 +1869,7 @@ static struct cgroup_subsys_state *cpuset_create( * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void cpuset_destroy(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); diff --git a/kernel/cred.c b/kernel/cred.c index 5791612a4045..97b36eeca4c9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -16,6 +16,7 @@ #include <linux/keyctl.h> #include <linux/init_task.h> #include <linux/security.h> +#include <linux/binfmts.h> #include <linux/cn_proc.h> #if 0 diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0d7c08784efb..3f88a45e6f0a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -41,6 +41,7 @@ #include <linux/delay.h> #include <linux/sched.h> #include <linux/sysrq.h> +#include <linux/reboot.h> #include <linux/init.h> #include <linux/kgdb.h> #include <linux/kdb.h> @@ -75,6 +76,8 @@ static int exception_level; struct kgdb_io *dbg_io_ops; static DEFINE_SPINLOCK(kgdb_registration_lock); +/* Action for the reboot notifiter, a global allow kdb to change it */ +static int kgdbreboot; /* kgdb console driver is loaded */ static int kgdb_con_registered; /* determine if kgdb console output should be used */ @@ -96,6 +99,7 @@ static int __init opt_kgdb_con(char *str) early_param("kgdbcon", opt_kgdb_con); module_param(kgdb_use_con, int, 0644); +module_param(kgdbreboot, int, 0644); /* * Holds information about breakpoints in a kernel. These breakpoints are @@ -784,6 +788,33 @@ void __init dbg_late_init(void) kdb_init(KDB_INIT_FULL); } +static int +dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x) +{ + /* + * Take the following action on reboot notify depending on value: + * 1 == Enter debugger + * 0 == [the default] detatch debug client + * -1 == Do nothing... and use this until the board resets + */ + switch (kgdbreboot) { + case 1: + kgdb_breakpoint(); + case -1: + goto done; + } + if (!dbg_kdb_mode) + gdbstub_exit(code); +done: + return NOTIFY_DONE; +} + +static struct notifier_block dbg_reboot_notifier = { + .notifier_call = dbg_notify_reboot, + .next = NULL, + .priority = INT_MAX, +}; + static void kgdb_register_callbacks(void) { if (!kgdb_io_module_registered) { @@ -791,6 +822,7 @@ static void kgdb_register_callbacks(void) kgdb_arch_init(); if (!dbg_is_early) kgdb_arch_late(); + register_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_register(&panic_notifier_list, &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ @@ -812,6 +844,7 @@ static void kgdb_unregister_callbacks(void) */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; + unregister_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_unregister(&panic_notifier_list, &kgdb_panic_event_nb); kgdb_arch_exit(); diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index c22d8c28ad84..ce615e064482 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1111,6 +1111,13 @@ void gdbstub_exit(int status) unsigned char checksum, ch, buffer[3]; int loop; + if (!kgdb_connected) + return; + kgdb_connected = 0; + + if (!dbg_io_ops || dbg_kdb_mode) + return; + buffer[0] = 'W'; buffer[1] = hex_asc_hi(status); buffer[2] = hex_asc_lo(status); @@ -1129,5 +1136,6 @@ void gdbstub_exit(int status) dbg_io_ops->write_char(hex_asc_lo(checksum)); /* make sure the output is flushed, lest the bootloader clobber it */ - dbg_io_ops->flush(); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); } diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 20059ef4459a..8418c2f8ec5d 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); +#ifdef CONFIG_DEBUG_RODATA + if (!bp->bp_type) { + kdb_printf("Software breakpoints are unavailable.\n" + " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " OR use hw breaks: help bph\n"); + } +#endif return 1; } return 0; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 4802eb5840e1..9b5f17da1c56 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -689,7 +689,7 @@ kdb_printit: if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(kdb_buffer, retlen); } else { - if (!dbg_io_ops->is_console) { + if (dbg_io_ops && !dbg_io_ops->is_console) { len = strlen(kdb_buffer); cp = kdb_buffer; while (len--) { diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 4bca634975c0..118527aa60ea 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -25,6 +25,7 @@ #define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ static int kbd_exists; +static int kbd_last_ret; /* * Check if the keyboard controller has a keypress for us. @@ -90,8 +91,11 @@ int kdb_get_kbd_char(void) return -1; } - if ((scancode & 0x80) != 0) + if ((scancode & 0x80) != 0) { + if (scancode == 0x9c) + kbd_last_ret = 0; return -1; + } scancode &= 0x7f; @@ -178,35 +182,82 @@ int kdb_get_kbd_char(void) return -1; /* ignore unprintables */ } - if ((scancode & 0x7f) == 0x1c) { - /* - * enter key. All done. Absorb the release scancode. - */ + if (scancode == 0x1c) { + kbd_last_ret = 1; + return 13; + } + + return keychar & 0xff; +} +EXPORT_SYMBOL_GPL(kdb_get_kbd_char); + +/* + * Best effort cleanup of ENTER break codes on leaving KDB. Called on + * exiting KDB, when we know we processed an ENTER or KP ENTER scan + * code. + */ +void kdb_kbd_cleanup_state(void) +{ + int scancode, scanstatus; + + /* + * Nothing to clean up, since either + * ENTER was never pressed, or has already + * gotten cleaned up. + */ + if (!kbd_last_ret) + return; + + kbd_last_ret = 0; + /* + * Enter key. Need to absorb the break code here, lest it gets + * leaked out if we exit KDB as the result of processing 'g'. + * + * This has several interesting implications: + * + Need to handle KP ENTER, which has break code 0xe0 0x9c. + * + Need to handle repeat ENTER and repeat KP ENTER. Repeats + * only get a break code at the end of the repeated + * sequence. This means we can't propagate the repeated key + * press, and must swallow it away. + * + Need to handle possible PS/2 mouse input. + * + Need to handle mashed keys. + */ + + while (1) { while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) - ; + cpu_relax(); /* - * Fetch the scancode + * Fetch the scancode. */ scancode = inb(KBD_DATA_REG); scanstatus = inb(KBD_STATUS_REG); - while (scanstatus & KBD_STAT_MOUSE_OBF) { - scancode = inb(KBD_DATA_REG); - scanstatus = inb(KBD_STATUS_REG); - } + /* + * Skip mouse input. + */ + if (scanstatus & KBD_STAT_MOUSE_OBF) + continue; - if (scancode != 0x9c) { - /* - * Wasn't an enter-release, why not? - */ - kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", - scancode, scanstatus); - } + /* + * If we see 0xe0, this is either a break code for KP + * ENTER, or a repeat make for KP ENTER. Either way, + * since the second byte is equivalent to an ENTER, + * skip the 0xe0 and try again. + * + * If we see 0x1c, this must be a repeat ENTER or KP + * ENTER (and we swallowed 0xe0 before). Try again. + * + * We can also see make and break codes for other keys + * mashed before or after pressing ENTER. Thus, if we + * see anything other than 0x9c, we have to try again. + * + * Note, if you held some key as ENTER was depressed, + * that break code would get leaked out. + */ + if (scancode != 0x9c) + continue; - return 13; + return; } - - return keychar & 0xff; } -EXPORT_SYMBOL_GPL(kdb_get_kbd_char); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index e2ae7349437f..67b847dfa2bb 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, if (KDB_STATE(DOING_SS)) KDB_STATE_CLEAR(SSBPT); + /* Clean up any keyboard devices before leaving */ + kdb_kbd_cleanup_state(); + return result; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index e381d105b40b..47c4e56e513b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -246,6 +246,13 @@ extern void debug_kusage(void); extern void kdb_set_current_task(struct task_struct *); extern struct task_struct *kdb_current_task; + +#ifdef CONFIG_KDB_KEYBOARD +extern void kdb_kbd_cleanup_state(void); +#else /* ! CONFIG_KDB_KEYBOARD */ +#define kdb_kbd_cleanup_state() +#endif /* ! CONFIG_KDB_KEYBOARD */ + #ifdef CONFIG_MODULES extern struct list_head *kdb_modules; #endif /* CONFIG_MODULES */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 7d6fb40d2188..d35cc2d3a4cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size) if (!pfn_valid(pfn)) return 1; page = pfn_to_page(pfn); - vaddr = kmap_atomic(page, KM_KDB); + vaddr = kmap_atomic(page); memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); - kunmap_atomic(vaddr, KM_KDB); + kunmap_atomic(vaddr); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 3f92a19aa11e..a6a9ec4cd8f5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7154,8 +7154,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) { struct perf_cgroup *jc; @@ -7172,8 +7171,7 @@ static struct cgroup_subsys_state *perf_cgroup_create( return &jc->css; } -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void perf_cgroup_destroy(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -7189,8 +7187,7 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7198,8 +7195,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/exit.c b/kernel/exit.c index 752d2c0abd19..3db1909faed9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -52,6 +52,7 @@ #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> +#include <linux/shm.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -424,7 +425,7 @@ void daemonize(const char *name, ...) */ exit_mm(current); /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation + * We don't want to get frozen, in case system-wide hibernation * or suspend transition begins right now. */ current->flags |= (PF_NOFREEZE | PF_KTHREAD); @@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk) } /* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. + * When we die, we re-parent all our children, and try to: + * 1. give them to another thread in our thread group, if such a member exists + * 2. give it to the first ancestor process which prctl'd itself as a + * child_subreaper for its children (like a service manager) + * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father) __releases(&tasklist_lock) @@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father) if (unlikely(pid_ns->child_reaper == father)) { write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: + father->exit_code); + } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); @@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father) * forget_original_parent() must move them somewhere. */ pid_ns->child_reaper = init_pid_ns.child_reaper; + } else if (father->signal->has_child_subreaper) { + struct task_struct *reaper; + + /* + * Find the first ancestor marked as child_subreaper. + * Note that the code below checks same_thread_group(reaper, + * pid_ns->child_reaper). This is what we need to DTRT in a + * PID namespace. However we still need the check above, see + * http://marc.info/?l=linux-kernel&m=131385460420380 + */ + for (reaper = father->real_parent; + reaper != &init_task; + reaper = reaper->real_parent) { + if (same_thread_group(reaper, pid_ns->child_reaper)) + break; + if (!reaper->signal->is_child_subreaper) + continue; + thread = reaper; + do { + if (!(thread->flags & PF_EXITING)) + return reaper; + } while_each_thread(reaper, thread); + } } return pid_ns->child_reaper; @@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitrary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. - */ - if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) - tsk->exit_signal = SIGCHLD; - if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && @@ -953,7 +961,7 @@ void do_exit(long code) acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) - sync_mm_rss(tsk, tsk->mm); + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/fork.c b/kernel/fork.c index 26a7a6707fa7..b9372a0bff18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -355,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (security_vm_enough_memory(len)) + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } @@ -511,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) return NULL; } +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif +} + /* * Allocate and initialize an mm_struct. */ @@ -538,9 +556,7 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(mm->pmd_huge_pte); -#endif + check_mm(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1035,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; + sig->has_child_subreaper = current->signal->has_child_subreaper || + current->signal->is_child_subreaper; + mutex_init(&sig->cred_guard_mutex); return 0; @@ -1222,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; + seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; @@ -1340,7 +1360,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, clear_all_latency_tracing(p); /* ok, now we should be set up.. */ - p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else if (clone_flags & CLONE_PARENT) + p->exit_signal = current->group_leader->exit_signal; + else + p->exit_signal = (clone_flags & CSIGNAL); + p->pdeath_signal = 0; p->exit_state = 0; diff --git a/kernel/freezer.c b/kernel/freezer.c index 9815b8d1eed5..11f82a4d4eae 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p) * freeze_task - send a freeze request to given task * @p: task to send the request to * - * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE - * flag and either sending a fake signal to it or waking it up, depending - * on whether it has %PF_FREEZER_NOSIG set. + * If @p is freezing, the freeze request is sent either by sending a fake + * signal (if it's not a kernel thread) or waking it up (if it's a kernel + * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f9e26526b69..af48e59bc2ff 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,189 +1,793 @@ +#include <linux/debugfs.h> +#include <linux/hardirq.h> +#include <linux/interrupt.h> #include <linux/irq.h> +#include <linux/irqdesc.h> #include <linux/irqdomain.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/smp.h> +#include <linux/fs.h> + +#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. + * ie. legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ +#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ +#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); +static DEFINE_MUTEX(revmap_trees_mutex); +static unsigned int irq_virq_count = NR_IRQS; +static struct irq_domain *irq_default_domain; + /** - * irq_domain_add() - Register an irq_domain - * @domain: ptr to initialized irq_domain structure + * irq_domain_alloc() - Allocate a new irq_domain data structure + * @of_node: optional device-tree node of the interrupt controller + * @revmap_type: type of reverse mapping to use + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer * - * Registers an irq_domain structure. The irq_domain must at a minimum be - * initialized with an ops structure pointer, and either a ->to_irq hook or - * a valid irq_base value. Everything else is optional. + * Allocates and initialize and irq_domain structure. Caller is expected to + * register allocated irq_domain with irq_domain_register(). Returns pointer + * to IRQ domain, or NULL on failure. */ -void irq_domain_add(struct irq_domain *domain) +static struct irq_domain *irq_domain_alloc(struct device_node *of_node, + unsigned int revmap_type, + const struct irq_domain_ops *ops, + void *host_data) { - struct irq_data *d; - int hwirq, irq; + struct irq_domain *domain; - /* - * This assumes that the irq_domain owner has already allocated - * the irq_descs. This block will be removed when support for dynamic - * allocation of irq_descs is added to irq_domain. - */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - if (!d) { - WARN(1, "error: assigning domain to non existant irq_desc"); - return; - } - if (d->domain) { - /* things are broken; just report, don't clean up */ - WARN(1, "error: irq_desc already assigned to a domain"); - return; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (WARN_ON(!domain)) + return NULL; + + /* Fill structure */ + domain->revmap_type = revmap_type; + domain->ops = ops; + domain->host_data = host_data; + domain->of_node = of_node_get(of_node); + + return domain; +} + +static void irq_domain_add(struct irq_domain *domain) +{ + mutex_lock(&irq_domain_mutex); + list_add(&domain->link, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); + pr_debug("irq: Allocated domain of type %d @0x%p\n", + domain->revmap_type, domain); +} + +static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; + int size = domain->revmap_data.legacy.size; + + if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) + return 0; + return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; +} + +/** + * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @size: total number of irqs in legacy mapping + * @first_irq: first number of irq block assigned to the domain + * @first_hwirq: first hwirq number to use for the translation. Should normally + * be '0', but a positive integer can be used if the effective + * hwirqs numbering does not begin at zero. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Note: the map() callback will be called before this function returns + * for all legacy interrupts except 0 (which is always the invalid irq for + * a legacy controller). + */ +struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + unsigned int i; + + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + if (!domain) + return NULL; + + domain->revmap_data.legacy.first_irq = first_irq; + domain->revmap_data.legacy.first_hwirq = first_hwirq; + domain->revmap_data.legacy.size = size; + + mutex_lock(&irq_domain_mutex); + /* Verify that all the irqs are available */ + for (i = 0; i < size; i++) { + int irq = first_irq + i; + struct irq_data *irq_data = irq_get_irq_data(irq); + + if (WARN_ON(!irq_data || irq_data->domain)) { + mutex_unlock(&irq_domain_mutex); + of_node_put(domain->of_node); + kfree(domain); + return NULL; } - d->domain = domain; - d->hwirq = hwirq; } - mutex_lock(&irq_domain_mutex); - list_add(&domain->list, &irq_domain_list); + /* Claim all of the irqs before registering a legacy domain */ + for (i = 0; i < size; i++) { + struct irq_data *irq_data = irq_get_irq_data(first_irq + i); + irq_data->hwirq = first_hwirq + i; + irq_data->domain = domain; + } mutex_unlock(&irq_domain_mutex); + + for (i = 0; i < size; i++) { + int irq = first_irq + i; + int hwirq = first_hwirq + i; + + /* IRQ0 gets ignored */ + if (!irq) + continue; + + /* Legacy flags are left to default at this point, + * one can then use irq_create_mapping() to + * explicitly change them + */ + ops->map(domain, irq, hwirq); + + /* Clear norequest flags */ + irq_clear_status_flags(irq, IRQ_NOREQUEST); + } + + irq_domain_add(domain); + return domain; +} + +/** + * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + */ +struct irq_domain *irq_domain_add_linear(struct device_node *of_node, + unsigned int size, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + unsigned int *revmap; + + revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); + if (WARN_ON(!revmap)) + return NULL; + + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); + if (!domain) { + kfree(revmap); + return NULL; + } + domain->revmap_data.linear.size = size; + domain->revmap_data.linear.revmap = revmap; + irq_domain_add(domain); + return domain; +} + +struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + if (domain) + irq_domain_add(domain); + return domain; +} + +/** + * irq_domain_add_tree() + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * + * Note: The radix tree will be allocated later during boot automatically + * (the reverse mapping will use the slow path until that happens). + */ +struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_TREE, ops, host_data); + if (domain) { + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); + irq_domain_add(domain); + } + return domain; } /** - * irq_domain_del() - Unregister an irq_domain - * @domain: ptr to registered irq_domain. + * irq_find_host() - Locates a domain for a given device node + * @node: device-tree node of the interrupt controller */ -void irq_domain_del(struct irq_domain *domain) +struct irq_domain *irq_find_host(struct device_node *node) { - struct irq_data *d; - int hwirq, irq; + struct irq_domain *h, *found = NULL; + int rc; + /* We might want to match the legacy controller last since + * it might potentially be set to match all interrupts in + * the absence of a device node. This isn't a problem so far + * yet though... + */ mutex_lock(&irq_domain_mutex); - list_del(&domain->list); + list_for_each_entry(h, &irq_domain_list, link) { + if (h->ops->match) + rc = h->ops->match(h, node); + else + rc = (h->of_node != NULL) && (h->of_node == node); + + if (rc) { + found = h; + break; + } + } mutex_unlock(&irq_domain_mutex); + return found; +} +EXPORT_SYMBOL_GPL(irq_find_host); + +/** + * irq_set_default_host() - Set a "default" irq domain + * @domain: default domain pointer + * + * For convenience, it's possible to set a "default" domain that will be used + * whenever NULL is passed to irq_create_mapping(). It makes life easier for + * platforms that want to manipulate a few hard coded interrupt numbers that + * aren't properly represented in the device-tree. + */ +void irq_set_default_host(struct irq_domain *domain) +{ + pr_debug("irq: Default domain set to @0x%p\n", domain); + + irq_default_domain = domain; +} + +/** + * irq_set_virq_count() - Set the maximum number of linux irqs + * @count: number of linux irqs, capped with NR_IRQS + * + * This is mainly for use by platforms like iSeries who want to program + * the virtual irq number in the controller to avoid the reverse mapping + */ +void irq_set_virq_count(unsigned int count) +{ + pr_debug("irq: Trying to set virq count to %d\n", count); - /* Clear the irq_domain assignments */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - d->domain = NULL; + BUG_ON(count < NUM_ISA_INTERRUPTS); + if (count < NR_IRQS) + irq_virq_count = count; +} + +static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + irq_data->hwirq = hwirq; + irq_data->domain = domain; + if (domain->ops->map(domain, virq, hwirq)) { + pr_debug("irq: -> mapping failed, freeing\n"); + irq_data->domain = NULL; + irq_data->hwirq = 0; + return -1; } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); + + return 0; } -#if defined(CONFIG_OF_IRQ) /** - * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec + * irq_create_direct_mapping() - Allocate an irq for direct mapping + * @domain: domain to allocate the irq for or NULL for default domain * - * Used by the device tree interrupt mapping code to translate a device tree - * interrupt specifier to a valid linux irq number. Returns either a valid - * linux IRQ number or 0. + * This routine is used for irq controllers which can choose the hardware + * interrupt numbers they generate. In such a case it's simplest to use + * the linux irq as the hardware interrupt number. + */ +unsigned int irq_create_direct_mapping(struct irq_domain *domain) +{ + unsigned int virq; + + if (domain == NULL) + domain = irq_default_domain; + + BUG_ON(domain == NULL); + WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + + virq = irq_alloc_desc_from(1, 0); + if (!virq) { + pr_debug("irq: create_direct virq allocation failed\n"); + return 0; + } + if (virq >= irq_virq_count) { + pr_err("ERROR: no free irqs available below %i maximum\n", + irq_virq_count); + irq_free_desc(virq); + return 0; + } + + pr_debug("irq: create_direct obtained virq %d\n", virq); + + if (irq_setup_virq(domain, virq, virq)) { + irq_free_desc(virq); + return 0; + } + + return virq; +} + +/** + * irq_create_mapping() - Map a hardware interrupt into linux irq space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space * - * When the caller no longer need the irq number returned by this function it - * should arrange to call irq_dispose_mapping(). + * Only one mapping per hardware interrupt is permitted. Returns a linux + * irq number. + * If the sense/trigger is to be specified, set_irq_type() should be called + * on the number returned from that call. */ +unsigned int irq_create_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + unsigned int virq, hint; + + pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); + + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) { + printk(KERN_WARNING "irq_create_mapping called for" + " NULL domain, hwirq=%lx\n", hwirq); + WARN_ON(1); + return 0; + } + pr_debug("irq: -> using domain @%p\n", domain); + + /* Check if mapping already exists */ + virq = irq_find_mapping(domain, hwirq); + if (virq) { + pr_debug("irq: -> existing mapping on virq %d\n", virq); + return virq; + } + + /* Get a virtual interrupt number */ + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return irq_domain_legacy_revmap(domain, hwirq); + + /* Allocate a virtual interrupt number */ + hint = hwirq % irq_virq_count; + if (hint == 0) + hint++; + virq = irq_alloc_desc_from(hint, 0); + if (!virq) + virq = irq_alloc_desc_from(1, 0); + if (!virq) { + pr_debug("irq: -> virq allocation failed\n"); + return 0; + } + + if (irq_setup_virq(domain, virq, hwirq)) { + if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) + irq_free_desc(virq); + return 0; + } + + pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", + hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); + + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_mapping); + unsigned int irq_create_of_mapping(struct device_node *controller, const u32 *intspec, unsigned int intsize) { struct irq_domain *domain; - unsigned long hwirq; - unsigned int irq, type; - int rc = -EINVAL; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + unsigned int virq; - /* Find a domain which can translate the irq spec */ - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, list) { - if (!domain->ops->dt_translate) - continue; - rc = domain->ops->dt_translate(domain, controller, - intspec, intsize, &hwirq, &type); - if (rc == 0) - break; + domain = controller ? irq_find_host(controller) : irq_default_domain; + if (!domain) { +#ifdef CONFIG_MIPS + /* + * Workaround to avoid breaking interrupt controller drivers + * that don't yet register an irq_domain. This is temporary + * code. ~~~gcl, Feb 24, 2012 + * + * Scheduled for removal in Linux v3.6. That should be enough + * time. + */ + if (intsize > 0) + return intspec[0]; +#endif + printk(KERN_WARNING "irq: no irq domain found for %s !\n", + controller->full_name); + return 0; } - mutex_unlock(&irq_domain_mutex); - if (rc != 0) - return 0; + /* If domain has no translation, then we assume interrupt line */ + if (domain->ops->xlate == NULL) + hwirq = intspec[0]; + else { + if (domain->ops->xlate(domain, controller, intspec, intsize, + &hwirq, &type)) + return 0; + } + + /* Create mapping */ + virq = irq_create_mapping(domain, hwirq); + if (!virq) + return virq; - irq = irq_domain_to_irq(domain, hwirq); - if (type != IRQ_TYPE_NONE) - irq_set_irq_type(irq, type); - pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", - controller->full_name, (int)hwirq, irq, type); - return irq; + /* Set type if specified and different than the current one */ + if (type != IRQ_TYPE_NONE && + type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) + irq_set_irq_type(virq, type); + return virq; } EXPORT_SYMBOL_GPL(irq_create_of_mapping); /** - * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() - * @irq: linux irq number to be discarded + * irq_dispose_mapping() - Unmap an interrupt + * @virq: linux irq number of the interrupt to unmap + */ +void irq_dispose_mapping(unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_domain *domain; + irq_hw_number_t hwirq; + + if (!virq || !irq_data) + return; + + domain = irq_data->domain; + if (WARN_ON(domain == NULL)) + return; + + /* Never unmap legacy interrupts */ + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + + /* remove chip and handler */ + irq_set_chip_and_handler(virq, NULL, NULL); + + /* Make sure it's completed */ + synchronize_irq(virq); + + /* Tell the PIC about it */ + if (domain->ops->unmap) + domain->ops->unmap(domain, virq); + smp_mb(); + + /* Clear reverse map */ + hwirq = irq_data->hwirq; + switch(domain->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = 0; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_data.tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + break; + } + + irq_free_desc(virq); +} +EXPORT_SYMBOL_GPL(irq_dispose_mapping); + +/** + * irq_find_mapping() - Find a linux irq from an hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space + * + * This is a slow path, for use by generic code. It's expected that an + * irq controller implementation directly calls the appropriate low level + * mapping function. + */ +unsigned int irq_find_mapping(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + unsigned int i; + unsigned int hint = hwirq % irq_virq_count; + + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) + return 0; + + /* legacy -> bail early */ + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return irq_domain_legacy_revmap(domain, hwirq); + + /* Slow path does a linear search of the map */ + if (hint == 0) + hint = 1; + i = hint; + do { + struct irq_data *data = irq_get_irq_data(i); + if (data && (data->domain == domain) && (data->hwirq == hwirq)) + return i; + i++; + if (i >= irq_virq_count) + i = 1; + } while(i != hint); + return 0; +} +EXPORT_SYMBOL_GPL(irq_find_mapping); + +/** + * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * - * Calling this function indicates the caller no longer needs a reference to - * the linux irq number returned by a prior call to irq_create_of_mapping(). + * This is a fast path, for use by irq controller code that uses radix tree + * revmaps */ -void irq_dispose_mapping(unsigned int irq) +unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, + irq_hw_number_t hwirq) { + struct irq_data *irq_data; + + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return irq_find_mapping(domain, hwirq); + + /* + * Freeing an irq can delete nodes along the path to + * do the lookup via call_rcu. + */ + rcu_read_lock(); + irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); + rcu_read_unlock(); + /* - * nothing yet; will be filled when support for dynamic allocation of - * irq_descs is added to irq_domain + * If found in radix tree, then fine. + * Else fallback to linear lookup - this should not happen in practice + * as it means that we failed to insert the node in the radix tree. */ + return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); } -EXPORT_SYMBOL_GPL(irq_dispose_mapping); -int irq_domain_simple_dt_translate(struct irq_domain *d, - struct device_node *controller, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type) +/** + * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. + * @domain: domain owning this hardware interrupt + * @virq: linux irq number + * @hwirq: hardware irq number in that domain space + * + * This is for use by irq controllers that use a radix tree reverse + * mapping for fast lookup. + */ +void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - if (d->of_node != controller) - return -EINVAL; - if (intsize < 1) - return -EINVAL; - if (d->nr_irq && ((intspec[0] < d->hwirq_base) || - (intspec[0] >= d->hwirq_base + d->nr_irq))) - return -EINVAL; + struct irq_data *irq_data = irq_get_irq_data(virq); + + if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return; + + if (virq) { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); + } +} + +/** + * irq_linear_revmap() - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space + * + * This is a fast path, for use by irq controller code that uses linear + * revmaps. It does fallback to the slow path if the revmap doesn't exist + * yet and will create the revmap entry with appropriate locking + */ +unsigned int irq_linear_revmap(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + unsigned int *revmap; + + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) + return irq_find_mapping(domain, hwirq); + + /* Check revmap bounds */ + if (unlikely(hwirq >= domain->revmap_data.linear.size)) + return irq_find_mapping(domain, hwirq); + + /* Check if revmap was allocated */ + revmap = domain->revmap_data.linear.revmap; + if (unlikely(revmap == NULL)) + return irq_find_mapping(domain, hwirq); + + /* Fill up revmap with slow path if no mapping found */ + if (unlikely(!revmap[hwirq])) + revmap[hwirq] = irq_find_mapping(domain, hwirq); + + return revmap[hwirq]; +} + +#ifdef CONFIG_VIRQ_DEBUG +static int virq_debug_show(struct seq_file *m, void *private) +{ + unsigned long flags; + struct irq_desc *desc; + const char *p; + static const char none[] = "none"; + void *data; + int i; + + seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", + "chip name", "chip data", "domain name"); + + for (i = 1; i < nr_irqs; i++) { + desc = irq_to_desc(i); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + if (desc->action && desc->action->handler) { + struct irq_chip *chip; + + seq_printf(m, "%5d ", i); + seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + + chip = irq_desc_get_chip(desc); + if (chip && chip->name) + p = chip->name; + else + p = none; + seq_printf(m, "%-15s ", p); + + data = irq_desc_get_chip_data(desc); + seq_printf(m, "0x%16p ", data); + + if (desc->irq_data.domain->of_node) + p = desc->irq_data.domain->of_node->full_name; + else + p = none; + seq_printf(m, "%s\n", p); + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } + + return 0; +} +static int virq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, virq_debug_show, inode->i_private); +} + +static const struct file_operations virq_debug_fops = { + .open = virq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init irq_debugfs_init(void) +{ + if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, + NULL, &virq_debug_fops) == NULL) + return -ENOMEM; + + return 0; +} +__initcall(irq_debugfs_init); +#endif /* CONFIG_VIRQ_DEBUG */ + +int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hwirq) +{ + return 0; +} + +/** + * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings + * + * Device Tree IRQ specifier translation function which works with one cell + * bindings where the cell value maps directly to the hwirq number. + */ +int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 1)) + return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; - if (intsize > 1) - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; return 0; } +EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); /** - * irq_domain_create_simple() - Set up a 'simple' translation range + * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. */ -void irq_domain_add_simple(struct device_node *controller, int irq_base) +int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) { - struct irq_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) { - WARN_ON(1); - return; - } + if (WARN_ON(intsize < 2)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); - domain->irq_base = irq_base; - domain->of_node = of_node_get(controller); - domain->ops = &irq_domain_simple_ops; - irq_domain_add(domain); +/** + * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings + * + * Device Tree IRQ specifier translation function which works with either one + * or two cell bindings where the cell values map directly to the hwirq number + * and linux irq flags. + * + * Note: don't use this function unless your interrupt controller explicitly + * supports both one and two cell bindings. For the majority of controllers + * the _onecell() or _twocell() variants above should be used. + */ +int irq_domain_xlate_onetwocell(struct irq_domain *d, + struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 1)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + return 0; } -EXPORT_SYMBOL_GPL(irq_domain_add_simple); +EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); +const struct irq_domain_ops irq_domain_simple_ops = { + .map = irq_domain_simple_map, + .xlate = irq_domain_xlate_onetwocell, +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +#ifdef CONFIG_OF_IRQ void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { struct device_node *node; - pr_info("looking for phys_base=%llx, irq_start=%i\n", + pr_debug("looking for phys_base=%llx, irq_start=%i\n", (unsigned long long) phys_base, (int) irq_start); node = of_find_matching_node_by_address(NULL, match, phys_base); if (node) - irq_domain_add_simple(node, irq_start); - else - pr_info("no node found\n"); + irq_domain_add_legacy(node, 32, irq_start, 0, + &irq_domain_simple_ops, NULL); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif /* CONFIG_OF_IRQ */ - -struct irq_domain_ops irq_domain_simple_ops = { -#ifdef CONFIG_OF_IRQ - .dt_translate = irq_domain_simple_dt_translate, -#endif /* CONFIG_OF_IRQ */ -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +#endif diff --git a/kernel/kexec.c b/kernel/kexec.c index 7b0886786701..a6a675cb9818 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1546,13 +1546,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_noirq(). We *must* call - * dpm_suspend_noirq() now. Otherwise, drivers for + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1579,7 +1579,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: diff --git a/kernel/kmod.c b/kernel/kmod.c index a0a88543934e..957a7aab8ebc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static void free_modprobe_argv(struct subprocess_info *info) +{ + kfree(info->argv[3]); /* check call_modprobe() */ + kfree(info->argv); +} + +static int call_modprobe(char *module_name, int wait) +{ + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + + char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); + if (!argv) + goto out; + + module_name = kstrdup(module_name, GFP_KERNEL); + if (!module_name) + goto free_argv; + + argv[0] = modprobe_path; + argv[1] = "-q"; + argv[2] = "--"; + argv[3] = module_name; /* check free_modprobe_argv() */ + argv[4] = NULL; + + return call_usermodehelper_fns(modprobe_path, argv, envp, + wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); +free_argv: + kfree(argv); +out: + return -ENOMEM; +} + /** * __request_module - try to load a kernel module * @wait: wait (or not) for the operation to complete @@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...) char module_name[MODULE_NAME_LEN]; unsigned int max_modprobes; int ret; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; static atomic_t kmod_concurrent = ATOMIC_INIT(0); #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; @@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_usermodehelper_fns(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, - NULL, NULL, NULL); + ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); return ret; @@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data) /* Exec failed? */ fail: sub_info->retval = retval; - do_exit(0); + return 0; } void call_usermodehelper_freeinfo(struct subprocess_info *info) @@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) } EXPORT_SYMBOL(call_usermodehelper_freeinfo); +static void umh_complete(struct subprocess_info *sub_info) +{ + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); +} + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -235,7 +278,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + umh_complete(sub_info); return 0; } @@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - enum umh_wait wait = sub_info->wait; + int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; /* CLONE_VFORK: wait until the usermode helper has execve'd @@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work) case UMH_WAIT_EXEC: if (pid < 0) sub_info->retval = pid; - complete(sub_info->complete); + umh_complete(sub_info); } } @@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, - enum umh_wait wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); int retval = 0; @@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, queue_work(khelper_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + wait_for_completion(&done); +wait_done: retval = sub_info->retval; - out: call_usermodehelper_freeinfo(sub_info); unlock: diff --git a/kernel/padata.c b/kernel/padata.c index b45259931512..6f10eb285ece 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -29,7 +29,6 @@ #include <linux/sysfs.h> #include <linux/rcupdate.h> -#define MAX_SEQ_NR (INT_MAX - NR_CPUS) #define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) @@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) return target_cpu; } -static int padata_cpu_hash(struct padata_priv *padata) +static int padata_cpu_hash(struct parallel_data *pd) { int cpu_index; - struct parallel_data *pd; - - pd = padata->pd; /* * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); + + spin_lock(&pd->seq_lock); + cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); + pd->seq_nr++; + spin_unlock(&pd->seq_lock); return padata_index_to_cpu(pd, cpu_index); } @@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst, padata->pd = pd; padata->cb_cpu = cb_cpu; - if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) - atomic_set(&pd->seq_nr, -1); - - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - - target_cpu = padata_cpu_hash(padata); + target_cpu = padata_cpu_hash(pd); queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); @@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel); static struct padata_priv *padata_get_next(struct parallel_data *pd) { int cpu, num_cpus; - int next_nr, next_index; + unsigned int next_nr, next_index; struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; @@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - if (unlikely(next_nr > pd->max_seq_nr)) { - next_nr = next_nr - pd->max_seq_nr - 1; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); - next_queue = per_cpu_ptr(pd->pqueue, cpu); - pd->processed = 0; - } - padata = NULL; reorder = &next_queue->reorder; @@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) padata = list_entry(reorder->list.next, struct padata_priv, list); - BUG_ON(next_nr != padata->seq_nr); - spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); @@ -230,6 +215,7 @@ out: static void padata_reorder(struct parallel_data *pd) { + int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; @@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd) return; } - squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); + cb_cpu = padata->cb_cpu; + squeue = per_cpu_ptr(pd->squeue, cb_cpu); spin_lock(&squeue->serial.lock); list_add_tail(&padata->list, &squeue->serial.list); spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); + queue_work_on(cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); @@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd) /* Initialize all percpu queues used by parallel workers */ static void padata_init_pqueues(struct parallel_data *pd) { - int cpu_index, num_cpus, cpu; + int cpu_index, cpu; struct padata_parallel_queue *pqueue; cpu_index = 0; @@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd) INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } - - num_cpus = cpumask_weight(pd->cpumask.pcpu); - pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; } /* Allocate and initialize the internal cpumask dependend resources. */ @@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = 0; atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; diff --git a/kernel/params.c b/kernel/params.c index 4bc965d8a1fe..47f5bf12434a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -15,7 +15,6 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a8968396046d..17b232869a04 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) while (nr > 0) { rcu_read_lock(); - /* - * Any nested-container's init processes won't ignore the - * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). - */ task = pid_task(find_vpid(nr), PIDTYPE_PID); - if (task) - send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); + if (task && !__fatal_signal_pending(task)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, task); rcu_read_unlock(); diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 07e0e28ffba7..66d808ec5252 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-$(CONFIG_PM) += main.o qos.o +obj-y += qos.o +obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..0a186cfde788 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ @@ -254,7 +254,7 @@ static int create_image(int platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -306,7 +306,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - dpm_resume_noirq(in_suspend ? + dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode) * successful freezer test. */ freezer_test_done = true; - goto Cleanup; + goto Thaw; } error = dpm_prepare(PMSG_FREEZE); if (error) { dpm_complete(PMSG_RECOVER); - goto Cleanup; + goto Thaw; } suspend_console(); @@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; + Thaw: + thaw_kernel_threads(); Cleanup: swsusp_free(); goto Close; @@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode) * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_end(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - dpm_resume_noirq(PMSG_RECOVER); + dpm_resume_start(PMSG_RECOVER); return error; } @@ -518,7 +520,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = dpm_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -549,7 +551,7 @@ int hibernation_platform_enter(void) Platform_finish: hibernation_ops->finish(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; @@ -616,7 +618,7 @@ int hibernate(void) /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) - goto Exit; + goto Enable_umh; printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -624,15 +626,11 @@ int hibernate(void) error = freeze_processes(); if (error) - goto Finish; + goto Free_bitmaps; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (error) - goto Thaw; - if (freezer_test_done) { - freezer_test_done = false; + if (error || freezer_test_done) goto Thaw; - } if (in_suspend) { unsigned int flags = 0; @@ -657,8 +655,13 @@ int hibernate(void) Thaw: thaw_processes(); - Finish: + + /* Don't bother checking whether freezer_test_done is true */ + freezer_test_done = false; + + Free_bitmaps: free_basic_memory_bitmaps(); + Enable_umh: usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..1c12581f1c62 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", @@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { + error = pm_suspend(state); break; - } - if (state < PM_SUSPEND_MAX && *s) { - error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else - suspend_stats.success++; + } } #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 21724eee5206..98f3622d7407 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -177,13 +177,11 @@ extern const char *const pm_states[]; extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); -extern int enter_state(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline int enter_state(suspend_state_t state) { return -ENOSYS; } static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ @@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void) int error; error = freeze_processes(); - /* * freeze_processes() automatically thaws every task if freezing * fails. So we need not do anything extra upon error. */ if (error) - goto Finish; + return error; error = freeze_kernel_threads(); - /* * freeze_kernel_threads() thaws only kernel threads upon freezing * failure. So we have to thaw the userspace tasks ourselves. @@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void) if (error) thaw_processes(); - Finish: return error; } diff --git a/kernel/power/process.c b/kernel/power/process.c index 7e426459e60a..0d2aeb226108 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only) * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. * - * Because freeze_task() goes through p's - * scheduler lock after setting TIF_FREEZE, it's - * guaranteed that either we see TASK_RUNNING or - * try_to_stop() after schedule() in ptrace/signal - * stop sees TIF_FREEZE. + * Because freeze_task() goes through p's scheduler lock, it's + * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING + * transition can't race with task state testing here. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) @@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!wakeup && !freezer_should_skip(p) && - p != current && freezing(p) && !frozen(p)) - sched_show_task(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (!wakeup) { + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p != current && !freezer_should_skip(p) + && freezing(p) && !frozen(p)) + sched_show_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } } else { printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, elapsed_csecs % 100); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 995e3bd3417b..d6d6dbd1ecc0 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, static int __init pm_qos_power_init(void) { int ret = 0; + int i; - ret = register_pm_qos_misc(&cpu_dma_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_lat_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); - return ret; + BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + + for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { + ret = register_pm_qos_misc(pm_qos_array[i]); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: %s setup failed\n", + pm_qos_array[i]->name); + return ret; + } } - ret = register_pm_qos_misc(&network_throughput_pm_qos); - if (ret < 0) - printk(KERN_ERR - "pm_qos_param: network_throughput setup failed\n"); return ret; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6a768e537001..0de28576807d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", - region->start_pfn << PAGE_SHIFT, - region->end_pfn << PAGE_SHIFT); + pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) if (pfn_valid(pfn)) { @@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page, KM_USER0); - dst = kmap_atomic(d_page, KM_USER1); + src = kmap_atomic(s_page); + dst = kmap_atomic(d_page); do_copy_page(dst, src); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst); + kunmap_atomic(src); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page, KM_USER0); + dst = kmap_atomic(d_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); } else { safe_copy_page(page_address(d_page), s_page); } @@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page, KM_USER0); + dst = kmap_atomic(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); last_highmem_page = NULL; } } @@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1, KM_USER0); - kaddr2 = kmap_atomic(p2, KM_USER1); + kaddr1 = kmap_atomic(p1); + kaddr2 = kmap_atomic(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2, KM_USER1); - kunmap_atomic(kaddr1, KM_USER0); + kunmap_atomic(kaddr2); + kunmap_atomic(kaddr1); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..88e5c967370d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = { static const struct platform_suspend_ops *suspend_ops; /** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use. */ void suspend_set_ops(const struct platform_suspend_ops *ops) { @@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state) } /** - * suspend_valid_only_mem - generic memory-only valid callback + * suspend_valid_only_mem - Generic memory-only valid callback. * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback. */ int suspend_valid_only_mem(suspend_state_t state) { @@ -83,10 +83,11 @@ static int suspend_test(int level) } /** - * suspend_prepare - Do prep work before entering low-power state. + * suspend_prepare - Prepare for entering system sleep state. * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. + * Common code run for every system sleep state that can be entered (except for + * hibernation). Run suspend notifiers, allocate the "suspend" console and + * freeze processes. */ static int suspend_prepare(void) { @@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) } /** - * suspend_enter - enter the desired system sleep state. - * @state: State to enter - * @wakeup: Returns information that suspend should not be entered again. + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered. * * This function should be called after devices have been suspended. */ @@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - error = dpm_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; @@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_ops->wake) suspend_ops->wake(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); Platform_finish: if (suspend_ops->finish) @@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } /** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter. */ int suspend_devices_and_enter(suspend_state_t state) { @@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state) } /** - * suspend_finish - Do final work before exiting suspend sequence. + * suspend_finish - Clean up before finishing the suspend sequence. * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation. */ static void suspend_finish(void) { @@ -265,16 +265,14 @@ static void suspend_finish(void) } /** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter. * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case. Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup. */ -int enter_state(suspend_state_t state) +static int enter_state(suspend_state_t state) { int error; @@ -310,24 +308,26 @@ int enter_state(suspend_state_t state) } /** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter. * - * Determine whether or not value is within range, get state - * structure, and enter (above). + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics. */ int pm_suspend(suspend_state_t state) { - int ret; - if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { - ret = enter_state(state); - if (ret) { - suspend_stats.fail++; - dpm_save_failed_errno(ret); - } else - suspend_stats.success++; - return ret; + int error; + + if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) + return -EINVAL; + + error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; } - return -EINVAL; + return error; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/user.c b/kernel/power/user.c index 3e100075b13c..33c4329205af 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,16 +249,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (error) { - thaw_kernel_threads(); - } else { + if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error && !freezer_test_done) - data->ready = 1; - if (freezer_test_done) { - freezer_test_done = false; - thaw_kernel_threads(); - } + data->ready = !freezer_test_done && !error; + freezer_test_done = false; } break; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 00ab2ca5ed11..ee8d49b9c309 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) } static int ptrace_attach(struct task_struct *task, long request, + unsigned long addr, unsigned long flags) { bool seize = (request == PTRACE_SEIZE); int retval; - /* - * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL is used to prevent applications - * expecting full SEIZE behaviors trapping on kernel commits which - * are still in the process of implementing them. - * - * Only test programs for new ptrace behaviors being implemented - * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. - * - * Once SEIZE behaviors are completely implemented, this flag and - * the following test will be removed. - */ retval = -EIO; - if (seize && !(flags & PTRACE_SEIZE_DEVEL)) - goto out; + if (seize) { + if (addr != 0) + goto out; + if (flags & ~(unsigned long)PTRACE_O_MASK) + goto out; + flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); + } else { + flags = PT_PTRACED; + } audit_ptrace(task); @@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request, /* * Protect exec's credential calculations against our interference; - * interference; SUID, SGID and LSM creds get determined differently + * SUID, SGID and LSM creds get determined differently * under ptrace. */ retval = -ERESTARTNOINTR; @@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - task->ptrace = PT_PTRACED; if (seize) - task->ptrace |= PT_SEIZED; + flags |= PT_SEIZED; if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; + flags |= PT_PTRACE_CAP; + task->ptrace = flags; __ptrace_link(task, current); @@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { - child->ptrace &= ~PT_TRACE_MASK; + unsigned flags; - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - - if (data & PTRACE_O_TRACEFORK) - child->ptrace |= PT_TRACE_FORK; - - if (data & PTRACE_O_TRACEVFORK) - child->ptrace |= PT_TRACE_VFORK; - - if (data & PTRACE_O_TRACECLONE) - child->ptrace |= PT_TRACE_CLONE; - - if (data & PTRACE_O_TRACEEXEC) - child->ptrace |= PT_TRACE_EXEC; - - if (data & PTRACE_O_TRACEVFORKDONE) - child->ptrace |= PT_TRACE_VFORK_DONE; + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; - if (data & PTRACE_O_TRACEEXIT) - child->ptrace |= PT_TRACE_EXIT; + /* Avoid intermediate state when all opts are cleared */ + flags = child->ptrace; + flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); + flags |= (data << PT_OPT_FLAG_SHIFT); + child->ptrace = flags; - return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; + return 0; } static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) @@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. diff --git a/kernel/resource.c b/kernel/resource.c index 7640b3a947d0..7e8ea66a8c01 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t write_unlock(&resource_lock); return result; } +EXPORT_SYMBOL(adjust_resource); static void __init __reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, @@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root, write_unlock(&resource_lock); } -EXPORT_SYMBOL(adjust_resource); - /** * resource_alignment - calculate resource's alignment * @res: resource pointer diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2bd4647586c..503d6426126d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -71,6 +71,7 @@ #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> +#include <linux/binfmts.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -7571,8 +7572,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7589,15 +7589,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &tg->css; } -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpu_cgroup_destroy(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); sched_destroy_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7615,7 +7614,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7625,8 +7624,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7976,8 +7975,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { */ /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { struct cpuacct *ca; @@ -8007,8 +8005,7 @@ out: } /* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpuacct_destroy(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); diff --git a/kernel/signal.c b/kernel/signal.c index 8511e39813c7..d523da02dd14 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig) (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, - int from_ancestor_ns) +static int sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !from_ancestor_ns) + handler == SIG_DFL && !force) return 1; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +static int sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, from_ancestor_ns)) + if (!sig_task_ignored(t, sig, force)) return 0; /* @@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) +static int prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } } - return !sig_ignored(p, sig, from_ancestor_ns); + return !sig_ignored(p, sig, force); } /* @@ -1059,7 +1058,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, from_ancestor_ns)) + if (!prepare_signal(sig, t, + from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; pending = group ? &t->signal->shared_pending : &t->pending; @@ -1601,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) ret = 1; /* the signal is ignored */ result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, 0)) + if (!prepare_signal(sig, t, false)) goto out; ret = 0; @@ -1652,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + if (sig != SIGCHLD) { + /* + * This is only possible if parent == real_parent. + * Check if it has changed security domain. + */ + if (tsk->parent_exec_id != tsk->parent->self_exec_id) + sig = SIGCHLD; + } + info.si_signo = sig; info.si_errno = 0; /* diff --git a/kernel/sys.c b/kernel/sys.c index 888d227fd195..9eb7fcab8df6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + error = 0; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *) arg2); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..d48ff4fd44c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,6 +58,7 @@ #include <linux/oom.h> #include <linux/kmod.h> #include <linux/capability.h> +#include <linux/binfmts.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -192,20 +193,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, #endif -static struct ctl_table root_table[]; -static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { - {{.count = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, - .root = &sysctl_table_root, - .set = &sysctl_table_root.default_set, -}; -static struct ctl_table_root sysctl_table_root = { - .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), -}; - static struct ctl_table kern_table[]; static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; @@ -222,7 +209,7 @@ int sysctl_legacy_va_layout; /* The default sysctl tables: */ -static struct ctl_table root_table[] = { +static struct ctl_table sysctl_base_table[] = { { .procname = "kernel", .mode = 0555, @@ -1559,490 +1546,12 @@ static struct ctl_table dev_table[] = { { } }; -static DEFINE_SPINLOCK(sysctl_lock); - -/* called under sysctl_lock */ -static int use_table(struct ctl_table_header *p) -{ - if (unlikely(p->unregistering)) - return 0; - p->used++; - return 1; -} - -/* called under sysctl_lock */ -static void unuse_table(struct ctl_table_header *p) -{ - if (!--p->used) - if (unlikely(p->unregistering)) - complete(p->unregistering); -} - -/* called under sysctl_lock, will reacquire if has to wait */ -static void start_unregistering(struct ctl_table_header *p) -{ - /* - * if p->used is 0, nobody will ever touch that entry again; - * we'll eliminate all paths to it before dropping sysctl_lock - */ - if (unlikely(p->used)) { - struct completion wait; - init_completion(&wait); - p->unregistering = &wait; - spin_unlock(&sysctl_lock); - wait_for_completion(&wait); - spin_lock(&sysctl_lock); - } else { - /* anything non-NULL; we'll never dereference it */ - p->unregistering = ERR_PTR(-EINVAL); - } - /* - * do not remove from the list until nobody holds it; walking the - * list in do_sysctl() relies on that. - */ - list_del_init(&p->ctl_entry); -} - -void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); -} - -struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) -{ - if (!head) - BUG(); - spin_lock(&sysctl_lock); - if (!use_table(head)) - head = ERR_PTR(-ENOENT); - spin_unlock(&sysctl_lock); - return head; -} - -void sysctl_head_finish(struct ctl_table_header *head) -{ - if (!head) - return; - spin_lock(&sysctl_lock); - unuse_table(head); - spin_unlock(&sysctl_lock); -} - -static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = &root->default_set; - if (root->lookup) - set = root->lookup(root, namespaces); - return set; -} - -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = lookup_header_set(root, namespaces); - return &set->list; -} - -struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev) -{ - struct ctl_table_root *root; - struct list_head *header_list; - struct ctl_table_header *head; - struct list_head *tmp; - - spin_lock(&sysctl_lock); - if (prev) { - head = prev; - tmp = &prev->ctl_entry; - unuse_table(prev); - goto next; - } - tmp = &root_table_header.ctl_entry; - for (;;) { - head = list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - goto next; - spin_unlock(&sysctl_lock); - return head; - next: - root = head->root; - tmp = tmp->next; - header_list = lookup_header_list(root, namespaces); - if (tmp != header_list) - continue; - - do { - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - if (root == &sysctl_table_root) - goto out; - header_list = lookup_header_list(root, namespaces); - } while (list_empty(header_list)); - tmp = header_list->next; - } -out: - spin_unlock(&sysctl_lock); - return NULL; -} - -struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) -{ - return __sysctl_head_next(current->nsproxy, prev); -} - -void register_sysctl_root(struct ctl_table_root *root) -{ - spin_lock(&sysctl_lock); - list_add_tail(&root->root_list, &sysctl_table_root.root_list); - spin_unlock(&sysctl_lock); -} - -/* - * sysctl_perm does NOT grant the superuser all rights automatically, because - * some sysctl variables are readonly even to root. - */ - -static int test_perm(int mode, int op) -{ - if (!current_euid()) - mode >>= 6; - else if (in_egroup_p(0)) - mode >>= 3; - if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) - return 0; - return -EACCES; -} - -int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) -{ - int mode; - - if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); - else - mode = table->mode; - - return test_perm(mode, op); -} - -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ - for (; table->procname; table++) { - table->parent = parent; - if (table->child) - sysctl_set_parent(table, table->child); - } -} - -static __init int sysctl_init(void) +int __init sysctl_init(void) { - sysctl_set_parent(NULL, root_table); -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - sysctl_check_table(current->nsproxy, root_table); -#endif + register_sysctl_table(sysctl_base_table); return 0; } -core_initcall(sysctl_init); - -static struct ctl_table *is_branch_in(struct ctl_table *branch, - struct ctl_table *table) -{ - struct ctl_table *p; - const char *s = branch->procname; - - /* branch should have named subdirectory as its first element */ - if (!s || !branch->child) - return NULL; - - /* ... and nothing else */ - if (branch[1].procname) - return NULL; - - /* table should contain subdirectory with the same name */ - for (p = table; p->procname; p++) { - if (!p->child) - continue; - if (p->procname && strcmp(p->procname, s) == 0) - return p; - } - return NULL; -} - -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) -{ - struct ctl_table *to = p->ctl_table, *by = q->ctl_table; - struct ctl_table *next; - int is_better = 0; - int not_in_parent = !p->attached_by; - - while ((next = is_branch_in(by, to)) != NULL) { - if (by == q->attached_by) - is_better = 1; - if (to == p->attached_by) - not_in_parent = 1; - by = by->child; - to = next->child; - } - - if (is_better && not_in_parent) { - q->attached_by = by; - q->attached_to = to; - q->parent = p; - } -} - -/** - * __register_sysctl_paths - register a sysctl hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * The members of the &struct ctl_table structure are used as follows: - * - * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not - * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) - * - * child - a pointer to the child sysctl table if this entry is a directory, or - * %NULL. - * - * proc_handler - the text handler routine (described below) - * - * de - for internal use by the sysctl routines - * - * extra1, extra2 - extra pointers usable by the proc handler routines - * - * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. - * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler). Several default handlers are available to - * cover common cases - - * - * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), - * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), - * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() - * - * It is the handler's job to read the input buffer from user memory - * and process it. The handler should return 0 on success. - * - * This routine returns %NULL on a failure to register, and a pointer - * to the table header on success. - */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_root *root, - struct nsproxy *namespaces, - const struct ctl_path *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - struct ctl_table *new, **prevp; - unsigned int n, npath; - struct ctl_table_set *set; - - /* Count the path components */ - for (npath = 0; path[npath].procname; ++npath) - ; - - /* - * For each path component, allocate a 2-element ctl_table array. - * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (procname == 0). - * - * We allocate everything in one go so that we don't have to - * worry about freeing additional memory in unregister_sysctl_table. - */ - header = kzalloc(sizeof(struct ctl_table_header) + - (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); - if (!header) - return NULL; - - new = (struct ctl_table *) (header + 1); - - /* Now connect the dots */ - prevp = &header->ctl_table; - for (n = 0; n < npath; ++n, ++path) { - /* Copy the procname */ - new->procname = path->procname; - new->mode = 0555; - - *prevp = new; - prevp = &new->child; - - new += 2; - } - *prevp = table; - header->ctl_table_arg = table; - - INIT_LIST_HEAD(&header->ctl_entry); - header->used = 0; - header->unregistering = NULL; - header->root = root; - sysctl_set_parent(NULL, header->ctl_table); - header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - if (sysctl_check_table(namespaces, header->ctl_table)) { - kfree(header); - return NULL; - } -#endif - spin_lock(&sysctl_lock); - header->set = lookup_header_set(root, namespaces); - header->attached_by = header->ctl_table; - header->attached_to = root_table; - header->parent = &root_table_header; - for (set = header->set; set; set = set->parent) { - struct ctl_table_header *p; - list_for_each_entry(p, &set->list, ctl_entry) { - if (p->unregistering) - continue; - try_attach(p, header); - } - } - header->parent->count++; - list_add_tail(&header->ctl_entry, &header->set->list); - spin_unlock(&sysctl_lock); - - return header; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, - path, table); -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} - -/** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table - * - * Unregisters the sysctl table and all children. proc entries may not - * actually be removed until they are no longer used by anyone. - */ -void unregister_sysctl_table(struct ctl_table_header * header) -{ - might_sleep(); - - if (header == NULL) - return; - - spin_lock(&sysctl_lock); - start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree_rcu(header->parent, rcu); - } - if (!--header->count) - kfree_rcu(header, rcu); - spin_unlock(&sysctl_lock); -} - -int sysctl_is_seen(struct ctl_table_header *p) -{ - struct ctl_table_set *set = p->set; - int res; - spin_lock(&sysctl_lock); - if (p->unregistering) - res = 0; - else if (!set->is_seen) - res = 1; - else - res = set->is_seen(set); - spin_unlock(&sysctl_lock); - return res; -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ - INIT_LIST_HEAD(&p->list); - p->parent = parent ? parent : &sysctl_table_root.default_set; - p->is_seen = is_seen; -} - -#else /* !CONFIG_SYSCTL */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -{ - return NULL; -} - -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -} - #endif /* CONFIG_SYSCTL */ /* @@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); -EXPORT_SYMBOL(register_sysctl_table); -EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c deleted file mode 100644 index 362da653813d..000000000000 --- a/kernel/sysctl_check.c +++ /dev/null @@ -1,160 +0,0 @@ -#include <linux/stat.h> -#include <linux/sysctl.h> -#include "../fs/xfs/xfs_sysctl.h" -#include <linux/sunrpc/debug.h> -#include <linux/string.h> -#include <net/ip_vs.h> - - -static int sysctl_depth(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth; - - depth = 0; - for (tmp = table; tmp->parent; tmp = tmp->parent) - depth++; - - return depth; -} - -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) -{ - int i; - - for (i = 0; table && i < n; i++) - table = table->parent; - - return table; -} - - -static void sysctl_print_path(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth, i; - depth = sysctl_depth(table); - if (table->procname) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk("/%s", tmp->procname?tmp->procname:""); - } - } - printk(" "); -} - -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table_header *head; - struct ctl_table *ref, *test; - int depth, cur_depth; - - depth = sysctl_depth(table); - - for (head = __sysctl_head_next(namespaces, NULL); head; - head = __sysctl_head_next(namespaces, head)) { - cur_depth = depth; - ref = head->ctl_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->procname; ref++) { - int match = 0; - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - } - ref = NULL; -out: - sysctl_head_finish(head); - return ref; -} - -static void set_fail(const char **fail, struct ctl_table *table, const char *str) -{ - if (*fail) { - printk(KERN_ERR "sysctl table check failed: "); - sysctl_print_path(table); - printk(" %s\n", *fail); - dump_stack(); - } - *fail = str; -} - -static void sysctl_check_leaf(struct nsproxy *namespaces, - struct ctl_table *table, const char **fail) -{ - struct ctl_table *ref; - - ref = sysctl_check_lookup(namespaces, table); - if (ref && (ref != table)) - set_fail(fail, table, "Sysctl already exists"); -} - -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) -{ - int error = 0; - for (; table->procname; table++) { - const char *fail = NULL; - - if (table->parent) { - if (!table->parent->procname) - set_fail(&fail, table, "Parent without procname"); - } - if (table->child) { - if (table->data) - set_fail(&fail, table, "Directory with data?"); - if (table->maxlen) - set_fail(&fail, table, "Directory with maxlen?"); - if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) - set_fail(&fail, table, "Writable sysctl directory"); - if (table->proc_handler) - set_fail(&fail, table, "Directory with proc_handler"); - if (table->extra1) - set_fail(&fail, table, "Directory with extra1"); - if (table->extra2) - set_fail(&fail, table, "Directory with extra2"); - } else { - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - set_fail(&fail, table, "No data"); - if (!table->maxlen) - set_fail(&fail, table, "No maxlen"); - } -#ifdef CONFIG_PROC_SYSCTL - if (!table->proc_handler) - set_fail(&fail, table, "No proc_handler"); -#endif - sysctl_check_leaf(namespaces, table, &fail); - } - if (table->mode > 0777) - set_fail(&fail, table, "bogus .mode"); - if (fail) { - set_fail(&fail, table, NULL); - error = -EINVAL; - } - if (table->child) - error |= sysctl_check_table(namespaces, table->child); - } - return error; -} diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 403c2a092830..15be32e19c6e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1260,6 +1260,8 @@ ktime_t ktime_get_monotonic_offset(void) return timespec_to_ktime(wtom); } +EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); + /** * xtime_update() - advances the timekeeping infrastructure diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c5a01873567d..859fae6b1825 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) return ret; } -int trace_seq_path(struct trace_seq *s, struct path *path) +int trace_seq_path(struct trace_seq *s, const struct path *path) { unsigned char *p; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14bc092fb12c..df30ee08bdd4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,6 +9,8 @@ * to those contributors as well. */ +#define pr_fmt(fmt) "NMI watchdog: " fmt + #include <linux/mm.h> #include <linux/cpu.h> #include <linux/nmi.h> @@ -319,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - sched_setscheduler(current, SCHED_FIFO, ¶m); - /* initialize timestamp */ __touch_watchdog(); @@ -349,8 +349,11 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } + /* + * Drop the policy/priority elevation during thread exit to avoid a + * scheduling latency spike. + */ __set_current_state(TASK_RUNNING); - param.sched_priority = 0; sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (!IS_ERR(event)) { - printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + pr_info("enabled, takes one hw-pmu counter.\n"); goto out_save; } /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) - printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); + pr_warning("disabled (cpu%i): hardware events not enabled\n", + cpu); else - printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ @@ -439,9 +444,10 @@ static int watchdog_enable(int cpu) /* create the watchdog thread */ if (!p) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { - printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + pr_err("softlockup watchdog for %i failed\n", cpu); if (!err) { /* if hardlockup hasn't already set this */ err = PTR_ERR(p); @@ -450,6 +456,7 @@ static int watchdog_enable(int cpu) } goto out; } + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; per_cpu(softlockup_watchdog, cpu) = p; @@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void) watchdog_enabled = 1; if (!watchdog_enabled) - printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + pr_err("failed to be enabled on some cpus\n"); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f2c5638bb5ab..5abf42f63c08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -476,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) { -#ifdef CONFIG_SMP + if (likely(cpu < nr_cpu_ids)) return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); -#else - return wq->cpu_wq.single; -#endif - } } else if (likely(cpu == WORK_CPU_UNBOUND)) return wq->cpu_wq.single; return NULL; @@ -2899,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq) const size_t size = sizeof(struct cpu_workqueue_struct); const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) wq->cpu_wq.pcpu = __alloc_percpu(size, align); else { void *ptr; @@ -2929,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) static void free_cwqs(struct workqueue_struct *wq) { -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_wq.pcpu); else if (wq->cpu_wq.single) { /* the pointer to free is stored right after the cwq */ |