diff options
Diffstat (limited to 'kernel')
53 files changed, 4744 insertions, 2467 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index af1de0f34eae..4b96415527b8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -67,6 +67,7 @@ #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/fs_struct.h> +#include <linux/compat.h> #include "audit.h" @@ -2710,13 +2711,16 @@ void audit_core_dumps(long signr) audit_log_end(ab); } -void __audit_seccomp(unsigned long syscall) +void __audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - audit_log_abend(ab, "seccomp", SIGKILL); + audit_log_abend(ab, "seccomp", signr); audit_log_format(ab, " syscall=%ld", syscall); + audit_log_format(ab, " compat=%d", is_compat_task()); + audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current)); + audit_log_format(ab, " code=0x%x", code); audit_log_end(ab); } diff --git a/kernel/capability.c b/kernel/capability.c index 3f1adb6c6470..493d97259484 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -419,3 +419,24 @@ bool nsown_capable(int cap) { return ns_capable(current_user_ns(), cap); } + +/** + * inode_capable - Check superior capability over inode + * @inode: The inode in question + * @cap: The capability in question + * + * Return true if the current task has the given superior capability + * targeted at it's own user namespace and that the given inode is owned + * by the current user namespace or a child namespace. + * + * Currently we check to see if an inode is owned by the current + * user namespace by seeing if the inode's owner maps into the + * current user namespace. + * + */ +bool inode_capable(const struct inode *inode, int cap) +{ + struct user_namespace *ns = current_user_ns(); + + return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); +} diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed64ccac67c9..a0c6af34d500 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -60,9 +60,13 @@ #include <linux/eventfd.h> #include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/kthread.h> #include <linux/atomic.h> +/* css deactivation bias, makes css->refcnt negative to deny new trygets */ +#define CSS_DEACT_BIAS INT_MIN + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -127,6 +131,9 @@ struct cgroupfs_root { /* A list running through the active hierarchies */ struct list_head root_list; + /* All cgroups on this root, cgroup_mutex protected */ + struct list_head allcg_list; + /* Hierarchy-specific flags */ unsigned long flags; @@ -145,6 +152,15 @@ struct cgroupfs_root { static struct cgroupfs_root rootnode; /* + * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. + */ +struct cfent { + struct list_head node; + struct dentry *dentry; + struct cftype *type; +}; + +/* * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when * cgroup_subsys->use_id != 0. */ @@ -239,6 +255,14 @@ int cgroup_lock_is_held(void) EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +/* the current nr of refs, always >= 0 whether @css is deactivated or not */ +static int css_refcnt(struct cgroup_subsys_state *css) +{ + int v = atomic_read(&css->refcnt); + + return v >= 0 ? v : v - CSS_DEACT_BIAS; +} + /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) { @@ -279,6 +303,21 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling) #define for_each_active_root(_root) \ list_for_each_entry(_root, &roots, root_list) +static inline struct cgroup *__d_cgrp(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +static inline struct cfent *__d_cfe(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +static inline struct cftype *__d_cft(struct dentry *dentry) +{ + return __d_cfe(dentry)->type; +} + /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -816,12 +855,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) struct cgroup_subsys *ss; int ret = 0; - for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) { - ret = ss->pre_destroy(cgrp); - if (ret) - break; + for_each_subsys(cgrp->root, ss) { + if (!ss->pre_destroy) + continue; + + ret = ss->pre_destroy(cgrp); + if (ret) { + /* ->pre_destroy() failure is being deprecated */ + WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); + break; } + } return ret; } @@ -864,6 +908,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) BUG_ON(!list_empty(&cgrp->pidlists)); kfree_rcu(cgrp, rcu_head); + } else { + struct cfent *cfe = __d_cfe(dentry); + struct cgroup *cgrp = dentry->d_parent->d_fsdata; + + WARN_ONCE(!list_empty(&cfe->node) && + cgrp != &cgrp->root->top_cgroup, + "cfe still linked for %s\n", cfe->type->name); + kfree(cfe); } iput(inode); } @@ -882,34 +934,36 @@ static void remove_dir(struct dentry *d) dput(parent); } -static void cgroup_clear_directory(struct dentry *dentry) -{ - struct list_head *node; - - BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); - spin_lock(&dentry->d_lock); - node = dentry->d_subdirs.next; - while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_u.d_child); - - spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(node); - if (d->d_inode) { - /* This should never be called on a cgroup - * directory with child cgroups */ - BUG_ON(d->d_inode->i_mode & S_IFDIR); - dget_dlock(d); - spin_unlock(&d->d_lock); - spin_unlock(&dentry->d_lock); - d_delete(d); - simple_unlink(dentry->d_inode, d); - dput(d); - spin_lock(&dentry->d_lock); - } else - spin_unlock(&d->d_lock); - node = dentry->d_subdirs.next; +static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +{ + struct cfent *cfe; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); + + list_for_each_entry(cfe, &cgrp->files, node) { + struct dentry *d = cfe->dentry; + + if (cft && cfe->type != cft) + continue; + + dget(d); + d_delete(d); + simple_unlink(d->d_inode, d); + list_del_init(&cfe->node); + dput(d); + + return 0; } - spin_unlock(&dentry->d_lock); + return -ENOENT; +} + +static void cgroup_clear_directory(struct dentry *dir) +{ + struct cgroup *cgrp = __d_cgrp(dir); + + while (!list_empty(&cgrp->files)) + cgroup_rm_file(cgrp, NULL); } /* @@ -1294,6 +1348,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; + /* See feature-removal-schedule.txt */ + if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) + pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", + task_tgid_nr(current), current->comm); + /* Don't allow flags or name to change at remount */ if (opts.flags != root->flags || (opts.name && strcmp(opts.name, root->name))) { @@ -1308,7 +1367,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* (re)populate subsystem files */ + /* clear out any existing files and repopulate subsystem files */ + cgroup_clear_directory(cgrp->dentry); cgroup_populate_dir(cgrp); if (opts.release_agent) @@ -1333,6 +1393,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) { INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); @@ -1344,11 +1405,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; + INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); + INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; + list_add_tail(&cgrp->allcg_node, &root->allcg_list); init_cgroup_housekeeping(cgrp); } @@ -1692,16 +1756,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - /** * cgroup_path - generate the path of a cgroup * @cgrp: the cgroup in question @@ -2160,9 +2214,9 @@ retry_find_task: * only need to check permissions on one of them. */ tcred = __task_cred(tsk); - if (cred->euid && - cred->euid != tcred->uid && - cred->euid != tcred->suid) { + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) { rcu_read_unlock(); ret = -EACCES; goto out_unlock_cgroup; @@ -2172,6 +2226,18 @@ retry_find_task: if (threadgroup) tsk = tsk->group_leader; + + /* + * Workqueue threads may acquire PF_THREAD_BOUND and become + * trapped in a cpuset, or RT worker may be born in a cgroup + * with no rt_runtime allocated. Just say no. + */ + if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + ret = -EINVAL; + rcu_read_unlock(); + goto out_unlock_cgroup; + } + get_task_struct(tsk); rcu_read_unlock(); @@ -2603,50 +2669,191 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -int cgroup_add_file(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, + const struct cftype *cft) { struct dentry *dir = cgrp->dentry; + struct cgroup *parent = __d_cgrp(dir); struct dentry *dentry; + struct cfent *cfe; int error; umode_t mode; - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + + /* does @cft->flags tell us to skip creation on @cgrp? */ + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + return 0; + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + return 0; + if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { strcpy(name, subsys->name); strcat(name, "."); } strcat(name, cft->name); + BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); + + cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); + if (!cfe) + return -ENOMEM; + dentry = lookup_one_len(name, dir, strlen(name)); - if (!IS_ERR(dentry)) { - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, - cgrp->root->sb); - if (!error) - dentry->d_fsdata = (void *)cft; - dput(dentry); - } else + if (IS_ERR(dentry)) { error = PTR_ERR(dentry); + goto out; + } + + mode = cgroup_file_mode(cft); + error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); + if (!error) { + cfe->type = (void *)cft; + cfe->dentry = dentry; + dentry->d_fsdata = cfe; + list_add_tail(&cfe->node, &parent->files); + cfe = NULL; + } + dput(dentry); +out: + kfree(cfe); return error; } -EXPORT_SYMBOL_GPL(cgroup_add_file); -int cgroup_add_files(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype cft[], - int count) +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + const struct cftype cfts[], bool is_add) { - int i, err; - for (i = 0; i < count; i++) { - err = cgroup_add_file(cgrp, subsys, &cft[i]); - if (err) - return err; + const struct cftype *cft; + int err, ret = 0; + + for (cft = cfts; cft->name[0] != '\0'; cft++) { + if (is_add) + err = cgroup_add_file(cgrp, subsys, cft); + else + err = cgroup_rm_file(cgrp, cft); + if (err) { + pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n", + is_add ? "add" : "remove", cft->name, err); + ret = err; + } + } + return ret; +} + +static DEFINE_MUTEX(cgroup_cft_mutex); + +static void cgroup_cfts_prepare(void) + __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) +{ + /* + * Thanks to the entanglement with vfs inode locking, we can't walk + * the existing cgroups under cgroup_mutex and create files. + * Instead, we increment reference on all cgroups and build list of + * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure + * exclusive access to the field. + */ + mutex_lock(&cgroup_cft_mutex); + mutex_lock(&cgroup_mutex); +} + +static void cgroup_cfts_commit(struct cgroup_subsys *ss, + const struct cftype *cfts, bool is_add) + __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) +{ + LIST_HEAD(pending); + struct cgroup *cgrp, *n; + + /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ + if (cfts && ss->root != &rootnode) { + list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { + dget(cgrp->dentry); + list_add_tail(&cgrp->cft_q_node, &pending); + } + } + + mutex_unlock(&cgroup_mutex); + + /* + * All new cgroups will see @cfts update on @ss->cftsets. Add/rm + * files for all cgroups which were created before. + */ + list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { + struct inode *inode = cgrp->dentry->d_inode; + + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_mutex); + if (!cgroup_is_removed(cgrp)) + cgroup_addrm_files(cgrp, ss, cfts, is_add); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + + list_del_init(&cgrp->cft_q_node); + dput(cgrp->dentry); } + + mutex_unlock(&cgroup_cft_mutex); +} + +/** + * cgroup_add_cftypes - add an array of cftypes to a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Register @cfts to @ss. Files described by @cfts are created for all + * existing cgroups to which @ss is attached and all future cgroups will + * have them too. This function can be called anytime whether @ss is + * attached or not. + * + * Returns 0 on successful registration, -errno on failure. Note that this + * function currently returns 0 as long as @cfts registration is successful + * even if some file creation attempts on existing cgroups fail. + */ +int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +{ + struct cftype_set *set; + + set = kzalloc(sizeof(*set), GFP_KERNEL); + if (!set) + return -ENOMEM; + + cgroup_cfts_prepare(); + set->cfts = cfts; + list_add_tail(&set->node, &ss->cftsets); + cgroup_cfts_commit(ss, cfts, true); + return 0; } -EXPORT_SYMBOL_GPL(cgroup_add_files); +EXPORT_SYMBOL_GPL(cgroup_add_cftypes); + +/** + * cgroup_rm_cftypes - remove an array of cftypes from a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Unregister @cfts from @ss. Files described by @cfts are removed from + * all existing cgroups to which @ss is attached and all future cgroups + * won't have them either. This function can be called anytime whether @ss + * is attached or not. + * + * Returns 0 on successful unregistration, -ENOENT if @cfts is not + * registered with @ss. + */ +int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +{ + struct cftype_set *set; + + cgroup_cfts_prepare(); + + list_for_each_entry(set, &ss->cftsets, node) { + if (set->cfts == cfts) { + list_del_init(&set->node); + cgroup_cfts_commit(ss, cfts, false); + return 0; + } + } + + cgroup_cfts_commit(ss, NULL, false); + return -ENOENT; +} /** * cgroup_task_count - count the number of tasks in a cgroup. @@ -3625,13 +3832,14 @@ static struct cftype files[] = { .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, -}; - -static struct cftype cft_release_agent = { - .name = "release_agent", - .read_seq_string = cgroup_release_agent_show, - .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + { + .name = "release_agent", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cgroup_release_agent_show, + .write_string = cgroup_release_agent_write, + .max_write_len = PATH_MAX, + }, + { } /* terminate */ }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -3639,22 +3847,18 @@ static int cgroup_populate_dir(struct cgroup *cgrp) int err; struct cgroup_subsys *ss; - /* First clear out any existing files */ - cgroup_clear_directory(cgrp->dentry); - - err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); + err = cgroup_addrm_files(cgrp, NULL, files, true); if (err < 0) return err; - if (cgrp == cgrp->top_cgroup) { - if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) - return err; - } - + /* process cftsets of each subsystem */ for_each_subsys(cgrp->root, ss) { - if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) - return err; + struct cftype_set *set; + + list_for_each_entry(set, &ss->cftsets, node) + cgroup_addrm_files(cgrp, ss, set->cfts, true); } + /* This cgroup is ready now */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; @@ -3670,6 +3874,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp) return 0; } +static void css_dput_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, dput_work); + + dput(css->cgroup->dentry); +} + static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -3682,6 +3894,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, set_bit(CSS_ROOT, &css->flags); BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; + + /* + * If !clear_css_refs, css holds an extra ref to @cgrp->dentry + * which is put on the last css_put(). dput() requires process + * context, which css_put() may be called without. @css->dput_work + * will be used to invoke dput() asynchronously from css_put(). + */ + INIT_WORK(&css->dput_work, css_dput_fn); + if (ss->__DEPRECATED_clear_css_refs) + set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } static void cgroup_lock_hierarchy(struct cgroupfs_root *root) @@ -3784,9 +4006,16 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; + /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + for_each_subsys(root, ss) + if (!ss->__DEPRECATED_clear_css_refs) + dget(dentry); + /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); + err = cgroup_populate_dir(cgrp); /* If err < 0, we have a half-filled directory - oh well ;) */ @@ -3826,18 +4055,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } +/* + * Check the reference count on each subsystem. Since we already + * established that there are no tasks in the cgroup, if the css refcount + * is also 1, then there should be no outstanding references, so the + * subsystem is safe to destroy. We scan across all subsystems rather than + * using the per-hierarchy linked list of mounted subsystems since we can + * be called via check_for_release() with no synchronization other than + * RCU, and the subsystem linked list isn't RCU-safe. + */ static int cgroup_has_css_refs(struct cgroup *cgrp) { - /* Check the reference count on each subsystem. Since we - * already established that there are no tasks in the - * cgroup, if the css refcount is also 1, then there should - * be no outstanding references, so the subsystem is safe to - * destroy. We scan across all subsystems rather than using - * the per-hierarchy linked list of mounted subsystems since - * we can be called via check_for_release() with no - * synchronization other than RCU, and the subsystem linked - * list isn't RCU-safe */ int i; + /* * We won't need to lock the subsys array, because the subsystems * we're concerned about aren't going anywhere since our cgroup root @@ -3846,17 +4076,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys_state *css; + /* Skip subsystems not present or not in this hierarchy */ if (ss == NULL || ss->root != cgrp->root) continue; + css = cgrp->subsys[ss->subsys_id]; - /* When called from check_for_release() it's possible + /* + * When called from check_for_release() it's possible * that by this point the cgroup has been removed * and the css deleted. But a false-positive doesn't * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the - * release agent to be called anyway. */ - if (css && (atomic_read(&css->refcnt) > 1)) + * release agent to be called anyway. + */ + if (css && css_refcnt(css) > 1) return 1; } return 0; @@ -3866,51 +4100,63 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * Atomically mark all (or else none) of the cgroup's CSS objects as * CSS_REMOVED. Return true on success, or false if the cgroup has * busy subsystems. Call with cgroup_mutex held + * + * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or + * not, cgroup removal behaves differently. + * + * If clear is set, css refcnt for the subsystem should be zero before + * cgroup removal can be committed. This is implemented by + * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be + * called multiple times until all css refcnts reach zero and is allowed to + * veto removal on any invocation. This behavior is deprecated and will be + * removed as soon as the existing user (memcg) is updated. + * + * If clear is not set, each css holds an extra reference to the cgroup's + * dentry and cgroup removal proceeds regardless of css refs. + * ->pre_destroy() will be called at least once and is not allowed to fail. + * On the last put of each css, whenever that may be, the extra dentry ref + * is put so that dentry destruction happens only after all css's are + * released. */ - static int cgroup_clear_css_refs(struct cgroup *cgrp) { struct cgroup_subsys *ss; unsigned long flags; bool failed = false; + local_irq_save(flags); + + /* + * Block new css_tryget() by deactivating refcnt. If all refcnts + * for subsystems w/ clear_css_refs set were 1 at the moment of + * deactivation, we succeeded. + */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - int refcnt; - while (1) { - /* We can only remove a CSS with a refcnt==1 */ - refcnt = atomic_read(&css->refcnt); - if (refcnt > 1) { - failed = true; - goto done; - } - BUG_ON(!refcnt); - /* - * Drop the refcnt to 0 while we check other - * subsystems. This will cause any racing - * css_tryget() to spin until we set the - * CSS_REMOVED bits or abort - */ - if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) - break; - cpu_relax(); - } + + WARN_ON(atomic_read(&css->refcnt) < 0); + atomic_add(CSS_DEACT_BIAS, &css->refcnt); + + if (ss->__DEPRECATED_clear_css_refs) + failed |= css_refcnt(css) != 1; } - done: + + /* + * If succeeded, set REMOVED and put all the base refs; otherwise, + * restore refcnts to positive values. Either way, all in-progress + * css_tryget() will be released. + */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - if (failed) { - /* - * Restore old refcnt if we previously managed - * to clear it from 1 to 0 - */ - if (!atomic_read(&css->refcnt)) - atomic_set(&css->refcnt, 1); - } else { - /* Commit the fact that the CSS is removed */ + + if (!failed) { set_bit(CSS_REMOVED, &css->flags); + css_put(css); + } else { + atomic_sub(CSS_DEACT_BIAS, &css->refcnt); } } + local_irq_restore(flags); return !failed; } @@ -3995,6 +4241,8 @@ again: list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); + list_del_init(&cgrp->allcg_node); + d = dget(cgrp->dentry); cgroup_d_remove_dir(d); @@ -4021,12 +4269,29 @@ again: return 0; } +static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) +{ + INIT_LIST_HEAD(&ss->cftsets); + + /* + * base_cftset is embedded in subsys itself, no need to worry about + * deregistration. + */ + if (ss->base_cftypes) { + ss->base_cftset.cfts = ss->base_cftypes; + list_add_tail(&ss->base_cftset.node, &ss->cftsets); + } +} + static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + /* init base cftset */ + cgroup_init_cftsets(ss); + /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; @@ -4096,6 +4361,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) return 0; } + /* init base cftset */ + cgroup_init_cftsets(ss); + /* * need to register a subsys id before anything else - for example, * init_cgroup_css needs it. @@ -4685,21 +4953,41 @@ static void check_for_release(struct cgroup *cgrp) } /* Caller must verify that the css is not for root cgroup */ -void __css_put(struct cgroup_subsys_state *css, int count) +bool __css_tryget(struct cgroup_subsys_state *css) +{ + do { + int v = css_refcnt(css); + + if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + return true; + cpu_relax(); + } while (!test_bit(CSS_REMOVED, &css->flags)); + + return false; +} +EXPORT_SYMBOL_GPL(__css_tryget); + +/* Caller must verify that the css is not for root cgroup */ +void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; - int val; + rcu_read_lock(); - val = atomic_sub_return(count, &css->refcnt); - if (val == 1) { + atomic_dec(&css->refcnt); + switch (css_refcnt(css)) { + case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } cgroup_wakeup_rmdir_waiter(cgrp); + break; + case 0: + if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) + schedule_work(&css->dput_work); + break; } rcu_read_unlock(); - WARN_ON_ONCE(val < 1); } EXPORT_SYMBOL_GPL(__css_put); @@ -4818,7 +5106,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) * on this or this is under rcu_read_lock(). Once css->id is allocated, * it's unchanged until freed. */ - cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, css_refcnt(css)); if (cssid) return cssid->id; @@ -4830,7 +5118,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) { struct css_id *cssid; - cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); + cssid = rcu_dereference_check(css->id, css_refcnt(css)); if (cssid) return cssid->depth; @@ -5211,19 +5499,15 @@ static struct cftype debug_files[] = { .name = "releasable", .read_u64 = releasable_read, }, -}; -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, debug_files, - ARRAY_SIZE(debug_files)); -} + { } /* terminate */ +}; struct cgroup_subsys debug_subsys = { .name = "debug", .create = debug_create, .destroy = debug_destroy, - .populate = debug_populate, .subsys_id = debug_subsys_id, + .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b62..3649fc6b3eaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup, static struct cftype files[] = { { .name = "state", + .flags = CFTYPE_NOT_ON_ROOT, .read_seq_string = freezer_read, .write_string = freezer_write, }, + { } /* terminate */ }; -static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) -{ - if (!cgroup->parent) - return 0; - return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); -} - struct cgroup_subsys freezer_subsys = { .name = "freezer", .create = freezer_create, .destroy = freezer_destroy, - .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, .fork = freezer_fork, + .base_cftypes = files, }; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 14f7070b4ba2..8c8bd652dd12 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1765,28 +1765,17 @@ static struct cftype files[] = { .write_u64 = cpuset_write_u64, .private = FILE_SPREAD_SLAB, }, -}; - -static struct cftype cft_memory_pressure_enabled = { - .name = "memory_pressure_enabled", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, -}; -static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - int err; + { + .name = "memory_pressure_enabled", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE_ENABLED, + }, - err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); - if (err) - return err; - /* memory_pressure_enabled is in root cpuset only */ - if (!cont->parent) - err = cgroup_add_file(cont, ss, - &cft_memory_pressure_enabled); - return err; -} + { } /* terminate */ +}; /* * post_clone() is called during cgroup_create() when the @@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = { .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, .attach = cpuset_attach, - .populate = cpuset_populate, .post_clone = cpuset_post_clone, .subsys_id = cpuset_subsys_id, + .base_cftypes = files, .early_init = 1, }; diff --git a/kernel/cred.c b/kernel/cred.c index e70683d9ec32..430557ea488f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -49,6 +49,14 @@ struct cred init_cred = { .subscribers = ATOMIC_INIT(2), .magic = CRED_MAGIC, #endif + .uid = GLOBAL_ROOT_UID, + .gid = GLOBAL_ROOT_GID, + .suid = GLOBAL_ROOT_UID, + .sgid = GLOBAL_ROOT_GID, + .euid = GLOBAL_ROOT_UID, + .egid = GLOBAL_ROOT_GID, + .fsuid = GLOBAL_ROOT_UID, + .fsgid = GLOBAL_ROOT_GID, .securebits = SECUREBITS_DEFAULT, .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, @@ -148,6 +156,7 @@ static void put_cred_rcu(struct rcu_head *rcu) if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); + put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } @@ -303,6 +312,7 @@ struct cred *prepare_creds(void) set_cred_subscribers(new, 0); get_group_info(new->group_info); get_uid(new->user); + get_user_ns(new->user_ns); #ifdef CONFIG_KEYS key_get(new->thread_keyring); @@ -414,11 +424,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) goto error_put; } - /* cache user_ns in cred. Doesn't need a refcount because it will - * stay pinned by cred->user - */ - new->user_ns = new->user->user_ns; - #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ @@ -493,10 +498,10 @@ int commit_creds(struct cred *new) get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ - if (old->euid != new->euid || - old->egid != new->egid || - old->fsuid != new->fsuid || - old->fsgid != new->fsgid || + if (!uid_eq(old->euid, new->euid) || + !gid_eq(old->egid, new->egid) || + !uid_eq(old->fsuid, new->fsuid) || + !gid_eq(old->fsgid, new->fsgid) || !cap_issubset(new->cap_permitted, old->cap_permitted)) { if (task->mm) set_dumpable(task->mm, suid_dumpable); @@ -505,9 +510,9 @@ int commit_creds(struct cred *new) } /* alter the thread keyring */ - if (new->fsuid != old->fsuid) + if (!uid_eq(new->fsuid, old->fsuid)) key_fsuid_changed(task); - if (new->fsgid != old->fsgid) + if (!gid_eq(new->fsgid, old->fsgid)) key_fsgid_changed(task); /* do it @@ -524,16 +529,16 @@ int commit_creds(struct cred *new) alter_cred_subscribers(old, -2); /* send notifications */ - if (new->uid != old->uid || - new->euid != old->euid || - new->suid != old->suid || - new->fsuid != old->fsuid) + if (!uid_eq(new->uid, old->uid) || + !uid_eq(new->euid, old->euid) || + !uid_eq(new->suid, old->suid) || + !uid_eq(new->fsuid, old->fsuid)) proc_id_connector(task, PROC_EVENT_UID); - if (new->gid != old->gid || - new->egid != old->egid || - new->sgid != old->sgid || - new->fsgid != old->fsgid) + if (!gid_eq(new->gid, old->gid) || + !gid_eq(new->egid, old->egid) || + !gid_eq(new->sgid, old->sgid) || + !gid_eq(new->fsgid, old->fsgid)) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ @@ -678,6 +683,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) atomic_set(&new->usage, 1); set_cred_subscribers(new, 0); get_uid(new->user); + get_user_ns(new->user_ns); get_group_info(new->group_info); #ifdef CONFIG_KEYS diff --git a/kernel/events/core.c b/kernel/events/core.c index fd126f82b57c..5b06cbbf6931 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4957,7 +4957,7 @@ void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) if (rctx < 0) return; - perf_sample_data_init(&data, addr); + perf_sample_data_init(&data, addr, 0); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); @@ -5215,7 +5215,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, .data = record, }; - perf_sample_data_init(&data, addr); + perf_sample_data_init(&data, addr, 0); data.raw = &raw; hlist_for_each_entry_rcu(event, node, head, hlist_entry) { @@ -5318,7 +5318,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; - perf_sample_data_init(&sample, bp->attr.bp_addr); + perf_sample_data_init(&sample, bp->attr.bp_addr, 0); if (!bp->hw.state && !perf_exclude_event(bp, regs)) perf_swevent_event(bp, 1, &sample, regs); @@ -5344,13 +5344,12 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs(); if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) - if (perf_event_overflow(event, &data, regs)) + if (__perf_event_overflow(event, 1, &data, regs)) ret = HRTIMER_NORESTART; } diff --git a/kernel/exit.c b/kernel/exit.c index d8bd3b425fa7..910a0716e17a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1214,7 +1214,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) unsigned long state; int retval, status, traced; pid_t pid = task_pid_vnr(p); - uid_t uid = __task_cred(p)->uid; + uid_t uid = from_kuid_munged(current_user_ns(), __task_cred(p)->uid); struct siginfo __user *infop; if (!likely(wo->wo_flags & WEXITED)) @@ -1427,7 +1427,7 @@ static int wait_task_stopped(struct wait_opts *wo, if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; - uid = task_uid(p); + uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1500,7 +1500,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = task_uid(p); + uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); diff --git a/kernel/extable.c b/kernel/extable.c index 5339705b8241..fe35a634bf76 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -35,10 +35,16 @@ DEFINE_MUTEX(text_mutex); extern struct exception_table_entry __start___ex_table[]; extern struct exception_table_entry __stop___ex_table[]; +/* Cleared by build time tools if the table is already sorted. */ +u32 __initdata main_extable_sort_needed = 1; + /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - sort_extable(__start___ex_table, __stop___ex_table); + if (main_extable_sort_needed) + sort_extable(__start___ex_table, __stop___ex_table); + else + pr_notice("__ex_table already sorted, skipping sort\n"); } /* Given an address, look for it in the exception tables. */ diff --git a/kernel/fork.c b/kernel/fork.c index 9f9b296fa6df..05c813dc9ecc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -34,6 +34,7 @@ #include <linux/cgroup.h> #include <linux/security.h> #include <linux/hugetlb.h> +#include <linux/seccomp.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> @@ -206,6 +207,7 @@ void free_task(struct task_struct *tsk) free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); + put_seccomp_filter(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -290,8 +292,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) int node = tsk_fork_get_node(orig); int err; - prepare_to_copy(orig); - tsk = alloc_task_struct_node(node); if (!tsk) return NULL; @@ -1192,6 +1192,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; ftrace_graph_init_task(p); + get_seccomp_filter(p); rt_mutex_init_task(p); diff --git a/kernel/groups.c b/kernel/groups.c index 99b53d1eb7ea..6b2588dd04ff 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -31,7 +31,7 @@ struct group_info *groups_alloc(int gidsetsize) group_info->blocks[0] = group_info->small_block; else { for (i = 0; i < nblocks; i++) { - gid_t *b; + kgid_t *b; b = (void *)__get_free_page(GFP_USER); if (!b) goto out_undo_partial_alloc; @@ -66,18 +66,15 @@ EXPORT_SYMBOL(groups_free); static int groups_to_user(gid_t __user *grouplist, const struct group_info *group_info) { + struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_to_user(grouplist, group_info->blocks[i], len)) + for (i = 0; i < count; i++) { + gid_t gid; + gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); + if (put_user(gid, grouplist+i)) return -EFAULT; - - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; } return 0; } @@ -86,18 +83,21 @@ static int groups_to_user(gid_t __user *grouplist, static int groups_from_user(struct group_info *group_info, gid_t __user *grouplist) { + struct user_namespace *user_ns = current_user_ns(); int i; unsigned int count = group_info->ngroups; - for (i = 0; i < group_info->nblocks; i++) { - unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); - unsigned int len = cp_count * sizeof(*grouplist); - - if (copy_from_user(group_info->blocks[i], grouplist, len)) + for (i = 0; i < count; i++) { + gid_t gid; + kgid_t kgid; + if (get_user(gid, grouplist+i)) return -EFAULT; - grouplist += NGROUPS_PER_BLOCK; - count -= cp_count; + kgid = make_kgid(user_ns, gid); + if (!gid_valid(kgid)) + return -EINVAL; + + GROUP_AT(group_info, i) = kgid; } return 0; } @@ -117,9 +117,9 @@ static void groups_sort(struct group_info *group_info) for (base = 0; base < max; base++) { int left = base; int right = left + stride; - gid_t tmp = GROUP_AT(group_info, right); + kgid_t tmp = GROUP_AT(group_info, right); - while (left >= 0 && GROUP_AT(group_info, left) > tmp) { + while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { GROUP_AT(group_info, right) = GROUP_AT(group_info, left); right = left; @@ -132,7 +132,7 @@ static void groups_sort(struct group_info *group_info) } /* a simple bsearch */ -int groups_search(const struct group_info *group_info, gid_t grp) +int groups_search(const struct group_info *group_info, kgid_t grp) { unsigned int left, right; @@ -143,9 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - if (grp > GROUP_AT(group_info, mid)) + if (gid_gt(grp, GROUP_AT(group_info, mid))) left = mid + 1; - else if (grp < GROUP_AT(group_info, mid)) + else if (gid_lt(grp, GROUP_AT(group_info, mid))) right = mid; else return 1; @@ -256,24 +256,24 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) /* * Check whether we're fsgid/egid or in the supplemental group.. */ -int in_group_p(gid_t grp) +int in_group_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; - if (grp != cred->fsgid) + if (!gid_eq(grp, cred->fsgid)) retval = groups_search(cred->group_info, grp); return retval; } EXPORT_SYMBOL(in_group_p); -int in_egroup_p(gid_t grp) +int in_egroup_p(kgid_t grp) { const struct cred *cred = current_cred(); int retval = 1; - if (grp != cred->egid) + if (!gid_eq(grp, cred->egid)) retval = groups_search(cred->group_info, grp); return retval; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3914c1e03cff..fc275e4f629b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -379,8 +379,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } handle_irq_event(desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 89a3ea82569b..bb32326afe87 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -565,8 +565,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_debug("No set_type function for IRQ %d (%s)\n", irq, - chip ? (chip->name ? : "unknown") : "unknown"); + pr_debug("genirq: No set_type function for IRQ %d (%s)\n", irq, + chip ? (chip->name ? : "unknown") : "unknown"); return 0; } @@ -600,7 +600,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, ret = 0; break; default: - pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("genirq: Setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); } if (unmask) @@ -837,8 +837,7 @@ void exit_irq_thread(void) action = kthread_data(tsk); - printk(KERN_ERR - "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + pr_err("genirq: exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, action->irq); desc = irq_to_desc(action->irq); @@ -878,7 +877,6 @@ static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; - const char *old_name = NULL; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; cpumask_var_t mask; @@ -972,10 +970,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ if (!((old->flags & new->flags) & IRQF_SHARED) || ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || - ((old->flags ^ new->flags) & IRQF_ONESHOT)) { - old_name = old->name; + ((old->flags ^ new->flags) & IRQF_ONESHOT)) goto mismatch; - } /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != @@ -1031,6 +1027,27 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * all existing action->thread_mask bits. */ new->thread_mask = 1 << ffz(thread_mask); + + } else if (new->handler == irq_default_primary_handler) { + /* + * The interrupt was requested with handler = NULL, so + * we use the default primary handler for it. But it + * does not have the oneshot flag set. In combination + * with level interrupts this is deadly, because the + * default primary handler just wakes the thread, then + * the irq lines is reenabled, but the device still + * has the level irq asserted. Rinse and repeat.... + * + * While this works for edge type interrupts, we play + * it safe and reject unconditionally because we can't + * say for sure which type this interrupt really + * has. The type flags are unreliable as the + * underlying chip implementation can override them. + */ + pr_err("genirq: Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", + irq); + ret = -EINVAL; + goto out_mask; } if (!shared) { @@ -1078,7 +1095,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + pr_warning("genirq: irq %d uses trigger mode %u; requested %u\n", irq, nmsk, omsk); } @@ -1115,14 +1132,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) return 0; mismatch: -#ifdef CONFIG_DEBUG_SHIRQ if (!(new->flags & IRQF_PROBE_SHARED)) { - printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); - if (old_name) - printk(KERN_ERR "current handler: %s\n", old_name); + pr_err("genirq: Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n", + irq, new->flags, new->name, old->flags, old->name); +#ifdef CONFIG_DEBUG_SHIRQ dump_stack(); - } #endif + } ret = -EBUSY; out_mask: @@ -1204,12 +1220,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Found it - now remove it from the list of entries: */ *action_ptr = action->next; - /* Currently used only by UML, might disappear one day: */ -#ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->irq_data.chip->release) - desc->irq_data.chip->release(irq, dev_id); -#endif - /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) irq_shutdown(desc); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a6..cb228bf21760 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -103,8 +103,13 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) { + /* + * Only interrupts which are marked as wakeup source + * and have not been disabled before the suspend check + * can abort suspend. + */ if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->istate & IRQS_PENDING) + if (desc->depth == 1 && desc->istate & IRQS_PENDING) return -EBUSY; continue; } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c9..6454db7b6a4d 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -58,10 +58,13 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still - * active. + * active. Clear the pending bit so suspend/resume does not + * get confused. */ - if (irq_settings_is_level(desc)) + if (irq_settings_is_level(desc)) { + desc->istate &= ~IRQS_PENDING; return; + } if (desc->istate & IRQS_REPLAY) return; if (desc->istate & IRQS_PENDING) { diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e425..4edbd9c11aca 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2429,7 +2429,8 @@ static int copy_and_check(struct load_info *info, goto free_hdr; } - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { + if (hdr->e_shoff >= len || + hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) { err = -ENOEXEC; goto free_hdr; } @@ -2953,7 +2954,7 @@ static struct module *load_module(void __user *umod, /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL); + -32768, 32767, &ddebug_dyndbg_module_param_cb); if (err < 0) goto unlink; diff --git a/kernel/params.c b/kernel/params.c index f37d82631347..ed35345be536 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -85,11 +85,13 @@ bool parameq(const char *a, const char *b) static int parse_one(char *param, char *val, + const char *doing, const struct kernel_param *params, unsigned num_params, s16 min_level, s16 max_level, - int (*handle_unknown)(char *param, char *val)) + int (*handle_unknown)(char *param, char *val, + const char *doing)) { unsigned int i; int err; @@ -104,8 +106,8 @@ static int parse_one(char *param, if (!val && params[i].ops->set != param_set_bool && params[i].ops->set != param_set_bint) return -EINVAL; - pr_debug("They are equal! Calling %p\n", - params[i].ops->set); + pr_debug("handling %s with %p\n", param, + params[i].ops->set); mutex_lock(¶m_lock); err = params[i].ops->set(val, ¶ms[i]); mutex_unlock(¶m_lock); @@ -114,11 +116,11 @@ static int parse_one(char *param, } if (handle_unknown) { - pr_debug("Unknown argument: calling %p\n", handle_unknown); - return handle_unknown(param, val); + pr_debug("doing %s: %s='%s'\n", doing, param, val); + return handle_unknown(param, val, doing); } - pr_debug("Unknown argument `%s'\n", param); + pr_debug("Unknown argument '%s'\n", param); return -ENOENT; } @@ -175,49 +177,47 @@ static char *next_arg(char *args, char **param, char **val) } /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ -int parse_args(const char *name, +int parse_args(const char *doing, char *args, const struct kernel_param *params, unsigned num, s16 min_level, s16 max_level, - int (*unknown)(char *param, char *val)) + int (*unknown)(char *param, char *val, const char *doing)) { char *param, *val; - pr_debug("Parsing ARGS: %s\n", args); - /* Chew leading spaces */ args = skip_spaces(args); + if (*args) + pr_debug("doing %s, parsing ARGS: '%s'\n", doing, args); + while (*args) { int ret; int irq_was_disabled; args = next_arg(args, ¶m, &val); irq_was_disabled = irqs_disabled(); - ret = parse_one(param, val, params, num, + ret = parse_one(param, val, doing, params, num, min_level, max_level, unknown); - if (irq_was_disabled && !irqs_disabled()) { - printk(KERN_WARNING "parse_args(): option '%s' enabled " - "irq's!\n", param); - } + if (irq_was_disabled && !irqs_disabled()) + pr_warn("%s: option '%s' enabled irq's!\n", + doing, param); + switch (ret) { case -ENOENT: - printk(KERN_ERR "%s: Unknown parameter `%s'\n", - name, param); + pr_err("%s: Unknown parameter `%s'\n", doing, param); return ret; case -ENOSPC: - printk(KERN_ERR - "%s: `%s' too large for parameter `%s'\n", - name, val ?: "", param); + pr_err("%s: `%s' too large for parameter `%s'\n", + doing, val ?: "", param); return ret; case 0: break; default: - printk(KERN_ERR - "%s: `%s' invalid for parameter `%s'\n", - name, val ?: "", param); + pr_err("%s: `%s' invalid for parameter `%s'\n", + doing, val ?: "", param); return ret; } } @@ -263,8 +263,7 @@ STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { if (strlen(val) > 1024) { - printk(KERN_ERR "%s: string parameter too long\n", - kp->name); + pr_err("%s: string parameter too long\n", kp->name); return -ENOSPC; } @@ -400,8 +399,7 @@ static int param_array(const char *name, int len; if (*num == max) { - printk(KERN_ERR "%s: can only take %i arguments\n", - name, max); + pr_err("%s: can only take %i arguments\n", name, max); return -EINVAL; } len = strcspn(val, ","); @@ -420,8 +418,7 @@ static int param_array(const char *name, } while (save == ','); if (*num < min) { - printk(KERN_ERR "%s: needs at least %i arguments\n", - name, min); + pr_err("%s: needs at least %i arguments\n", name, min); return -EINVAL; } return 0; @@ -480,7 +477,7 @@ int param_set_copystring(const char *val, const struct kernel_param *kp) const struct kparam_string *kps = kp->str; if (strlen(val)+1 > kps->maxlen) { - printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", + pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } @@ -750,11 +747,8 @@ static struct module_kobject * __init locate_module_kobject(const char *name) #endif if (err) { kobject_put(&mk->kobj); - printk(KERN_ERR - "Module '%s' failed add to sysfs, error number %d\n", + pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", name, err); - printk(KERN_ERR - "The system will be unstable now.\n"); return NULL; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index deb5461e3216..8f9b4eb974e0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -103,6 +103,33 @@ config PM_SLEEP_SMP select HOTPLUG select HOTPLUG_CPU +config PM_AUTOSLEEP + bool "Opportunistic sleep" + depends on PM_SLEEP + default n + ---help--- + Allow the kernel to trigger a system transition into a global sleep + state automatically whenever there are no active wakeup sources. + +config PM_WAKELOCKS + bool "User space wakeup sources interface" + depends on PM_SLEEP + default n + ---help--- + Allow user space to create, activate and deactivate wakeup source + objects with the help of a sysfs-based interface. + +config PM_WAKELOCKS_LIMIT + int "Maximum number of user space wakeup sources (0 = no limit)" + range 0 100000 + default 100 + depends on PM_WAKELOCKS + +config PM_WAKELOCKS_GC + bool "Garbage collector for user space wakeup sources" + depends on PM_WAKELOCKS + default y + config PM_RUNTIME bool "Run-time PM core functionality" depends on !IA64_HP_SIM diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 66d808ec5252..29472bff11ef 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -9,5 +9,7 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o +obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o +obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 000000000000..ca304046d9e2 --- /dev/null +++ b/kernel/power/autosleep.c @@ -0,0 +1,127 @@ +/* + * kernel/power/autosleep.c + * + * Opportunistic sleep support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/pm_wakeup.h> + +#include "power.h" + +static suspend_state_t autosleep_state; +static struct workqueue_struct *autosleep_wq; +/* + * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source + * is active, otherwise a deadlock with try_to_suspend() is possible. + * Alternatively mutex_lock_interruptible() can be used. This will then fail + * if an auto_sleep cycle tries to freeze processes. + */ +static DEFINE_MUTEX(autosleep_lock); +static struct wakeup_source *autosleep_ws; + +static void try_to_suspend(struct work_struct *work) +{ + unsigned int initial_count, final_count; + + if (!pm_get_wakeup_count(&initial_count, true)) + goto out; + + mutex_lock(&autosleep_lock); + + if (!pm_save_wakeup_count(initial_count)) { + mutex_unlock(&autosleep_lock); + goto out; + } + + if (autosleep_state == PM_SUSPEND_ON) { + mutex_unlock(&autosleep_lock); + return; + } + if (autosleep_state >= PM_SUSPEND_MAX) + hibernate(); + else + pm_suspend(autosleep_state); + + mutex_unlock(&autosleep_lock); + + if (!pm_get_wakeup_count(&final_count, false)) + goto out; + + /* + * If the wakeup occured for an unknown reason, wait to prevent the + * system from trying to suspend and waking up in a tight loop. + */ + if (final_count == initial_count) + schedule_timeout_uninterruptible(HZ / 2); + + out: + queue_up_suspend_work(); +} + +static DECLARE_WORK(suspend_work, try_to_suspend); + +void queue_up_suspend_work(void) +{ + if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) + queue_work(autosleep_wq, &suspend_work); +} + +suspend_state_t pm_autosleep_state(void) +{ + return autosleep_state; +} + +int pm_autosleep_lock(void) +{ + return mutex_lock_interruptible(&autosleep_lock); +} + +void pm_autosleep_unlock(void) +{ + mutex_unlock(&autosleep_lock); +} + +int pm_autosleep_set_state(suspend_state_t state) +{ + +#ifndef CONFIG_HIBERNATION + if (state >= PM_SUSPEND_MAX) + return -EINVAL; +#endif + + __pm_stay_awake(autosleep_ws); + + mutex_lock(&autosleep_lock); + + autosleep_state = state; + + __pm_relax(autosleep_ws); + + if (state > PM_SUSPEND_ON) { + pm_wakep_autosleep_enabled(true); + queue_up_suspend_work(); + } else { + pm_wakep_autosleep_enabled(false); + } + + mutex_unlock(&autosleep_lock); + return 0; +} + +int __init pm_autosleep_init(void) +{ + autosleep_ws = wakeup_source_register("autosleep"); + if (!autosleep_ws) + return -ENOMEM; + + autosleep_wq = alloc_ordered_workqueue("autosleep", 0); + if (autosleep_wq) + return 0; + + wakeup_source_unregister(autosleep_ws); + return -ENOMEM; +} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e09dfbfeecee..8b53db38a279 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -25,6 +25,8 @@ #include <linux/freezer.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> +#include <linux/ctype.h> +#include <linux/genhd.h> #include <scsi/scsi_scan.h> #include "power.h" @@ -722,6 +724,17 @@ static int software_resume(void) /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); + + /* + * name_to_dev_t is ineffective to verify parition if resume_file is in + * integer format. (e.g. major:minor) + */ + if (isdigit(resume_file[0]) && resume_wait) { + int partno; + while (!get_gendisk(swsusp_resume_device, &partno)) + msleep(10); + } + if (!swsusp_resume_device) { /* * Some device discovery might still be in progress; we need diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c12581f1c62..428f8a034e96 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, return (s - buf); } -static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) +static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state = PM_SUSPEND_STANDBY; @@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, #endif char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - /* First, check if we are requested to hibernate */ - if (len == 4 && !strncmp(buf, "disk", len)) { - error = hibernate(); - goto Exit; - } + /* Check hibernation first. */ + if (len == 4 && !strncmp(buf, "disk", len)) + return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { - error = pm_suspend(state); - break; - } - } + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + return state; #endif - Exit: + return PM_SUSPEND_ON; +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_state(buf, n); + if (state < PM_SUSPEND_MAX) + error = pm_suspend(state); + else if (state == PM_SUSPEND_MAX) + error = hibernate(); + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); return error ? error : n; } @@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj, { unsigned int val; - return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; + return pm_get_wakeup_count(&val, true) ? + sprintf(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, @@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj, const char *buf, size_t n) { unsigned int val; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) - return n; + error = n; } - return -EINVAL; + + out: + pm_autosleep_unlock(); + return error; } power_attr(wakeup_count); + +#ifdef CONFIG_PM_AUTOSLEEP +static ssize_t autosleep_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + suspend_state_t state = pm_autosleep_state(); + + if (state == PM_SUSPEND_ON) + return sprintf(buf, "off\n"); + +#ifdef CONFIG_SUSPEND + if (state < PM_SUSPEND_MAX) + return sprintf(buf, "%s\n", valid_state(state) ? + pm_states[state] : "error"); +#endif +#ifdef CONFIG_HIBERNATION + return sprintf(buf, "disk\n"); +#else + return sprintf(buf, "error"); +#endif +} + +static ssize_t autosleep_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state = decode_state(buf, n); + int error; + + if (state == PM_SUSPEND_ON + && strcmp(buf, "off") && strcmp(buf, "off\n")) + return -EINVAL; + + error = pm_autosleep_set_state(state); + return error ? error : n; +} + +power_attr(autosleep); +#endif /* CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS +static ssize_t wake_lock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, true); +} + +static ssize_t wake_lock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_lock(buf); + return error ? error : n; +} + +power_attr(wake_lock); + +static ssize_t wake_unlock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, false); +} + +static ssize_t wake_unlock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_unlock(buf); + return error ? error : n; +} + +power_attr(wake_unlock); + +#endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE @@ -409,6 +521,13 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_PM_AUTOSLEEP + &autosleep_attr.attr, +#endif +#ifdef CONFIG_PM_WAKELOCKS + &wake_lock_attr.attr, + &wake_unlock_attr.attr, +#endif #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif @@ -444,7 +563,10 @@ static int __init pm_init(void) power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; - return sysfs_create_group(power_kobj, &attr_group); + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return error; + return pm_autosleep_init(); } core_initcall(pm_init); diff --git a/kernel/power/power.h b/kernel/power/power.h index 98f3622d7407..b0bd4beaebfe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void) { } #endif + +#ifdef CONFIG_PM_AUTOSLEEP + +/* kernel/power/autosleep.c */ +extern int pm_autosleep_init(void); +extern int pm_autosleep_lock(void); +extern void pm_autosleep_unlock(void); +extern suspend_state_t pm_autosleep_state(void); +extern int pm_autosleep_set_state(suspend_state_t state); + +#else /* !CONFIG_PM_AUTOSLEEP */ + +static inline int pm_autosleep_init(void) { return 0; } +static inline int pm_autosleep_lock(void) { return 0; } +static inline void pm_autosleep_unlock(void) {} +static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } + +#endif /* !CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS + +/* kernel/power/wakelock.c */ +extern ssize_t pm_show_wakelocks(char *buf, bool show_active); +extern int pm_wake_lock(const char *buf); +extern int pm_wake_unlock(const char *buf); + +#endif /* !CONFIG_PM_WAKELOCKS */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index eef311a58a64..11e22c068e8b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -6,7 +6,7 @@ * * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> + * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. * @@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) return -ENOSPC; if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | + __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { @@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - } - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; - handle->reqd_free_pages = reqd_free_pages(); + + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + /* + * Recalculate the number of required free pages, to + * make sure we never take more than half. + */ + handle->reqd_free_pages = reqd_free_pages(); + } } out: return error; @@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, /* Maximum number of threads for compression/decompression. */ #define LZO_THREADS 3 -/* Maximum number of pages for read buffering. */ -#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) +/* Minimum/maximum number of pages for read buffering. */ +#define LZO_MIN_RD_PAGES 1024 +#define LZO_MAX_RD_PAGES 8192 /** @@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of free pages after all allocations have been done. - * We don't want to run out of pages when writing. - */ - handle->reqd_free_pages = reqd_free_pages(); - - /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); @@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, goto out_clean; } + /* + * Adjust the number of required free pages after all allocations have + * been done. We don't want to run out of pages when writing. + */ + handle->reqd_free_pages = reqd_free_pages(); + printk(KERN_INFO "PM: Using %u thread(s) for compression.\n" "PM: Compressing and saving image data (%u pages) ... ", @@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned i, thr, run_threads, nr_threads; unsigned ring = 0, pg = 0, ring_size = 0, have = 0, want, need, asked = 0; - unsigned long read_pages; + unsigned long read_pages = 0; unsigned char **page = NULL; struct dec_data *data = NULL; struct crc_data *crc = NULL; @@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = vmalloc(sizeof(*page) * LZO_READ_PAGES); + page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of pages for read buffering, in case we are short. + * Set the number of pages for read buffering. + * This is complete guesswork, because we'll only know the real + * picture once prepare_image() is called, which is much later on + * during the image load phase. We'll assume the worst case and + * say that none of the image pages are from high memory. */ - read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; - read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); + if (low_free_pages() > snapshot_get_image_size()) + read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; + read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT); + __GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); + if (!page[i]) { if (i < LZO_CMP_PAGES) { ring_size = i; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 000000000000..c8fba3380076 --- /dev/null +++ b/kernel/power/wakelock.c @@ -0,0 +1,259 @@ +/* + * kernel/power/wakelock.c + * + * User space wakeup sources support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + * + * This code is based on the analogous interface allowing user space to + * manipulate wakelocks on Android. + */ + +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/slab.h> + +static DEFINE_MUTEX(wakelocks_lock); + +struct wakelock { + char *name; + struct rb_node node; + struct wakeup_source ws; +#ifdef CONFIG_PM_WAKELOCKS_GC + struct list_head lru; +#endif +}; + +static struct rb_root wakelocks_tree = RB_ROOT; + +ssize_t pm_show_wakelocks(char *buf, bool show_active) +{ + struct rb_node *node; + struct wakelock *wl; + char *str = buf; + char *end = buf + PAGE_SIZE; + + mutex_lock(&wakelocks_lock); + + for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { + wl = rb_entry(node, struct wakelock, node); + if (wl->ws.active == show_active) + str += scnprintf(str, end - str, "%s ", wl->name); + } + if (str > buf) + str--; + + str += scnprintf(str, end - str, "\n"); + + mutex_unlock(&wakelocks_lock); + return (str - buf); +} + +#if CONFIG_PM_WAKELOCKS_LIMIT > 0 +static unsigned int number_of_wakelocks; + +static inline bool wakelocks_limit_exceeded(void) +{ + return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; +} + +static inline void increment_wakelocks_number(void) +{ + number_of_wakelocks++; +} + +static inline void decrement_wakelocks_number(void) +{ + number_of_wakelocks--; +} +#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ +static inline bool wakelocks_limit_exceeded(void) { return false; } +static inline void increment_wakelocks_number(void) {} +static inline void decrement_wakelocks_number(void) {} +#endif /* CONFIG_PM_WAKELOCKS_LIMIT */ + +#ifdef CONFIG_PM_WAKELOCKS_GC +#define WL_GC_COUNT_MAX 100 +#define WL_GC_TIME_SEC 300 + +static LIST_HEAD(wakelocks_lru_list); +static unsigned int wakelocks_gc_count; + +static inline void wakelocks_lru_add(struct wakelock *wl) +{ + list_add(&wl->lru, &wakelocks_lru_list); +} + +static inline void wakelocks_lru_most_recent(struct wakelock *wl) +{ + list_move(&wl->lru, &wakelocks_lru_list); +} + +static void wakelocks_gc(void) +{ + struct wakelock *wl, *aux; + ktime_t now; + + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + now = ktime_get(); + list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { + u64 idle_time_ns; + bool active; + + spin_lock_irq(&wl->ws.lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); + active = wl->ws.active; + spin_unlock_irq(&wl->ws.lock); + + if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) + break; + + if (!active) { + wakeup_source_remove(&wl->ws); + rb_erase(&wl->node, &wakelocks_tree); + list_del(&wl->lru); + kfree(wl->name); + kfree(wl); + decrement_wakelocks_number(); + } + } + wakelocks_gc_count = 0; +} +#else /* !CONFIG_PM_WAKELOCKS_GC */ +static inline void wakelocks_lru_add(struct wakelock *wl) {} +static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} +static inline void wakelocks_gc(void) {} +#endif /* !CONFIG_PM_WAKELOCKS_GC */ + +static struct wakelock *wakelock_lookup_add(const char *name, size_t len, + bool add_if_not_found) +{ + struct rb_node **node = &wakelocks_tree.rb_node; + struct rb_node *parent = *node; + struct wakelock *wl; + + while (*node) { + int diff; + + parent = *node; + wl = rb_entry(*node, struct wakelock, node); + diff = strncmp(name, wl->name, len); + if (diff == 0) { + if (wl->name[len]) + diff = -1; + else + return wl; + } + if (diff < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + if (!add_if_not_found) + return ERR_PTR(-EINVAL); + + if (wakelocks_limit_exceeded()) + return ERR_PTR(-ENOSPC); + + /* Not found, we have to add a new one. */ + wl = kzalloc(sizeof(*wl), GFP_KERNEL); + if (!wl) + return ERR_PTR(-ENOMEM); + + wl->name = kstrndup(name, len, GFP_KERNEL); + if (!wl->name) { + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws.name = wl->name; + wakeup_source_add(&wl->ws); + rb_link_node(&wl->node, parent, node); + rb_insert_color(&wl->node, &wakelocks_tree); + wakelocks_lru_add(wl); + increment_wakelocks_number(); + return wl; +} + +int pm_wake_lock(const char *buf) +{ + const char *str = buf; + struct wakelock *wl; + u64 timeout_ns = 0; + size_t len; + int ret = 0; + + while (*str && !isspace(*str)) + str++; + + len = str - buf; + if (!len) + return -EINVAL; + + if (*str && *str != '\n') { + /* Find out if there's a valid timeout string appended. */ + ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); + if (ret) + return -EINVAL; + } + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, true); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + if (timeout_ns) { + u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; + + do_div(timeout_ms, NSEC_PER_MSEC); + __pm_wakeup_event(&wl->ws, timeout_ms); + } else { + __pm_stay_awake(&wl->ws); + } + + wakelocks_lru_most_recent(wl); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} + +int pm_wake_unlock(const char *buf) +{ + struct wakelock *wl; + size_t len; + int ret = 0; + + len = strlen(buf); + if (!len) + return -EINVAL; + + if (buf[len-1] == '\n') + len--; + + if (!len) + return -EINVAL; + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, false); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + __pm_relax(&wl->ws); + + wakelocks_lru_most_recent(wl); + wakelocks_gc(); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d39..32462d2b364a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -41,6 +41,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/rculist.h> +#include <linux/poll.h> #include <asm/uaccess.h> @@ -54,8 +55,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) { } -#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) - /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -99,24 +98,6 @@ EXPORT_SYMBOL_GPL(console_drivers); static int console_locked, console_suspended; /* - * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars - * It is also used in interesting ways to provide interlocking in - * console_unlock();. - */ -static DEFINE_RAW_SPINLOCK(logbuf_lock); - -#define LOG_BUF_MASK (log_buf_len-1) -#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) - -/* - * The indices into log_buf are not constrained to log_buf_len - they - * must be masked before subscripting - */ -static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ -static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ -static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ - -/* * If exclusive_console is non-NULL then only this console is to be printed to. */ static struct console *exclusive_console; @@ -145,13 +126,491 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +/* + * The printk log buffer consists of a chain of concatenated variable + * length records. Every record starts with a record header, containing + * the overall length of the record. + * + * The heads to the first and last entry in the buffer, as well as the + * sequence numbers of these both entries are maintained when messages + * are stored.. + * + * If the heads indicate available messages, the length in the header + * tells the start next message. A length == 0 for the next message + * indicates a wrap-around to the beginning of the buffer. + * + * Every record carries the monotonic timestamp in microseconds, as well as + * the standard userspace syslog level and syslog facility. The usual + * kernel messages use LOG_KERN; userspace-injected messages always carry + * a matching syslog facility, by default LOG_USER. The origin of every + * message can be reliably determined that way. + * + * The human readable log message directly follows the message header. The + * length of the message text is stored in the header, the stored message + * is not terminated. + * + * Optionally, a message can carry a dictionary of properties (key/value pairs), + * to provide userspace with a machine-readable message context. + * + * Examples for well-defined, commonly used property names are: + * DEVICE=b12:8 device identifier + * b12:8 block dev_t + * c127:3 char dev_t + * n8 netdev ifindex + * +sound:card0 subsystem:devname + * SUBSYSTEM=pci driver-core subsystem name + * + * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value + * follows directly after a '=' character. Every property is terminated by + * a '\0' character. The last property is not terminated. + * + * Example of a message structure: + * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec + * 0008 34 00 record is 52 bytes long + * 000a 0b 00 text is 11 bytes long + * 000c 1f 00 dictionary is 23 bytes long + * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) + * 0010 69 74 27 73 20 61 20 6c "it's a l" + * 69 6e 65 "ine" + * 001b 44 45 56 49 43 "DEVIC" + * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" + * 52 49 56 45 52 3d 62 75 "RIVER=bu" + * 67 "g" + * 0032 00 00 00 padding to next message header + * + * The 'struct log' buffer header must never be directly exported to + * userspace, it is a kernel-private implementation detail that might + * need to be changed in the future, when the requirements change. + * + * /dev/kmsg exports the structured data in the following line format: + * "level,sequnum,timestamp;<message text>\n" + * + * The optional key/value pairs are attached as continuation lines starting + * with a space character and terminated by a newline. All possible + * non-prinatable characters are escaped in the "\xff" notation. + * + * Users of the export format should ignore possible additional values + * separated by ',', and find the message after the ';' character. + */ + +struct log { + u64 ts_nsec; /* timestamp in nanoseconds */ + u16 len; /* length of entire record */ + u16 text_len; /* length of text buffer */ + u16 dict_len; /* length of dictionary buffer */ + u16 level; /* syslog level + facility */ +}; + +/* + * The logbuf_lock protects kmsg buffer, indices, counters. It is also + * used in interesting ways to provide interlocking in console_unlock(); + */ +static DEFINE_RAW_SPINLOCK(logbuf_lock); + +/* the next printk record to read by syslog(READ) or /proc/kmsg */ +static u64 syslog_seq; +static u32 syslog_idx; + +/* index and sequence number of the first record stored in the buffer */ +static u64 log_first_seq; +static u32 log_first_idx; + +/* index and sequence number of the next record to store in the buffer */ +static u64 log_next_seq; #ifdef CONFIG_PRINTK +static u32 log_next_idx; + +/* the next printk record to read after the last 'clear' command */ +static u64 clear_seq; +static u32 clear_idx; + +#define LOG_LINE_MAX 1024 -static char __log_buf[__LOG_BUF_LEN]; +/* record buffer */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define LOG_ALIGN 4 +#else +#define LOG_ALIGN 8 +#endif +#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; -static int log_buf_len = __LOG_BUF_LEN; -static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ -static int saved_console_loglevel = -1; +static u32 log_buf_len = __LOG_BUF_LEN; + +/* cpu currently holding logbuf_lock */ +static volatile unsigned int logbuf_cpu = UINT_MAX; + +/* human readable text of the record */ +static char *log_text(const struct log *msg) +{ + return (char *)msg + sizeof(struct log); +} + +/* optional key/value pair dictionary attached to the record */ +static char *log_dict(const struct log *msg) +{ + return (char *)msg + sizeof(struct log) + msg->text_len; +} + +/* get record by index; idx must point to valid msg */ +static struct log *log_from_idx(u32 idx) +{ + struct log *msg = (struct log *)(log_buf + idx); + + /* + * A length == 0 record is the end of buffer marker. Wrap around and + * read the message at the start of the buffer. + */ + if (!msg->len) + return (struct log *)log_buf; + return msg; +} + +/* get next record; idx must point to valid msg */ +static u32 log_next(u32 idx) +{ + struct log *msg = (struct log *)(log_buf + idx); + + /* length == 0 indicates the end of the buffer; wrap */ + /* + * A length == 0 record is the end of buffer marker. Wrap around and + * read the message at the start of the buffer as *this* one, and + * return the one after that. + */ + if (!msg->len) { + msg = (struct log *)log_buf; + return msg->len; + } + return idx + msg->len; +} + +/* insert record into the buffer, discard old ones, update heads */ +static void log_store(int facility, int level, + const char *dict, u16 dict_len, + const char *text, u16 text_len) +{ + struct log *msg; + u32 size, pad_len; + + /* number of '\0' padding bytes to next message */ + size = sizeof(struct log) + text_len + dict_len; + pad_len = (-size) & (LOG_ALIGN - 1); + size += pad_len; + + while (log_first_seq < log_next_seq) { + u32 free; + + if (log_next_idx > log_first_idx) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + + if (free > size + sizeof(struct log)) + break; + + /* drop old messages until we have enough contiuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } + + if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { + /* + * This message + an additional empty header does not fit + * at the end of the buffer. Add an empty header with len == 0 + * to signify a wrap around. + */ + memset(log_buf + log_next_idx, 0, sizeof(struct log)); + log_next_idx = 0; + } + + /* fill message */ + msg = (struct log *)(log_buf + log_next_idx); + memcpy(log_text(msg), text, text_len); + msg->text_len = text_len; + memcpy(log_dict(msg), dict, dict_len); + msg->dict_len = dict_len; + msg->level = (facility << 3) | (level & 7); + msg->ts_nsec = local_clock(); + memset(log_dict(msg) + dict_len, 0, pad_len); + msg->len = sizeof(struct log) + text_len + dict_len + pad_len; + + /* insert message */ + log_next_idx += msg->len; + log_next_seq++; +} + +/* /dev/kmsg - userspace message inject/listen interface */ +struct devkmsg_user { + u64 seq; + u32 idx; + struct mutex lock; + char buf[8192]; +}; + +static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + char *buf, *line; + int i; + int level = default_message_loglevel; + int facility = 1; /* LOG_USER */ + size_t len = iov_length(iv, count); + ssize_t ret = len; + + if (len > LOG_LINE_MAX) + return -EINVAL; + buf = kmalloc(len+1, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + line = buf; + for (i = 0; i < count; i++) { + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) + goto out; + line += iv[i].iov_len; + } + + /* + * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace + * the decimal value represents 32bit, the lower 3 bit are the log + * level, the rest are the log facility. + * + * If no prefix or no userspace facility is specified, we + * enforce LOG_USER, to be able to reliably distinguish + * kernel-generated messages from userspace-injected ones. + */ + line = buf; + if (line[0] == '<') { + char *endp = NULL; + + i = simple_strtoul(line+1, &endp, 10); + if (endp && endp[0] == '>') { + level = i & 7; + if (i >> 3) + facility = i >> 3; + endp++; + len -= endp - line; + line = endp; + } + } + line[len] = '\0'; + + printk_emit(facility, level, NULL, 0, "%s", line); +out: + kfree(buf); + return ret; +} + +static ssize_t devkmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct devkmsg_user *user = file->private_data; + struct log *msg; + u64 ts_usec; + size_t i; + size_t len; + ssize_t ret; + + if (!user) + return -EBADF; + + mutex_lock(&user->lock); + raw_spin_lock(&logbuf_lock); + while (user->seq == log_next_seq) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + raw_spin_unlock(&logbuf_lock); + goto out; + } + + raw_spin_unlock(&logbuf_lock); + ret = wait_event_interruptible(log_wait, + user->seq != log_next_seq); + if (ret) + goto out; + raw_spin_lock(&logbuf_lock); + } + + if (user->seq < log_first_seq) { + /* our last seen message is gone, return error and reset */ + user->idx = log_first_idx; + user->seq = log_first_seq; + ret = -EPIPE; + raw_spin_unlock(&logbuf_lock); + goto out; + } + + msg = log_from_idx(user->idx); + ts_usec = msg->ts_nsec; + do_div(ts_usec, 1000); + len = sprintf(user->buf, "%u,%llu,%llu;", + msg->level, user->seq, ts_usec); + + /* escape non-printable characters */ + for (i = 0; i < msg->text_len; i++) { + unsigned char c = log_text(msg)[i]; + + if (c < ' ' || c >= 128) + len += sprintf(user->buf + len, "\\x%02x", c); + else + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + + if (msg->dict_len) { + bool line = true; + + for (i = 0; i < msg->dict_len; i++) { + unsigned char c = log_dict(msg)[i]; + + if (line) { + user->buf[len++] = ' '; + line = false; + } + + if (c == '\0') { + user->buf[len++] = '\n'; + line = true; + continue; + } + + if (c < ' ' || c >= 128) { + len += sprintf(user->buf + len, "\\x%02x", c); + continue; + } + + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + } + + user->idx = log_next(user->idx); + user->seq++; + raw_spin_unlock(&logbuf_lock); + + if (len > count) { + ret = -EINVAL; + goto out; + } + + if (copy_to_user(buf, user->buf, len)) { + ret = -EFAULT; + goto out; + } + ret = len; +out: + mutex_unlock(&user->lock); + return ret; +} + +static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) +{ + struct devkmsg_user *user = file->private_data; + loff_t ret = 0; + + if (!user) + return -EBADF; + if (offset) + return -ESPIPE; + + raw_spin_lock(&logbuf_lock); + switch (whence) { + case SEEK_SET: + /* the first record */ + user->idx = log_first_idx; + user->seq = log_first_seq; + break; + case SEEK_DATA: + /* + * The first record after the last SYSLOG_ACTION_CLEAR, + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ + user->idx = clear_idx; + user->seq = clear_seq; + break; + case SEEK_END: + /* after the last record */ + user->idx = log_next_idx; + user->seq = log_next_seq; + break; + default: + ret = -EINVAL; + } + raw_spin_unlock(&logbuf_lock); + return ret; +} + +static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +{ + struct devkmsg_user *user = file->private_data; + int ret = 0; + + if (!user) + return POLLERR|POLLNVAL; + + poll_wait(file, &log_wait, wait); + + raw_spin_lock(&logbuf_lock); + if (user->seq < log_next_seq) { + /* return error when data has vanished underneath us */ + if (user->seq < log_first_seq) + ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; + ret = POLLIN|POLLRDNORM; + } + raw_spin_unlock(&logbuf_lock); + + return ret; +} + +static int devkmsg_open(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user; + int err; + + /* write-only does not need any file context */ + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + return 0; + + err = security_syslog(SYSLOG_ACTION_READ_ALL); + if (err) + return err; + + user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); + if (!user) + return -ENOMEM; + + mutex_init(&user->lock); + + raw_spin_lock(&logbuf_lock); + user->idx = log_first_idx; + user->seq = log_first_seq; + raw_spin_unlock(&logbuf_lock); + + file->private_data = user; + return 0; +} + +static int devkmsg_release(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user = file->private_data; + + if (!user) + return 0; + + mutex_destroy(&user->lock); + kfree(user); + return 0; +} + +const struct file_operations kmsg_fops = { + .open = devkmsg_open, + .read = devkmsg_read, + .aio_write = devkmsg_writev, + .llseek = devkmsg_llseek, + .poll = devkmsg_poll, + .release = devkmsg_release, +}; #ifdef CONFIG_KEXEC /* @@ -165,9 +624,9 @@ static int saved_console_loglevel = -1; void log_buf_kexec_setup(void) { VMCOREINFO_SYMBOL(log_buf); - VMCOREINFO_SYMBOL(log_end); VMCOREINFO_SYMBOL(log_buf_len); - VMCOREINFO_SYMBOL(logged_chars); + VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(log_next_idx); } #endif @@ -191,7 +650,6 @@ early_param("log_buf_len", log_buf_len_setup); void __init setup_log_buf(int early) { unsigned long flags; - unsigned start, dest_idx, offset; char *new_log_buf; int free; @@ -219,20 +677,8 @@ void __init setup_log_buf(int early) log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; - free = __LOG_BUF_LEN - log_end; - - offset = start = min(con_start, log_start); - dest_idx = 0; - while (start != log_end) { - unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); - - log_buf[dest_idx] = __log_buf[log_idx_mask]; - start++; - dest_idx++; - } - log_start -= offset; - con_start -= offset; - log_end -= offset; + free = __LOG_BUF_LEN - log_next_idx; + memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); pr_info("log_buf_len: %d\n", log_buf_len); @@ -332,11 +778,202 @@ static int check_syslog_permissions(int type, bool from_file) return 0; } +#if defined(CONFIG_PRINTK_TIME) +static bool printk_time = 1; +#else +static bool printk_time; +#endif +module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); + +static size_t print_prefix(const struct log *msg, bool syslog, char *buf) +{ + size_t len = 0; + + if (syslog) { + if (buf) { + len += sprintf(buf, "<%u>", msg->level); + } else { + len += 3; + if (msg->level > 9) + len++; + if (msg->level > 99) + len++; + } + } + + if (printk_time) { + if (buf) { + unsigned long long ts = msg->ts_nsec; + unsigned long rem_nsec = do_div(ts, 1000000000); + + len += sprintf(buf + len, "[%5lu.%06lu] ", + (unsigned long) ts, rem_nsec / 1000); + } else { + len += 15; + } + } + + return len; +} + +static size_t msg_print_text(const struct log *msg, bool syslog, + char *buf, size_t size) +{ + const char *text = log_text(msg); + size_t text_size = msg->text_len; + size_t len = 0; + + do { + const char *next = memchr(text, '\n', text_size); + size_t text_len; + + if (next) { + text_len = next - text; + next++; + text_size -= next - text; + } else { + text_len = text_size; + } + + if (buf) { + if (print_prefix(msg, syslog, NULL) + + text_len + 1>= size - len) + break; + + len += print_prefix(msg, syslog, buf + len); + memcpy(buf + len, text, text_len); + len += text_len; + buf[len++] = '\n'; + } else { + /* SYSLOG_ACTION_* buffer size only calculation */ + len += print_prefix(msg, syslog, NULL); + len += text_len + 1; + } + + text = next; + } while (text); + + return len; +} + +static int syslog_print(char __user *buf, int size) +{ + char *text; + struct log *msg; + int len; + + text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + } + msg = log_from_idx(syslog_idx); + len = msg_print_text(msg, true, text, LOG_LINE_MAX); + syslog_idx = log_next(syslog_idx); + syslog_seq++; + raw_spin_unlock_irq(&logbuf_lock); + + if (len > 0 && copy_to_user(buf, text, len)) + len = -EFAULT; + + kfree(text); + return len; +} + +static int syslog_print_all(char __user *buf, int size, bool clear) +{ + char *text; + int len = 0; + + text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + + raw_spin_lock_irq(&logbuf_lock); + if (buf) { + u64 next_seq; + u64 seq; + u32 idx; + + if (clear_seq < log_first_seq) { + /* messages are gone, move to first available one */ + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ + seq = clear_seq; + idx = clear_idx; + while (seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + len += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + seq = clear_seq; + idx = clear_idx; + while (len > size && seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + len -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + + /* last message in this dump */ + next_seq = log_next_seq; + + len = 0; + while (len >= 0 && seq < next_seq) { + struct log *msg = log_from_idx(idx); + int textlen; + + textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); + if (textlen < 0) { + len = textlen; + break; + } + idx = log_next(idx); + seq++; + + raw_spin_unlock_irq(&logbuf_lock); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; + raw_spin_lock_irq(&logbuf_lock); + + if (seq < log_first_seq) { + /* messages are gone, move to next one */ + seq = log_first_seq; + idx = log_first_idx; + } + } + } + + if (clear) { + clear_seq = log_next_seq; + clear_idx = log_next_idx; + } + raw_spin_unlock_irq(&logbuf_lock); + + kfree(text); + return len; +} + int do_syslog(int type, char __user *buf, int len, bool from_file) { - unsigned i, j, limit, count; - int do_clear = 0; - char c; + bool clear = false; + static int saved_console_loglevel = -1; int error; error = check_syslog_permissions(type, from_file); @@ -364,28 +1001,14 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) goto out; } error = wait_event_interruptible(log_wait, - (log_start - log_end)); + syslog_seq != log_next_seq); if (error) goto out; - i = 0; - raw_spin_lock_irq(&logbuf_lock); - while (!error && (log_start != log_end) && i < len) { - c = LOG_BUF(log_start); - log_start++; - raw_spin_unlock_irq(&logbuf_lock); - error = __put_user(c,buf); - buf++; - i++; - cond_resched(); - raw_spin_lock_irq(&logbuf_lock); - } - raw_spin_unlock_irq(&logbuf_lock); - if (!error) - error = i; + error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: - do_clear = 1; + clear = true; /* FALL THRU */ /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: @@ -399,52 +1022,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } - count = len; - if (count > log_buf_len) - count = log_buf_len; - raw_spin_lock_irq(&logbuf_lock); - if (count > logged_chars) - count = logged_chars; - if (do_clear) - logged_chars = 0; - limit = log_end; - /* - * __put_user() could sleep, and while we sleep - * printk() could overwrite the messages - * we try to copy to user space. Therefore - * the messages are copied in reverse. <manfreds> - */ - for (i = 0; i < count && !error; i++) { - j = limit-1-i; - if (j + log_buf_len < log_end) - break; - c = LOG_BUF(j); - raw_spin_unlock_irq(&logbuf_lock); - error = __put_user(c,&buf[count-1-i]); - cond_resched(); - raw_spin_lock_irq(&logbuf_lock); - } - raw_spin_unlock_irq(&logbuf_lock); - if (error) - break; - error = i; - if (i != count) { - int offset = count-error; - /* buffer overflow during copy, correct user buffer. */ - for (i = 0; i < error; i++) { - if (__get_user(c,&buf[i+offset]) || - __put_user(c,&buf[i])) { - error = -EFAULT; - break; - } - cond_resched(); - } - } + error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: - logged_chars = 0; - break; + syslog_print_all(NULL, 0, true); /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) @@ -472,7 +1054,35 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - error = log_end - log_start; + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + } + if (from_file) { + /* + * Short-cut for poll(/"proc/kmsg") which simply checks + * for pending data, not the size; return the count of + * records, not the length. + */ + error = log_next_idx - syslog_idx; + } else { + u64 seq; + u32 idx; + + error = 0; + seq = syslog_seq; + idx = syslog_idx; + while (seq < log_next_seq) { + struct log *msg = log_from_idx(idx); + + error += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + } + raw_spin_unlock_irq(&logbuf_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -501,29 +1111,11 @@ void kdb_syslog_data(char *syslog_data[4]) { syslog_data[0] = log_buf; syslog_data[1] = log_buf + log_buf_len; - syslog_data[2] = log_buf + log_end - - (logged_chars < log_buf_len ? logged_chars : log_buf_len); - syslog_data[3] = log_buf + log_end; + syslog_data[2] = log_buf + log_first_idx; + syslog_data[3] = log_buf + log_next_idx; } #endif /* CONFIG_KGDB_KDB */ -/* - * Call the console drivers on a range of log_buf - */ -static void __call_console_drivers(unsigned start, unsigned end) -{ - struct console *con; - - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if ((con->flags & CON_ENABLED) && con->write && - (cpu_online(smp_processor_id()) || - (con->flags & CON_ANYTIME))) - con->write(con, &LOG_BUF(start), end - start); - } -} - static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) @@ -540,142 +1132,33 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" "print all kernel messages to the console."); /* - * Write out chars from start to end - 1 inclusive - */ -static void _call_console_drivers(unsigned start, - unsigned end, int msg_log_level) -{ - trace_console(&LOG_BUF(0), start, end, log_buf_len); - - if ((msg_log_level < console_loglevel || ignore_loglevel) && - console_drivers && start != end) { - if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { - /* wrapped write */ - __call_console_drivers(start & LOG_BUF_MASK, - log_buf_len); - __call_console_drivers(0, end & LOG_BUF_MASK); - } else { - __call_console_drivers(start, end); - } - } -} - -/* - * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the - * lower 3 bit are the log level, the rest are the log facility. In case - * userspace passes usual userspace syslog messages to /dev/kmsg or - * /dev/ttyprintk, the log prefix might contain the facility. Printk needs - * to extract the correct log level for in-kernel processing, and not mangle - * the original value. - * - * If a prefix is found, the length of the prefix is returned. If 'level' is - * passed, it will be filled in with the log level without a possible facility - * value. If 'special' is passed, the special printk prefix chars are accepted - * and returned. If no valid header is found, 0 is returned and the passed - * variables are not touched. - */ -static size_t log_prefix(const char *p, unsigned int *level, char *special) -{ - unsigned int lev = 0; - char sp = '\0'; - size_t len; - - if (p[0] != '<' || !p[1]) - return 0; - if (p[2] == '>') { - /* usual single digit level number or special char */ - switch (p[1]) { - case '0' ... '7': - lev = p[1] - '0'; - break; - case 'c': /* KERN_CONT */ - case 'd': /* KERN_DEFAULT */ - sp = p[1]; - break; - default: - return 0; - } - len = 3; - } else { - /* multi digit including the level and facility number */ - char *endp = NULL; - - lev = (simple_strtoul(&p[1], &endp, 10) & 7); - if (endp == NULL || endp[0] != '>') - return 0; - len = (endp + 1) - p; - } - - /* do not accept special char if not asked for */ - if (sp && !special) - return 0; - - if (special) { - *special = sp; - /* return special char, do not touch level */ - if (sp) - return len; - } - - if (level) - *level = lev; - return len; -} - -/* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. */ -static void call_console_drivers(unsigned start, unsigned end) +static void call_console_drivers(int level, const char *text, size_t len) { - unsigned cur_index, start_print; - static int msg_level = -1; + struct console *con; - BUG_ON(((int)(start - end)) > 0); + trace_console(text, 0, len, len); - cur_index = start; - start_print = start; - while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2)) { - /* strip log prefix */ - cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); - start_print = cur_index; - } - while (cur_index != end) { - char c = LOG_BUF(cur_index); - - cur_index++; - if (c == '\n') { - if (msg_level < 0) { - /* - * printk() has already given us loglevel tags in - * the buffer. This code is here in case the - * log buffer has wrapped right round and scribbled - * on those tags - */ - msg_level = default_message_loglevel; - } - _call_console_drivers(start_print, cur_index, msg_level); - msg_level = -1; - start_print = cur_index; - break; - } - } - } - _call_console_drivers(start_print, end, msg_level); -} + if (level >= console_loglevel && !ignore_loglevel) + return; + if (!console_drivers) + return; -static void emit_log_char(char c) -{ - LOG_BUF(log_end) = c; - log_end++; - if (log_end - log_start > log_buf_len) - log_start = log_end - log_buf_len; - if (log_end - con_start > log_buf_len) - con_start = log_end - log_buf_len; - if (logged_chars < log_buf_len) - logged_chars++; + for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; + if (!(con->flags & CON_ENABLED)) + continue; + if (!con->write) + continue; + if (!cpu_online(smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + continue; + con->write(con, text, len); + } } /* @@ -700,16 +1183,6 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time = 0; -#endif -module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); - -static bool always_kmsg_dump; -module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); - /* Check if we have any console registered that can be called early in boot. */ static int have_callable_console(void) { @@ -722,51 +1195,6 @@ static int have_callable_console(void) return 0; } -/** - * printk - print a kernel message - * @fmt: format string - * - * This is printk(). It can be called from any context. We want it to work. - * - * We try to grab the console_lock. If we succeed, it's easy - we log the output and - * call the console drivers. If we fail to get the semaphore we place the output - * into the log buffer and return. The current holder of the console_sem will - * notice the new output in console_unlock(); and will send it to the - * consoles before releasing the lock. - * - * One effect of this deferred printing is that code which calls printk() and - * then changes console_loglevel may break. This is because console_loglevel - * is inspected when the actual printing occurs. - * - * See also: - * printf(3) - * - * See the vsnprintf() documentation for format string extensions over C99. - */ - -asmlinkage int printk(const char *fmt, ...) -{ - va_list args; - int r; - -#ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { - va_start(args, fmt); - r = vkdb_printf(fmt, args); - va_end(args); - return r; - } -#endif - va_start(args, fmt); - r = vprintk(fmt, args); - va_end(args); - - return r; -} - -/* cpu currently holding logbuf_lock */ -static volatile unsigned int printk_cpu = UINT_MAX; - /* * Can we actually use the console at this time on this cpu? * @@ -810,17 +1238,12 @@ static int console_trylock_for_printk(unsigned int cpu) retval = 0; } } - printk_cpu = UINT_MAX; + logbuf_cpu = UINT_MAX; if (wake) up(&console_sem); raw_spin_unlock(&logbuf_lock); return retval; } -static const char recursion_bug_msg [] = - KERN_CRIT "BUG: recent printk recursion!\n"; -static int recursion_bug; -static int new_text_line = 1; -static char printk_buf[1024]; int printk_delay_msec __read_mostly; @@ -836,15 +1259,23 @@ static inline void printk_delay(void) } } -asmlinkage int vprintk(const char *fmt, va_list args) +asmlinkage int vprintk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) { - int printed_len = 0; - int current_log_level = default_message_loglevel; + static int recursion_bug; + static char cont_buf[LOG_LINE_MAX]; + static size_t cont_len; + static int cont_level; + static struct task_struct *cont_task; + static char textbuf[LOG_LINE_MAX]; + char *text = textbuf; + size_t text_len; unsigned long flags; int this_cpu; - char *p; - size_t plen; - char special; + bool newline = false; + bool prefix = false; + int printed_len = 0; boot_delay_msec(); printk_delay(); @@ -856,7 +1287,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* * Ouch, printk recursed into itself! */ - if (unlikely(printk_cpu == this_cpu)) { + if (unlikely(logbuf_cpu == this_cpu)) { /* * If a crash is occurring during printk() on this CPU, * then try to get the crash message out but make sure @@ -873,97 +1304,110 @@ asmlinkage int vprintk(const char *fmt, va_list args) lockdep_off(); raw_spin_lock(&logbuf_lock); - printk_cpu = this_cpu; + logbuf_cpu = this_cpu; if (recursion_bug) { + static const char recursion_msg[] = + "BUG: recent printk recursion!"; + recursion_bug = 0; - strcpy(printk_buf, recursion_bug_msg); - printed_len = strlen(recursion_bug_msg); + printed_len += strlen(recursion_msg); + /* emit KERN_CRIT message */ + log_store(0, 2, NULL, 0, recursion_msg, printed_len); } - /* Emit the output into the temporary buffer */ - printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf) - printed_len, fmt, args); - p = printk_buf; + /* + * The printf needs to come first; we need the syslog + * prefix which might be passed-in as a parameter. + */ + text_len = vscnprintf(text, sizeof(textbuf), fmt, args); - /* Read log level and handle special printk prefix */ - plen = log_prefix(p, ¤t_log_level, &special); - if (plen) { - p += plen; + /* mark and strip a trailing newline */ + if (text_len && text[text_len-1] == '\n') { + text_len--; + newline = true; + } - switch (special) { - case 'c': /* Strip <c> KERN_CONT, continue line */ - plen = 0; - break; - case 'd': /* Strip <d> KERN_DEFAULT, start new line */ - plen = 0; - default: - if (!new_text_line) { - emit_log_char('\n'); - new_text_line = 1; - } + /* strip syslog prefix and extract log level or control flags */ + if (text[0] == '<' && text[1] && text[2] == '>') { + switch (text[1]) { + case '0' ... '7': + if (level == -1) + level = text[1] - '0'; + case 'd': /* KERN_DEFAULT */ + prefix = true; + case 'c': /* KERN_CONT */ + text += 3; + text_len -= 3; } } - /* - * Copy the output into log_buf. If the caller didn't provide - * the appropriate log prefix, we insert them here - */ - for (; *p; p++) { - if (new_text_line) { - new_text_line = 0; - - if (plen) { - /* Copy original log prefix */ - int i; - - for (i = 0; i < plen; i++) - emit_log_char(printk_buf[i]); - printed_len += plen; - } else { - /* Add log prefix */ - emit_log_char('<'); - emit_log_char(current_log_level + '0'); - emit_log_char('>'); - printed_len += 3; - } + if (level == -1) + level = default_message_loglevel; - if (printk_time) { - /* Add the current time stamp */ - char tbuf[50], *tp; - unsigned tlen; - unsigned long long t; - unsigned long nanosec_rem; - - t = cpu_clock(printk_cpu); - nanosec_rem = do_div(t, 1000000000); - tlen = sprintf(tbuf, "[%5lu.%06lu] ", - (unsigned long) t, - nanosec_rem / 1000); - - for (tp = tbuf; tp < tbuf + tlen; tp++) - emit_log_char(*tp); - printed_len += tlen; - } + if (dict) { + prefix = true; + newline = true; + } - if (!*p) - break; + if (!newline) { + if (cont_len && (prefix || cont_task != current)) { + /* + * Flush earlier buffer, which is either from a + * different thread, or when we got a new prefix. + */ + log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); + cont_len = 0; } - emit_log_char(*p); - if (*p == '\n') - new_text_line = 1; + if (!cont_len) { + cont_level = level; + cont_task = current; + } + + /* buffer or append to earlier buffer from the same thread */ + if (cont_len + text_len > sizeof(cont_buf)) + text_len = sizeof(cont_buf) - cont_len; + memcpy(cont_buf + cont_len, text, text_len); + cont_len += text_len; + } else { + if (cont_len && cont_task == current) { + if (prefix) { + /* + * New prefix from the same thread; flush. We + * either got no earlier newline, or we race + * with an interrupt. + */ + log_store(facility, cont_level, + NULL, 0, cont_buf, cont_len); + cont_len = 0; + } + + /* append to the earlier buffer and flush */ + if (cont_len + text_len > sizeof(cont_buf)) + text_len = sizeof(cont_buf) - cont_len; + memcpy(cont_buf + cont_len, text, text_len); + cont_len += text_len; + log_store(facility, cont_level, + NULL, 0, cont_buf, cont_len); + cont_len = 0; + cont_task = NULL; + printed_len = cont_len; + } else { + /* ordinary single and terminated line */ + log_store(facility, level, + dict, dictlen, text, text_len); + printed_len = text_len; + } } /* - * Try to acquire and then immediately release the - * console semaphore. The release will do all the - * actual magic (print out buffers, wake up klogd, - * etc). + * Try to acquire and then immediately release the console semaphore. + * The release will print out buffers and wake up /dev/kmsg and syslog() + * users. * - * The console_trylock_for_printk() function - * will release 'logbuf_lock' regardless of whether it - * actually gets the semaphore or not. + * The console_trylock_for_printk() function will release 'logbuf_lock' + * regardless of whether it actually gets the console semaphore or not. */ if (console_trylock_for_printk(this_cpu)) console_unlock(); @@ -974,16 +1418,81 @@ out_restore_irqs: return printed_len; } -EXPORT_SYMBOL(printk); -EXPORT_SYMBOL(vprintk); +EXPORT_SYMBOL(vprintk_emit); -#else +asmlinkage int vprintk(const char *fmt, va_list args) +{ + return vprintk_emit(0, -1, NULL, 0, fmt, args); +} +EXPORT_SYMBOL(vprintk); -static void call_console_drivers(unsigned start, unsigned end) +asmlinkage int printk_emit(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, ...) { + va_list args; + int r; + + va_start(args, fmt); + r = vprintk_emit(facility, level, dict, dictlen, fmt, args); + va_end(args); + + return r; } +EXPORT_SYMBOL(printk_emit); +/** + * printk - print a kernel message + * @fmt: format string + * + * This is printk(). It can be called from any context. We want it to work. + * + * We try to grab the console_lock. If we succeed, it's easy - we log the + * output and call the console drivers. If we fail to get the semaphore, we + * place the output into the log buffer and return. The current holder of + * the console_sem will notice the new output in console_unlock(); and will + * send it to the consoles before releasing the lock. + * + * One effect of this deferred printing is that code which calls printk() and + * then changes console_loglevel may break. This is because console_loglevel + * is inspected when the actual printing occurs. + * + * See also: + * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. + */ +asmlinkage int printk(const char *fmt, ...) +{ + va_list args; + int r; + +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + va_start(args, fmt); + r = vkdb_printf(fmt, args); + va_end(args); + return r; + } #endif + va_start(args, fmt); + r = vprintk_emit(0, -1, NULL, 0, fmt, args); + va_end(args); + + return r; +} +EXPORT_SYMBOL(printk); + +#else + +#define LOG_LINE_MAX 0 +static struct log *log_from_idx(u32 idx) { return NULL; } +static u32 log_next(u32 idx) { return 0; } +static void call_console_drivers(int level, const char *text, size_t len) {} +static size_t msg_print_text(const struct log *msg, bool syslog, + char *buf, size_t size) { return 0; } + +#endif /* CONFIG_PRINTK */ static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) @@ -1217,7 +1726,7 @@ int is_console_locked(void) } /* - * Delayed printk facility, for scheduler-internal messages: + * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_BUF_SIZE 512 @@ -1253,6 +1762,10 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; + /** * console_unlock - unlock the console system * @@ -1263,15 +1776,16 @@ void wake_up_klogd(void) * by printk(). If this is the case, console_unlock(); emits * the output prior to releasing the lock. * - * If there is output waiting for klogd, we wake it up. + * If there is output waiting, we wake /dev/kmsg and syslog() users. * * console_unlock(); may be called from any context. */ void console_unlock(void) { + static u64 seen_seq; unsigned long flags; - unsigned _con_start, _log_end; - unsigned wake_klogd = 0, retry = 0; + bool wake_klogd = false; + bool retry; if (console_suspended) { up(&console_sem); @@ -1281,17 +1795,38 @@ void console_unlock(void) console_may_schedule = 0; again: - for ( ; ; ) { + for (;;) { + struct log *msg; + static char text[LOG_LINE_MAX]; + size_t len; + int level; + raw_spin_lock_irqsave(&logbuf_lock, flags); - wake_klogd |= log_start - log_end; - if (con_start == log_end) - break; /* Nothing to print */ - _con_start = con_start; - _log_end = log_end; - con_start = log_end; /* Flush */ + if (seen_seq != log_next_seq) { + wake_klogd = true; + seen_seq = log_next_seq; + } + + if (console_seq < log_first_seq) { + /* messages are gone, move to first one */ + console_seq = log_first_seq; + console_idx = log_first_idx; + } + + if (console_seq == log_next_seq) + break; + + msg = log_from_idx(console_idx); + level = msg->level & 7; + + len = msg_print_text(msg, false, text, sizeof(text)); + + console_idx = log_next(console_idx); + console_seq++; raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(_con_start, _log_end); + call_console_drivers(level, text, len); start_critical_timings(); local_irq_restore(flags); } @@ -1312,8 +1847,7 @@ again: * flush, no worries. */ raw_spin_lock(&logbuf_lock); - if (con_start != log_end) - retry = 1; + retry = console_seq != log_next_seq; raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (retry && console_trylock()) @@ -1549,7 +2083,8 @@ void register_console(struct console *newcon) * for us. */ raw_spin_lock_irqsave(&logbuf_lock, flags); - con_start = log_start; + console_seq = syslog_seq; + console_idx = syslog_idx; raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the @@ -1758,6 +2293,9 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); + /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -1767,8 +2305,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_unregister); */ void kmsg_dump(enum kmsg_dump_reason reason) { - unsigned long end; - unsigned chars; + u64 idx; struct kmsg_dumper *dumper; const char *s1, *s2; unsigned long l1, l2; @@ -1780,24 +2317,27 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ + raw_spin_lock_irqsave(&logbuf_lock, flags); - end = log_end & LOG_BUF_MASK; - chars = logged_chars; - raw_spin_unlock_irqrestore(&logbuf_lock, flags); + if (syslog_seq < log_first_seq) + idx = syslog_idx; + else + idx = log_first_idx; - if (chars > end) { - s1 = log_buf + log_buf_len - chars + end; - l1 = chars - end; + if (idx > log_next_idx) { + s1 = log_buf; + l1 = log_next_idx; - s2 = log_buf; - l2 = end; + s2 = log_buf + idx; + l2 = log_buf_len - idx; } else { s1 = ""; l1 = 0; - s2 = log_buf + end - chars; - l2 = chars; + s2 = log_buf + idx; + l2 = log_next_idx - idx; } + raw_spin_unlock_irqrestore(&logbuf_lock, flags); rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ee8d49b9c309..a232bb59d93f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -198,15 +198,14 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->uid == tcred->euid && - cred->uid == tcred->suid && - cred->uid == tcred->uid && - cred->gid == tcred->egid && - cred->gid == tcred->sgid && - cred->gid == tcred->gid)) + if (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)) goto ok; - if (ptrace_has_cap(tcred->user->user_ns, mode)) + if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index d508363858b3..bebe2b170d49 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val) +int res_counter_charge_locked(struct res_counter *counter, unsigned long val, + bool force) { + int ret = 0; + if (counter->usage + val > counter->limit) { counter->failcnt++; - return -ENOMEM; + ret = -ENOMEM; + if (!force) + return ret; } counter->usage += val; if (counter->usage > counter->max_usage) counter->max_usage = counter->usage; - return 0; + return ret; } -int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) +static int __res_counter_charge(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at, bool force) { - int ret; + int ret, r; unsigned long flags; struct res_counter *c, *u; + r = ret = 0; *limit_fail_at = NULL; local_irq_save(flags); for (c = counter; c != NULL; c = c->parent) { spin_lock(&c->lock); - ret = res_counter_charge_locked(c, val); + r = res_counter_charge_locked(c, val, force); spin_unlock(&c->lock); - if (ret < 0) { + if (r < 0 && !ret) { + ret = r; *limit_fail_at = c; - goto undo; + if (!force) + break; } } - ret = 0; - goto done; -undo: - for (u = counter; u != c; u = u->parent) { - spin_lock(&u->lock); - res_counter_uncharge_locked(u, val); - spin_unlock(&u->lock); + + if (ret < 0 && !force) { + for (u = counter; u != c; u = u->parent) { + spin_lock(&u->lock); + res_counter_uncharge_locked(u, val); + spin_unlock(&u->lock); + } } -done: local_irq_restore(flags); + return ret; } +int res_counter_charge(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) +{ + return __res_counter_charge(counter, val, limit_fail_at, false); +} + int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, struct res_counter **limit_fail_at) { - int ret, r; - unsigned long flags; - struct res_counter *c; - - r = ret = 0; - *limit_fail_at = NULL; - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - r = res_counter_charge_locked(c, val); - if (r) - c->usage += val; - spin_unlock(&c->lock); - if (r < 0 && ret == 0) { - *limit_fail_at = c; - ret = r; - } - } - local_irq_restore(flags); - - return ret; + return __res_counter_charge(counter, val, limit_fail_at, true); } + void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea8a4769fea5..39eb6011bc38 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -693,8 +693,6 @@ int tg_nop(struct task_group *tg, void *data) } #endif -void update_cpu_load(struct rq *this_rq); - static void set_load_weight(struct task_struct *p) { int prio = p->static_prio - MAX_RT_PRIO; @@ -2488,22 +2486,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -void update_cpu_load(struct rq *this_rq) +static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, + unsigned long pending_updates) { - unsigned long this_load = this_rq->load.weight; - unsigned long curr_jiffies = jiffies; - unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; - /* Avoid repeated calls on same jiffy, when moving in and out of idle */ - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - /* Update our load: */ this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { @@ -2528,9 +2517,45 @@ void update_cpu_load(struct rq *this_rq) sched_avg_update(this_rq); } +/* + * Called from nohz_idle_balance() to update the load ratings before doing the + * idle balance. + */ +void update_idle_cpu_load(struct rq *this_rq) +{ + unsigned long curr_jiffies = jiffies; + unsigned long load = this_rq->load.weight; + unsigned long pending_updates; + + /* + * Bloody broken means of dealing with nohz, but better than nothing.. + * jiffies is updated by one cpu, another cpu can drift wrt the jiffy + * update and see 0 difference the one time and 2 the next, even though + * we ticked at roughtly the same rate. + * + * Hence we only use this from nohz_idle_balance() and skip this + * nonsense when called from the scheduler_tick() since that's + * guaranteed a stable rate. + */ + if (load || curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + __update_cpu_load(this_rq, load, pending_updates); +} + +/* + * Called from scheduler_tick() + */ static void update_cpu_load_active(struct rq *this_rq) { - update_cpu_load(this_rq); + /* + * See the mess in update_idle_cpu_load(). + */ + this_rq->last_load_update_tick = jiffies; + __update_cpu_load(this_rq, this_rq->load.weight, 1); calc_load_account_active(this_rq); } @@ -3115,6 +3140,7 @@ static noinline void __schedule_bug(struct task_struct *prev) if (irqs_disabled()) print_irqtrace_events(prev); dump_stack(); + add_taint(TAINT_WARN); } /* @@ -4044,11 +4070,8 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - if (cred->user->user_ns == pcred->user->user_ns) - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); - else - match = false; + match = (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); rcu_read_unlock(); return match; } @@ -5562,7 +5585,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (cpumask_intersects(groupmask, sched_group_cpus(group))) { + if (!(sd->flags & SD_OVERLAP) && + cpumask_intersects(groupmask, sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; @@ -5900,99 +5924,11 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -#ifdef CONFIG_NUMA - -/** - * find_next_best_node - find the next node to include in a sched_domain - * @node: node whose sched_domain we're building - * @used_nodes: nodes already in the sched_domain - * - * Find the next node to include in a given scheduling domain. Simply - * finds the closest node not already in the @used_nodes map. - * - * Should use nodemask_t. - */ -static int find_next_best_node(int node, nodemask_t *used_nodes) -{ - int i, n, val, min_val, best_node = -1; - - min_val = INT_MAX; - - for (i = 0; i < nr_node_ids; i++) { - /* Start at @node */ - n = (node + i) % nr_node_ids; - - if (!nr_cpus_node(n)) - continue; - - /* Skip already used nodes */ - if (node_isset(n, *used_nodes)) - continue; - - /* Simple min distance search */ - val = node_distance(node, n); - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - if (best_node != -1) - node_set(best_node, *used_nodes); - return best_node; -} - -/** - * sched_domain_node_span - get a cpumask for a node's sched_domain - * @node: node whose cpumask we're constructing - * @span: resulting cpumask - * - * Given a node, construct a good cpumask for its sched_domain to span. It - * should be one that prevents unnecessary balancing, but also spreads tasks - * out optimally. - */ -static void sched_domain_node_span(int node, struct cpumask *span) -{ - nodemask_t used_nodes; - int i; - - cpumask_clear(span); - nodes_clear(used_nodes); - - cpumask_or(span, span, cpumask_of_node(node)); - node_set(node, used_nodes); - - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, &used_nodes); - if (next_node < 0) - break; - cpumask_or(span, span, cpumask_of_node(next_node)); - } -} - -static const struct cpumask *cpu_node_mask(int cpu) -{ - lockdep_assert_held(&sched_domains_mutex); - - sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); - - return sched_domains_tmpmask; -} - -static const struct cpumask *cpu_allnodes_mask(int cpu) -{ - return cpu_possible_mask; -} -#endif /* CONFIG_NUMA */ - static const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } -int sched_smt_power_savings = 0, sched_mc_power_savings = 0; - struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; @@ -6022,6 +5958,7 @@ struct sched_domain_topology_level { sched_domain_init_f init; sched_domain_mask_f mask; int flags; + int numa_level; struct sd_data data; }; @@ -6213,10 +6150,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ } SD_INIT_FUNC(CPU) -#ifdef CONFIG_NUMA - SD_INIT_FUNC(ALLNODES) - SD_INIT_FUNC(NODE) -#endif #ifdef CONFIG_SCHED_SMT SD_INIT_FUNC(SIBLING) #endif @@ -6338,15 +6271,184 @@ static struct sched_domain_topology_level default_topology[] = { { sd_init_BOOK, cpu_book_mask, }, #endif { sd_init_CPU, cpu_cpu_mask, }, -#ifdef CONFIG_NUMA - { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, - { sd_init_ALLNODES, cpu_allnodes_mask, }, -#endif { NULL, }, }; static struct sched_domain_topology_level *sched_domain_topology = default_topology; +#ifdef CONFIG_NUMA + +static int sched_domains_numa_levels; +static int sched_domains_numa_scale; +static int *sched_domains_numa_distance; +static struct cpumask ***sched_domains_numa_masks; +static int sched_domains_curr_level; + +static inline int sd_local_flags(int level) +{ + if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + return 0; + + return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; +} + +static struct sched_domain * +sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +{ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); + int level = tl->numa_level; + int sd_weight = cpumask_weight( + sched_domains_numa_masks[level][cpu_to_node(cpu)]); + + *sd = (struct sched_domain){ + .min_interval = sd_weight, + .max_interval = 2*sd_weight, + .busy_factor = 32, + .imbalance_pct = 125, + .cache_nice_tries = 2, + .busy_idx = 3, + .idle_idx = 2, + .newidle_idx = 0, + .wake_idx = 0, + .forkexec_idx = 0, + + .flags = 1*SD_LOAD_BALANCE + | 1*SD_BALANCE_NEWIDLE + | 0*SD_BALANCE_EXEC + | 0*SD_BALANCE_FORK + | 0*SD_BALANCE_WAKE + | 0*SD_WAKE_AFFINE + | 0*SD_PREFER_LOCAL + | 0*SD_SHARE_CPUPOWER + | 0*SD_SHARE_PKG_RESOURCES + | 1*SD_SERIALIZE + | 0*SD_PREFER_SIBLING + | sd_local_flags(level) + , + .last_balance = jiffies, + .balance_interval = sd_weight, + }; + SD_INIT_NAME(sd, NUMA); + sd->private = &tl->data; + + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; + + return sd; +} + +static const struct cpumask *sd_numa_mask(int cpu) +{ + return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; +} + +static void sched_init_numa(void) +{ + int next_distance, curr_distance = node_distance(0, 0); + struct sched_domain_topology_level *tl; + int level = 0; + int i, j, k; + + sched_domains_numa_scale = curr_distance; + sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + if (!sched_domains_numa_distance) + return; + + /* + * O(nr_nodes^2) deduplicating selection sort -- in order to find the + * unique distances in the node_distance() table. + * + * Assumes node_distance(0,j) includes all distances in + * node_distance(i,j) in order to avoid cubic time. + * + * XXX: could be optimized to O(n log n) by using sort() + */ + next_distance = curr_distance; + for (i = 0; i < nr_node_ids; i++) { + for (j = 0; j < nr_node_ids; j++) { + int distance = node_distance(0, j); + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; + } + /* + * 'level' contains the number of unique distances, excluding the + * identity distance node_distance(i,i). + * + * The sched_domains_nume_distance[] array includes the actual distance + * numbers. + */ + + sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + if (!sched_domains_numa_masks) + return; + + /* + * Now for each level, construct a mask per node which contains all + * cpus of nodes that are that many hops away from us. + */ + for (i = 0; i < level; i++) { + sched_domains_numa_masks[i] = + kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); + if (!sched_domains_numa_masks[i]) + return; + + for (j = 0; j < nr_node_ids; j++) { + struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + if (!mask) + return; + + sched_domains_numa_masks[i][j] = mask; + + for (k = 0; k < nr_node_ids; k++) { + if (node_distance(j, k) > sched_domains_numa_distance[i]) + continue; + + cpumask_or(mask, mask, cpumask_of_node(k)); + } + } + } + + tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + sizeof(struct sched_domain_topology_level), GFP_KERNEL); + if (!tl) + return; + + /* + * Copy the default topology bits.. + */ + for (i = 0; default_topology[i].init; i++) + tl[i] = default_topology[i]; + + /* + * .. and append 'j' levels of NUMA goodness. + */ + for (j = 0; j < level; i++, j++) { + tl[i] = (struct sched_domain_topology_level){ + .init = sd_numa_init, + .mask = sd_numa_mask, + .flags = SDTL_OVERLAP, + .numa_level = j, + }; + } + + sched_domain_topology = tl; +} +#else +static inline void sched_init_numa(void) +{ +} +#endif /* CONFIG_NUMA */ + static int __sdt_alloc(const struct cpumask *cpu_map) { struct sched_domain_topology_level *tl; @@ -6714,97 +6816,6 @@ match2: mutex_unlock(&sched_domains_mutex); } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static void reinit_sched_domains(void) -{ - get_online_cpus(); - - /* Destroy domains first to force the rebuild */ - partition_sched_domains(0, NULL, NULL); - - rebuild_sched_domains(); - put_online_cpus(); -} - -static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) -{ - unsigned int level = 0; - - if (sscanf(buf, "%u", &level) != 1) - return -EINVAL; - - /* - * level is always be positive so don't check for - * level < POWERSAVINGS_BALANCE_NONE which is 0 - * What happens on 0 or 1 byte write, - * need to check for count as well? - */ - - if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) - return -EINVAL; - - if (smt) - sched_smt_power_savings = level; - else - sched_mc_power_savings = level; - - reinit_sched_domains(); - - return count; -} - -#ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_mc_power_savings); -} -static ssize_t sched_mc_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 0); -} -static DEVICE_ATTR(sched_mc_power_savings, 0644, - sched_mc_power_savings_show, - sched_mc_power_savings_store); -#endif - -#ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", sched_smt_power_savings); -} -static ssize_t sched_smt_power_savings_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - return sched_power_savings_store(buf, count, 1); -} -static DEVICE_ATTR(sched_smt_power_savings, 0644, - sched_smt_power_savings_show, - sched_smt_power_savings_store); -#endif - -int __init sched_create_sysfs_power_savings_entries(struct device *dev) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = device_create_file(dev, &dev_attr_sched_smt_power_savings); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = device_create_file(dev, &dev_attr_sched_mc_power_savings); -#endif - return err; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper @@ -6842,6 +6853,8 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + sched_init_numa(); + get_online_cpus(); mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); @@ -7985,13 +7998,9 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_rt_period_write_uint, }, #endif + { } /* terminate */ }; -static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); -} - struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, @@ -7999,8 +8008,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, + .base_cftypes = cpu_files, .early_init = 1, }; @@ -8185,13 +8194,9 @@ static struct cftype files[] = { .name = "stat", .read_map = cpuacct_stats_show, }, + { } /* terminate */ }; -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) -{ - return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); -} - /* * charge this task's execution time to its accounting group. * @@ -8223,7 +8228,7 @@ struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .create = cpuacct_create, .destroy = cpuacct_destroy, - .populate = cpuacct_populate, .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, }; #endif /* CONFIG_CGROUP_CPUACCT */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161d..6f79596e0ea9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP @@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "\ncpu#%d\n", cpu); #endif -#define P(x) \ - SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define P(x) \ +do { \ + if (sizeof(rq->x) == 4) \ + SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \ + else \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ +} while (0) + #define PN(x) \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9553640c1c3..940e6d17cf96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. */ - if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { + if (tmp->flags & (SD_PREFER_LOCAL)) { unsigned long power = 0; unsigned long nr_running = 0; unsigned long capacity; @@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); - if (tmp->flags & SD_POWERSAVINGS_BALANCE) - nr_running /= 2; - if (nr_running < capacity) want_sd = 0; } @@ -3082,7 +3079,7 @@ struct lb_env { struct rq *dst_rq; enum cpu_idle_type idle; - long load_move; + long imbalance; unsigned int flags; unsigned int loop; @@ -3218,7 +3215,7 @@ static unsigned long task_h_load(struct task_struct *p); static const unsigned int sched_nr_migrate_break = 32; /* - * move_tasks tries to move up to load_move weighted load from busiest to + * move_tasks tries to move up to imbalance weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". * Returns 1 if successful and 0 otherwise. * @@ -3231,7 +3228,7 @@ static int move_tasks(struct lb_env *env) unsigned long load; int pulled = 0; - if (env->load_move <= 0) + if (env->imbalance <= 0) return 0; while (!list_empty(tasks)) { @@ -3257,7 +3254,7 @@ static int move_tasks(struct lb_env *env) if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load / 2) > env->load_move) + if ((load / 2) > env->imbalance) goto next; if (!can_migrate_task(p, env)) @@ -3265,7 +3262,7 @@ static int move_tasks(struct lb_env *env) move_task(p, env); pulled++; - env->load_move -= load; + env->imbalance -= load; #ifdef CONFIG_PREEMPT /* @@ -3281,7 +3278,7 @@ static int move_tasks(struct lb_env *env) * We only want to steal up to the prescribed amount of * weighted load. */ - if (env->load_move <= 0) + if (env->imbalance <= 0) break; continue; @@ -3435,14 +3432,6 @@ struct sd_lb_stats { unsigned int busiest_group_weight; int group_imb; /* Is there imbalance in this sd */ -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance; /* Is powersave balance needed for this sd */ - struct sched_group *group_min; /* Least loaded group in sd */ - struct sched_group *group_leader; /* Group which relieves group_min */ - unsigned long min_load_per_task; /* load_per_task in group_min */ - unsigned long leader_nr_running; /* Nr running of group_leader */ - unsigned long min_nr_running; /* Nr running of group_min */ -#endif }; /* @@ -3486,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } - -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * init_sd_power_savings_stats - Initialize power savings statistics for - * the given sched_domain, during load balancing. - * - * @sd: Sched domain whose power-savings statistics are to be initialized. - * @sds: Variable containing the statistics for sd. - * @idle: Idle status of the CPU at which we're performing load-balancing. - */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -} - -/** - * update_sd_power_savings_stats - Update the power saving stats for a - * sched_domain while performing load balancing. - * - * @group: sched_group belonging to the sched_domain under consideration. - * @sds: Variable containing the statistics of the sched_domain - * @local_group: Does group contain the CPU for which we're performing - * load balancing ? - * @sgs: Variable containing the statistics of the group. - */ -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - - if (!sds->power_savings_balance) - return; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && (sds->this_nr_running >= sgs->group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs->sum_nr_running >= sgs->group_capacity || - !sgs->sum_nr_running) - return; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs->sum_nr_running < sds->min_nr_running) || - (sgs->sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs->sum_nr_running; - sds->min_load_per_task = sgs->sum_weighted_load / - sgs->sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs->sum_nr_running + 1 > sgs->group_capacity) - return; - - if (sgs->sum_nr_running > sds->leader_nr_running || - (sgs->sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs->sum_nr_running; - } -} - -/** - * check_power_save_busiest_group - see if there is potential for some power-savings balance - * @sds: Variable containing the statistics of the sched_domain - * under consideration. - * @this_cpu: Cpu at which we're currently performing load-balancing. - * @imbalance: Variable to store the imbalance. - * - * Description: - * Check if we have potential to perform some power-savings balance. - * If yes, set the busiest group to be the least loaded group in the - * sched_domain, so that it's CPUs can be put to idle. - * - * Returns 1 if there is potential to perform power-savings balance. - * Else returns 0. - */ -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - if (!sds->power_savings_balance) - return 0; - - if (sds->this != sds->group_leader || - sds->group_leader == sds->group_min) - return 0; - - *imbalance = sds->min_load_per_task; - sds->busiest = sds->group_min; - - return 1; - -} -#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static inline void init_sd_power_savings_stats(struct sched_domain *sd, - struct sd_lb_stats *sds, enum cpu_idle_type idle) -{ - return; -} - -static inline void update_sd_power_savings_stats(struct sched_group *group, - struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) -{ - return; -} - -static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) -{ - return 0; -} -#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ - - unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { return SCHED_POWER_SCALE; @@ -3765,24 +3612,22 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. * @group: sched_group whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_domain *sd, - struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, +static inline void update_sg_lb_stats(struct lb_env *env, + struct sched_group *group, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; - int i; + unsigned long nr_running, max_nr_running, min_nr_running; + unsigned long load, max_cpu_load, min_cpu_load; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; + int i; if (local_group) balance_cpu = group_first_cpu(group); @@ -3791,10 +3636,13 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, max_cpu_load = 0; min_cpu_load = ~0UL; max_nr_running = 0; + min_nr_running = ~0UL; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); + nr_running = rq->nr_running; + /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -3805,16 +3653,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) { + if (load > max_cpu_load) max_cpu_load = load; - max_nr_running = rq->nr_running; - } if (min_cpu_load > load) min_cpu_load = load; + + if (nr_running > max_nr_running) + max_nr_running = nr_running; + if (min_nr_running > nr_running) + min_nr_running = nr_running; } sgs->group_load += load; - sgs->sum_nr_running += rq->nr_running; + sgs->sum_nr_running += nr_running; sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; @@ -3827,14 +3678,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * to do the newly idle load balance. */ if (local_group) { - if (idle != CPU_NEWLY_IDLE) { - if (balance_cpu != this_cpu) { + if (env->idle != CPU_NEWLY_IDLE) { + if (balance_cpu != env->dst_cpu) { *balance = 0; return; } - update_group_power(sd, this_cpu); + update_group_power(env->sd, env->dst_cpu); } else if (time_after_eq(jiffies, group->sgp->next_update)) - update_group_power(sd, this_cpu); + update_group_power(env->sd, env->dst_cpu); } /* Adjust by relative CPU power of the group */ @@ -3852,13 +3703,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && + (max_nr_running - min_nr_running) > 1) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); if (!sgs->group_capacity) - sgs->group_capacity = fix_small_capacity(sd, group); + sgs->group_capacity = fix_small_capacity(env->sd, group); sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) @@ -3876,11 +3728,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * Determine if @sg is a busier group than the previously selected * busiest group. */ -static bool update_sd_pick_busiest(struct sched_domain *sd, +static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *sg, - struct sg_lb_stats *sgs, - int this_cpu) + struct sg_lb_stats *sgs) { if (sgs->avg_load <= sds->max_load) return false; @@ -3896,8 +3747,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * numbered CPUs in the group, therefore mark all groups * higher than ourself as busy. */ - if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && - this_cpu < group_first_cpu(sg)) { + if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + env->dst_cpu < group_first_cpu(sg)) { if (!sds->busiest) return true; @@ -3917,28 +3768,27 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, const struct cpumask *cpus, - int *balance, struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, + const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) { - struct sched_domain *child = sd->child; - struct sched_group *sg = sd->groups; + struct sched_domain *child = env->sd->child; + struct sched_group *sg = env->sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; - init_sd_power_savings_stats(sd, sds, idle); - load_idx = get_sd_load_idx(sd, idle); + load_idx = get_sd_load_idx(env->sd, env->idle); do { int local_group; - local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); + local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, - local_group, cpus, balance, &sgs); + update_sg_lb_stats(env, sg, load_idx, local_group, + cpus, balance, &sgs); if (local_group && !(*balance)) return; @@ -3966,7 +3816,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->this_load_per_task = sgs.sum_weighted_load; sds->this_has_capacity = sgs.group_has_capacity; sds->this_idle_cpus = sgs.idle_cpus; - } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { + } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { sds->max_load = sgs.avg_load; sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; @@ -3978,9 +3828,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(sg, sds, local_group, &sgs); sg = sg->next; - } while (sg != sd->groups); + } while (sg != env->sd->groups); } /** @@ -4008,24 +3857,23 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, * @this_cpu: The cpu at whose sched_domain we're performing load-balance. * @imbalance: returns amount of imbalanced due to packing. */ -static int check_asym_packing(struct sched_domain *sd, - struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) +static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) { int busiest_cpu; - if (!(sd->flags & SD_ASYM_PACKING)) + if (!(env->sd->flags & SD_ASYM_PACKING)) return 0; if (!sds->busiest) return 0; busiest_cpu = group_first_cpu(sds->busiest); - if (this_cpu > busiest_cpu) + if (env->dst_cpu > busiest_cpu) return 0; - *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, - SCHED_POWER_SCALE); + env->imbalance = DIV_ROUND_CLOSEST( + sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); + return 1; } @@ -4037,8 +3885,8 @@ static int check_asym_packing(struct sched_domain *sd, * @this_cpu: The cpu at whose sched_domain we're performing load-balance. * @imbalance: Variable to store the imbalance. */ -static inline void fix_small_imbalance(struct sd_lb_stats *sds, - int this_cpu, unsigned long *imbalance) +static inline +void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { unsigned long tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; @@ -4049,9 +3897,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, if (sds->busiest_load_per_task > sds->this_load_per_task) imbn = 1; - } else + } else { sds->this_load_per_task = - cpu_avg_load_per_task(this_cpu); + cpu_avg_load_per_task(env->dst_cpu); + } scaled_busy_load_per_task = sds->busiest_load_per_task * SCHED_POWER_SCALE; @@ -4059,7 +3908,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { - *imbalance = sds->busiest_load_per_task; + env->imbalance = sds->busiest_load_per_task; return; } @@ -4096,18 +3945,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, /* Move if we gain throughput */ if (pwr_move > pwr_now) - *imbalance = sds->busiest_load_per_task; + env->imbalance = sds->busiest_load_per_task; } /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. + * @env: load balance environment * @sds: statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: Cpu for which currently load balance is being performed. - * @imbalance: The variable to store the imbalance. */ -static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, - unsigned long *imbalance) +static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { unsigned long max_pull, load_above_capacity = ~0UL; @@ -4123,8 +3970,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * its cpu_power, while calculating max_load..) */ if (sds->max_load < sds->avg_load) { - *imbalance = 0; - return fix_small_imbalance(sds, this_cpu, imbalance); + env->imbalance = 0; + return fix_small_imbalance(env, sds); } if (!sds->group_imb) { @@ -4152,7 +3999,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->sgp->power, + env->imbalance = min(max_pull * sds->busiest->sgp->power, (sds->avg_load - sds->this_load) * sds->this->sgp->power) / SCHED_POWER_SCALE; @@ -4162,8 +4009,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < sds->busiest_load_per_task) - return fix_small_imbalance(sds, this_cpu, imbalance); + if (env->imbalance < sds->busiest_load_per_task) + return fix_small_imbalance(env, sds); } @@ -4194,9 +4041,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - const struct cpumask *cpus, int *balance) +find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; @@ -4206,7 +4051,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); + update_sd_lb_stats(env, cpus, balance, &sds); /* * this_cpu is not the appropriate cpu to perform load balancing at @@ -4215,8 +4060,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!(*balance)) goto ret; - if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && - check_asym_packing(sd, &sds, this_cpu, imbalance)) + if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && + check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ @@ -4234,7 +4079,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) goto force_balance; @@ -4252,7 +4097,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.this_load >= sds.avg_load) goto out_balanced; - if (idle == CPU_IDLE) { + if (env->idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and @@ -4267,34 +4112,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use * imbalance_pct to be conservative. */ - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) goto out_balanced; } force_balance: /* Looks like there is an imbalance. Compute it */ - calculate_imbalance(&sds, this_cpu, imbalance); + calculate_imbalance(env, &sds); return sds.busiest; out_balanced: - /* - * There is no obvious imbalance. But check if we can do some balancing - * to save power. - */ - if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) - return sds.busiest; ret: - *imbalance = 0; + env->imbalance = 0; return NULL; } /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static struct rq * -find_busiest_queue(struct sched_domain *sd, struct sched_group *group, - enum cpu_idle_type idle, unsigned long imbalance, - const struct cpumask *cpus) +static struct rq *find_busiest_queue(struct lb_env *env, + struct sched_group *group, + const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -4307,7 +4145,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, unsigned long wl; if (!capacity) - capacity = fix_small_capacity(sd, group); + capacity = fix_small_capacity(env->sd, group); if (!cpumask_test_cpu(i, cpus)) continue; @@ -4319,7 +4157,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu power. */ - if (capacity && rq->nr_running == 1 && wl > imbalance) + if (capacity && rq->nr_running == 1 && wl > env->imbalance) continue; /* @@ -4348,40 +4186,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int idle, - int busiest_cpu, int this_cpu) +static int need_active_balance(struct lb_env *env) { - if (idle == CPU_NEWLY_IDLE) { + struct sched_domain *sd = env->sd; + + if (env->idle == CPU_NEWLY_IDLE) { /* * ASYM_PACKING needs to force migrate tasks from busy but * higher numbered CPUs in order to pack all tasks in the * lowest numbered CPUs. */ - if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) return 1; - - /* - * The only task running in a non-idle cpu can be moved to this - * cpu in an attempt to completely freeup the other CPU - * package. - * - * The package power saving logic comes from - * find_busiest_group(). If there are no imbalance, then - * f_b_g() will return NULL. However when sched_mc={1,2} then - * f_b_g() will select a group from which a running task may be - * pulled to this cpu in order to make the other package idle. - * If there is no opportunity to make a package idle and if - * there are no imbalance, then f_b_g() will return NULL and no - * action will be taken in load_balance_newidle(). - * - * Under normal task pull operation due to imbalance, there - * will be more than one task in the source run queue and - * move_tasks() will succeed. ld_moved will be true and this - * active balance code will not be triggered. - */ - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) - return 0; } return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); @@ -4399,7 +4216,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, { int ld_moved, active_balance = 0; struct sched_group *group; - unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); @@ -4417,8 +4233,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, - cpus, balance); + group = find_busiest_group(&env, cpus, balance); if (*balance == 0) goto out_balanced; @@ -4428,7 +4243,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); + busiest = find_busiest_queue(&env, group, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -4436,7 +4251,7 @@ redo: BUG_ON(busiest == this_rq); - schedstat_add(sd, lb_imbalance[idle], imbalance); + schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; if (busiest->nr_running > 1) { @@ -4447,10 +4262,9 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.load_move = imbalance; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; - env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: local_irq_save(flags); @@ -4492,7 +4306,7 @@ more_balance: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { + if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -4519,10 +4333,11 @@ more_balance: } raw_spin_unlock_irqrestore(&busiest->lock, flags); - if (active_balance) + if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); + } /* * We've kicked active balancing, reset the failure @@ -4703,104 +4518,15 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -/** - * lowest_flag_domain - Return lowest sched_domain containing flag. - * @cpu: The cpu whose lowest level of sched domain is to - * be returned. - * @flag: The flag to check for the lowest sched_domain - * for the given cpu. - * - * Returns the lowest sched_domain of a cpu which contains the given flag. - */ -static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -{ - struct sched_domain *sd; - - for_each_domain(cpu, sd) - if (sd->flags & flag) - break; - - return sd; -} - -/** - * for_each_flag_domain - Iterates over sched_domains containing the flag. - * @cpu: The cpu whose domains we're iterating over. - * @sd: variable holding the value of the power_savings_sd - * for cpu. - * @flag: The flag to filter the sched_domains to be iterated. - * - * Iterates over all the scheduler domains for a given cpu that has the 'flag' - * set, starting from the lowest sched_domain to the highest. - */ -#define for_each_flag_domain(cpu, sd, flag) \ - for (sd = lowest_flag_domain(cpu, flag); \ - (sd && (sd->flags & flag)); sd = sd->parent) - -/** - * find_new_ilb - Finds the optimum idle load balancer for nomination. - * @cpu: The cpu which is nominating a new idle_load_balancer. - * - * Returns: Returns the id of the idle load balancer if it exists, - * Else, returns >= nr_cpu_ids. - * - * This algorithm picks the idle load balancer such that it belongs to a - * semi-idle powersavings sched_domain. The idea is to try and avoid - * completely idle packages/cores just for the purpose of idle load balancing - * when there are other idle cpu's which are better suited for that job. - */ -static int find_new_ilb(int cpu) +static inline int find_new_ilb(int call_cpu) { int ilb = cpumask_first(nohz.idle_cpus_mask); - struct sched_group *ilbg; - struct sched_domain *sd; - - /* - * Have idle load balancer selection from semi-idle packages only - * when power-aware load balancing is enabled - */ - if (!(sched_smt_power_savings || sched_mc_power_savings)) - goto out_done; - - /* - * Optimize for the case when we have no idle CPUs or only one - * idle CPU. Don't walk the sched_domain hierarchy in such cases - */ - if (cpumask_weight(nohz.idle_cpus_mask) < 2) - goto out_done; - - rcu_read_lock(); - for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { - ilbg = sd->groups; - - do { - if (ilbg->group_weight != - atomic_read(&ilbg->sgp->nr_busy_cpus)) { - ilb = cpumask_first_and(nohz.idle_cpus_mask, - sched_group_cpus(ilbg)); - goto unlock; - } - - ilbg = ilbg->next; - - } while (ilbg != sd->groups); - } -unlock: - rcu_read_unlock(); -out_done: if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; return nr_cpu_ids; } -#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ -static inline int find_new_ilb(int call_cpu) -{ - return nr_cpu_ids; -} -#endif /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick the @@ -5023,7 +4749,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) raw_spin_lock_irq(&this_rq->lock); update_rq_clock(this_rq); - update_cpu_load(this_rq); + update_idle_cpu_load(this_rq); raw_spin_unlock_irq(&this_rq->lock); rebalance_domains(balance_cpu, CPU_IDLE); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 91b4c957f289..b44d604b35d1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -4,7 +4,7 @@ * idle-task scheduling class. * * (NOTE: these are not related to SCHED_IDLE tasks which are - * handled in sched_fair.c) + * handled in sched/fair.c) */ #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d0..c5565c3c515f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1803,44 +1803,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_rt(struct task_struct *p, const struct cpumask *new_mask) { - int weight = cpumask_weight(new_mask); + struct rq *rq; + int weight; BUG_ON(!rt_task(p)); - /* - * Update the migration status of the RQ if we have an RT task - * which is running AND changing its weight value. - */ - if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { - struct rq *rq = task_rq(p); - - if (!task_current(rq, p)) { - /* - * Make sure we dequeue this task from the pushable list - * before going further. It will either remain off of - * the list because we are no longer pushable, or it - * will be requeued. - */ - if (p->rt.nr_cpus_allowed > 1) - dequeue_pushable_task(rq, p); + if (!p->on_rq) + return; - /* - * Requeue if our weight is changing and still > 1 - */ - if (weight > 1) - enqueue_pushable_task(rq, p); + weight = cpumask_weight(new_mask); - } + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + return; - if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { - rq->rt.rt_nr_migratory++; - } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { - BUG_ON(!rq->rt.rt_nr_migratory); - rq->rt.rt_nr_migratory--; - } + rq = task_rq(p); - update_rt_migration(&rq->rt); + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_task(rq, p); + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_task(rq, p); + rq->rt.rt_nr_migratory++; } + + update_rt_migration(&rq->rt); } /* Assumes rq->lock is held */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb3acba4d52e..ba9dccfd24ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -201,7 +201,7 @@ struct cfs_bandwidth { }; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long nr_running, h_nr_running; + unsigned int nr_running, h_nr_running; u64 exec_clock; u64 min_vruntime; @@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void) /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; - unsigned long rt_nr_running; + unsigned int rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ @@ -353,7 +353,7 @@ struct rq { * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ - unsigned long nr_running; + unsigned int nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; @@ -876,7 +876,7 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); -extern void update_cpu_load(struct rq *this_rq); +extern void update_idle_cpu_load(struct rq *this_rq); #ifdef CONFIG_CGROUP_CPUACCT #include <linux/cgroup.h> diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e8d76c5895ea..ee376beedaf9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -3,16 +3,357 @@ * * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> * - * This defines a simple but solid secure-computing mode. + * Copyright (C) 2012 Google, Inc. + * Will Drewry <wad@chromium.org> + * + * This defines a simple but solid secure-computing facility. + * + * Mode 1 uses a fixed list of allowed system calls. + * Mode 2 allows user-defined system call filters in the form + * of Berkeley Packet Filters/Linux Socket Filters. */ +#include <linux/atomic.h> #include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/sched.h> #include <linux/compat.h> +#include <linux/sched.h> +#include <linux/seccomp.h> /* #define SECCOMP_DEBUG 1 */ -#define NR_SECCOMP_MODES 1 + +#ifdef CONFIG_SECCOMP_FILTER +#include <asm/syscall.h> +#include <linux/filter.h> +#include <linux/ptrace.h> +#include <linux/security.h> +#include <linux/slab.h> +#include <linux/tracehook.h> +#include <linux/uaccess.h> + +/** + * struct seccomp_filter - container for seccomp BPF programs + * + * @usage: reference count to manage the object lifetime. + * get/put helpers should be used when accessing an instance + * outside of a lifetime-guarded section. In general, this + * is only needed for handling filters shared across tasks. + * @prev: points to a previously installed, or inherited, filter + * @len: the number of instructions in the program + * @insns: the BPF program instructions to evaluate + * + * seccomp_filter objects are organized in a tree linked via the @prev + * pointer. For any task, it appears to be a singly-linked list starting + * with current->seccomp.filter, the most recently attached or inherited filter. + * However, multiple filters may share a @prev node, by way of fork(), which + * results in a unidirectional tree existing in memory. This is similar to + * how namespaces work. + * + * seccomp_filter objects should never be modified after being attached + * to a task_struct (other than @usage). + */ +struct seccomp_filter { + atomic_t usage; + struct seccomp_filter *prev; + unsigned short len; /* Instruction count */ + struct sock_filter insns[]; +}; + +/* Limit any path through the tree to 256KB worth of instructions. */ +#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) + +/** + * get_u32 - returns a u32 offset into data + * @data: a unsigned 64 bit value + * @index: 0 or 1 to return the first or second 32-bits + * + * This inline exists to hide the length of unsigned long. If a 32-bit + * unsigned long is passed in, it will be extended and the top 32-bits will be + * 0. If it is a 64-bit unsigned long, then whatever data is resident will be + * properly returned. + * + * Endianness is explicitly ignored and left for BPF program authors to manage + * as per the specific architecture. + */ +static inline u32 get_u32(u64 data, int index) +{ + return ((u32 *)&data)[index]; +} + +/* Helper for bpf_load below. */ +#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) +/** + * bpf_load: checks and returns a pointer to the requested offset + * @off: offset into struct seccomp_data to load from + * + * Returns the requested 32-bits of data. + * seccomp_check_filter() should assure that @off is 32-bit aligned + * and not out of bounds. Failure to do so is a BUG. + */ +u32 seccomp_bpf_load(int off) +{ + struct pt_regs *regs = task_pt_regs(current); + if (off == BPF_DATA(nr)) + return syscall_get_nr(current, regs); + if (off == BPF_DATA(arch)) + return syscall_get_arch(current, regs); + if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { + unsigned long value; + int arg = (off - BPF_DATA(args[0])) / sizeof(u64); + int index = !!(off % sizeof(u64)); + syscall_get_arguments(current, regs, arg, 1, &value); + return get_u32(value, index); + } + if (off == BPF_DATA(instruction_pointer)) + return get_u32(KSTK_EIP(current), 0); + if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) + return get_u32(KSTK_EIP(current), 1); + /* seccomp_check_filter should make this impossible. */ + BUG(); +} + +/** + * seccomp_check_filter - verify seccomp filter code + * @filter: filter to verify + * @flen: length of filter + * + * Takes a previously checked filter (by sk_chk_filter) and + * redirects all filter code that loads struct sk_buff data + * and related data through seccomp_bpf_load. It also + * enforces length and alignment checking of those loads. + * + * Returns 0 if the rule set is legal or -EINVAL if not. + */ +static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) +{ + int pc; + for (pc = 0; pc < flen; pc++) { + struct sock_filter *ftest = &filter[pc]; + u16 code = ftest->code; + u32 k = ftest->k; + + switch (code) { + case BPF_S_LD_W_ABS: + ftest->code = BPF_S_ANC_SECCOMP_LD_W; + /* 32-bit aligned and not out of bounds. */ + if (k >= sizeof(struct seccomp_data) || k & 3) + return -EINVAL; + continue; + case BPF_S_LD_W_LEN: + ftest->code = BPF_S_LD_IMM; + ftest->k = sizeof(struct seccomp_data); + continue; + case BPF_S_LDX_W_LEN: + ftest->code = BPF_S_LDX_IMM; + ftest->k = sizeof(struct seccomp_data); + continue; + /* Explicitly include allowed calls. */ + case BPF_S_RET_K: + case BPF_S_RET_A: + case BPF_S_ALU_ADD_K: + case BPF_S_ALU_ADD_X: + case BPF_S_ALU_SUB_K: + case BPF_S_ALU_SUB_X: + case BPF_S_ALU_MUL_K: + case BPF_S_ALU_MUL_X: + case BPF_S_ALU_DIV_X: + case BPF_S_ALU_AND_K: + case BPF_S_ALU_AND_X: + case BPF_S_ALU_OR_K: + case BPF_S_ALU_OR_X: + case BPF_S_ALU_LSH_K: + case BPF_S_ALU_LSH_X: + case BPF_S_ALU_RSH_K: + case BPF_S_ALU_RSH_X: + case BPF_S_ALU_NEG: + case BPF_S_LD_IMM: + case BPF_S_LDX_IMM: + case BPF_S_MISC_TAX: + case BPF_S_MISC_TXA: + case BPF_S_ALU_DIV_K: + case BPF_S_LD_MEM: + case BPF_S_LDX_MEM: + case BPF_S_ST: + case BPF_S_STX: + case BPF_S_JMP_JA: + case BPF_S_JMP_JEQ_K: + case BPF_S_JMP_JEQ_X: + case BPF_S_JMP_JGE_K: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JSET_K: + case BPF_S_JMP_JSET_X: + continue; + default: + return -EINVAL; + } + } + return 0; +} + +/** + * seccomp_run_filters - evaluates all seccomp filters against @syscall + * @syscall: number of the current system call + * + * Returns valid seccomp BPF response codes. + */ +static u32 seccomp_run_filters(int syscall) +{ + struct seccomp_filter *f; + u32 ret = SECCOMP_RET_ALLOW; + + /* Ensure unexpected behavior doesn't result in failing open. */ + if (WARN_ON(current->seccomp.filter == NULL)) + return SECCOMP_RET_KILL; + + /* + * All filters in the list are evaluated and the lowest BPF return + * value always takes priority (ignoring the DATA). + */ + for (f = current->seccomp.filter; f; f = f->prev) { + u32 cur_ret = sk_run_filter(NULL, f->insns); + if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) + ret = cur_ret; + } + return ret; +} + +/** + * seccomp_attach_filter: Attaches a seccomp filter to current. + * @fprog: BPF program to install + * + * Returns 0 on success or an errno on failure. + */ +static long seccomp_attach_filter(struct sock_fprog *fprog) +{ + struct seccomp_filter *filter; + unsigned long fp_size = fprog->len * sizeof(struct sock_filter); + unsigned long total_insns = fprog->len; + long ret; + + if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) + return -EINVAL; + + for (filter = current->seccomp.filter; filter; filter = filter->prev) + total_insns += filter->len + 4; /* include a 4 instr penalty */ + if (total_insns > MAX_INSNS_PER_PATH) + return -ENOMEM; + + /* + * Installing a seccomp filter requires that the task have + * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. + * This avoids scenarios where unprivileged tasks can affect the + * behavior of privileged children. + */ + if (!current->no_new_privs && + security_capable_noaudit(current_cred(), current_user_ns(), + CAP_SYS_ADMIN) != 0) + return -EACCES; + + /* Allocate a new seccomp_filter */ + filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, + GFP_KERNEL|__GFP_NOWARN); + if (!filter) + return -ENOMEM; + atomic_set(&filter->usage, 1); + filter->len = fprog->len; + + /* Copy the instructions from fprog. */ + ret = -EFAULT; + if (copy_from_user(filter->insns, fprog->filter, fp_size)) + goto fail; + + /* Check and rewrite the fprog via the skb checker */ + ret = sk_chk_filter(filter->insns, filter->len); + if (ret) + goto fail; + + /* Check and rewrite the fprog for seccomp use */ + ret = seccomp_check_filter(filter->insns, filter->len); + if (ret) + goto fail; + + /* + * If there is an existing filter, make it the prev and don't drop its + * task reference. + */ + filter->prev = current->seccomp.filter; + current->seccomp.filter = filter; + return 0; +fail: + kfree(filter); + return ret; +} + +/** + * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * @user_filter: pointer to the user data containing a sock_fprog. + * + * Returns 0 on success and non-zero otherwise. + */ +long seccomp_attach_user_filter(char __user *user_filter) +{ + struct sock_fprog fprog; + long ret = -EFAULT; + +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + struct compat_sock_fprog fprog32; + if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) + goto out; + fprog.len = fprog32.len; + fprog.filter = compat_ptr(fprog32.filter); + } else /* falls through to the if below. */ +#endif + if (copy_from_user(&fprog, user_filter, sizeof(fprog))) + goto out; + ret = seccomp_attach_filter(&fprog); +out: + return ret; +} + +/* get_seccomp_filter - increments the reference count of the filter on @tsk */ +void get_seccomp_filter(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + if (!orig) + return; + /* Reference count is bounded by the number of total processes. */ + atomic_inc(&orig->usage); +} + +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + /* Clean up single-reference branches iteratively. */ + while (orig && atomic_dec_and_test(&orig->usage)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + kfree(freeme); + } +} + +/** + * seccomp_send_sigsys - signals the task to allow in-process syscall emulation + * @syscall: syscall number to send to userland + * @reason: filter-supplied reason code to send to userland (via si_errno) + * + * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. + */ +static void seccomp_send_sigsys(int syscall, int reason) +{ + struct siginfo info; + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSYS; + info.si_code = SYS_SECCOMP; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = reason; + info.si_arch = syscall_get_arch(current, task_pt_regs(current)); + info.si_syscall = syscall; + force_sig_info(SIGSYS, &info, current); +} +#endif /* CONFIG_SECCOMP_FILTER */ /* * Secure computing mode 1 allows only read/write/exit/sigreturn. @@ -31,13 +372,15 @@ static int mode1_syscalls_32[] = { }; #endif -void __secure_computing(int this_syscall) +int __secure_computing(int this_syscall) { int mode = current->seccomp.mode; - int * syscall; + int exit_sig = 0; + int *syscall; + u32 ret; switch (mode) { - case 1: + case SECCOMP_MODE_STRICT: syscall = mode1_syscalls; #ifdef CONFIG_COMPAT if (is_compat_task()) @@ -45,9 +388,54 @@ void __secure_computing(int this_syscall) #endif do { if (*syscall == this_syscall) - return; + return 0; } while (*++syscall); + exit_sig = SIGKILL; + ret = SECCOMP_RET_KILL; + break; +#ifdef CONFIG_SECCOMP_FILTER + case SECCOMP_MODE_FILTER: { + int data; + ret = seccomp_run_filters(this_syscall); + data = ret & SECCOMP_RET_DATA; + ret &= SECCOMP_RET_ACTION; + switch (ret) { + case SECCOMP_RET_ERRNO: + /* Set the low-order 16-bits as a errno. */ + syscall_set_return_value(current, task_pt_regs(current), + -data, 0); + goto skip; + case SECCOMP_RET_TRAP: + /* Show the handler the original registers. */ + syscall_rollback(current, task_pt_regs(current)); + /* Let the filter pass back 16 bits of data. */ + seccomp_send_sigsys(this_syscall, data); + goto skip; + case SECCOMP_RET_TRACE: + /* Skip these calls if there is no tracer. */ + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) + goto skip; + /* Allow the BPF to provide the event message */ + ptrace_event(PTRACE_EVENT_SECCOMP, data); + /* + * The delivery of a fatal signal during event + * notification may silently skip tracer notification. + * Terminating the task now avoids executing a system + * call that may not be intended. + */ + if (fatal_signal_pending(current)) + break; + return 0; + case SECCOMP_RET_ALLOW: + return 0; + case SECCOMP_RET_KILL: + default: + break; + } + exit_sig = SIGSYS; break; + } +#endif default: BUG(); } @@ -55,8 +443,13 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif - audit_seccomp(this_syscall); - do_exit(SIGKILL); + audit_seccomp(this_syscall, exit_sig, ret); + do_exit(exit_sig); +#ifdef CONFIG_SECCOMP_FILTER +skip: + audit_seccomp(this_syscall, exit_sig, ret); +#endif + return -1; } long prctl_get_seccomp(void) @@ -64,25 +457,48 @@ long prctl_get_seccomp(void) return current->seccomp.mode; } -long prctl_set_seccomp(unsigned long seccomp_mode) +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * This function may be called repeatedly with a @seccomp_mode of + * SECCOMP_MODE_FILTER to install additional filters. Every filter + * successfully installed will be evaluated (in reverse order) for each system + * call the task makes. + * + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) { - long ret; + long ret = -EINVAL; - /* can set it only once to be even more secure */ - ret = -EPERM; - if (unlikely(current->seccomp.mode)) + if (current->seccomp.mode && + current->seccomp.mode != seccomp_mode) goto out; - ret = -EINVAL; - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { - current->seccomp.mode = seccomp_mode; - set_thread_flag(TIF_SECCOMP); + switch (seccomp_mode) { + case SECCOMP_MODE_STRICT: + ret = 0; #ifdef TIF_NOTSC disable_TSC(); #endif - ret = 0; + break; +#ifdef CONFIG_SECCOMP_FILTER + case SECCOMP_MODE_FILTER: + ret = seccomp_attach_user_filter(filter); + if (ret) + goto out; + break; +#endif + default: + goto out; } - out: + current->seccomp.mode = seccomp_mode; + set_thread_flag(TIF_SECCOMP); +out: return ret; } diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 60636a4e25c3..4567fc020fe3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -118,7 +118,7 @@ EXPORT_SYMBOL(down_killable); * down_trylock - try to acquire the semaphore, without waiting * @sem: the semaphore to be acquired * - * Try to acquire the semaphore atomically. Returns 0 if the mutex has + * Try to acquire the semaphore atomically. Returns 0 if the semaphore has * been acquired successfully or 1 if it it cannot be acquired. * * NOTE: This return value is inverted from both spin_trylock and diff --git a/kernel/signal.c b/kernel/signal.c index 3ad220a81619..4dbf00dfb359 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -160,7 +160,7 @@ void recalc_sigpending(void) #define SYNCHRONOUS_MASK \ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ - sigmask(SIGTRAP) | sigmask(SIGFPE)) + sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) int next_signal(struct sigpending *pending, sigset_t *mask) { @@ -767,14 +767,13 @@ static int kill_ok_by_cred(struct task_struct *t) const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->euid == tcred->suid || - cred->euid == tcred->uid || - cred->uid == tcred->suid || - cred->uid == tcred->uid)) + if (uid_eq(cred->euid, tcred->suid) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->uid, tcred->suid) || + uid_eq(cred->uid, tcred->uid)) return 1; - if (ns_capable(tcred->user->user_ns, CAP_KILL)) + if (ns_capable(tcred->user_ns, CAP_KILL)) return 1; return 0; @@ -1020,15 +1019,6 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -/* - * map the uid in struct cred into user namespace *ns - */ -static inline uid_t map_cred_ns(const struct cred *cred, - struct user_namespace *ns) -{ - return user_ns_map_uid(ns, cred, cred->uid); -} - #ifdef CONFIG_USER_NS static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) { @@ -1038,8 +1028,10 @@ static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_str if (SI_FROMKERNEL(info)) return; - info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), - current_cred(), info->si_uid); + rcu_read_lock(); + info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), + make_kuid(current_user_ns(), info->si_uid)); + rcu_read_unlock(); } #else static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) @@ -1106,7 +1098,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); - q->info.si_uid = current_uid(); + q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); break; case (unsigned long) SEND_SIG_PRIV: q->info.si_signo = sig; @@ -1387,10 +1379,8 @@ static int kill_as_cred_perm(const struct cred *cred, struct task_struct *target) { const struct cred *pcred = __task_cred(target); - if (cred->user_ns != pcred->user_ns) - return 0; - if (cred->euid != pcred->suid && cred->euid != pcred->uid && - cred->uid != pcred->suid && cred->uid != pcred->uid) + if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && + !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) return 0; return 1; } @@ -1678,8 +1668,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = map_cred_ns(__task_cred(tsk), - task_cred_xxx(tsk->parent, user_ns)); + info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), + task_uid(tsk)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1762,8 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = map_cred_ns(__task_cred(tsk), - task_cred_xxx(parent, user_ns)); + info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -1973,7 +1962,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ ptrace_stop(exit_code, why, 1, &info); @@ -2181,8 +2170,8 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_code = SI_USER; rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = map_cred_ns(__task_cred(current->parent), - current_user_ns()); + info->si_uid = from_kuid_munged(current_user_ns(), + task_uid(current->parent)); rcu_read_unlock(); } @@ -2706,6 +2695,13 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; +#ifdef __ARCH_SIGSYS + case __SI_SYS: + err |= __put_user(from->si_call_addr, &to->si_call_addr); + err |= __put_user(from->si_syscall, &to->si_syscall); + err |= __put_user(from->si_arch, &to->si_arch); + break; +#endif default: /* this is just in case for now ... */ err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); @@ -2828,7 +2824,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) info.si_errno = 0; info.si_code = SI_USER; info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); return kill_something_info(sig, &info, pid); } @@ -2871,7 +2867,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) info.si_errno = 0; info.si_code = SI_TKILL; info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); return do_send_specific(tgid, pid, sig, &info); } diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e4..6df42624e454 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -93,10 +93,8 @@ int overflowuid = DEFAULT_OVERFLOWUID; int overflowgid = DEFAULT_OVERFLOWGID; -#ifdef CONFIG_UID16 EXPORT_SYMBOL(overflowuid); EXPORT_SYMBOL(overflowgid); -#endif /* * the same as above, but for filesystems which can only store a 16-bit @@ -133,11 +131,10 @@ static bool set_one_prio_perm(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred = __task_cred(p); - if (pcred->user->user_ns == cred->user->user_ns && - (pcred->uid == cred->euid || - pcred->euid == cred->euid)) + if (uid_eq(pcred->uid, cred->euid) || + uid_eq(pcred->euid, cred->euid)) return true; - if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) return true; return false; } @@ -177,6 +174,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) goto out; @@ -209,18 +207,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = (struct user_struct *) cred->user; + uid = make_kuid(cred->user_ns, who); + user = cred->user; if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && + !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - if (__task_cred(p)->uid == who) + if (uid_eq(task_uid(p), uid)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); - if (who != cred->uid) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } @@ -244,6 +243,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; @@ -274,21 +274,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = (struct user_struct *) cred->user; + uid = make_kuid(cred->user_ns, who); + user = cred->user; if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && + !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - if (__task_cred(p)->uid == who) { + if (uid_eq(task_uid(p), uid)) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } } while_each_thread(g, p); - if (who != cred->uid) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } @@ -553,9 +554,19 @@ void ctrl_alt_del(void) */ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t krgid, kegid; + + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -564,25 +575,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) retval = -EPERM; if (rgid != (gid_t) -1) { - if (old->gid == rgid || - old->egid == rgid || + if (gid_eq(old->gid, krgid) || + gid_eq(old->egid, krgid) || nsown_capable(CAP_SETGID)) - new->gid = rgid; + new->gid = krgid; else goto error; } if (egid != (gid_t) -1) { - if (old->gid == egid || - old->egid == egid || - old->sgid == egid || + if (gid_eq(old->gid, kegid) || + gid_eq(old->egid, kegid) || + gid_eq(old->sgid, kegid) || nsown_capable(CAP_SETGID)) - new->egid = egid; + new->egid = kegid; else goto error; } if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old->gid)) + (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) new->sgid = new->egid; new->fsgid = new->egid; @@ -600,9 +611,15 @@ error: */ SYSCALL_DEFINE1(setgid, gid_t, gid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t kgid; + + kgid = make_kgid(ns, gid); + if (!gid_valid(kgid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -611,9 +628,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) retval = -EPERM; if (nsown_capable(CAP_SETGID)) - new->gid = new->egid = new->sgid = new->fsgid = gid; - else if (gid == old->gid || gid == old->sgid) - new->egid = new->fsgid = gid; + new->gid = new->egid = new->sgid = new->fsgid = kgid; + else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) + new->egid = new->fsgid = kgid; else goto error; @@ -631,7 +648,7 @@ static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current_user_ns(), new->uid); + new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; @@ -670,9 +687,19 @@ static int set_user(struct cred *new) */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kruid, keuid; + + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); + + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -681,29 +708,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) retval = -EPERM; if (ruid != (uid_t) -1) { - new->uid = ruid; - if (old->uid != ruid && - old->euid != ruid && + new->uid = kruid; + if (!uid_eq(old->uid, kruid) && + !uid_eq(old->euid, kruid) && !nsown_capable(CAP_SETUID)) goto error; } if (euid != (uid_t) -1) { - new->euid = euid; - if (old->uid != euid && - old->euid != euid && - old->suid != euid && + new->euid = keuid; + if (!uid_eq(old->uid, keuid) && + !uid_eq(old->euid, keuid) && + !uid_eq(old->suid, keuid) && !nsown_capable(CAP_SETUID)) goto error; } - if (new->uid != old->uid) { + if (!uid_eq(new->uid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old->uid)) + (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) new->suid = new->euid; new->fsuid = new->euid; @@ -731,9 +758,15 @@ error: */ SYSCALL_DEFINE1(setuid, uid_t, uid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kuid; + + kuid = make_kuid(ns, uid); + if (!uid_valid(kuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -742,17 +775,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (nsown_capable(CAP_SETUID)) { - new->suid = new->uid = uid; - if (uid != old->uid) { + new->suid = new->uid = kuid; + if (!uid_eq(kuid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } - } else if (uid != old->uid && uid != new->suid) { + } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { goto error; } - new->fsuid = new->euid = uid; + new->fsuid = new->euid = kuid; retval = security_task_fix_setuid(new, old, LSM_SETID_ID); if (retval < 0) @@ -772,9 +805,24 @@ error: */ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kruid, keuid, ksuid; + + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); + ksuid = make_kuid(ns, suid); + + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; + + if ((suid != (uid_t) -1) && !uid_valid(ksuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -784,29 +832,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) retval = -EPERM; if (!nsown_capable(CAP_SETUID)) { - if (ruid != (uid_t) -1 && ruid != old->uid && - ruid != old->euid && ruid != old->suid) + if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; - if (euid != (uid_t) -1 && euid != old->uid && - euid != old->euid && euid != old->suid) + if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) goto error; - if (suid != (uid_t) -1 && suid != old->uid && - suid != old->euid && suid != old->suid) + if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) goto error; } if (ruid != (uid_t) -1) { - new->uid = ruid; - if (ruid != old->uid) { + new->uid = kruid; + if (!uid_eq(kruid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } if (euid != (uid_t) -1) - new->euid = euid; + new->euid = keuid; if (suid != (uid_t) -1) - new->suid = suid; + new->suid = ksuid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RES); @@ -820,14 +868,19 @@ error: return retval; } -SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; + uid_t ruid, euid, suid; - if (!(retval = put_user(cred->uid, ruid)) && - !(retval = put_user(cred->euid, euid))) - retval = put_user(cred->suid, suid); + ruid = from_kuid_munged(cred->user_ns, cred->uid); + euid = from_kuid_munged(cred->user_ns, cred->euid); + suid = from_kuid_munged(cred->user_ns, cred->suid); + + if (!(retval = put_user(ruid, ruidp)) && + !(retval = put_user(euid, euidp))) + retval = put_user(suid, suidp); return retval; } @@ -837,9 +890,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u */ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t krgid, kegid, ksgid; + + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + ksgid = make_kgid(ns, sgid); + + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; + if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -848,23 +914,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) retval = -EPERM; if (!nsown_capable(CAP_SETGID)) { - if (rgid != (gid_t) -1 && rgid != old->gid && - rgid != old->egid && rgid != old->sgid) + if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; - if (egid != (gid_t) -1 && egid != old->gid && - egid != old->egid && egid != old->sgid) + if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) goto error; - if (sgid != (gid_t) -1 && sgid != old->gid && - sgid != old->egid && sgid != old->sgid) + if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) goto error; } if (rgid != (gid_t) -1) - new->gid = rgid; + new->gid = krgid; if (egid != (gid_t) -1) - new->egid = egid; + new->egid = kegid; if (sgid != (gid_t) -1) - new->sgid = sgid; + new->sgid = ksgid; new->fsgid = new->egid; return commit_creds(new); @@ -874,14 +940,19 @@ error: return retval; } -SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; + gid_t rgid, egid, sgid; + + rgid = from_kgid_munged(cred->user_ns, cred->gid); + egid = from_kgid_munged(cred->user_ns, cred->egid); + sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(cred->gid, rgid)) && - !(retval = put_user(cred->egid, egid))) - retval = put_user(cred->sgid, sgid); + if (!(retval = put_user(rgid, rgidp)) && + !(retval = put_user(egid, egidp))) + retval = put_user(sgid, sgidp); return retval; } @@ -898,18 +969,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) const struct cred *old; struct cred *new; uid_t old_fsuid; + kuid_t kuid; + + old = current_cred(); + old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); + + kuid = make_kuid(old->user_ns, uid); + if (!uid_valid(kuid)) + return old_fsuid; new = prepare_creds(); if (!new) - return current_fsuid(); - old = current_cred(); - old_fsuid = old->fsuid; + return old_fsuid; - if (uid == old->uid || uid == old->euid || - uid == old->suid || uid == old->fsuid || + if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || + uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || nsown_capable(CAP_SETUID)) { - if (uid != old_fsuid) { - new->fsuid = uid; + if (!uid_eq(kuid, old->fsuid)) { + new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) goto change_okay; } @@ -931,18 +1008,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) const struct cred *old; struct cred *new; gid_t old_fsgid; + kgid_t kgid; + + old = current_cred(); + old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); + + kgid = make_kgid(old->user_ns, gid); + if (!gid_valid(kgid)) + return old_fsgid; new = prepare_creds(); if (!new) - return current_fsgid(); - old = current_cred(); - old_fsgid = old->fsgid; + return old_fsgid; - if (gid == old->gid || gid == old->egid || - gid == old->sgid || gid == old->fsgid || + if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || + gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || nsown_capable(CAP_SETGID)) { - if (gid != old_fsgid) { - new->fsgid = gid; + if (!gid_eq(kgid, old->fsgid)) { + new->fsgid = kgid; goto change_okay; } } @@ -1498,15 +1581,14 @@ static int check_prlimit_permission(struct task_struct *task) return 0; tcred = __task_cred(task); - if (cred->user->user_ns == tcred->user->user_ns && - (cred->uid == tcred->euid && - cred->uid == tcred->suid && - cred->uid == tcred->uid && - cred->gid == tcred->egid && - cred->gid == tcred->sgid && - cred->gid == tcred->gid)) + if (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)) return 0; - if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return 0; return -EPERM; @@ -1908,7 +1990,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = prctl_get_seccomp(); break; case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2); + error = prctl_set_seccomp(arg2, (char __user *)arg3); break; case PR_GET_TSC: error = GET_TSC_CTL(arg2); @@ -1979,6 +2061,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = put_user(me->signal->is_child_subreaper, (int __user *) arg2); break; + case PR_SET_NO_NEW_PRIVS: + if (arg2 != 1 || arg3 || arg4 || arg5) + return -EINVAL; + + current->no_new_privs = 1; + break; + case PR_GET_NO_NEW_PRIVS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + return current->no_new_privs ? 1 : 0; default: error = -EINVAL; break; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7b..aa27d391bfc8 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); * If one has not already been chosen, it checks to see if a * functional rtc device is available. */ -static struct rtc_device *alarmtimer_get_rtcdev(void) +struct rtc_device *alarmtimer_get_rtcdev(void) { unsigned long flags; struct rtc_device *ret; @@ -115,7 +115,7 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -static inline struct rtc_device *alarmtimer_get_rtcdev(void) +struct rtc_device *alarmtimer_get_rtcdev(void) { return NULL; } diff --git a/kernel/timer.c b/kernel/timer.c index 837c552fe838..6ec7e7e0db43 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1108,7 +1108,9 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here. */ - struct lockdep_map lockdep_map = timer->lockdep_map; + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at @@ -1433,25 +1435,25 @@ SYSCALL_DEFINE0(getppid) SYSCALL_DEFINE0(getuid) { /* Only we change this so SMP safe */ - return current_uid(); + return from_kuid_munged(current_user_ns(), current_uid()); } SYSCALL_DEFINE0(geteuid) { /* Only we change this so SMP safe */ - return current_euid(); + return from_kuid_munged(current_user_ns(), current_euid()); } SYSCALL_DEFINE0(getgid) { /* Only we change this so SMP safe */ - return current_gid(); + return from_kgid_munged(current_user_ns(), current_gid()); } SYSCALL_DEFINE0(getegid) { /* Only we change this so SMP safe */ - return current_egid(); + return from_kgid_munged(current_user_ns(), current_egid()); } #endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f2473..f347ac91292d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,6 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER @@ -272,7 +271,7 @@ config PROFILE_ANNOTATED_BRANCHES bool "Trace likely/unlikely profiler" select TRACE_BRANCH_PROFILING help - This tracer profiles all the the likely and unlikely macros + This tracer profiles all likely and unlikely macros in the kernel. It will display the results in: /sys/kernel/debug/tracing/trace_stat/branch_annotated diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5ea..b3afe0e76f79 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) obj-$(CONFIG_EVENT_TRACING) += blktrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0fa92f677c92..a008663d86c8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1383,44 +1383,73 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) static int ftrace_cmp_recs(const void *a, const void *b) { - const struct dyn_ftrace *reca = a; - const struct dyn_ftrace *recb = b; + const struct dyn_ftrace *key = a; + const struct dyn_ftrace *rec = b; - if (reca->ip > recb->ip) - return 1; - if (reca->ip < recb->ip) + if (key->flags < rec->ip) return -1; + if (key->ip >= rec->ip + MCOUNT_INSN_SIZE) + return 1; return 0; } -/** - * ftrace_location - return true if the ip giving is a traced location - * @ip: the instruction pointer to check - * - * Returns 1 if @ip given is a pointer to a ftrace location. - * That is, the instruction that is either a NOP or call to - * the function tracer. It checks the ftrace internal tables to - * determine if the address belongs or not. - */ -int ftrace_location(unsigned long ip) +static unsigned long ftrace_location_range(unsigned long start, unsigned long end) { struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; - key.ip = ip; + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; rec = bsearch(&key, pg->records, pg->index, sizeof(struct dyn_ftrace), ftrace_cmp_recs); if (rec) - return 1; + return rec->ip; } return 0; } +/** + * ftrace_location - return true if the ip giving is a traced location + * @ip: the instruction pointer to check + * + * Returns rec->ip if @ip given is a pointer to a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +unsigned long ftrace_location(unsigned long ip) +{ + return ftrace_location_range(ip, ip); +} + +/** + * ftrace_text_reserved - return true if range contains an ftrace location + * @start: start of range to search + * @end: end of range to search (inclusive). @end points to the last byte to check. + * + * Returns 1 if @start and @end contains a ftrace location. + * That is, the instruction that is either a NOP or call to + * the function tracer. It checks the ftrace internal tables to + * determine if the address belongs or not. + */ +int ftrace_text_reserved(void *start, void *end) +{ + unsigned long ret; + + ret = ftrace_location_range((unsigned long)start, + (unsigned long)end); + + return (int)!!ret; +} + static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1520,35 +1549,6 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops, __ftrace_hash_rec_update(ops, filter_hash, 1); } -static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) -{ - if (ftrace_pages->index == ftrace_pages->size) { - /* We should have allocated enough */ - if (WARN_ON(!ftrace_pages->next)) - return NULL; - ftrace_pages = ftrace_pages->next; - } - - return &ftrace_pages->records[ftrace_pages->index++]; -} - -static struct dyn_ftrace * -ftrace_record_ip(unsigned long ip) -{ - struct dyn_ftrace *rec; - - if (ftrace_disabled) - return NULL; - - rec = ftrace_alloc_dyn_node(ip); - if (!rec) - return NULL; - - rec->ip = ip; - - return rec; -} - static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -1598,21 +1598,6 @@ void ftrace_bug(int failed, unsigned long ip) } } - -/* Return 1 if the address range is reserved for ftrace */ -int ftrace_text_reserved(void *start, void *end) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - - do_for_each_ftrace_rec(pg, rec) { - if (rec->ip <= (unsigned long)end && - rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) - return 1; - } while_for_each_ftrace_rec(); - return 0; -} - static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { unsigned long flag = 0UL; @@ -1698,7 +1683,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return -1; /* unknow ftrace bug */ } -static void ftrace_replace_code(int update) +void __weak ftrace_replace_code(int enable) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -1708,7 +1693,7 @@ static void ftrace_replace_code(int update) return; do_for_each_ftrace_rec(pg, rec) { - failed = __ftrace_replace_code(rec, update); + failed = __ftrace_replace_code(rec, enable); if (failed) { ftrace_bug(failed, rec->ip); /* Stop processing */ @@ -1826,22 +1811,27 @@ int __weak ftrace_arch_code_modify_post_process(void) return 0; } -static int __ftrace_modify_code(void *data) +void ftrace_modify_all_code(int command) { - int *command = data; - - if (*command & FTRACE_UPDATE_CALLS) + if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); - else if (*command & FTRACE_DISABLE_CALLS) + else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (*command & FTRACE_UPDATE_TRACE_FUNC) + if (command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); - if (*command & FTRACE_START_FUNC_RET) + if (command & FTRACE_START_FUNC_RET) ftrace_enable_ftrace_graph_caller(); - else if (*command & FTRACE_STOP_FUNC_RET) + else if (command & FTRACE_STOP_FUNC_RET) ftrace_disable_ftrace_graph_caller(); +} + +static int __ftrace_modify_code(void *data) +{ + int *command = data; + + ftrace_modify_all_code(*command); return 0; } @@ -2469,57 +2459,35 @@ static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; } - return ret; + return iter ? 0 : -ENOMEM; } static int ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - int ret; if (unlikely(ftrace_disabled)) return -ENODEV; - iter = kzalloc(sizeof(*iter), GFP_KERNEL); - if (!iter) - return -ENOMEM; - - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - - ret = seq_open(file, &show_ftrace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = iter; - } else { - kfree(iter); + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); + if (iter) { + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; } - return ret; + return iter ? 0 : -ENOMEM; } static void ftrace_filter_reset(struct ftrace_hash *hash) @@ -3688,22 +3656,36 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) return 0; } -static void ftrace_swap_recs(void *a, void *b, int size) +static int ftrace_cmp_ips(const void *a, const void *b) +{ + const unsigned long *ipa = a; + const unsigned long *ipb = b; + + if (*ipa > *ipb) + return 1; + if (*ipa < *ipb) + return -1; + return 0; +} + +static void ftrace_swap_ips(void *a, void *b, int size) { - struct dyn_ftrace *reca = a; - struct dyn_ftrace *recb = b; - struct dyn_ftrace t; + unsigned long *ipa = a; + unsigned long *ipb = b; + unsigned long t; - t = *reca; - *reca = *recb; - *recb = t; + t = *ipa; + *ipa = *ipb; + *ipb = t; } static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) { + struct ftrace_page *start_pg; struct ftrace_page *pg; + struct dyn_ftrace *rec; unsigned long count; unsigned long *p; unsigned long addr; @@ -3715,8 +3697,11 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; - pg = ftrace_allocate_pages(count); - if (!pg) + sort(start, count, sizeof(*start), + ftrace_cmp_ips, ftrace_swap_ips); + + start_pg = ftrace_allocate_pages(count); + if (!start_pg) return -ENOMEM; mutex_lock(&ftrace_lock); @@ -3729,7 +3714,7 @@ static int ftrace_process_locs(struct module *mod, if (!mod) { WARN_ON(ftrace_pages || ftrace_pages_start); /* First initialization */ - ftrace_pages = ftrace_pages_start = pg; + ftrace_pages = ftrace_pages_start = start_pg; } else { if (!ftrace_pages) goto out; @@ -3740,11 +3725,11 @@ static int ftrace_process_locs(struct module *mod, ftrace_pages = ftrace_pages->next; } - ftrace_pages->next = pg; - ftrace_pages = pg; + ftrace_pages->next = start_pg; } p = start; + pg = start_pg; while (p < end) { addr = ftrace_call_adjust(*p++); /* @@ -3755,17 +3740,26 @@ static int ftrace_process_locs(struct module *mod, */ if (!addr) continue; - if (!ftrace_record_ip(addr)) - break; + + if (pg->index == pg->size) { + /* We should have allocated enough */ + if (WARN_ON(!pg->next)) + break; + pg = pg->next; + } + + rec = &pg->records[pg->index++]; + rec->ip = addr; } - /* These new locations need to be initialized */ - ftrace_new_pgs = pg; + /* We should have used all pages */ + WARN_ON(pg->next); + + /* Assign the last page to ftrace_pages */ + ftrace_pages = pg; - /* Make each individual set of pages sorted by ips */ - for (; pg; pg = pg->next) - sort(pg->records, pg->index, sizeof(struct dyn_ftrace), - ftrace_cmp_recs, ftrace_swap_recs); + /* These new locations need to be initialized */ + ftrace_new_pgs = start_pg; /* * We only need to disable interrupts on start up diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cf8d11e91efd..6420cda62336 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -23,6 +23,8 @@ #include <asm/local.h> #include "trace.h" +static void update_pages_handler(struct work_struct *work); + /* * The ring buffer header is special. We must manually up keep it. */ @@ -449,6 +451,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + unsigned int nr_pages; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ @@ -466,13 +469,18 @@ struct ring_buffer_per_cpu { unsigned long read_bytes; u64 write_stamp; u64 read_stamp; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ + int nr_pages_to_update; + struct list_head new_pages; /* new pages to add */ + struct work_struct update_pages_work; + struct completion update_done; }; struct ring_buffer { - unsigned pages; unsigned flags; int cpus; atomic_t record_disabled; + atomic_t resize_disabled; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -937,6 +945,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) struct list_head *head = cpu_buffer->pages; struct buffer_page *bpage, *tmp; + /* Reset the head page if it exists */ + if (cpu_buffer->head_page) + rb_set_head_page(cpu_buffer); + rb_head_page_deactivate(cpu_buffer); if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) @@ -963,14 +975,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) +static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) { + int i; struct buffer_page *bpage, *tmp; - LIST_HEAD(pages); - unsigned i; - - WARN_ON(!nr_pages); for (i = 0; i < nr_pages; i++) { struct page *page; @@ -981,15 +989,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, */ bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu_buffer->cpu)); + cpu_to_node(cpu)); if (!bpage) goto free_pages; - rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, pages); - list_add(&bpage->list, &pages); - - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) goto free_pages; @@ -997,6 +1003,27 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_init_page(bpage->page); } + return 0; + +free_pages: + list_for_each_entry_safe(bpage, tmp, pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + + return -ENOMEM; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + LIST_HEAD(pages); + + WARN_ON(!nr_pages); + + if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) + return -ENOMEM; + /* * The ring buffer page list is a circular list that does not * start and end with a list head. All page list items point to @@ -1005,20 +1032,15 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->pages = pages.next; list_del(&pages); + cpu_buffer->nr_pages = nr_pages; + rb_check_pages(cpu_buffer); return 0; - - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - return -ENOMEM; } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1035,6 +1057,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); + init_completion(&cpu_buffer->update_done); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1052,7 +1076,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); - ret = rb_allocate_pages(cpu_buffer, buffer->pages); + ret = rb_allocate_pages(cpu_buffer, nr_pages); if (ret < 0) goto fail_free_reader; @@ -1113,7 +1137,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, { struct ring_buffer *buffer; int bsize; - int cpu; + int cpu, nr_pages; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1124,14 +1148,14 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) goto fail_free_buffer; - buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; buffer->reader_lock_key = key; /* need at least two pages */ - if (buffer->pages < 2) - buffer->pages = 2; + if (nr_pages < 2) + nr_pages = 2; /* * In case of non-hotplug cpu, if the ring-buffer is allocated @@ -1154,7 +1178,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, for_each_buffer_cpu(buffer, cpu) { buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) goto fail_free_buffers; } @@ -1222,58 +1246,222 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); -static void -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +static inline unsigned long rb_page_entries(struct buffer_page *bpage) { - struct buffer_page *bpage; - struct list_head *p; - unsigned i; + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write) & RB_WRITE_MASK; +} + +static int +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +{ + struct list_head *tail_page, *to_remove, *next_page; + struct buffer_page *to_remove_page, *tmp_iter_page; + struct buffer_page *last_page, *first_page; + unsigned int nr_removed; + unsigned long head_bit; + int page_entries; + + head_bit = 0; raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); + atomic_inc(&cpu_buffer->record_disabled); + /* + * We don't race with the readers since we have acquired the reader + * lock. We also don't race with writers after disabling recording. + * This makes it easy to figure out the first and the last page to be + * removed from the list. We unlink all the pages in between including + * the first and last pages. This is done in a busy loop so that we + * lose the least number of traces. + * The pages are freed after we restart recording and unlock readers. + */ + tail_page = &cpu_buffer->tail_page->list; - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - p = cpu_buffer->pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - free_buffer_page(bpage); + /* + * tail page might be on reader page, we remove the next page + * from the ring buffer + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + tail_page = rb_list_head(tail_page->next); + to_remove = tail_page; + + /* start of pages to remove */ + first_page = list_entry(rb_list_head(to_remove->next), + struct buffer_page, list); + + for (nr_removed = 0; nr_removed < nr_pages; nr_removed++) { + to_remove = rb_list_head(to_remove)->next; + head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } - if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) - goto out; - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); + next_page = rb_list_head(to_remove)->next; -out: + /* + * Now we remove all pages between tail_page and next_page. + * Make sure that we have head_bit value preserved for the + * next page + */ + tail_page->next = (struct list_head *)((unsigned long)next_page | + head_bit); + next_page = rb_list_head(next_page); + next_page->prev = tail_page; + + /* make sure pages points to a valid page in the ring buffer */ + cpu_buffer->pages = next_page; + + /* update head page */ + if (head_bit) + cpu_buffer->head_page = list_entry(next_page, + struct buffer_page, list); + + /* + * change read pointer to make sure any read iterators reset + * themselves + */ + cpu_buffer->read = 0; + + /* pages are removed, resume tracing and then free the pages */ + atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)); + + /* last buffer page to remove */ + last_page = list_entry(rb_list_head(to_remove), struct buffer_page, + list); + tmp_iter_page = first_page; + + do { + to_remove_page = tmp_iter_page; + rb_inc_page(cpu_buffer, &tmp_iter_page); + + /* update the counters */ + page_entries = rb_page_entries(to_remove_page); + if (page_entries) { + /* + * If something was added to this page, it was full + * since it is not the tail page. So we deduct the + * bytes consumed in ring buffer from here. + * No need to update overruns, since this page is + * deleted from ring buffer and its entries are + * already accounted for. + */ + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + } + + /* + * We have already removed references to this list item, just + * free up the buffer_page and its page + */ + free_buffer_page(to_remove_page); + nr_removed--; + + } while (to_remove_page != last_page); + + RB_WARN_ON(cpu_buffer, nr_removed); + + return nr_removed == 0; } -static void -rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, - struct list_head *pages, unsigned nr_pages) +static int +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *bpage; - struct list_head *p; - unsigned i; + struct list_head *pages = &cpu_buffer->new_pages; + int retries, success; raw_spin_lock_irq(&cpu_buffer->reader_lock); - rb_head_page_deactivate(cpu_buffer); + /* + * We are holding the reader lock, so the reader page won't be swapped + * in the ring buffer. Now we are racing with the writer trying to + * move head page and the tail page. + * We are going to adapt the reader page update process where: + * 1. We first splice the start and end of list of new pages between + * the head page and its previous page. + * 2. We cmpxchg the prev_page->next to point from head page to the + * start of new pages list. + * 3. Finally, we update the head->prev to the end of new list. + * + * We will try this process 10 times, to make sure that we don't keep + * spinning. + */ + retries = 10; + success = 0; + while (retries--) { + struct list_head *head_page, *prev_page, *r; + struct list_head *last_page, *first_page; + struct list_head *head_page_with_bit; - for (i = 0; i < nr_pages; i++) { - if (RB_WARN_ON(cpu_buffer, list_empty(pages))) - goto out; - p = pages->next; - bpage = list_entry(p, struct buffer_page, list); - list_del_init(&bpage->list); - list_add_tail(&bpage->list, cpu_buffer->pages); + head_page = &rb_set_head_page(cpu_buffer)->list; + prev_page = head_page->prev; + + first_page = pages->next; + last_page = pages->prev; + + head_page_with_bit = (struct list_head *) + ((unsigned long)head_page | RB_PAGE_HEAD); + + last_page->next = head_page_with_bit; + first_page->prev = prev_page; + + r = cmpxchg(&prev_page->next, head_page_with_bit, first_page); + + if (r == head_page_with_bit) { + /* + * yay, we replaced the page pointer to our new list, + * now, we just have to update to head page's prev + * pointer to point to end of list + */ + head_page->prev = last_page; + success = 1; + break; + } } - rb_reset_cpu(cpu_buffer); - rb_check_pages(cpu_buffer); -out: + if (success) + INIT_LIST_HEAD(pages); + /* + * If we weren't successful in adding in new pages, warn and stop + * tracing + */ + RB_WARN_ON(cpu_buffer, !success); raw_spin_unlock_irq(&cpu_buffer->reader_lock); + + /* free pages if they weren't inserted */ + if (!success) { + struct buffer_page *bpage, *tmp; + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } + return success; +} + +static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + int success; + + if (cpu_buffer->nr_pages_to_update > 0) + success = rb_insert_pages(cpu_buffer); + else + success = rb_remove_pages(cpu_buffer, + -cpu_buffer->nr_pages_to_update); + + if (success) + cpu_buffer->nr_pages += cpu_buffer->nr_pages_to_update; +} + +static void update_pages_handler(struct work_struct *work) +{ + struct ring_buffer_per_cpu *cpu_buffer = container_of(work, + struct ring_buffer_per_cpu, update_pages_work); + rb_update_pages(cpu_buffer); + complete(&cpu_buffer->update_done); } /** @@ -1283,16 +1471,14 @@ out: * * Minimum size is 2 * BUF_PAGE_SIZE. * - * Returns -1 on failure. + * Returns 0 on success and < 0 on failure. */ -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, + int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages, rm_pages, new_pages; - struct buffer_page *bpage, *tmp; - unsigned long buffer_size; - LIST_HEAD(pages); - int i, cpu; + unsigned nr_pages; + int cpu, err = 0; /* * Always succeed at resizing a non-existent buffer: @@ -1302,113 +1488,154 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; - buffer_size = buffer->pages * BUF_PAGE_SIZE; /* we need a minimum of two pages */ if (size < BUF_PAGE_SIZE * 2) size = BUF_PAGE_SIZE * 2; - if (size == buffer_size) - return size; - - atomic_inc(&buffer->record_disabled); + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - /* Make sure all writers are done with this buffer. */ - synchronize_sched(); + /* + * Don't succeed if resizing is disabled, as a reader might be + * manipulating the ring buffer and is expecting a sane state while + * this is true. + */ + if (atomic_read(&buffer->resize_disabled)) + return -EBUSY; + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); - get_online_cpus(); - - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - if (size < buffer_size) { + if (cpu_id == RING_BUFFER_ALL_CPUS) { + /* calculate the pages to update */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; - /* easy case, just free pages */ - if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) - goto out_fail; + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + /* + * nothing more to do for removing pages or no update + */ + if (cpu_buffer->nr_pages_to_update <= 0) + continue; + /* + * to add pages, make sure all new pages can be + * allocated without receiving ENOMEM + */ + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu)) { + /* not enough memory for new pages */ + err = -ENOMEM; + goto out_err; + } + } - rm_pages = buffer->pages - nr_pages; + get_online_cpus(); + /* + * Fire off all the required work handlers + * We can't schedule on offline CPUs, but it's not necessary + * since we can change their buffer sizes without any race. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!cpu_buffer->nr_pages_to_update) + continue; + + if (cpu_online(cpu)) + schedule_work_on(cpu, + &cpu_buffer->update_pages_work); + else + rb_update_pages(cpu_buffer); + } + /* wait for all the updates to complete */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - rb_remove_pages(cpu_buffer, rm_pages); + if (!cpu_buffer->nr_pages_to_update) + continue; + + if (cpu_online(cpu)) + wait_for_completion(&cpu_buffer->update_done); + cpu_buffer->nr_pages_to_update = 0; } - goto out; - } - /* - * This is a bit more difficult. We only want to add pages - * when we can allocate enough for all CPUs. We do this - * by allocating all the pages and storing them on a local - * link list. If we succeed in our allocation, then we - * add these pages to the cpu_buffers. Otherwise we just free - * them all and return -ENOMEM; - */ - if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) - goto out_fail; + put_online_cpus(); + } else { + cpu_buffer = buffer->buffers[cpu_id]; - new_pages = nr_pages - buffer->pages; + if (nr_pages == cpu_buffer->nr_pages) + goto out; - for_each_buffer_cpu(buffer, cpu) { - for (i = 0; i < new_pages; i++) { - struct page *page; - /* - * __GFP_NORETRY flag makes sure that the allocation - * fails gracefully without invoking oom-killer and - * the system is not destabilized. - */ - bpage = kzalloc_node(ALIGN(sizeof(*bpage), - cache_line_size()), - GFP_KERNEL | __GFP_NORETRY, - cpu_to_node(cpu)); - if (!bpage) - goto free_pages; - list_add(&bpage->list, &pages); - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); + cpu_buffer->nr_pages_to_update = nr_pages - + cpu_buffer->nr_pages; + + INIT_LIST_HEAD(&cpu_buffer->new_pages); + if (cpu_buffer->nr_pages_to_update > 0 && + __rb_allocate_pages(cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages, cpu_id)) { + err = -ENOMEM; + goto out_err; } - } - for_each_buffer_cpu(buffer, cpu) { - cpu_buffer = buffer->buffers[cpu]; - rb_insert_pages(cpu_buffer, &pages, new_pages); - } + get_online_cpus(); - if (RB_WARN_ON(buffer, !list_empty(&pages))) - goto out_fail; + if (cpu_online(cpu_id)) { + schedule_work_on(cpu_id, + &cpu_buffer->update_pages_work); + wait_for_completion(&cpu_buffer->update_done); + } else + rb_update_pages(cpu_buffer); + + cpu_buffer->nr_pages_to_update = 0; + put_online_cpus(); + } out: - buffer->pages = nr_pages; - put_online_cpus(); + /* + * The ring buffer resize can happen with the ring buffer + * enabled, so that the update disturbs the tracing as little + * as possible. But if the buffer is disabled, we do not need + * to worry about that, and we can take the time to verify + * that the buffer is not corrupt. + */ + if (atomic_read(&buffer->record_disabled)) { + atomic_inc(&buffer->record_disabled); + /* + * Even though the buffer was disabled, we must make sure + * that it is truly disabled before calling rb_check_pages. + * There could have been a race between checking + * record_disable and incrementing it. + */ + synchronize_sched(); + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_check_pages(cpu_buffer); + } + atomic_dec(&buffer->record_disabled); + } + mutex_unlock(&buffer->mutex); + return size; - atomic_dec(&buffer->record_disabled); + out_err: + for_each_buffer_cpu(buffer, cpu) { + struct buffer_page *bpage, *tmp; - return size; + cpu_buffer = buffer->buffers[cpu]; + cpu_buffer->nr_pages_to_update = 0; - free_pages: - list_for_each_entry_safe(bpage, tmp, &pages, list) { - list_del_init(&bpage->list); - free_buffer_page(bpage); - } - put_online_cpus(); - mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -ENOMEM; + if (list_empty(&cpu_buffer->new_pages)) + continue; - /* - * Something went totally wrong, and we are too paranoid - * to even clean up the mess. - */ - out_fail: - put_online_cpus(); + list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, + list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + } mutex_unlock(&buffer->mutex); - atomic_dec(&buffer->record_disabled); - return -1; + return err; } EXPORT_SYMBOL_GPL(ring_buffer_resize); @@ -1447,21 +1674,11 @@ rb_iter_head_event(struct ring_buffer_iter *iter) return __rb_page_index(iter->head_page, iter->head); } -static inline unsigned long rb_page_write(struct buffer_page *bpage) -{ - return local_read(&bpage->write) & RB_WRITE_MASK; -} - static inline unsigned rb_page_commit(struct buffer_page *bpage) { return local_read(&bpage->page->commit); } -static inline unsigned long rb_page_entries(struct buffer_page *bpage) -{ - return local_read(&bpage->entries) & RB_WRITE_MASK; -} - /* Size is determined by what has been committed */ static inline unsigned rb_page_size(struct buffer_page *bpage) { @@ -1510,7 +1727,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * assign the commit to the tail. */ again: - max_count = cpu_buffer->buffer->pages * 100; + max_count = cpu_buffer->nr_pages * 100; while (cpu_buffer->commit_page != cpu_buffer->tail_page) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) @@ -3486,6 +3703,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) iter->cpu_buffer = cpu_buffer; + atomic_inc(&buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); return iter; @@ -3548,7 +3766,14 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + /* + * Ring buffer is disabled from recording, here's a good place + * to check the integrity of the ring buffer. + */ + rb_check_pages(cpu_buffer); + atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&cpu_buffer->buffer->resize_disabled); kfree(iter); } EXPORT_SYMBOL_GPL(ring_buffer_read_finish); @@ -3588,9 +3813,18 @@ EXPORT_SYMBOL_GPL(ring_buffer_read); * ring_buffer_size - return the size of the ring buffer (in bytes) * @buffer: The ring buffer. */ -unsigned long ring_buffer_size(struct ring_buffer *buffer) +unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu) { - return BUF_PAGE_SIZE * buffer->pages; + /* + * Earlier, this method returned + * BUF_PAGE_SIZE * buffer->nr_pages + * Since the nr_pages field is now removed, we have converted this to + * return the per cpu buffer value. + */ + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages; } EXPORT_SYMBOL_GPL(ring_buffer_size); @@ -3611,6 +3845,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->commit_page = cpu_buffer->head_page; INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + INIT_LIST_HEAD(&cpu_buffer->new_pages); local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); @@ -3647,8 +3882,12 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&buffer->resize_disabled); atomic_inc(&cpu_buffer->record_disabled); + /* Make sure all commits have finished */ + synchronize_sched(); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) @@ -3664,6 +3903,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); + atomic_dec(&buffer->resize_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -3765,8 +4005,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, !cpumask_test_cpu(cpu, buffer_b->cpumask)) goto out; + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + /* At least make sure the two buffers are somewhat the same */ - if (buffer_a->pages != buffer_b->pages) + if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) goto out; ret = -EAGAIN; @@ -3780,9 +4023,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, if (atomic_read(&buffer_b->record_disabled)) goto out; - cpu_buffer_a = buffer_a->buffers[cpu]; - cpu_buffer_b = buffer_b->buffers[cpu]; - if (atomic_read(&cpu_buffer_a->record_disabled)) goto out; @@ -4071,6 +4311,8 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; + int cpu_i, nr_pages_same; + unsigned int nr_pages; switch (action) { case CPU_UP_PREPARE: @@ -4078,8 +4320,23 @@ static int rb_cpu_notify(struct notifier_block *self, if (cpumask_test_cpu(cpu, buffer->cpumask)) return NOTIFY_OK; + nr_pages = 0; + nr_pages_same = 1; + /* check if all cpu sizes are same */ + for_each_buffer_cpu(buffer, cpu_i) { + /* fill in the size from first enabled cpu */ + if (nr_pages == 0) + nr_pages = buffer->buffers[cpu_i]->nr_pages; + if (nr_pages != buffer->buffers[cpu_i]->nr_pages) { + nr_pages_same = 0; + break; + } + } + /* allocate minimum pages, user can later expand it */ + if (!nr_pages_same) + nr_pages = 2; buffer->buffers[cpu] = - rb_allocate_cpu_buffer(buffer, cpu); + rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) { WARN(1, "failed to allocate ring buffer on CPU %ld\n", cpu); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2a22255c1010..68032c6177db 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -87,18 +87,6 @@ static int tracing_disabled = 1; DEFINE_PER_CPU(int, ftrace_cpu_disabled); -static inline void ftrace_disable_cpu(void) -{ - preempt_disable(); - __this_cpu_inc(ftrace_cpu_disabled); -} - -static inline void ftrace_enable_cpu(void) -{ - __this_cpu_dec(ftrace_cpu_disabled); - preempt_enable(); -} - cpumask_var_t __read_mostly tracing_buffer_mask; /* @@ -629,7 +617,6 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - void *ret; if (s->len <= s->readpos) return -EBUSY; @@ -637,9 +624,7 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) len = s->len - s->readpos; if (cnt > len) cnt = len; - ret = memcpy(buf, s->buffer + s->readpos, cnt); - if (!ret) - return -EFAULT; + memcpy(buf, s->buffer + s->readpos, cnt); s->readpos += cnt; return cnt; @@ -751,8 +736,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); - ftrace_disable_cpu(); - ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); if (ret == -EBUSY) { @@ -766,8 +749,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) "Failed to swap buffers due to commit in progress\n"); } - ftrace_enable_cpu(); - WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); __update_max_tr(tr, tsk, cpu); @@ -782,8 +763,6 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * Register a new plugin tracer. */ int register_tracer(struct tracer *type) -__releases(kernel_lock) -__acquires(kernel_lock) { struct tracer *t; int ret = 0; @@ -841,7 +820,8 @@ __acquires(kernel_lock) /* If we expanded the buffers, make sure the max is expanded too */ if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, trace_buf_size); + ring_buffer_resize(max_tr.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -857,7 +837,8 @@ __acquires(kernel_lock) /* Shrink the max buffer again */ if (ring_buffer_expanded && type->use_max_tr) - ring_buffer_resize(max_tr.buffer, 1); + ring_buffer_resize(max_tr.buffer, 1, + RING_BUFFER_ALL_CPUS); printk(KERN_CONT "PASSED\n"); } @@ -917,13 +898,6 @@ out: mutex_unlock(&trace_types_lock); } -static void __tracing_reset(struct ring_buffer *buffer, int cpu) -{ - ftrace_disable_cpu(); - ring_buffer_reset_cpu(buffer, cpu); - ftrace_enable_cpu(); -} - void tracing_reset(struct trace_array *tr, int cpu) { struct ring_buffer *buffer = tr->buffer; @@ -932,7 +906,7 @@ void tracing_reset(struct trace_array *tr, int cpu) /* Make sure all commits have finished */ synchronize_sched(); - __tracing_reset(buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -950,7 +924,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - __tracing_reset(buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -1498,25 +1472,119 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) #endif /* CONFIG_STACKTRACE */ +/* created for use with alloc_percpu */ +struct trace_buffer_struct { + char buffer[TRACE_BUF_SIZE]; +}; + +static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct *trace_percpu_sirq_buffer; +static struct trace_buffer_struct *trace_percpu_irq_buffer; +static struct trace_buffer_struct *trace_percpu_nmi_buffer; + +/* + * The buffer used is dependent on the context. There is a per cpu + * buffer for normal context, softirq contex, hard irq context and + * for NMI context. Thise allows for lockless recording. + * + * Note, if the buffers failed to be allocated, then this returns NULL + */ +static char *get_trace_buf(void) +{ + struct trace_buffer_struct *percpu_buffer; + struct trace_buffer_struct *buffer; + + /* + * If we have allocated per cpu buffers, then we do not + * need to do any locking. + */ + if (in_nmi()) + percpu_buffer = trace_percpu_nmi_buffer; + else if (in_irq()) + percpu_buffer = trace_percpu_irq_buffer; + else if (in_softirq()) + percpu_buffer = trace_percpu_sirq_buffer; + else + percpu_buffer = trace_percpu_buffer; + + if (!percpu_buffer) + return NULL; + + buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); + + return buffer->buffer; +} + +static int alloc_percpu_trace_buffer(void) +{ + struct trace_buffer_struct *buffers; + struct trace_buffer_struct *sirq_buffers; + struct trace_buffer_struct *irq_buffers; + struct trace_buffer_struct *nmi_buffers; + + buffers = alloc_percpu(struct trace_buffer_struct); + if (!buffers) + goto err_warn; + + sirq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!sirq_buffers) + goto err_sirq; + + irq_buffers = alloc_percpu(struct trace_buffer_struct); + if (!irq_buffers) + goto err_irq; + + nmi_buffers = alloc_percpu(struct trace_buffer_struct); + if (!nmi_buffers) + goto err_nmi; + + trace_percpu_buffer = buffers; + trace_percpu_sirq_buffer = sirq_buffers; + trace_percpu_irq_buffer = irq_buffers; + trace_percpu_nmi_buffer = nmi_buffers; + + return 0; + + err_nmi: + free_percpu(irq_buffers); + err_irq: + free_percpu(sirq_buffers); + err_sirq: + free_percpu(buffers); + err_warn: + WARN(1, "Could not allocate percpu trace_printk buffer"); + return -ENOMEM; +} + +void trace_printk_init_buffers(void) +{ + static int buffers_allocated; + + if (buffers_allocated) + return; + + if (alloc_percpu_trace_buffer()) + return; + + pr_info("ftrace: Allocated trace_printk buffers\n"); + + buffers_allocated = 1; +} + /** * trace_vbprintk - write binary msg to tracing buffer * */ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) { - static arch_spinlock_t trace_buf_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - static u32 trace_buf[TRACE_BUF_SIZE]; - struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct ring_buffer *buffer; struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; - int disable; - int cpu, len = 0, size, pc; + char *tbuffer; + int len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -1526,43 +1594,36 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pc = preempt_count(); preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; goto out; + } - /* Lockdep uses trace_printk for lock tracing */ - local_irq_save(flags); - arch_spin_lock(&trace_buf_lock); - len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vbin_printf((u32 *)tbuffer, TRACE_BUF_SIZE/sizeof(int), fmt, args); - if (len > TRACE_BUF_SIZE || len < 0) - goto out_unlock; + if (len > TRACE_BUF_SIZE/sizeof(int) || len < 0) + goto out; + local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) - goto out_unlock; + goto out; entry = ring_buffer_event_data(event); entry->ip = ip; entry->fmt = fmt; - memcpy(entry->buf, trace_buf, sizeof(u32) * len); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } -out_unlock: - arch_spin_unlock(&trace_buf_lock); - local_irq_restore(flags); - out: - atomic_dec_return(&data->disabled); preempt_enable_notrace(); unpause_graph_tracing(); @@ -1588,58 +1649,53 @@ int trace_array_printk(struct trace_array *tr, int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { - static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; - static char trace_buf[TRACE_BUF_SIZE]; - struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct ring_buffer *buffer; - struct trace_array_cpu *data; - int cpu, len = 0, size, pc; + int len = 0, size, pc; struct print_entry *entry; - unsigned long irq_flags; - int disable; + unsigned long flags; + char *tbuffer; if (tracing_disabled || tracing_selftest_running) return 0; + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + pc = preempt_count(); preempt_disable_notrace(); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disable = atomic_inc_return(&data->disabled); - if (unlikely(disable != 1)) + + tbuffer = get_trace_buf(); + if (!tbuffer) { + len = 0; goto out; + } - pause_graph_tracing(); - raw_local_irq_save(irq_flags); - arch_spin_lock(&trace_buf_lock); - len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); + if (len > TRACE_BUF_SIZE) + goto out; + local_save_flags(flags); size = sizeof(*entry) + len + 1; buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - irq_flags, pc); + flags, pc); if (!event) - goto out_unlock; + goto out; entry = ring_buffer_event_data(event); entry->ip = ip; - memcpy(&entry->buf, trace_buf, len); + memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); - ftrace_trace_stack(buffer, irq_flags, 6, pc); + ftrace_trace_stack(buffer, flags, 6, pc); } - - out_unlock: - arch_spin_unlock(&trace_buf_lock); - raw_local_irq_restore(irq_flags); - unpause_graph_tracing(); out: - atomic_dec_return(&data->disabled); preempt_enable_notrace(); + unpause_graph_tracing(); return len; } @@ -1652,14 +1708,9 @@ EXPORT_SYMBOL_GPL(trace_vprintk); static void trace_iterator_increment(struct trace_iterator *iter) { - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - iter->idx++; if (iter->buffer_iter[iter->cpu]) ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - - ftrace_enable_cpu(); } static struct trace_entry * @@ -1669,17 +1720,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); - if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else event = ring_buffer_peek(iter->tr->buffer, cpu, ts, lost_events); - ftrace_enable_cpu(); - if (event) { iter->ent_size = ring_buffer_event_length(event); return ring_buffer_event_data(event); @@ -1769,11 +1815,8 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - /* Don't allow ftrace to trace into the ring buffers */ - ftrace_disable_cpu(); ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, &iter->lost_events); - ftrace_enable_cpu(); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1862,16 +1905,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; - ftrace_disable_cpu(); - if (cpu_file == TRACE_PIPE_ALL_CPU) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else tracing_iter_reset(iter, cpu_file); - ftrace_enable_cpu(); - iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -2332,15 +2371,13 @@ static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file) { long cpu_file = (long) inode->i_private; - void *fail_ret = ERR_PTR(-ENOMEM); struct trace_iterator *iter; - struct seq_file *m; - int cpu, ret; + int cpu; if (tracing_disabled) return ERR_PTR(-ENODEV); - iter = kzalloc(sizeof(*iter), GFP_KERNEL); + iter = __seq_open_private(file, &tracer_seq_ops, sizeof(*iter)); if (!iter) return ERR_PTR(-ENOMEM); @@ -2397,32 +2434,15 @@ __tracing_open(struct inode *inode, struct file *file) tracing_iter_reset(iter, cpu); } - ret = seq_open(file, &tracer_seq_ops); - if (ret < 0) { - fail_ret = ERR_PTR(ret); - goto fail_buffer; - } - - m = file->private_data; - m->private = iter; - mutex_unlock(&trace_types_lock); return iter; - fail_buffer: - for_each_tracing_cpu(cpu) { - if (iter->buffer_iter[cpu]) - ring_buffer_read_finish(iter->buffer_iter[cpu]); - } - free_cpumask_var(iter->started); - tracing_start(); fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); - kfree(iter); - - return fail_ret; + seq_release_private(inode, file); + return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) @@ -2458,11 +2478,10 @@ static int tracing_release(struct inode *inode, struct file *file) tracing_start(); mutex_unlock(&trace_types_lock); - seq_release(inode, file); mutex_destroy(&iter->mutex); free_cpumask_var(iter->started); kfree(iter->trace); - kfree(iter); + seq_release_private(inode, file); return 0; } @@ -2648,10 +2667,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&global_trace.data[cpu]->disabled); + ring_buffer_record_disable_cpu(global_trace.buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&global_trace.data[cpu]->disabled); + ring_buffer_record_enable_cpu(global_trace.buffer, cpu); } } arch_spin_unlock(&ftrace_max_lock); @@ -2974,7 +2995,14 @@ int tracer_init(struct tracer *t, struct trace_array *tr) return t->init(tr); } -static int __tracing_resize_ring_buffer(unsigned long size) +static void set_buffer_entries(struct trace_array *tr, unsigned long val) +{ + int cpu; + for_each_tracing_cpu(cpu) + tr->data[cpu]->entries = val; +} + +static int __tracing_resize_ring_buffer(unsigned long size, int cpu) { int ret; @@ -2985,19 +3013,32 @@ static int __tracing_resize_ring_buffer(unsigned long size) */ ring_buffer_expanded = 1; - ret = ring_buffer_resize(global_trace.buffer, size); + ret = ring_buffer_resize(global_trace.buffer, size, cpu); if (ret < 0) return ret; if (!current_trace->use_max_tr) goto out; - ret = ring_buffer_resize(max_tr.buffer, size); + ret = ring_buffer_resize(max_tr.buffer, size, cpu); if (ret < 0) { - int r; + int r = 0; + + if (cpu == RING_BUFFER_ALL_CPUS) { + int i; + for_each_tracing_cpu(i) { + r = ring_buffer_resize(global_trace.buffer, + global_trace.data[i]->entries, + i); + if (r < 0) + break; + } + } else { + r = ring_buffer_resize(global_trace.buffer, + global_trace.data[cpu]->entries, + cpu); + } - r = ring_buffer_resize(global_trace.buffer, - global_trace.entries); if (r < 0) { /* * AARGH! We are left with different @@ -3019,43 +3060,39 @@ static int __tracing_resize_ring_buffer(unsigned long size) return ret; } - max_tr.entries = size; + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&max_tr, size); + else + max_tr.data[cpu]->entries = size; + out: - global_trace.entries = size; + if (cpu == RING_BUFFER_ALL_CPUS) + set_buffer_entries(&global_trace, size); + else + global_trace.data[cpu]->entries = size; return ret; } -static ssize_t tracing_resize_ring_buffer(unsigned long size) +static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) { - int cpu, ret = size; + int ret = size; mutex_lock(&trace_types_lock); - tracing_stop(); - - /* disable all cpu buffers */ - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_inc(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_inc(&max_tr.data[cpu]->disabled); + if (cpu_id != RING_BUFFER_ALL_CPUS) { + /* make sure, this cpu is enabled in the mask */ + if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) { + ret = -EINVAL; + goto out; + } } - if (size != global_trace.entries) - ret = __tracing_resize_ring_buffer(size); - + ret = __tracing_resize_ring_buffer(size, cpu_id); if (ret < 0) ret = -ENOMEM; - for_each_tracing_cpu(cpu) { - if (global_trace.data[cpu]) - atomic_dec(&global_trace.data[cpu]->disabled); - if (max_tr.data[cpu]) - atomic_dec(&max_tr.data[cpu]->disabled); - } - - tracing_start(); +out: mutex_unlock(&trace_types_lock); return ret; @@ -3078,7 +3115,8 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size, + RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); return ret; @@ -3102,7 +3140,8 @@ static int tracing_set_tracer(const char *buf) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = __tracing_resize_ring_buffer(trace_buf_size); + ret = __tracing_resize_ring_buffer(trace_buf_size, + RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; ret = 0; @@ -3128,8 +3167,8 @@ static int tracing_set_tracer(const char *buf) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ - ring_buffer_resize(max_tr.buffer, 1); - max_tr.entries = 1; + ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&max_tr, 1); } destroy_trace_option_files(topts); @@ -3137,10 +3176,17 @@ static int tracing_set_tracer(const char *buf) topts = create_trace_option_files(current_trace); if (current_trace->use_max_tr) { - ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); - if (ret < 0) - goto out; - max_tr.entries = global_trace.entries; + int cpu; + /* we need to make per cpu buffer sizes equivalent */ + for_each_tracing_cpu(cpu) { + ret = ring_buffer_resize(max_tr.buffer, + global_trace.data[cpu]->entries, + cpu); + if (ret < 0) + goto out; + max_tr.data[cpu]->entries = + global_trace.data[cpu]->entries; + } } if (t->init) { @@ -3642,30 +3688,82 @@ out_err: goto out; } +struct ftrace_entries_info { + struct trace_array *tr; + int cpu; +}; + +static int tracing_entries_open(struct inode *inode, struct file *filp) +{ + struct ftrace_entries_info *info; + + if (tracing_disabled) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->tr = &global_trace; + info->cpu = (unsigned long)inode->i_private; + + filp->private_data = info; + + return 0; +} + static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_array *tr = filp->private_data; - char buf[96]; - int r; + struct ftrace_entries_info *info = filp->private_data; + struct trace_array *tr = info->tr; + char buf[64]; + int r = 0; + ssize_t ret; mutex_lock(&trace_types_lock); - if (!ring_buffer_expanded) - r = sprintf(buf, "%lu (expanded: %lu)\n", - tr->entries >> 10, - trace_buf_size >> 10); - else - r = sprintf(buf, "%lu\n", tr->entries >> 10); + + if (info->cpu == RING_BUFFER_ALL_CPUS) { + int cpu, buf_size_same; + unsigned long size; + + size = 0; + buf_size_same = 1; + /* check if all cpu sizes are same */ + for_each_tracing_cpu(cpu) { + /* fill in the size from first enabled cpu */ + if (size == 0) + size = tr->data[cpu]->entries; + if (size != tr->data[cpu]->entries) { + buf_size_same = 0; + break; + } + } + + if (buf_size_same) { + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + size >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", size >> 10); + } else + r = sprintf(buf, "X\n"); + } else + r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); + mutex_unlock(&trace_types_lock); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + return ret; } static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct ftrace_entries_info *info = filp->private_data; unsigned long val; int ret; @@ -3680,7 +3778,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - ret = tracing_resize_ring_buffer(val); + ret = tracing_resize_ring_buffer(val, info->cpu); if (ret < 0) return ret; @@ -3689,6 +3787,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static int +tracing_entries_release(struct inode *inode, struct file *filp) +{ + struct ftrace_entries_info *info = filp->private_data; + + kfree(info); + + return 0; +} + static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3700,7 +3808,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += tr->entries >> 10; + size += tr->data[cpu]->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -3734,7 +3842,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) if (trace_flags & TRACE_ITER_STOP_ON_FREE) tracing_off(); /* resize the ring buffer to 0 */ - tracing_resize_ring_buffer(0); + tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); return 0; } @@ -3749,14 +3857,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct print_entry *entry; unsigned long irq_flags; struct page *pages[2]; + void *map_page[2]; int nr_pages = 1; ssize_t written; - void *page1; - void *page2; int offset; int size; int len; int ret; + int i; if (tracing_disabled) return -EINVAL; @@ -3795,9 +3903,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, goto out; } - page1 = kmap_atomic(pages[0]); - if (nr_pages == 2) - page2 = kmap_atomic(pages[1]); + for (i = 0; i < nr_pages; i++) + map_page[i] = kmap_atomic(pages[i]); local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ @@ -3815,10 +3922,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (nr_pages == 2) { len = PAGE_SIZE - offset; - memcpy(&entry->buf, page1 + offset, len); - memcpy(&entry->buf[len], page2, cnt - len); + memcpy(&entry->buf, map_page[0] + offset, len); + memcpy(&entry->buf[len], map_page[1], cnt - len); } else - memcpy(&entry->buf, page1 + offset, cnt); + memcpy(&entry->buf, map_page[0] + offset, cnt); if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; @@ -3833,11 +3940,10 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, *fpos += written; out_unlock: - if (nr_pages == 2) - kunmap_atomic(page2); - kunmap_atomic(page1); - while (nr_pages > 0) - put_page(pages[--nr_pages]); + for (i = 0; i < nr_pages; i++){ + kunmap_atomic(map_page[i]); + put_page(pages[i]); + } out: return written; } @@ -3933,9 +4039,10 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_open_generic, + .open = tracing_entries_open, .read = tracing_entries_read, .write = tracing_entries_write, + .release = tracing_entries_release, .llseek = generic_file_llseek, }; @@ -4367,6 +4474,9 @@ static void tracing_init_debugfs_percpu(long cpu) struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ + if (!d_percpu) + return; + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { @@ -4387,6 +4497,9 @@ static void tracing_init_debugfs_percpu(long cpu) trace_create_file("stats", 0444, d_cpu, (void *) cpu, &tracing_stats_fops); + + trace_create_file("buffer_size_kb", 0444, d_cpu, + (void *) cpu, &tracing_entries_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -4718,7 +4831,7 @@ static __init int tracer_init_debugfs(void) (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); + (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); trace_create_file("buffer_total_size_kb", 0444, d_tracer, &global_trace, &tracing_total_entries_fops); @@ -4957,6 +5070,10 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; + /* Only allocate trace_printk buffers if a trace_printk exists */ + if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt) + trace_printk_init_buffers(); + /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; @@ -4975,7 +5092,6 @@ __init static int tracer_alloc_buffers(void) WARN_ON(1); goto out_free_cpumask; } - global_trace.entries = ring_buffer_size(global_trace.buffer); if (global_trace.buffer_disabled) tracing_off(); @@ -4988,7 +5104,6 @@ __init static int tracer_alloc_buffers(void) ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } - max_tr.entries = 1; #endif /* Allocate the first page for all buffers */ @@ -4997,6 +5112,12 @@ __init static int tracer_alloc_buffers(void) max_tr.data[i] = &per_cpu(max_tr_data, i); } + set_buffer_entries(&global_trace, + ring_buffer_size(global_trace.buffer, 0)); +#ifdef CONFIG_TRACER_MAX_TRACE + set_buffer_entries(&max_tr, 1); +#endif + trace_init_cmdlines(); register_tracer(&nop_trace); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f95d65da6db8..6c6f7933eede 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -131,6 +131,7 @@ struct trace_array_cpu { atomic_t disabled; void *buffer_page; /* ring buffer spare */ + unsigned long entries; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; @@ -152,7 +153,6 @@ struct trace_array_cpu { */ struct trace_array { struct ring_buffer *buffer; - unsigned long entries; int cpu; int buffer_disabled; cycle_t time_start; @@ -826,6 +826,8 @@ extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; +void trace_printk_init_buffers(void); + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6fd4ffd042f9..a9077c1b4ad3 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -51,6 +51,10 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) const char **iter; char *fmt; + /* allocate the trace_printk per cpu buffers */ + if (start != end) + trace_printk_init_buffers(); + mutex_lock(&btrace_mutex); for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c deleted file mode 100644 index 209b379a4721..000000000000 --- a/kernel/trace/trace_workqueue.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Workqueue statistical tracer. - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - - -#include <trace/events/workqueue.h> -#include <linux/list.h> -#include <linux/percpu.h> -#include <linux/slab.h> -#include <linux/kref.h> -#include "trace_stat.h" -#include "trace.h" - - -/* A cpu workqueue thread */ -struct cpu_workqueue_stats { - struct list_head list; - struct kref kref; - int cpu; - pid_t pid; -/* Can be inserted from interrupt or user context, need to be atomic */ - atomic_t inserted; -/* - * Don't need to be atomic, works are serialized in a single workqueue thread - * on a single CPU. - */ - unsigned int executed; -}; - -/* List of workqueue threads on one cpu */ -struct workqueue_global_stats { - struct list_head list; - spinlock_t lock; -}; - -/* Don't need a global lock because allocated before the workqueues, and - * never freed. - */ -static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); -#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) - -static void cpu_workqueue_stat_free(struct kref *kref) -{ - kfree(container_of(kref, struct cpu_workqueue_stats, kref)); -} - -/* Insertion of a work */ -static void -probe_workqueue_insertion(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - atomic_inc(&node->inserted); - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Execution of a work */ -static void -probe_workqueue_execution(void *ignore, - struct task_struct *wq_thread, - struct work_struct *work) -{ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { - if (node->pid == wq_thread->pid) { - node->executed++; - goto found; - } - } - pr_debug("trace_workqueue: entry not found\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(void *ignore, - struct task_struct *wq_thread, int cpu) -{ - struct cpu_workqueue_stats *cws; - unsigned long flags; - - WARN_ON(cpu < 0); - - /* Workqueues are sometimes created in atomic context */ - cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); - if (!cws) { - pr_warning("trace_workqueue: not enough memory\n"); - return; - } - INIT_LIST_HEAD(&cws->list); - kref_init(&cws->kref); - cws->cpu = cpu; - cws->pid = wq_thread->pid; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); -} - -/* Destruction of a cpu workqueue thread */ -static void -probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) -{ - /* Workqueue only execute on one cpu */ - int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { - if (node->pid == wq_thread->pid) { - list_del(&node->list); - kref_put(&node->kref, cpu_workqueue_stat_free); - goto found; - } - } - - pr_debug("trace_workqueue: don't find workqueue to destroy\n"); -found: - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - -} - -static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) -{ - unsigned long flags; - struct cpu_workqueue_stats *ret = NULL; - - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - - if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { - ret = list_entry(workqueue_cpu_stat(cpu)->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static void *workqueue_stat_start(struct tracer_stat *trace) -{ - int cpu; - void *ret = NULL; - - for_each_possible_cpu(cpu) { - ret = workqueue_stat_start_cpu(cpu); - if (ret) - return ret; - } - return NULL; -} - -static void *workqueue_stat_next(void *prev, int idx) -{ - struct cpu_workqueue_stats *prev_cws = prev; - struct cpu_workqueue_stats *ret; - int cpu = prev_cws->cpu; - unsigned long flags; - - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - do { - cpu = cpumask_next(cpu, cpu_possible_mask); - if (cpu >= nr_cpu_ids) - return NULL; - } while (!(ret = workqueue_stat_start_cpu(cpu))); - return ret; - } else { - ret = list_entry(prev_cws->list.next, - struct cpu_workqueue_stats, list); - kref_get(&ret->kref); - } - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - - return ret; -} - -static int workqueue_stat_show(struct seq_file *s, void *p) -{ - struct cpu_workqueue_stats *cws = p; - struct pid *pid; - struct task_struct *tsk; - - pid = find_get_pid(cws->pid); - if (pid) { - tsk = get_pid_task(pid, PIDTYPE_PID); - if (tsk) { - seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, - atomic_read(&cws->inserted), cws->executed, - tsk->comm); - put_task_struct(tsk); - } - put_pid(pid); - } - - return 0; -} - -static void workqueue_stat_release(void *stat) -{ - struct cpu_workqueue_stats *node = stat; - - kref_put(&node->kref, cpu_workqueue_stat_free); -} - -static int workqueue_stat_headers(struct seq_file *s) -{ - seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); - seq_printf(s, "# | | | |\n"); - return 0; -} - -struct tracer_stat workqueue_stats __read_mostly = { - .name = "workqueues", - .stat_start = workqueue_stat_start, - .stat_next = workqueue_stat_next, - .stat_show = workqueue_stat_show, - .stat_release = workqueue_stat_release, - .stat_headers = workqueue_stat_headers -}; - - -int __init stat_workqueue_init(void) -{ - if (register_stat_tracer(&workqueue_stats)) { - pr_warning("Unable to register workqueue stat tracer\n"); - return 1; - } - - return 0; -} -fs_initcall(stat_workqueue_init); - -/* - * Workqueues are created very early, just after pre-smp initcalls. - * So we must register our tracepoints at this stage. - */ -int __init trace_workqueue_early_init(void) -{ - int ret, cpu; - - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - - ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); - if (ret) - goto out; - - ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); - if (ret) - goto no_insertion; - - ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); - if (ret) - goto no_execution; - - ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); - if (ret) - goto no_creation; - - return 0; - -no_creation: - unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); -no_execution: - unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); -no_insertion: - unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); -out: - pr_warning("trace_workqueue: unable to trace workqueues\n"); - - return 1; -} -early_initcall(trace_workqueue_early_init); diff --git a/kernel/uid16.c b/kernel/uid16.c index 51c6e89e8619..d7948eb10225 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -81,14 +81,19 @@ SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) return ret; } -SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) +SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; + old_uid_t ruid, euid, suid; - if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && - !(retval = put_user(high2lowuid(cred->euid), euid))) - retval = put_user(high2lowuid(cred->suid), suid); + ruid = high2lowuid(from_kuid_munged(cred->user_ns, cred->uid)); + euid = high2lowuid(from_kuid_munged(cred->user_ns, cred->euid)); + suid = high2lowuid(from_kuid_munged(cred->user_ns, cred->suid)); + + if (!(retval = put_user(ruid, ruidp)) && + !(retval = put_user(euid, euidp))) + retval = put_user(suid, suidp); return retval; } @@ -103,14 +108,19 @@ SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) } -SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) +SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egidp, old_gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; + old_gid_t rgid, egid, sgid; + + rgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->gid)); + egid = high2lowgid(from_kgid_munged(cred->user_ns, cred->egid)); + sgid = high2lowgid(from_kgid_munged(cred->user_ns, cred->sgid)); - if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && - !(retval = put_user(high2lowgid(cred->egid), egid))) - retval = put_user(high2lowgid(cred->sgid), sgid); + if (!(retval = put_user(rgid, rgidp)) && + !(retval = put_user(egid, egidp))) + retval = put_user(sgid, sgidp); return retval; } @@ -134,11 +144,14 @@ SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) static int groups16_to_user(old_gid_t __user *grouplist, struct group_info *group_info) { + struct user_namespace *user_ns = current_user_ns(); int i; old_gid_t group; + kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - group = high2lowgid(GROUP_AT(group_info, i)); + kgid = GROUP_AT(group_info, i); + group = high2lowgid(from_kgid_munged(user_ns, kgid)); if (put_user(group, grouplist+i)) return -EFAULT; } @@ -149,13 +162,20 @@ static int groups16_to_user(old_gid_t __user *grouplist, static int groups16_from_user(struct group_info *group_info, old_gid_t __user *grouplist) { + struct user_namespace *user_ns = current_user_ns(); int i; old_gid_t group; + kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { if (get_user(group, grouplist+i)) return -EFAULT; - GROUP_AT(group_info, i) = low2highgid(group); + + kgid = make_kgid(user_ns, low2highgid(group)); + if (!gid_valid(kgid)) + return -EINVAL; + + GROUP_AT(group_info, i) = kgid; } return 0; @@ -211,20 +231,20 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) SYSCALL_DEFINE0(getuid16) { - return high2lowuid(current_uid()); + return high2lowuid(from_kuid_munged(current_user_ns(), current_uid())); } SYSCALL_DEFINE0(geteuid16) { - return high2lowuid(current_euid()); + return high2lowuid(from_kuid_munged(current_user_ns(), current_euid())); } SYSCALL_DEFINE0(getgid16) { - return high2lowgid(current_gid()); + return high2lowgid(from_kgid_munged(current_user_ns(), current_gid())); } SYSCALL_DEFINE0(getegid16) { - return high2lowgid(current_egid()); + return high2lowgid(from_kgid_munged(current_user_ns(), current_egid())); } diff --git a/kernel/user.c b/kernel/user.c index 71dd2363ab0f..b815fefbe76f 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -22,10 +22,27 @@ * and 1 for... ? */ struct user_namespace init_user_ns = { + .uid_map = { + .nr_extents = 1, + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, + }, + .gid_map = { + .nr_extents = 1, + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, + }, .kref = { .refcount = ATOMIC_INIT(3), }, - .creator = &root_user, + .owner = GLOBAL_ROOT_UID, + .group = GLOBAL_ROOT_GID, }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -34,11 +51,14 @@ EXPORT_SYMBOL_GPL(init_user_ns); * when changing user ID's (ie setuid() and friends). */ +#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) +#define UIDHASH_SZ (1 << UIDHASH_BITS) #define UIDHASH_MASK (UIDHASH_SZ - 1) #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) -#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) +#define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) static struct kmem_cache *uid_cachep; +struct hlist_head uidhash_table[UIDHASH_SZ]; /* * The uidhash_lock is mostly taken from process context, but it is @@ -51,14 +71,14 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ +/* root_user.__count is 1, for init task cred */ struct user_struct root_user = { - .__count = ATOMIC_INIT(2), + .__count = ATOMIC_INIT(1), .processes = ATOMIC_INIT(1), .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, - .user_ns = &init_user_ns, + .uid = GLOBAL_ROOT_UID, }; /* @@ -72,16 +92,15 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); - put_user_ns(up->user_ns); } -static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) +static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) { struct user_struct *user; struct hlist_node *h; hlist_for_each_entry(user, h, hashent, uidhash_node) { - if (user->uid == uid) { + if (uid_eq(user->uid, uid)) { atomic_inc(&user->__count); return user; } @@ -110,14 +129,13 @@ static void free_user(struct user_struct *up, unsigned long flags) * * If the user_struct could not be found, return NULL. */ -struct user_struct *find_user(uid_t uid) +struct user_struct *find_user(kuid_t uid) { struct user_struct *ret; unsigned long flags; - struct user_namespace *ns = current_user_ns(); spin_lock_irqsave(&uidhash_lock, flags); - ret = uid_hash_find(uid, uidhashentry(ns, uid)); + ret = uid_hash_find(uid, uidhashentry(uid)); spin_unlock_irqrestore(&uidhash_lock, flags); return ret; } @@ -136,9 +154,9 @@ void free_uid(struct user_struct *up) local_irq_restore(flags); } -struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) +struct user_struct *alloc_uid(kuid_t uid) { - struct hlist_head *hashent = uidhashentry(ns, uid); + struct hlist_head *hashent = uidhashentry(uid); struct user_struct *up, *new; spin_lock_irq(&uidhash_lock); @@ -153,8 +171,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) new->uid = uid; atomic_set(&new->__count, 1); - new->user_ns = get_user_ns(ns); - /* * Before adding this, check whether we raced * on adding the same user already.. @@ -162,7 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - put_user_ns(ns); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -187,11 +202,11 @@ static int __init uid_cache_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); + INIT_HLIST_HEAD(uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); - uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); + uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); spin_unlock_irq(&uidhash_lock); return 0; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 3b906e98b1db..86602316422d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -11,9 +11,20 @@ #include <linux/user_namespace.h> #include <linux/highuid.h> #include <linux/cred.h> +#include <linux/securebits.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <keys/user-type.h> +#include <linux/seq_file.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/ctype.h> static struct kmem_cache *user_ns_cachep __read_mostly; +static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, + struct uid_gid_map *map); + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -24,109 +35,565 @@ static struct kmem_cache *user_ns_cachep __read_mostly; */ int create_user_ns(struct cred *new) { - struct user_namespace *ns; - struct user_struct *root_user; - int n; + struct user_namespace *ns, *parent_ns = new->user_ns; + kuid_t owner = new->euid; + kgid_t group = new->egid; + + /* The creator needs a mapping in the parent user namespace + * or else we won't be able to reasonably tell userspace who + * created a user_namespace. + */ + if (!kuid_has_mapping(parent_ns, owner) || + !kgid_has_mapping(parent_ns, group)) + return -EPERM; - ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); + ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) return -ENOMEM; kref_init(&ns->kref); + ns->parent = parent_ns; + ns->owner = owner; + ns->group = group; - for (n = 0; n < UIDHASH_SZ; ++n) - INIT_HLIST_HEAD(ns->uidhash_table + n); - - /* Alloc new root user. */ - root_user = alloc_uid(ns, 0); - if (!root_user) { - kmem_cache_free(user_ns_cachep, ns); - return -ENOMEM; - } - - /* set the new root user in the credentials under preparation */ - ns->creator = new->user; - new->user = root_user; - new->uid = new->euid = new->suid = new->fsuid = 0; - new->gid = new->egid = new->sgid = new->fsgid = 0; - put_group_info(new->group_info); - new->group_info = get_group_info(&init_groups); + /* Start with the same capabilities as init but useless for doing + * anything as the capabilities are bound to the new user namespace. + */ + new->securebits = SECUREBITS_DEFAULT; + new->cap_inheritable = CAP_EMPTY_SET; + new->cap_permitted = CAP_FULL_SET; + new->cap_effective = CAP_FULL_SET; + new->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(new->request_key_auth); new->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ - /* root_user holds a reference to ns, our reference can be dropped */ - put_user_ns(ns); + /* Leave the new->user_ns reference with the new user namespace. */ + /* Leave the reference to our user_ns with the new cred. */ + new->user_ns = ns; return 0; } -/* - * Deferred destructor for a user namespace. This is required because - * free_user_ns() may be called with uidhash_lock held, but we need to call - * back to free_uid() which will want to take the lock again. - */ -static void free_user_ns_work(struct work_struct *work) +void free_user_ns(struct kref *kref) { - struct user_namespace *ns = - container_of(work, struct user_namespace, destroyer); - free_uid(ns->creator); + struct user_namespace *parent, *ns = + container_of(kref, struct user_namespace, kref); + + parent = ns->parent; kmem_cache_free(user_ns_cachep, ns); + put_user_ns(parent); } +EXPORT_SYMBOL(free_user_ns); -void free_user_ns(struct kref *kref) +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { - struct user_namespace *ns = - container_of(kref, struct user_namespace, kref); + unsigned idx, extents; + u32 first, last, id2; + + id2 = id + count - 1; + + /* Find the matching extent */ + extents = map->nr_extents; + smp_read_barrier_depends(); + for (idx = 0; idx < extents; idx++) { + first = map->extent[idx].first; + last = first + map->extent[idx].count - 1; + if (id >= first && id <= last && + (id2 >= first && id2 <= last)) + break; + } + /* Map the id or note failure */ + if (idx < extents) + id = (id - first) + map->extent[idx].lower_first; + else + id = (u32) -1; - INIT_WORK(&ns->destroyer, free_user_ns_work); - schedule_work(&ns->destroyer); + return id; } -EXPORT_SYMBOL(free_user_ns); -uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +static u32 map_id_down(struct uid_gid_map *map, u32 id) { - struct user_namespace *tmp; + unsigned idx, extents; + u32 first, last; - if (likely(to == cred->user->user_ns)) - return uid; + /* Find the matching extent */ + extents = map->nr_extents; + smp_read_barrier_depends(); + for (idx = 0; idx < extents; idx++) { + first = map->extent[idx].first; + last = first + map->extent[idx].count - 1; + if (id >= first && id <= last) + break; + } + /* Map the id or note failure */ + if (idx < extents) + id = (id - first) + map->extent[idx].lower_first; + else + id = (u32) -1; + return id; +} - /* Is cred->user the creator of the target user_ns - * or the creator of one of it's parents? - */ - for ( tmp = to; tmp != &init_user_ns; - tmp = tmp->creator->user_ns ) { - if (cred->user == tmp->creator) { - return (uid_t)0; - } +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + unsigned idx, extents; + u32 first, last; + + /* Find the matching extent */ + extents = map->nr_extents; + smp_read_barrier_depends(); + for (idx = 0; idx < extents; idx++) { + first = map->extent[idx].lower_first; + last = first + map->extent[idx].count - 1; + if (id >= first && id <= last) + break; } + /* Map the id or note failure */ + if (idx < extents) + id = (id - first) + map->extent[idx].first; + else + id = (u32) -1; + + return id; +} + +/** + * make_kuid - Map a user-namespace uid pair into a kuid. + * @ns: User namespace that the uid is in + * @uid: User identifier + * + * Maps a user-namespace uid pair into a kernel internal kuid, + * and returns that kuid. + * + * When there is no mapping defined for the user-namespace uid + * pair INVALID_UID is returned. Callers are expected to test + * for and handle handle INVALID_UID being returned. INVALID_UID + * may be tested for using uid_valid(). + */ +kuid_t make_kuid(struct user_namespace *ns, uid_t uid) +{ + /* Map the uid to a global kernel uid */ + return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); +} +EXPORT_SYMBOL(make_kuid); + +/** + * from_kuid - Create a uid from a kuid user-namespace pair. + * @targ: The user namespace we want a uid in. + * @kuid: The kernel internal uid to start with. + * + * Map @kuid into the user-namespace specified by @targ and + * return the resulting uid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kuid has no mapping in @targ (uid_t)-1 is returned. + */ +uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) +{ + /* Map the uid from a global kernel uid */ + return map_id_up(&targ->uid_map, __kuid_val(kuid)); +} +EXPORT_SYMBOL(from_kuid); - /* No useful relationship so no mapping */ - return overflowuid; +/** + * from_kuid_munged - Create a uid from a kuid user-namespace pair. + * @targ: The user namespace we want a uid in. + * @kuid: The kernel internal uid to start with. + * + * Map @kuid into the user-namespace specified by @targ and + * return the resulting uid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kuid from_kuid_munged never fails and always + * returns a valid uid. This makes from_kuid_munged appropriate + * for use in syscalls like stat and getuid where failing the + * system call and failing to provide a valid uid are not an + * options. + * + * If @kuid has no mapping in @targ overflowuid is returned. + */ +uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) +{ + uid_t uid; + uid = from_kuid(targ, kuid); + + if (uid == (uid_t) -1) + uid = overflowuid; + return uid; } +EXPORT_SYMBOL(from_kuid_munged); -gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +/** + * make_kgid - Map a user-namespace gid pair into a kgid. + * @ns: User namespace that the gid is in + * @uid: group identifier + * + * Maps a user-namespace gid pair into a kernel internal kgid, + * and returns that kgid. + * + * When there is no mapping defined for the user-namespace gid + * pair INVALID_GID is returned. Callers are expected to test + * for and handle INVALID_GID being returned. INVALID_GID may be + * tested for using gid_valid(). + */ +kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { - struct user_namespace *tmp; + /* Map the gid to a global kernel gid */ + return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); +} +EXPORT_SYMBOL(make_kgid); - if (likely(to == cred->user->user_ns)) - return gid; +/** + * from_kgid - Create a gid from a kgid user-namespace pair. + * @targ: The user namespace we want a gid in. + * @kgid: The kernel internal gid to start with. + * + * Map @kgid into the user-namespace specified by @targ and + * return the resulting gid. + * + * There is always a mapping into the initial user_namespace. + * + * If @kgid has no mapping in @targ (gid_t)-1 is returned. + */ +gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) +{ + /* Map the gid from a global kernel gid */ + return map_id_up(&targ->gid_map, __kgid_val(kgid)); +} +EXPORT_SYMBOL(from_kgid); + +/** + * from_kgid_munged - Create a gid from a kgid user-namespace pair. + * @targ: The user namespace we want a gid in. + * @kgid: The kernel internal gid to start with. + * + * Map @kgid into the user-namespace specified by @targ and + * return the resulting gid. + * + * There is always a mapping into the initial user_namespace. + * + * Unlike from_kgid from_kgid_munged never fails and always + * returns a valid gid. This makes from_kgid_munged appropriate + * for use in syscalls like stat and getgid where failing the + * system call and failing to provide a valid gid are not options. + * + * If @kgid has no mapping in @targ overflowgid is returned. + */ +gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) +{ + gid_t gid; + gid = from_kgid(targ, kgid); - /* Is cred->user the creator of the target user_ns - * or the creator of one of it's parents? + if (gid == (gid_t) -1) + gid = overflowgid; + return gid; +} +EXPORT_SYMBOL(from_kgid_munged); + +static int uid_m_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + struct uid_gid_extent *extent = v; + struct user_namespace *lower_ns; + uid_t lower; + + lower_ns = current_user_ns(); + if ((lower_ns == ns) && lower_ns->parent) + lower_ns = lower_ns->parent; + + lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); + + seq_printf(seq, "%10u %10u %10u\n", + extent->first, + lower, + extent->count); + + return 0; +} + +static int gid_m_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + struct uid_gid_extent *extent = v; + struct user_namespace *lower_ns; + gid_t lower; + + lower_ns = current_user_ns(); + if ((lower_ns == ns) && lower_ns->parent) + lower_ns = lower_ns->parent; + + lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); + + seq_printf(seq, "%10u %10u %10u\n", + extent->first, + lower, + extent->count); + + return 0; +} + +static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) +{ + struct uid_gid_extent *extent = NULL; + loff_t pos = *ppos; + + if (pos < map->nr_extents) + extent = &map->extent[pos]; + + return extent; +} + +static void *uid_m_start(struct seq_file *seq, loff_t *ppos) +{ + struct user_namespace *ns = seq->private; + + return m_start(seq, ppos, &ns->uid_map); +} + +static void *gid_m_start(struct seq_file *seq, loff_t *ppos) +{ + struct user_namespace *ns = seq->private; + + return m_start(seq, ppos, &ns->gid_map); +} + +static void *m_next(struct seq_file *seq, void *v, loff_t *pos) +{ + (*pos)++; + return seq->op->start(seq, pos); +} + +static void m_stop(struct seq_file *seq, void *v) +{ + return; +} + +struct seq_operations proc_uid_seq_operations = { + .start = uid_m_start, + .stop = m_stop, + .next = m_next, + .show = uid_m_show, +}; + +struct seq_operations proc_gid_seq_operations = { + .start = gid_m_start, + .stop = m_stop, + .next = m_next, + .show = gid_m_show, +}; + +static DEFINE_MUTEX(id_map_mutex); + +static ssize_t map_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, + int cap_setid, + struct uid_gid_map *map, + struct uid_gid_map *parent_map) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + struct uid_gid_map new_map; + unsigned idx; + struct uid_gid_extent *extent, *last = NULL; + unsigned long page = 0; + char *kbuf, *pos, *next_line; + ssize_t ret = -EINVAL; + + /* + * The id_map_mutex serializes all writes to any given map. + * + * Any map is only ever written once. + * + * An id map fits within 1 cache line on most architectures. + * + * On read nothing needs to be done unless you are on an + * architecture with a crazy cache coherency model like alpha. + * + * There is a one time data dependency between reading the + * count of the extents and the values of the extents. The + * desired behavior is to see the values of the extents that + * were written before the count of the extents. + * + * To achieve this smp_wmb() is used on guarantee the write + * order and smp_read_barrier_depends() is guaranteed that we + * don't have crazy architectures returning stale data. + * */ - for ( tmp = to; tmp != &init_user_ns; - tmp = tmp->creator->user_ns ) { - if (cred->user == tmp->creator) { - return (gid_t)0; + mutex_lock(&id_map_mutex); + + ret = -EPERM; + /* Only allow one successful write to the map */ + if (map->nr_extents != 0) + goto out; + + /* Require the appropriate privilege CAP_SETUID or CAP_SETGID + * over the user namespace in order to set the id mapping. + */ + if (!ns_capable(ns, cap_setid)) + goto out; + + /* Get a buffer */ + ret = -ENOMEM; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!page) + goto out; + + /* Only allow <= page size writes at the beginning of the file */ + ret = -EINVAL; + if ((*ppos != 0) || (count >= PAGE_SIZE)) + goto out; + + /* Slurp in the user data */ + ret = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + kbuf[count] = '\0'; + + /* Parse the user data */ + ret = -EINVAL; + pos = kbuf; + new_map.nr_extents = 0; + for (;pos; pos = next_line) { + extent = &new_map.extent[new_map.nr_extents]; + + /* Find the end of line and ensure I don't look past it */ + next_line = strchr(pos, '\n'); + if (next_line) { + *next_line = '\0'; + next_line++; + if (*next_line == '\0') + next_line = NULL; } + + pos = skip_spaces(pos); + extent->first = simple_strtoul(pos, &pos, 10); + if (!isspace(*pos)) + goto out; + + pos = skip_spaces(pos); + extent->lower_first = simple_strtoul(pos, &pos, 10); + if (!isspace(*pos)) + goto out; + + pos = skip_spaces(pos); + extent->count = simple_strtoul(pos, &pos, 10); + if (*pos && !isspace(*pos)) + goto out; + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') + goto out; + + /* Verify we have been given valid starting values */ + if ((extent->first == (u32) -1) || + (extent->lower_first == (u32) -1 )) + goto out; + + /* Verify count is not zero and does not cause the extent to wrap */ + if ((extent->first + extent->count) <= extent->first) + goto out; + if ((extent->lower_first + extent->count) <= extent->lower_first) + goto out; + + /* For now only accept extents that are strictly in order */ + if (last && + (((last->first + last->count) > extent->first) || + ((last->lower_first + last->count) > extent->lower_first))) + goto out; + + new_map.nr_extents++; + last = extent; + + /* Fail if the file contains too many extents */ + if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && + (next_line != NULL)) + goto out; + } + /* Be very certaint the new map actually exists */ + if (new_map.nr_extents == 0) + goto out; + + ret = -EPERM; + /* Validate the user is allowed to use user id's mapped to. */ + if (!new_idmap_permitted(ns, cap_setid, &new_map)) + goto out; + + /* Map the lower ids from the parent user namespace to the + * kernel global id space. + */ + for (idx = 0; idx < new_map.nr_extents; idx++) { + u32 lower_first; + extent = &new_map.extent[idx]; + + lower_first = map_id_range_down(parent_map, + extent->lower_first, + extent->count); + + /* Fail if we can not map the specified extent to + * the kernel global id space. + */ + if (lower_first == (u32) -1) + goto out; + + extent->lower_first = lower_first; } - /* No useful relationship so no mapping */ - return overflowgid; + /* Install the map */ + memcpy(map->extent, new_map.extent, + new_map.nr_extents*sizeof(new_map.extent[0])); + smp_wmb(); + map->nr_extents = new_map.nr_extents; + + *ppos = count; + ret = count; +out: + mutex_unlock(&id_map_mutex); + if (page) + free_page(page); + return ret; +} + +ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + + if (!ns->parent) + return -EPERM; + + return map_write(file, buf, size, ppos, CAP_SETUID, + &ns->uid_map, &ns->parent->uid_map); +} + +ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + + if (!ns->parent) + return -EPERM; + + return map_write(file, buf, size, ppos, CAP_SETGID, + &ns->gid_map, &ns->parent->gid_map); +} + +static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, + struct uid_gid_map *new_map) +{ + /* Allow the specified ids if we have the appropriate capability + * (CAP_SETUID or CAP_SETGID) over the parent user namespace. + */ + if (ns_capable(ns->parent, cap_setid)) + return true; + + return false; } static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 405caf91aad5..679d97a5d3fd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -43,7 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns)); up_read(&uts_sem); return ns; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5abf42f63c08..9a3128dc67df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1032,7 +1032,10 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, cwq = get_cwq(gcwq->cpu, wq); trace_workqueue_queue_work(cpu, cwq, work); - BUG_ON(!list_empty(&work->entry)); + if (WARN_ON(!list_empty(&work->entry))) { + spin_unlock_irqrestore(&gcwq->lock, flags); + return; + } cwq->nr_in_flight[cwq->work_color]++; work_flags = work_color_to_flags(cwq->work_color); @@ -1210,8 +1213,13 @@ static void worker_enter_idle(struct worker *worker) } else wake_up_all(&gcwq->trustee_wait); - /* sanity check nr_running */ - WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + /* + * Sanity check nr_running. Because trustee releases gcwq->lock + * between setting %WORKER_ROGUE and zapping nr_running, the + * warning may trigger spuriously. Check iff trustee is idle. + */ + WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && + gcwq->nr_workers == gcwq->nr_idle && atomic_read(get_gcwq_nr_running(gcwq->cpu))); } @@ -1810,7 +1818,9 @@ __acquires(&gcwq->lock) * lock freed" warnings as well as problems when looking into * work->lockdep_map, make a copy and use that here. */ - struct lockdep_map lockdep_map = work->lockdep_map; + struct lockdep_map lockdep_map; + + lockdep_copy_map(&lockdep_map, &work->lockdep_map); #endif /* * A single work shouldn't be executed concurrently by @@ -2506,6 +2516,9 @@ bool flush_work(struct work_struct *work) { struct wq_barrier barr; + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + if (start_flush_work(work, &barr, true)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); |