diff options
Diffstat (limited to 'kernel')
33 files changed, 922 insertions, 228 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index a6605ca921b6..24f8c81fc48d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -588,16 +588,6 @@ out: } /** - * acct_init_pacct - initialize a new pacct_struct - * @pacct: per-process accounting info struct to initialize - */ -void acct_init_pacct(struct pacct_struct *pacct) -{ - memset(pacct, 0, sizeof(struct pacct_struct)); - pacct->ac_utime = pacct->ac_stime = cputime_zero; -} - -/** * acct_collect - collect accounting information into pacct_struct * @exitcode: task exit code * @group_dead: not 0, if this thread is the last one in the process. diff --git a/kernel/audit.c b/kernel/audit.c index 5feed232be9d..78f7f86aa238 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb) skb_get(skb); err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ + BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); audit_log_lost("auditd dissapeared\n"); audit_pid = 0; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4fd90e129772..ef909a329750 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4,6 +4,10 @@ * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * + * Notifications support + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. @@ -44,6 +48,7 @@ #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> +#include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> #include <linux/hash.h> @@ -52,15 +57,21 @@ #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ +#include <linux/eventfd.h> +#include <linux/poll.h> #include <asm/atomic.h> static DEFINE_MUTEX(cgroup_mutex); -/* Generate an array of cgroup subsystem pointers */ +/* + * Generate an array of cgroup subsystem pointers. At boot time, this is + * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * registered after that. The mutable section of this array is protected by + * cgroup_mutex. + */ #define SUBSYS(_x) &_x ## _subsys, - -static struct cgroup_subsys *subsys[] = { +static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; @@ -147,6 +158,35 @@ struct css_id { unsigned short stack[0]; /* Array of Length (depth+1) */ }; +/* + * cgroup_event represents events which userspace want to recieve. + */ +struct cgroup_event { + /* + * Cgroup which the event belongs to. + */ + struct cgroup *cgrp; + /* + * Control file which the event associated. + */ + struct cftype *cft; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; /* The list of hierarchy roots */ @@ -250,7 +290,8 @@ struct cg_cgroup_link { static struct css_set init_css_set; static struct cg_cgroup_link init_css_set_link; -static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); +static int cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *css); /* css_set_lock protects the list of css_set objects, and the * chain of tasks off each css_set. Nests outside task->alloc_lock @@ -448,8 +489,11 @@ static struct css_set *find_existing_css_set( struct hlist_node *node; struct css_set *cg; - /* Built the set of subsystem state objects that we want to - * see in the new css_set */ + /* + * Build the set of subsystem state objects that we want to see in the + * new css_set. while subsystems can change globally, the entries here + * won't change, so no need for locking. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want @@ -696,6 +740,7 @@ void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } +EXPORT_SYMBOL_GPL(cgroup_lock); /** * cgroup_unlock - release lock on cgroup changes @@ -706,6 +751,7 @@ void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } +EXPORT_SYMBOL_GPL(cgroup_unlock); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -757,6 +803,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) if (ret) break; } + return ret; } @@ -884,7 +931,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) css_put(css); } - +/* + * Call with cgroup_mutex held. Drops reference counts on modules, including + * any duplicate ones that parse_cgroupfs_options took. If this function + * returns an error, no reference counts are touched. + */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -892,6 +943,8 @@ static int rebind_subsystems(struct cgroupfs_root *root, struct cgroup *cgrp = &root->top_cgroup; int i; + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ @@ -900,6 +953,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; + /* + * Nobody should tell us to do a subsys that doesn't exist: + * parse_cgroupfs_options should catch that case and refcounts + * ensure that subsystems won't disappear once selected. + */ + BUG_ON(ss == NULL); if (ss->root != &rootnode) { /* Subsystem isn't free */ return -EBUSY; @@ -919,6 +978,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long bit = 1UL << i; if (bit & added_bits) { /* We're binding this subsystem to this hierarchy */ + BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); @@ -930,8 +990,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(ss, cgrp); mutex_unlock(&ss->hierarchy_mutex); + /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { /* We're removing this subsystem */ + BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); @@ -942,9 +1004,20 @@ static int rebind_subsystems(struct cgroupfs_root *root, subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); mutex_unlock(&ss->hierarchy_mutex); + /* subsystem is now free - drop reference on module */ + module_put(ss->module); } else if (bit & final_bits) { /* Subsystem state should already exist */ + BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); + /* + * a refcount was taken, but we already had one, so + * drop the extra reference. + */ + module_put(ss->module); +#ifdef CONFIG_MODULE_UNLOAD + BUG_ON(ss->module && !module_refcount(ss->module)); +#endif } else { /* Subsystem state shouldn't exist */ BUG_ON(cgrp->subsys[i]); @@ -986,13 +1059,20 @@ struct cgroup_sb_opts { }; -/* Convert a hierarchy specifier into a bitmask of subsystems and - * flags. */ -static int parse_cgroupfs_options(char *data, - struct cgroup_sb_opts *opts) +/* + * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call + * with cgroup_mutex held to protect the subsys[] array. This function takes + * refcounts on subsystems to be used, unless it returns error, in which case + * no refcounts are taken. + */ +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data ?: "all"; unsigned long mask = (unsigned long)-1; + int i; + bool module_pin_failed = false; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); #ifdef CONFIG_CPUSETS mask = ~(1UL << cpuset_subsys_id); @@ -1005,10 +1085,11 @@ static int parse_cgroupfs_options(char *data, return -EINVAL; if (!strcmp(token, "all")) { /* Add all non-disabled subsystems */ - int i; opts->subsys_bits = 0; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (!ss->disabled) opts->subsys_bits |= 1ul << i; } @@ -1026,7 +1107,6 @@ static int parse_cgroupfs_options(char *data, if (!opts->release_agent) return -ENOMEM; } else if (!strncmp(token, "name=", 5)) { - int i; const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1050,9 +1130,10 @@ static int parse_cgroupfs_options(char *data, return -ENOMEM; } else { struct cgroup_subsys *ss; - int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; + if (ss == NULL) + continue; if (!strcmp(token, ss->name)) { if (!ss->disabled) set_bit(i, &opts->subsys_bits); @@ -1087,9 +1168,54 @@ static int parse_cgroupfs_options(char *data, if (!opts->subsys_bits && !opts->name) return -EINVAL; + /* + * Grab references on all the modules we'll need, so the subsystems + * don't dance around before rebind_subsystems attaches them. This may + * take duplicate reference counts on a subsystem that's already used, + * but rebind_subsystems handles this case. + */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + if (!try_module_get(subsys[i]->module)) { + module_pin_failed = true; + break; + } + } + if (module_pin_failed) { + /* + * oops, one of the modules was going away. this means that we + * raced with a module_delete call, and to the user this is + * essentially a "subsystem doesn't exist" case. + */ + for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + /* drop refcounts only on the ones we took */ + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + module_put(subsys[i]->module); + } + return -ENOENT; + } + return 0; } +static void drop_parsed_module_refcounts(unsigned long subsys_bits) +{ + int i; + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & subsys_bits)) + continue; + module_put(subsys[i]->module); + } +} + static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1106,21 +1232,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - /* Don't allow flags to change at remount */ - if (opts.flags != root->flags) { - ret = -EINVAL; - goto out_unlock; - } - - /* Don't allow name to change at remount */ - if (opts.name && strcmp(opts.name, root->name)) { + /* Don't allow flags or name to change at remount */ + if (opts.flags != root->flags || + (opts.name && strcmp(opts.name, root->name))) { ret = -EINVAL; + drop_parsed_module_refcounts(opts.subsys_bits); goto out_unlock; } ret = rebind_subsystems(root, opts.subsys_bits); - if (ret) + if (ret) { + drop_parsed_module_refcounts(opts.subsys_bits); goto out_unlock; + } /* (re)populate subsystem files */ cgroup_populate_dir(cgrp); @@ -1151,6 +1275,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + INIT_LIST_HEAD(&cgrp->event_list); + spin_lock_init(&cgrp->event_list_lock); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1306,7 +1432,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ + mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); + mutex_unlock(&cgroup_mutex); if (ret) goto out_err; @@ -1317,7 +1445,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto out_err; + goto drop_modules; } opts.new_root = new_root; @@ -1326,7 +1454,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_drop_root(opts.new_root); - goto out_err; + goto drop_modules; } root = sb->s_fs_info; @@ -1382,6 +1510,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, free_cg_links(&tmp_cg_links); goto drop_new_super; } + /* + * There must be no failure case after here, since rebinding + * takes care of subsystems' refcounts, which are explicitly + * dropped in the failure exit path. + */ /* EBUSY should be the only error here */ BUG_ON(ret); @@ -1420,6 +1553,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + /* no subsys rebinding, so refcounts don't change */ + drop_parsed_module_refcounts(opts.subsys_bits); } simple_set_mnt(mnt, sb); @@ -1429,6 +1564,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_new_super: deactivate_locked_super(sb); + drop_modules: + drop_parsed_module_refcounts(opts.subsys_bits); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -1542,6 +1679,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) memmove(buf, start, buf + buflen - start); return 0; } +EXPORT_SYMBOL_GPL(cgroup_path); /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' @@ -1554,7 +1692,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { int retval = 0; - struct cgroup_subsys *ss; + struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct css_set *cg; struct css_set *newcg; @@ -1568,8 +1706,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { retval = ss->can_attach(ss, cgrp, tsk, false); - if (retval) - return retval; + if (retval) { + /* + * Remember on which subsystem the can_attach() + * failed, so that we only call cancel_attach() + * against the subsystems whose can_attach() + * succeeded. (See below) + */ + failed_ss = ss; + goto out; + } } } @@ -1583,14 +1729,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) */ newcg = find_css_set(cg, cgrp); put_css_set(cg); - if (!newcg) - return -ENOMEM; + if (!newcg) { + retval = -ENOMEM; + goto out; + } task_lock(tsk); if (tsk->flags & PF_EXITING) { task_unlock(tsk); put_css_set(newcg); - return -ESRCH; + retval = -ESRCH; + goto out; } rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1616,7 +1765,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * is no longer empty. */ cgroup_wakeup_rmdir_waiter(cgrp); - return 0; +out: + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) + /* + * This subsystem was the one that failed the + * can_attach() check earlier, so we don't need + * to call cancel_attach() against it or any + * remaining subsystems. + */ + break; + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, tsk, false); + } + } + return retval; } /* @@ -1682,6 +1846,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp) } return true; } +EXPORT_SYMBOL_GPL(cgroup_lock_live_group); static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) @@ -1950,6 +2115,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; +/* + * Check if a file is a control file + */ +static inline struct cftype *__file_cft(struct file *file) +{ + if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + return ERR_PTR(-EINVAL); + return __d_cft(file->f_dentry); +} + static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { @@ -2069,6 +2244,7 @@ int cgroup_add_file(struct cgroup *cgrp, error = PTR_ERR(dentry); return error; } +EXPORT_SYMBOL_GPL(cgroup_add_file); int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, @@ -2083,6 +2259,7 @@ int cgroup_add_files(struct cgroup *cgrp, } return 0; } +EXPORT_SYMBOL_GPL(cgroup_add_files); /** * cgroup_task_count - count the number of tasks in a cgroup. @@ -2468,7 +2645,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + struct pid_namespace *ns = current->nsproxy->pid_ns; + /* * We can't drop the pidlist_mutex before taking the l->mutex in case * the last ref-holder is trying to remove l from the list at the same @@ -2478,8 +2656,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { - /* found a matching list - drop the extra refcount */ - put_pid_ns(ns); /* make sure l doesn't vanish out from under us */ down_write(&l->mutex); mutex_unlock(&cgrp->pidlist_mutex); @@ -2490,13 +2666,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) { mutex_unlock(&cgrp->pidlist_mutex); - put_pid_ns(ns); return l; } init_rwsem(&l->mutex); down_write(&l->mutex); l->key.type = type; - l->key.ns = ns; + l->key.ns = get_pid_ns(ns); l->use_count = 0; /* don't increment here */ l->list = NULL; l->owner = cgrp; @@ -2804,6 +2979,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, } /* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void cgroup_event_remove(struct work_struct *work) +{ + struct cgroup_event *event = container_of(work, struct cgroup_event, + remove); + struct cgroup *cgrp = event->cgrp; + + /* TODO: check return code */ + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + + eventfd_ctx_put(event->eventfd); + kfree(event); + dput(cgrp->dentry); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct cgroup_event *event = container_of(wait, + struct cgroup_event, wait); + struct cgroup *cgrp = event->cgrp; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + remove_wait_queue_locked(event->wqh, &event->wait); + spin_lock(&cgrp->event_list_lock); + list_del(&event->list); + spin_unlock(&cgrp->event_list_lock); + /* + * We are in atomic context, but cgroup_event_remove() may + * sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + + return 0; +} + +static void cgroup_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct cgroup_event *event = container_of(pt, + struct cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * Parse input and register new cgroup event handler. + * + * Input must be in format '<event_fd> <control_fd> <args>'. + * Interpretation of args is defined by control file implementation. + */ +static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + struct cgroup_event *event = NULL; + unsigned int efd, cfd; + struct file *efile = NULL; + struct file *cfile = NULL; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + event->cgrp = cgrp; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); + INIT_WORK(&event->remove, cgroup_event_remove); + + efile = eventfd_fget(efd); + if (IS_ERR(efile)) { + ret = PTR_ERR(efile); + goto fail; + } + + event->eventfd = eventfd_ctx_fileget(efile); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto fail; + } + + cfile = fget(cfd); + if (!cfile) { + ret = -EBADF; + goto fail; + } + + /* the process need read permission on control file */ + ret = file_permission(cfile, MAY_READ); + if (ret < 0) + goto fail; + + event->cft = __file_cft(cfile); + if (IS_ERR(event->cft)) { + ret = PTR_ERR(event->cft); + goto fail; + } + + if (!event->cft->register_event || !event->cft->unregister_event) { + ret = -EINVAL; + goto fail; + } + + ret = event->cft->register_event(cgrp, event->cft, + event->eventfd, buffer); + if (ret) + goto fail; + + if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + ret = 0; + goto fail; + } + + /* + * Events should be removed after rmdir of cgroup directory, but before + * destroying subsystem state objects. Let's take reference to cgroup + * directory dentry to do that. + */ + dget(cgrp->dentry); + + spin_lock(&cgrp->event_list_lock); + list_add(&event->list, &cgrp->event_list); + spin_unlock(&cgrp->event_list_lock); + + fput(cfile); + fput(efile); + + return 0; + +fail: + if (cfile) + fput(cfile); + + if (event && event->eventfd && !IS_ERR(event->eventfd)) + eventfd_ctx_put(event->eventfd); + + if (!IS_ERR_OR_NULL(efile)) + fput(efile); + + kfree(event); + + return ret; +} + +/* * for the common functions, 'private' gives the type of file */ /* for hysterical raisins, we can't put this on the older files */ @@ -2828,6 +3171,11 @@ static struct cftype files[] = { .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, + { + .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .write_string = cgroup_write_event_control, + .mode = S_IWUGO, + }, }; static struct cftype cft_release_agent = { @@ -2892,8 +3240,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) /* We need to take each hierarchy_mutex in a consistent order */ int i; + /* + * No worry about a race with rebind_subsystems that might mess up the + * locking order, since both parties are under cgroup_mutex. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (ss->root == root) mutex_lock(&ss->hierarchy_mutex); } @@ -2905,6 +3259,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (ss->root == root) mutex_unlock(&ss->hierarchy_mutex); } @@ -3028,11 +3384,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * synchronization other than RCU, and the subsystem linked * list isn't RCU-safe */ int i; + /* + * We won't need to lock the subsys array, because the subsystems + * we're concerned about aren't going anywhere since our cgroup root + * has a reference on them. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys_state *css; - /* Skip subsystems not in this hierarchy */ - if (ss->root != cgrp->root) + /* Skip subsystems not present or not in this hierarchy */ + if (ss == NULL || ss->root != cgrp->root) continue; css = cgrp->subsys[ss->subsys_id]; /* When called from check_for_release() it's possible @@ -3106,6 +3467,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct dentry *d; struct cgroup *parent; DEFINE_WAIT(wait); + struct cgroup_event *event, *tmp; int ret; /* the vfs holds both inode->i_mutex already */ @@ -3189,6 +3551,20 @@ again: set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del(&event->list); + remove_wait_queue(event->wqh, &event->wait); + eventfd_signal(event->eventfd, 1); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + mutex_unlock(&cgroup_mutex); return 0; } @@ -3223,7 +3599,196 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_init(&ss->hierarchy_mutex); lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; + + /* this function shouldn't be used with modular subsystems, since they + * need to register a subsys_id, among other things */ + BUG_ON(ss->module); +} + +/** + * cgroup_load_subsys: load and register a modular subsystem at runtime + * @ss: the subsystem to load + * + * This function should be called in a modular subsystem's initcall. If the + * subsytem is built as a module, it will be assigned a new subsys_id and set + * up for use. If the subsystem is built-in anyway, work is delegated to the + * simpler cgroup_init_subsys. + */ +int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) +{ + int i; + struct cgroup_subsys_state *css; + + /* check name and function validity */ + if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || + ss->create == NULL || ss->destroy == NULL) + return -EINVAL; + + /* + * we don't support callbacks in modular subsystems. this check is + * before the ss->module check for consistency; a subsystem that could + * be a module should still have no callbacks even if the user isn't + * compiling it as one. + */ + if (ss->fork || ss->exit) + return -EINVAL; + + /* + * an optionally modular subsystem is built-in: we want to do nothing, + * since cgroup_init_subsys will have already taken care of it. + */ + if (ss->module == NULL) { + /* a few sanity checks */ + BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + BUG_ON(subsys[ss->subsys_id] != ss); + return 0; + } + + /* + * need to register a subsys id before anything else - for example, + * init_cgroup_css needs it. + */ + mutex_lock(&cgroup_mutex); + /* find the first empty slot in the array */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + if (subsys[i] == NULL) + break; + } + if (i == CGROUP_SUBSYS_COUNT) { + /* maximum number of subsystems already registered! */ + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + /* assign ourselves the subsys_id */ + ss->subsys_id = i; + subsys[i] = ss; + + /* + * no ss->create seems to need anything important in the ss struct, so + * this can happen first (i.e. before the rootnode attachment). + */ + css = ss->create(ss, dummytop); + if (IS_ERR(css)) { + /* failure case - need to deassign the subsys[] slot. */ + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return PTR_ERR(css); + } + + list_add(&ss->sibling, &rootnode.subsys_list); + ss->root = &rootnode; + + /* our new subsystem will be attached to the dummy hierarchy. */ + init_cgroup_css(css, ss, dummytop); + /* init_idr must be after init_cgroup_css because it sets css->id. */ + if (ss->use_id) { + int ret = cgroup_init_idr(ss, css); + if (ret) { + dummytop->subsys[ss->subsys_id] = NULL; + ss->destroy(ss, dummytop); + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return ret; + } + } + + /* + * Now we need to entangle the css into the existing css_sets. unlike + * in cgroup_init_subsys, there are now multiple css_sets, so each one + * will need a new pointer to it; done by iterating the css_set_table. + * furthermore, modifying the existing css_sets will corrupt the hash + * table state, so each changed css_set will need its hash recomputed. + * this is all done under the css_set_lock. + */ + write_lock(&css_set_lock); + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { + struct css_set *cg; + struct hlist_node *node, *tmp; + struct hlist_head *bucket = &css_set_table[i], *new_bucket; + + hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hlist_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + new_bucket = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, new_bucket); + } + } + write_unlock(&css_set_lock); + + mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); + ss->active = 1; + + /* success! */ + mutex_unlock(&cgroup_mutex); + return 0; } +EXPORT_SYMBOL_GPL(cgroup_load_subsys); + +/** + * cgroup_unload_subsys: unload a modular subsystem + * @ss: the subsystem to unload + * + * This function should be called in a modular subsystem's exitcall. When this + * function is invoked, the refcount on the subsystem's module will be 0, so + * the subsystem will not be attached to any hierarchy. + */ +void cgroup_unload_subsys(struct cgroup_subsys *ss) +{ + struct cg_cgroup_link *link; + struct hlist_head *hhead; + + BUG_ON(ss->module == NULL); + + /* + * we shouldn't be called if the subsystem is in use, and the use of + * try_module_get in parse_cgroupfs_options should ensure that it + * doesn't start being used while we're killing it off. + */ + BUG_ON(ss->root != &rootnode); + + mutex_lock(&cgroup_mutex); + /* deassign the subsys_id */ + BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); + subsys[ss->subsys_id] = NULL; + + /* remove subsystem from rootnode's list of subsystems */ + list_del(&ss->sibling); + + /* + * disentangle the css from all css_sets attached to the dummytop. as + * in loading, we need to pay our respects to the hashtable gods. + */ + write_lock(&css_set_lock); + list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + + hlist_del(&cg->hlist); + BUG_ON(!cg->subsys[ss->subsys_id]); + cg->subsys[ss->subsys_id] = NULL; + hhead = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, hhead); + } + write_unlock(&css_set_lock); + + /* + * remove subsystem's css from the dummytop and free it - need to free + * before marking as null because ss->destroy needs the cgrp->subsys + * pointer to find their state. note that this also takes care of + * freeing the css_id. + */ + ss->destroy(ss, dummytop); + dummytop->subsys[ss->subsys_id] = NULL; + + mutex_unlock(&cgroup_mutex); +} +EXPORT_SYMBOL_GPL(cgroup_unload_subsys); /** * cgroup_init_early - cgroup initialization at system boot @@ -3253,7 +3818,8 @@ int __init cgroup_init_early(void) for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; BUG_ON(!ss->name); @@ -3288,12 +3854,13 @@ int __init cgroup_init(void) if (err) return err; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) - cgroup_subsys_init_idr(ss); + cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* Add init_css_set to the hash table */ @@ -3397,9 +3964,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); @@ -3457,7 +4031,12 @@ void cgroup_fork_callbacks(struct task_struct *child) { if (need_forkexit_callback) { int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * forkexit callbacks are only supported for builtin + * subsystems, and the builtin section of the subsys array is + * immutable, so we don't need to lock the subsys array here. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) ss->fork(ss, child); @@ -3526,7 +4105,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct css_set *cg; if (run_callbacks && need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->exit) ss->exit(ss, tsk); @@ -3720,12 +4303,13 @@ static void check_for_release(struct cgroup *cgrp) } } -void __css_put(struct cgroup_subsys_state *css) +/* Caller must verify that the css is not for root cgroup */ +void __css_put(struct cgroup_subsys_state *css, int count) { struct cgroup *cgrp = css->cgroup; int val; rcu_read_lock(); - val = atomic_dec_return(&css->refcnt); + val = atomic_sub_return(count, &css->refcnt); if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); @@ -3736,6 +4320,7 @@ void __css_put(struct cgroup_subsys_state *css) rcu_read_unlock(); WARN_ON_ONCE(val < 1); } +EXPORT_SYMBOL_GPL(__css_put); /* * Notify userspace when a cgroup is released, by running the @@ -3817,8 +4402,11 @@ static int __init cgroup_disable(char *str) while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * cgroup_disable, being at boot time, can't know about module + * subsystems, so we don't worry about them. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!strcmp(token, ss->name)) { @@ -3848,6 +4436,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) return cssid->id; return 0; } +EXPORT_SYMBOL_GPL(css_id); unsigned short css_depth(struct cgroup_subsys_state *css) { @@ -3857,6 +4446,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) return cssid->depth; return 0; } +EXPORT_SYMBOL_GPL(css_depth); bool css_is_ancestor(struct cgroup_subsys_state *child, const struct cgroup_subsys_state *root) @@ -3893,6 +4483,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_unlock(&ss->id_lock); call_rcu(&id->rcu_head, __free_css_id_cb); } +EXPORT_SYMBOL_GPL(free_css_id); /* * This is called by init or create(). Then, calls to this function are @@ -3942,15 +4533,14 @@ err_out: } -static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) +static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *rootcss) { struct css_id *newid; - struct cgroup_subsys_state *rootcss; spin_lock_init(&ss->id_lock); idr_init(&ss->idr); - rootcss = init_css_set.subsys[ss->subsys_id]; newid = get_new_cssid(ss, 0); if (IS_ERR(newid)) return PTR_ERR(newid); @@ -4010,6 +4600,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) return rcu_dereference(cssid->css); } +EXPORT_SYMBOL_GPL(css_lookup); /** * css_get_next - lookup next cgroup under specified hierarchy. diff --git a/kernel/exit.c b/kernel/exit.c index ce1e48c2d93d..cce59cb5ee6a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference_check(tsk->sighand, rcu_read_lock_held() || - lockdep_is_held(&tasklist_lock)); + lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index b0ec34abc0bb..4799c5f0e6d0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -EXPORT_SYMBOL_GPL(tasklist_lock); + +#ifdef CONFIG_PROVE_RCU +int lockdep_tasklist_lock_is_held(void) +{ + return lockdep_is_held(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); +#endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { @@ -833,17 +840,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) /* Thread group counters. */ thread_group_cputime_init(sig); - /* Expiration times and increments. */ - sig->it[CPUCLOCK_PROF].expires = cputime_zero; - sig->it[CPUCLOCK_PROF].incr = cputime_zero; - sig->it[CPUCLOCK_VIRT].expires = cputime_zero; - sig->it[CPUCLOCK_VIRT].incr = cputime_zero; - - /* Cached expiration times. */ - sig->cputime_expires.prof_exp = cputime_zero; - sig->cputime_expires.virt_exp = cputime_zero; - sig->cputime_expires.sched_exp = 0; - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); @@ -863,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_THREAD) return 0; - sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; @@ -871,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); - sig->flags = 0; if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; - sig->group_exit_code = 0; - sig->group_exit_task = NULL; - sig->group_stop_count = 0; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->leader = 0; /* session leadership doesn't inherit */ - sig->tty_old_pgrp = NULL; - sig->tty = NULL; - - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; - sig->gtime = cputime_zero; - sig->cgtime = cputime_zero; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - sig->prev_utime = sig->prev_stime = cputime_zero; -#endif - sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; - sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; - sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->maxrss = sig->cmaxrss = 0; - task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - taskstats_tgid_init(sig); - task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); - tty_audit_fork(sig); sig->oom_adj = current->signal->oom_adj; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..03808ed342a6 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); * * @return a set of per_cpu pointers to perf events */ -struct perf_event ** +struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { - struct perf_event **cpu_events, **pevent, *bp; + struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return ERR_PTR(-ENOMEM); + return (void __percpu __force *)ERR_PTR(-ENOMEM); get_online_cpus(); for_each_online_cpu(cpu) { @@ -451,7 +451,7 @@ fail: put_online_cpus(); free_percpu(cpu_events); - return ERR_PTR(err); + return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ -void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; struct perf_event **pevent; @@ -489,5 +489,4 @@ struct pmu perf_ops_bp = { .enable = arch_install_hw_breakpoint, .disable = arch_uninstall_hw_breakpoint, .read = hw_breakpoint_pmu_read, - .unthrottle = hw_breakpoint_pmu_unthrottle }; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d70394f12ee9..42ec11b2af8a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -554,7 +554,7 @@ out: * signal. The occurence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one - * is handled by the assosiacted event handler. If this happens it + * is handled by the associated event handler. If this happens it * might be necessary to disable (mask) the interrupt depending on the * controller hardware. This requires to reenable the interrupt inside * of the loop which handles the interrupts which have arrived while diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index d06df9c41cba..1ef4ffcdfa55 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * automatically freed on driver detach. * * If an IRQ allocated with this function needs to be freed - * separately, dev_free_irq() must be used. + * separately, devm_free_irq() must be used. */ int devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, @@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq); * Except for the extra @dev argument, this function takes the * same arguments and performs the same function as free_irq(). * This function instead of free_irq() should be used to manually - * free IRQs allocated with dev_request_irq(). + * free IRQs allocated with devm_request_irq(). */ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6b1ccc3f0205..21fe3c426948 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); -/* uevent helper program, used during early boo */ +/* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0c30d0455de1..681bc2e1e187 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3822,6 +3822,7 @@ void lockdep_rcu_dereference(const char *file, const int line) printk("%s:%d invoked rcu_dereference_check() without protection!\n", file, line); printk("\nother info that might help us debug this:\n\n"); + printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); diff --git a/kernel/module.c b/kernel/module.c index e5538d5f00ad..c968d3606dca 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1085,6 +1085,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, if (sattr->name == NULL) goto out; sect_attrs->nsections++; + sysfs_attr_init(&sattr->mattr.attr); sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; @@ -1180,6 +1181,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, if (sect_empty(&sechdrs[i])) continue; if (sechdrs[i].sh_type == SHT_NOTE) { + sysfs_bin_attr_init(nattr); nattr->attr.name = mod->sect_attrs->attrs[loaded].name; nattr->attr.mode = S_IRUGO; nattr->size = sechdrs[i].sh_size; @@ -1252,6 +1254,7 @@ int module_add_modinfo_attrs(struct module *mod) if (!attr->test || (attr->test && attr->test(mod))) { memcpy(temp_attr, attr, sizeof(*temp_attr)); + sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); ++temp_attr; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 09b4ff9711b2..2ab67233ee8f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -24,7 +24,18 @@ static struct kmem_cache *nsproxy_cachep; -struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); +struct nsproxy init_nsproxy = { + .count = ATOMIC_INIT(1), + .uts_ns = &init_uts_ns, +#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) + .ipc_ns = &init_ipc_ns, +#endif + .mnt_ns = NULL, + .pid_ns = &init_pid_ns, +#ifdef CONFIG_NET + .net_ns = &init_net, +#endif +}; static inline struct nsproxy *create_nsproxy(void) { diff --git a/kernel/params.c b/kernel/params.c index 8d95f5451b22..0b30ecd53a52 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -401,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp) } /* sysfs output in /sys/modules/XYZ/parameters/ */ -#define to_module_attr(n) container_of(n, struct module_attribute, attr); -#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); +#define to_module_attr(n) container_of(n, struct module_attribute, attr) +#define to_module_kobject(n) container_of(n, struct module_kobject, kobj) extern struct kernel_param __start___param[], __stop___param[]; @@ -420,7 +420,7 @@ struct module_param_attrs }; #ifdef CONFIG_SYSFS -#define to_param_attr(n) container_of(n, struct param_attribute, mattr); +#define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, struct module *mod, char *buf) @@ -516,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, new->grp.attrs = attrs; /* Tack new one on the end. */ + sysfs_attr_init(&new->attrs[num].mattr.attr); new->attrs[num].param = kp; new->attrs[num].mattr.show = param_attr_show; new->attrs[num].mattr.store = param_attr_store; @@ -722,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj, return ret; } -static struct sysfs_ops module_sysfs_ops = { +static const struct sysfs_ops module_sysfs_ops = { .show = module_attr_show, .store = module_attr_store, }; @@ -736,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) return 0; } -static struct kset_uevent_ops module_uevent_ops = { +static const struct kset_uevent_ops module_uevent_ops = { .filter = uevent_filter, }; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8e352c756ba7..4393b9e73740 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly; */ int sysctl_perf_event_paranoid __read_mostly = 1; -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_event_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_event_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_event_paranoid > 1; -} - int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* @@ -4123,8 +4108,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, if (rctx < 0) return; - data.addr = addr; - data.raw = NULL; + perf_sample_data_init(&data, addr); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); @@ -4169,11 +4153,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) struct perf_event *event; u64 period; - event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); event->pmu->read(event); - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); /* @@ -4337,17 +4320,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { + struct pt_regs *regs = get_irq_regs(); + struct perf_sample_data data; struct perf_raw_record raw = { .size = entry_size, .data = record, }; - struct perf_sample_data data = { - .addr = addr, - .raw = &raw, - }; - - struct pt_regs *regs = get_irq_regs(); + perf_sample_data_init(&data, addr); + data.raw = &raw; if (!regs) regs = task_pt_regs(current); @@ -4463,8 +4444,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; - sample.raw = NULL; - sample.addr = bp->attr.bp_addr; + perf_sample_data_init(&sample, bp->attr.bp_addr); if (!perf_exclude_event(bp, regs)) perf_swevent_add(bp, 1, 1, &sample, regs); @@ -5481,13 +5461,16 @@ void __init perf_event_init(void) register_cpu_notifier(&perf_cpu_nb); } -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + char *buf) { return sprintf(buf, "%d\n", perf_reserved_percpu); } static ssize_t perf_set_reserve_percpu(struct sysdev_class *class, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { @@ -5516,13 +5499,17 @@ perf_set_reserve_percpu(struct sysdev_class *class, return count; } -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +static ssize_t perf_show_overcommit(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + char *buf) { return sprintf(buf, "%d\n", perf_overcommit); } static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +perf_set_overcommit(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + const char *buf, size_t count) { unsigned long val; int err; diff --git a/kernel/pid.c b/kernel/pid.c index 86b296943e5f..aebb30d9c233 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); + first = rcu_dereference_check(pid->tasks[type].first, + rcu_read_lock_held() || + lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pids[(type)].node); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 86b3796b0436..79aac93acf99 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rcu_read_lock(); /* - * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring - * any nested-container's init processes don't ignore the - * signal + * Any nested-container's init processes won't ignore the + * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). */ task = pid_task(find_vpid(nr), PIDTYPE_PID); if (task) - force_sig(SIGKILL, task); + send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); rcu_read_unlock(); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1439eb504c22..4a525a30e08e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -246,12 +246,21 @@ struct rcu_data { #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) + /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) + /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 464ad2cdee00..79b53bda8943 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1010,6 +1010,10 @@ int rcu_needs_cpu(int cpu) int c = 0; int thatcpu; + /* Check for being in the holdoff period. */ + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + return rcu_needs_cpu_quick_check(cpu); + /* Don't bother unless we are the last non-dyntick-idle CPU. */ for_each_cpu_not(thatcpu, nohz_cpu_mask) if (thatcpu != cpu) { @@ -1041,10 +1045,8 @@ int rcu_needs_cpu(int cpu) } /* If RCU callbacks are still pending, RCU still needs this CPU. */ - if (c) { + if (c) raise_softirq(RCU_SOFTIRQ); - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - } return c; } diff --git a/kernel/sched.c b/kernel/sched.c index b47ceeec1a91..9ab3cd7858d3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2359,7 +2359,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq, *orig_rq; + struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2367,7 +2367,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = orig_rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; @@ -7406,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); @@ -7422,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, #ifdef CONFIG_SCHED_SMT static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 82095bf2099f..fccf9fbb0d7b 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -56,7 +56,7 @@ static int convert_prio(int prio) * @lowest_mask: A mask to fill in with selected CPUs (or NULL) * * Note: This function returns the recommended CPUs as calculated during the - * current invokation. By the time the call returns, the CPUs may have in + * current invocation. By the time the call returns, the CPUs may have in * fact changed priorities any number of times. While not ideal, it is not * an issue of correctness since the normal rebalancer logic will correct * any discrepancies created by racing against the uncertainty of the current diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3e1fd96c6cf9..5a5ea2cd924f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3476,7 +3476,7 @@ static void run_rebalance_domains(struct softirq_action *h) static inline int on_null_domain(int cpu) { - return !rcu_dereference(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(cpu_rq(cpu)->sd); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 5a6ed1f0990a..b5b920ae2ea7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1146,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) if (next && next->prio < idx) continue; list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p = rt_task_of(rt_se); + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; break; diff --git a/kernel/sys.c b/kernel/sys.c index 9814e43fb23b..8298878f4f71 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> #include <linux/cpu.h> +#include <linux/personality.h> #include <linux/ptrace.h> #include <linux/fs_struct.h> @@ -1114,6 +1115,15 @@ out: DECLARE_RWSEM(uts_sem); +#ifdef COMPAT_UTS_MACHINE +#define override_architecture(name) \ + (current->personality == PER_LINUX32 && \ + copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ + sizeof(COMPAT_UTS_MACHINE))) +#else +#define override_architecture(name) 0 +#endif + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1122,9 +1132,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); + + if (!errno && override_architecture(name)) + errno = -EFAULT; return errno; } +#ifdef __ARCH_WANT_SYS_OLD_UNAME +/* + * Old cruft + */ +SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) +{ + int error = 0; + + if (!name) + return -EFAULT; + + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof(*name))) + error = -EFAULT; + up_read(&uts_sem); + + if (!error && override_architecture(name)) + error = -EFAULT; + return error; +} + +SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); + up_read(&uts_sem); + + if (!error && override_architecture(name)) + error = -EFAULT; + return error ? -EFAULT : 0; +} +#endif + SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 695384f12a7d..70f2ea758ffe 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(sys_ipc); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); cond_syscall(sys_flock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0ef19c614f6d..8686b0f5fc12 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include <linux/swap.h> #include <linux/slab.h> #include <linux/sysctl.h> +#include <linux/signal.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> @@ -60,13 +61,23 @@ #include <asm/stacktrace.h> #include <asm/io.h> #endif +#ifdef CONFIG_BSD_PROCESS_ACCT +#include <linux/acct.h> +#endif +#ifdef CONFIG_RT_MUTEXES +#include <linux/rtmutex.h> +#endif +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) +#include <linux/lockdep.h> +#endif +#ifdef CONFIG_CHR_DEV_SG +#include <scsi/sg.h> +#endif #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int C_A_D; -extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; @@ -88,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif -#ifdef CONFIG_RCU_TORTURE_TEST -extern int rcutorture_runnable; -#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ #ifdef CONFIG_BLOCK extern int blk_iopoll_enabled; #endif @@ -120,14 +128,6 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_MODULES -extern char modprobe_path[]; -extern int modules_disabled; -#endif -#ifdef CONFIG_CHR_DEV_SG -extern int sg_big_buff; -#endif - #ifdef CONFIG_SPARC #include <asm/system.h> #endif @@ -149,10 +149,6 @@ extern int sysctl_userprocess_debug; extern int spin_retry; #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -extern int acct_parm[]; -#endif - #ifdef CONFIG_IA64 extern int no_unaligned_warning; extern int unaligned_dump_stack; @@ -160,10 +156,6 @@ extern int unaligned_dump_stack; extern struct ratelimit_state printk_ratelimit_state; -#ifdef CONFIG_RT_MUTEXES -extern int max_lock_depth; -#endif - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -202,9 +194,6 @@ extern struct ctl_table epoll_table[]; int sysctl_legacy_va_layout; #endif -extern int prove_locking; -extern int lock_stat; - /* The default sysctl tables: */ static struct ctl_table root_table[] = { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1f663d23e85e..1f5dde637457 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -592,6 +592,10 @@ static inline void clocksource_select(void) { } */ static int __init clocksource_done_booting(void) { + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); + mutex_unlock(&clocksource_mutex); + finished_booting = 1; /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83783579378f..d9062f5cc0c0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,7 @@ #include <linux/ctype.h> #include <linux/list.h> #include <linux/hash.h> +#include <linux/rcupdate.h> #include <trace/events/sched.h> @@ -84,22 +85,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); -#endif - +/* + * Traverse the ftrace_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { - struct ftrace_ops *op = ftrace_list; - - /* in case someone actually ports this to alpha! */ - read_barrier_depends(); + struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ while (op != &ftrace_list_end) { - /* silly alpha */ - read_barrier_depends(); op->func(ip, parent_ip); - op = op->next; + op = rcu_dereference_raw(op->next); /*see above*/ }; } @@ -154,8 +155,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) * the ops->next pointer is valid before another CPU sees * the ops pointer included into the ftrace_list. */ - smp_wmb(); - ftrace_list = ops; + rcu_assign_pointer(ftrace_list, ops); if (ftrace_enabled) { ftrace_func_t func; @@ -2276,6 +2276,8 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); + static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -3351,6 +3353,7 @@ void ftrace_graph_init_task(struct task_struct *t) { /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; + t->curr_ret_stack = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; @@ -3360,7 +3363,6 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0287f9f52f5a..05a9f83b8819 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2233,12 +2233,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; - if (atomic_read(&buffer->record_disabled)) - return NULL; - /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out_nocheck; + if (trace_recursive_lock()) goto out_nocheck; @@ -2470,11 +2470,11 @@ int ring_buffer_write(struct ring_buffer *buffer, if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - if (atomic_read(&buffer->record_disabled)) - return -EBUSY; - resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -2542,7 +2542,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct ring_buffer *buffer) { @@ -2578,7 +2578,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ed01fdba4a55..3ec2ee6f6560 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -374,6 +374,21 @@ static int __init set_buf_size(char *str) } __setup("trace_buf_size=", set_buf_size); +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshhold; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &threshhold); + if (ret < 0) + return 0; + tracing_thresh = threshhold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; @@ -579,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) static arch_spinlock_t ftrace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +unsigned long __read_mostly tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace @@ -592,7 +608,7 @@ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data = tr->data[cpu]; + struct trace_array_cpu *max_data; max_tr.cpu = cpu; max_tr.time_start = data->preempt_timestamp; @@ -602,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; @@ -824,10 +840,10 @@ out: mutex_unlock(&trace_types_lock); } -static void __tracing_reset(struct trace_array *tr, int cpu) +static void __tracing_reset(struct ring_buffer *buffer, int cpu) { ftrace_disable_cpu(); - ring_buffer_reset_cpu(tr->buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ftrace_enable_cpu(); } @@ -839,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu) /* Make sure all commits have finished */ synchronize_sched(); - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -857,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -934,6 +950,8 @@ void tracing_start(void) goto out; } + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); buffer = global_trace.buffer; if (buffer) @@ -943,6 +961,8 @@ void tracing_start(void) if (buffer) ring_buffer_record_enable(buffer); + arch_spin_unlock(&ftrace_max_lock); + ftrace_start(); out: spin_unlock_irqrestore(&tracing_start_lock, flags); @@ -964,6 +984,9 @@ void tracing_stop(void) if (trace_stop_count++) goto out; + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); + buffer = global_trace.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -972,6 +995,8 @@ void tracing_stop(void) if (buffer) ring_buffer_record_disable(buffer); + arch_spin_unlock(&ftrace_max_lock); + out: spin_unlock_irqrestore(&tracing_start_lock, flags); } @@ -1259,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) @@ -1703,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) ftrace_enable_cpu(); + iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -4248,10 +4281,10 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); +#endif trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); -#endif trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fd05bcaf91b0..2825ef2c0b15 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); extern unsigned long nsecs_to_usecs(unsigned long nsecs); +extern unsigned long tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE extern unsigned long tracing_max_latency; -extern unsigned long tracing_thresh; void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, @@ -550,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task) * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found * @buffer: holds the parsed user input - * @idx: user input lenght + * @idx: user input length * @size: buffer size */ struct trace_parser { diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 84a3a7ba072a..6fbfb8f417b9 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -13,6 +13,7 @@ * Tracer plugins will chose a default from these clocks. */ #include <linux/spinlock.h> +#include <linux/irqflags.h> #include <linux/hardirq.h> #include <linux/module.h> #include <linux/percpu.h> diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index f0d693005075..c1cc3ab633de 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -138,9 +138,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, cpu = smp_processor_id(); if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); + trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); else - trace_buf = rcu_dereference(perf_trace_buf); + trace_buf = rcu_dereference_sched(perf_trace_buf); if (!trace_buf) goto err; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 3fc2a575664f..e6989d9b44da 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -237,6 +237,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } +int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +{ + if (tracing_thresh) + return 1; + else + return trace_graph_entry(trace); +} + static void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -290,13 +298,26 @@ void set_graph_array(struct trace_array *tr) smp_mb(); } +void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + static int graph_trace_init(struct trace_array *tr) { int ret; set_graph_array(tr); - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + if (tracing_thresh) + ret = register_ftrace_graph(&trace_graph_thresh_return, + &trace_graph_thresh_entry); + else + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); @@ -920,7 +941,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; } else { - ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); + ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } |