diff options
Diffstat (limited to 'kernel')
160 files changed, 14152 insertions, 7495 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index bc010ee272b6..f2a8b6246ce9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,11 +18,13 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif +# cond_syscall is currently not LTO compatible +CFLAGS_sys_ni.o = $(DISABLE_LTO) + obj-y += sched/ obj-y += locking/ obj-y += power/ obj-y += printk/ -obj-y += cpu/ obj-y += irq/ obj-y += rcu/ @@ -93,6 +95,7 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o +obj-$(CONFIG_TORTURE_TEST) += torture.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/audit.c b/kernel/audit.c index 873b965fdc58..7c2893602d06 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -559,7 +559,7 @@ static int audit_send_reply_thread(void *arg) } /** * audit_send_reply - send an audit reply message via netlink - * @portid: netlink port to which to send reply + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: sequence number * @type: audit message type * @done: done (last) flag @@ -618,7 +618,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) * userspace will reject all logins. This should be removed when we * support non init namespaces!! */ - if ((current_user_ns() != &init_user_ns)) + if (current_user_ns() != &init_user_ns) return -ECONNREFUSED; switch (msg_type) { diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 43c307dc9453..135944a7b28a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -912,12 +912,13 @@ static void evict_chunk(struct audit_chunk *chunk) } static int audit_tree_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmonut_mark, - struct fsnotify_event *event) + struct fsnotify_mark *vfsmount_mark, + u32 mask, void *data, int data_type, + const unsigned char *file_name, u32 cookie) { - BUG(); - return -EOPNOTSUPP; + return 0; } static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) @@ -933,19 +934,8 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify BUG_ON(atomic_read(&entry->refcnt) < 1); } -static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return false; -} - static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, - .should_send_event = audit_tree_send_event, - .free_group_priv = NULL, - .free_event_priv = NULL, .freeing_mark = audit_tree_freeing_mark, }; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 22831c4d369c..70b4554d2fbe 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -465,35 +465,27 @@ void audit_remove_watch_rule(struct audit_krule *krule) } } -static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, int data_type) -{ - return true; -} - /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, + struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - struct fsnotify_event *event) + u32 mask, void *data, int data_type, + const unsigned char *dname, u32 cookie) { struct inode *inode; - __u32 mask = event->mask; - const char *dname = event->file_name; struct audit_parent *parent; parent = container_of(inode_mark, struct audit_parent, mark); BUG_ON(group != audit_watch_group); - switch (event->data_type) { + switch (data_type) { case (FSNOTIFY_EVENT_PATH): - inode = event->path.dentry->d_inode; + inode = ((struct path *)data)->dentry->d_inode; break; case (FSNOTIFY_EVENT_INODE): - inode = event->inode; + inode = (struct inode *)data; break; default: BUG(); @@ -512,11 +504,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, } static const struct fsnotify_ops audit_watch_fsnotify_ops = { - .should_send_event = audit_watch_should_send_event, .handle_event = audit_watch_handle_event, - .free_group_priv = NULL, - .freeing_mark = NULL, - .free_event_priv = NULL, }; static int __init audit_watch_init(void) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 70101e0b184a..8e9bc9c3dbb7 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1082,7 +1082,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data, /** * audit_list_rules_send - list the audit rules - * @portid: target portid for netlink audit messages + * @request_skb: skb of request we are replying to (used to target the reply) * @seq: netlink audit message sequence (serial) number */ int audit_list_rules_send(struct sk_buff *request_skb, int seq) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 254ce2063d1d..f251a5e8d17a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1789,7 +1789,7 @@ void audit_putname(struct filename *name) struct audit_context *context = current->audit_context; BUG_ON(!context); - if (!context->in_syscall) { + if (!name->aname || !context->in_syscall) { #if AUDIT_DEBUG == 2 pr_err("%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); diff --git a/kernel/capability.c b/kernel/capability.c index 34019c57888d..a8d63df0c322 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -7,6 +7,8 @@ * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> @@ -42,15 +44,10 @@ __setup("no_file_caps", file_caps_disable); static void warn_legacy_capability_use(void) { - static int warned; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" - " (legacy support in use)\n", - get_task_comm(name, current)); - warned = 1; - } + char name[sizeof(current->comm)]; + + pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", + get_task_comm(name, current)); } /* @@ -71,16 +68,10 @@ static void warn_legacy_capability_use(void) static void warn_deprecated_v2(void) { - static int warned; + char name[sizeof(current->comm)]; - if (!warned) { - char name[sizeof(current->comm)]; - - printk(KERN_INFO "warning: `%s' uses deprecated v2" - " capabilities in a way that may be insecure.\n", - get_task_comm(name, current)); - warned = 1; - } + pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", + get_task_comm(name, current)); } /* @@ -380,7 +371,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap) bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { - printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc1dcabe9217..9fcdaa705b6c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -40,54 +40,74 @@ #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/backing-dev.h> -#include <linux/seq_file.h> #include <linux/slab.h> -#include <linux/magic.h> #include <linux/spinlock.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> -#include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> #include <linux/hashtable.h> -#include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ -#include <linux/eventfd.h> -#include <linux/poll.h> -#include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> -#include <linux/file.h> +#include <linux/delay.h> #include <linux/atomic.h> /* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ + MAX_CFTYPE_NAME + 2) + +/* + * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file + * creation/removal and hierarchy changing operations including cgroup + * creation, removal, css association and controller rebinding. This outer + * lock is needed mainly to resolve the circular dependency between kernfs + * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. + */ +static DEFINE_MUTEX(cgroup_tree_mutex); + +/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * - * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify - * cgroupfs_root of any cgroup hierarchy - subsys list, flags, - * release_agent_path and so on. Modifying requires both cgroup_mutex and - * cgroup_root_mutex. Readers can acquire either of the two. This is to - * break the following locking order cycle. + * css_set_rwsem protects task->cgroups pointer, the list of css_set + * objects, and the chain of tasks off each css_set. * - * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem - * B. namespace_sem -> cgroup_mutex - * - * B happens only through cgroup_show_options() and using cgroup_root_mutex - * breaks it. + * These locks are exported if CONFIG_PROVE_RCU so that accessors in + * cgroup.h can use them for lockdep annotations. */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ +DECLARE_RWSEM(css_set_rwsem); +EXPORT_SYMBOL_GPL(cgroup_mutex); +EXPORT_SYMBOL_GPL(css_set_rwsem); #else static DEFINE_MUTEX(cgroup_mutex); +static DECLARE_RWSEM(css_set_rwsem); #endif -static DEFINE_MUTEX(cgroup_root_mutex); +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); + +#define cgroup_assert_mutexes_or_rcu_locked() \ + rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_tree_mutex) || \ + lockdep_is_held(&cgroup_mutex), \ + "cgroup_[tree_]mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -98,84 +118,46 @@ static DEFINE_MUTEX(cgroup_root_mutex); static struct workqueue_struct *cgroup_destroy_wq; /* - * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated with the built in subsystems, and modular subsystems are - * registered after that. The mutable section of this array is protected by - * cgroup_mutex. + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. */ -#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* generate an array of cgroup subsystem pointers */ +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, +static struct cgroup_subsys *cgroup_subsys[] = { #include <linux/cgroup_subsys.h> }; +#undef SUBSYS + +/* array of cgroup subsystem names */ +#define SUBSYS(_x) [_x ## _cgrp_id] = #_x, +static const char *cgroup_subsys_name[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS /* - * The dummy hierarchy, reserved for the subsystems that are otherwise + * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -static struct cgroupfs_root cgroup_dummy_root; - -/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ -static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; +struct cgroup_root cgrp_dfl_root; /* - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. + * The default hierarchy always exists but is hidden until mounted for the + * first time. This is for backward compatibility. */ -struct cfent { - struct list_head node; - struct dentry *dentry; - struct cftype *type; - struct cgroup_subsys_state *css; - - /* file xattrs */ - struct simple_xattrs xattrs; -}; - -/* - * cgroup_event represents events which userspace want to receive. - */ -struct cgroup_event { - /* - * css which the event belongs to. - */ - struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; -}; +static bool cgrp_dfl_root_visible; /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); static int cgroup_root_count; -/* - * Hierarchy ID allocation and mapping. It follows the same exclusion - * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for - * writes, either for reads. - */ +/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); -static struct cgroup_name root_cgroup_name = { .name = "/" }; - /* * Assign a monotonically increasing serial number to cgroups. It * guarantees cgroups with bigger numbers are newer than those with smaller @@ -195,11 +177,14 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; +static void cgroup_put(struct cgroup *cgrp); +static int rebind_subsystems(struct cgroup_root *dst_root, + unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static int cgroup_file_release(struct inode *inode, struct file *file); +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -216,8 +201,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (ss) - return rcu_dereference_check(cgrp->subsys[ss->subsys_id], - lockdep_is_held(&cgroup_mutex)); + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_tree_mutex) || + lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; } @@ -228,6 +214,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } +struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = seq_cft(seq); + + /* + * This is open and unprotected implementation of cgroup_css(). + * seq_css() is only called from a kernfs file operation which has + * an active reference on the file. Because all the subsystem + * files are drained before a css is disassociated with a cgroup, + * the matching css from the cgroup's subsys table is guaranteed to + * be and stay valid until the enclosing operation is complete. + */ + if (cft->ss) + return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); + else + return &cgrp->dummy_css; +} +EXPORT_SYMBOL_GPL(seq_css); + /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested @@ -246,7 +253,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) } return false; } -EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -262,53 +268,34 @@ static int notify_on_release(const struct cgroup *cgrp) } /** - * for_each_subsys - iterate all loaded cgroup subsystems - * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * for_each_css - iterate all css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of * * Should be called under cgroup_mutex. */ -#define for_each_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - !((ss) = cgroup_subsys[i]); })) { } \ +#define for_each_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = rcu_dereference_check( \ + (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_tree_mutex) || \ + lockdep_is_held(&cgroup_mutex)))) { } \ else /** - * for_each_builtin_subsys - iterate all built-in cgroup subsystems + * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end - * - * Bulit-in subsystems are always present and iteration itself doesn't - * require any synchronization. + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end */ -#define for_each_builtin_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[i]) || true); (i)++) - -/* iterate each subsystem attached to a hierarchy */ -#define for_each_root_subsys(root, ss) \ - list_for_each_entry((ss), &(root)->subsys_list, sibling) +#define for_each_subsys(ss, ssid) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -/* iterate across the active hierarchies */ -#define for_each_active_root(root) \ +/* iterate across the hierarchies */ +#define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cfent *__d_cfe(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return __d_cfe(dentry)->type; -} - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -354,23 +341,23 @@ struct cgrp_cset_link { struct list_head cgrp_link; }; -/* The default css_set - used by init and its children prior to any +/* + * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */ +static struct css_set init_css_set = { + .refcount = ATOMIC_INIT(1), + .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), + .tasks = LIST_HEAD_INIT(init_css_set.tasks), + .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), +}; -static struct css_set init_css_set; -static struct cgrp_cset_link init_cgrp_cset_link; - -/* - * css_set_lock protects the list of css_set objects, and the chain of - * tasks off each css_set. Nests outside task->alloc_lock due to - * css_task_iter_start(). - */ -static DEFINE_RWLOCK(css_set_lock); -static int css_set_count; +static int css_set_count = 1; /* 1 for init_css_set */ /* * hash table for cgroup groups. This improves the performance to find @@ -393,30 +380,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* - * We don't maintain the lists running through each css_set to its task - * until after the first call to css_task_iter_start(). This reduces the - * fork()/exit() overhead for people who have cgroups compiled into their - * kernel but not actually in use. - */ -static int use_task_css_set_links __read_mostly; - -static void __put_css_set(struct css_set *cset, int taskexit) +static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) { - write_unlock(&css_set_lock); + lockdep_assert_held(&css_set_rwsem); + + if (!atomic_dec_and_test(&cset->refcount)) return; - } /* This css_set is dead. unlink it and release cgroup refcounts */ hash_del(&cset->hlist); @@ -428,7 +399,7 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cset_link); list_del(&link->cgrp_link); - /* @cgrp can't go away while we're holding css_set_lock */ + /* @cgrp can't go away while we're holding css_set_rwsem */ if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); @@ -438,10 +409,24 @@ static void __put_css_set(struct css_set *cset, int taskexit) kfree(link); } - write_unlock(&css_set_lock); kfree_rcu(cset, rcu_head); } +static void put_css_set(struct css_set *cset, bool taskexit) +{ + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + down_write(&css_set_rwsem); + put_css_set_locked(cset, taskexit); + up_write(&css_set_rwsem); +} + /* * refcounted get/put for css_set objects */ @@ -450,16 +435,6 @@ static inline void get_css_set(struct css_set *cset) atomic_inc(&cset->refcount); } -static inline void put_css_set(struct css_set *cset) -{ - __put_css_set(cset, 0); -} - -static inline void put_css_set_taskexit(struct css_set *cset) -{ - __put_css_set(cset, 1); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -542,7 +517,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, struct cgroup *cgrp, struct cgroup_subsys_state *template[]) { - struct cgroupfs_root *root = cgrp->root; + struct cgroup_root *root = cgrp->root; struct cgroup_subsys *ss; struct css_set *cset; unsigned long key; @@ -554,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->subsys_mask & (1UL << i)) { + if (root->cgrp.subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -659,11 +634,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (cset) return cset; @@ -681,13 +656,16 @@ static struct css_set *find_css_set(struct css_set *old_cset, atomic_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->cgrp_links); INIT_LIST_HEAD(&cset->tasks); + INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_node); INIT_HLIST_NODE(&cset->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - write_lock(&css_set_lock); + down_write(&css_set_rwsem); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -705,31 +683,105 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); return cset; } -/* - * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex held. - */ -static struct cgroup *task_cgroup_from_root(struct task_struct *task, - struct cgroupfs_root *root) +static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct css_set *cset; - struct cgroup *res = NULL; + struct cgroup *root_cgrp = kf_root->kn->priv; + + return root_cgrp->root; +} + +static int cgroup_init_root_id(struct cgroup_root *root) +{ + int id; + + lockdep_assert_held(&cgroup_mutex); + + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; + return 0; +} + +static void cgroup_exit_root_id(struct cgroup_root *root) +{ + lockdep_assert_held(&cgroup_mutex); + + if (root->hierarchy_id) { + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); + root->hierarchy_id = 0; + } +} + +static void cgroup_free_root(struct cgroup_root *root) +{ + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); + + idr_destroy(&root->cgroup_idr); + kfree(root); + } +} + +static void cgroup_destroy_root(struct cgroup_root *root) +{ + struct cgroup *cgrp = &root->cgrp; + struct cgrp_cset_link *link, *tmp_link; + + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + BUG_ON(atomic_read(&root->nr_cgrps)); + BUG_ON(!list_empty(&cgrp->children)); + + /* Rebind all subsystems back to the default hierarchy */ + rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - read_lock(&css_set_lock); /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. + * Release all the links from cset_links to this hierarchy's + * root cgroup */ - cset = task_css_set(task); + down_write(&css_set_rwsem); + + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); + kfree(link); + } + up_write(&css_set_rwsem); + + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + cgroup_root_count--; + } + + cgroup_exit_root_id(root); + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + + kernfs_destroy_root(root->kf_root); + cgroup_free_root(root); +} + +/* look up cgroup associated with given css_set on the specified hierarchy */ +static struct cgroup *cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) +{ + struct cgroup *res = NULL; + + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + if (cset == &init_css_set) { - res = &root->top_cgroup; + res = &root->cgrp; } else { struct cgrp_cset_link *link; @@ -742,16 +794,27 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, } } } - read_unlock(&css_set_lock); + BUG_ON(!res); return res; } /* - * There is one global cgroup mutex. We also require taking - * task_lock() when dereferencing a task's cgroup subsys pointers. - * See "The task_lock() exception", at the end of this comment. - * + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex and css_set_rwsem held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroup_root *root) +{ + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + return cset_cgroup_from_root(task_css_set(task), root); +} + +/* * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. @@ -777,102 +840,79 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all * tasks in the system use _some_ cgroup, and since there is always at - * least one task in the system (init, pid == 1), therefore, top_cgroup + * least one task in the system (init, pid == 1), therefore, root cgroup * always has either children cgroups and/or using tasks. So we don't - * need a special hack to ensure that top_cgroup cannot be deleted. - * - * The task_lock() exception - * - * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one task's cgroup pointer with - * another. It does so using cgroup_mutex, however there are - * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global - * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task's cgroup pointer we use - * task_lock(), which acts on a spinlock (task->alloc_lock) already in - * the task_struct routinely used for such matters. + * need a special hack to ensure that root cgroup cannot be deleted. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cgroup pointer by cgroup_attach_task() */ -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); -static const struct inode_operations cgroup_dir_inode_operations; +static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct backing_dev_info cgroup_backing_dev_info = { - .name = "cgroup", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) { - struct inode *inode = new_inode(sb); - - if (inode) { - inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", + cft->ss->name, cft->name); + else + strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + return buf; } -static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static umode_t cgroup_file_mode(const struct cftype *cft) { - struct cgroup_name *name; + umode_t mode = 0; - name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); - if (!name) - return NULL; - strcpy(name->name, dentry->d_name.name); - return name; + if (cft->mode) + return cft->mode; + + if (cft->read_u64 || cft->read_s64 || cft->seq_show) + mode |= S_IRUGO; + + if (cft->write_u64 || cft->write_s64 || cft->write_string || + cft->trigger) + mode |= S_IWUSR; + + return mode; } static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - mutex_lock(&cgroup_mutex); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); - - /* - * We get a ref to the parent's dentry, and put the ref when - * this cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - dput(cgrp->parent->dentry); - - /* - * Drop the active superblock reference that we took when we - * created the cgroup. This will free cgrp->root, if we are - * holding the last reference to @sb. - */ - deactivate_super(cgrp->root->sb); - - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); - - simple_xattrs_free(&cgrp->xattrs); + atomic_dec(&cgrp->root->nr_cgrps); + cgroup_pidlist_destroy_all(cgrp); - kfree(rcu_dereference_raw(cgrp->name)); - kfree(cgrp); + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when this + * cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is root cgroup's refcnt reaching zero, which + * indicates that the root should be released. + */ + cgroup_destroy_root(cgrp->root); + } } static void cgroup_free_rcu(struct rcu_head *head) @@ -883,71 +923,40 @@ static void cgroup_free_rcu(struct rcu_head *head) queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) +static void cgroup_get(struct cgroup *cgrp) { - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - - BUG_ON(!(cgroup_is_dead(cgrp))); - - /* - * XXX: cgrp->id is only used to look up css's. As cgroup - * and css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); - } else { - struct cfent *cfe = __d_cfe(dentry); - struct cgroup *cgrp = dentry->d_parent->d_fsdata; - - WARN_ONCE(!list_empty(&cfe->node) && - cgrp != &cgrp->root->top_cgroup, - "cfe still linked for %s\n", cfe->type->name); - simple_xattrs_free(&cfe->xattrs); - kfree(cfe); - } - iput(inode); -} - -static void remove_dir(struct dentry *d) -{ - struct dentry *parent = dget(d->d_parent); - - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); + atomic_inc(&cgrp->refcnt); } -static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_put(struct cgroup *cgrp) { - struct cfent *cfe; - - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + if (!atomic_dec_and_test(&cgrp->refcnt)) + return; + if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) + return; /* - * If we're doing cleanup due to failure of cgroup_create(), - * the corresponding @cfe may not exist. + * XXX: cgrp->id is only used to look up css's. As cgroup and + * css's lifetimes will be decoupled, it should be made + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. */ - list_for_each_entry(cfe, &cgrp->files, node) { - struct dentry *d = cfe->dentry; + mutex_lock(&cgroup_mutex); + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); + cgrp->id = -1; - if (cft && cfe->type != cft) - continue; + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); +} - dget(d); - d_delete(d); - simple_unlink(cgrp->dentry->d_inode, d); - list_del_init(&cfe->node); - dput(d); +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +{ + char name[CGROUP_FILE_NAME_MAX]; - break; - } + lockdep_assert_held(&cgroup_tree_mutex); + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** @@ -961,158 +970,123 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) int i; for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, set->cfts, false); + list_for_each_entry(cfts, &ss->cfts, node) + cgroup_addrm_files(cgrp, cfts, false); } } -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) -{ - struct dentry *parent; - - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - remove_dir(dentry); -} - -/* - * Call with cgroup_mutex held. Drops reference counts on modules, including - * any duplicate ones that parse_cgroupfs_options took. If this function - * returns an error, no reference counts are touched. - */ -static int rebind_subsystems(struct cgroupfs_root *root, - unsigned long added_mask, unsigned removed_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, + unsigned long ss_mask) { - struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - unsigned long pinned = 0; - int i, ret; + int ssid, ret; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); - /* Check that any added subsystems are currently free */ - for_each_subsys(ss, i) { - if (!(added_mask & (1 << i))) + for_each_subsys(ss, ssid) { + if (!(ss_mask & (1 << ssid))) continue; - /* is the subsystem mounted elsewhere? */ - if (ss->root != &cgroup_dummy_root) { - ret = -EBUSY; - goto out_put; - } + /* if @ss is on the dummy_root, we can always move it */ + if (ss->root == &cgrp_dfl_root) + continue; - /* pin the module */ - if (!try_module_get(ss->module)) { - ret = -ENOENT; - goto out_put; - } - pinned |= 1 << i; - } + /* if @ss has non-root cgroups attached to it, can't move */ + if (!list_empty(&ss->root->cgrp.children)) + return -EBUSY; - /* subsys could be missing if unloaded between parsing and here */ - if (added_mask != pinned) { - ret = -ENOENT; - goto out_put; + /* can't move between two non-dummy roots either */ + if (dst_root != &cgrp_dfl_root) + return -EBUSY; } - ret = cgroup_populate_dir(cgrp, added_mask); - if (ret) - goto out_put; + ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask); + if (ret) { + if (dst_root != &cgrp_dfl_root) + return ret; + + /* + * Rebinding back to the default root is not allowed to + * fail. Using both default and non-default roots should + * be rare. Moving subsystems back and forth even more so. + * Just warn about it and continue. + */ + if (cgrp_dfl_root_visible) { + pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", + ret, ss_mask); + pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n"); + } + } /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ - cgroup_clear_dir(cgrp, removed_mask); + mutex_unlock(&cgroup_mutex); + for_each_subsys(ss, ssid) + if (ss_mask & (1 << ssid)) + cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); + mutex_lock(&cgroup_mutex); - for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; - - if (bit & added_mask) { - /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgroup_css(cgrp, ss)); - BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); - BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); - - rcu_assign_pointer(cgrp->subsys[i], - cgroup_css(cgroup_dummy_top, ss)); - cgroup_css(cgrp, ss)->cgroup = cgrp; - - list_move(&ss->sibling, &root->subsys_list); - ss->root = root; - if (ss->bind) - ss->bind(cgroup_css(cgrp, ss)); - - /* refcount was already taken, and we're keeping it */ - root->subsys_mask |= bit; - } else if (bit & removed_mask) { - /* We're removing this subsystem */ - BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); - BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); - - if (ss->bind) - ss->bind(cgroup_css(cgroup_dummy_top, ss)); - - cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; - RCU_INIT_POINTER(cgrp->subsys[i], NULL); - - cgroup_subsys[i]->root = &cgroup_dummy_root; - list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); - - /* subsystem is now free - drop reference on module */ - module_put(ss->module); - root->subsys_mask &= ~bit; - } - } + for_each_subsys(ss, ssid) { + struct cgroup_root *src_root; + struct cgroup_subsys_state *css; - /* - * Mark @root has finished binding subsystems. @root->subsys_mask - * now matches the bound subsystems. - */ - root->flags |= CGRP_ROOT_SUBSYS_BOUND; + if (!(ss_mask & (1 << ssid))) + continue; - return 0; + src_root = ss->root; + css = cgroup_css(&src_root->cgrp, ss); -out_put: - for_each_subsys(ss, i) - if (pinned & (1 << i)) - module_put(ss->module); - return ret; + WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); + + RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); + rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); + ss->root = dst_root; + css->cgroup = &dst_root->cgrp; + + src_root->cgrp.subsys_mask &= ~(1 << ssid); + dst_root->cgrp.subsys_mask |= 1 << ssid; + + if (ss->bind) + ss->bind(css); + } + + kernfs_activate(dst_root->cgrp.kn); + return 0; } -static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; + int ssid; - mutex_lock(&cgroup_root_mutex); - for_each_root_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); + for_each_subsys(ss, ssid) + if (root->cgrp.subsys_mask & (1 << ssid)) + seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) + spin_unlock(&release_agent_path_lock); + + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_root_mutex); return 0; } @@ -1124,9 +1098,6 @@ struct cgroup_sb_opts { char *name; /* User explicitly requested empty subsystem */ bool none; - - struct cgroupfs_root *new_root; - }; /* @@ -1146,7 +1117,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) BUG_ON(!mutex_is_locked(&cgroup_mutex)); #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_subsys_id); + mask = ~(1UL << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1236,30 +1207,34 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options - * were not specified, let's default to 'all' - */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (!ss->disabled) - set_bit(i, &opts->subsys_mask); - /* Consistency checks */ if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (opts->flags & CGRP_ROOT_NOPREFIX) { - pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || + opts->cpuset_clone_children || opts->release_agent || + opts->name) { + pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } + } else { + /* + * If the 'all' option was specified select all the + * subsystems, otherwise if 'none', 'name=' and a subsystem + * name options were not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) + for_each_subsys(ss, i) + if (!ss->disabled) + set_bit(i, &opts->subsys_mask); - if (opts->cpuset_clone_children) { - pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + /* + * We either have to specify by name or by subsystems. (So + * all empty hierarchies must have a name). + */ + if (!opts->subsys_mask && !opts->name) return -EINVAL; - } } /* @@ -1275,21 +1250,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->subsys_mask && opts->none) return -EINVAL; - /* - * We either have to specify by name or by subsystems. (So all - * empty hierarchies must have a name). - */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; - return 0; } -static int cgroup_remount(struct super_block *sb, int *flags, char *data) +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; @@ -1298,21 +1265,20 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; + removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1325,424 +1291,332 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (root->number_of_cgroups > 1) { + if (!list_empty(&root->cgrp.children)) { ret = -EBUSY; goto out_unlock; } - ret = rebind_subsystems(root, added_mask, removed_mask); + ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; - if (opts.release_agent) + rebind_subsystems(&cgrp_dfl_root, removed_mask); + + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } out_unlock: kfree(opts.release_agent); kfree(opts.name); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return ret; } -static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; +/* + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first mount. + */ +static bool use_task_css_set_links __read_mostly; + +static void cgroup_enable_task_cg_lists(void) +{ + struct task_struct *p, *g; + + down_write(&css_set_rwsem); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); + do_each_thread(g, p) { + WARN_ON_ONCE(!list_empty(&p->cg_list) || + task_css_set(p) != &init_css_set); + + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + * Do it while holding siglock so that we don't end up + * racing against cgroup_exit(). + */ + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_EXITING)) { + struct css_set *cset = task_css_set(p); + + list_add(&p->cg_list, &cset->tasks); + get_css_set(cset); + } + spin_unlock_irq(&p->sighand->siglock); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +out_unlock: + up_write(&css_set_rwsem); +} static void init_cgroup_housekeeping(struct cgroup *cgrp) { + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->event_list); - spin_lock_init(&cgrp->event_list_lock); - simple_xattrs_init(&cgrp->xattrs); } -static void init_cgroup_root(struct cgroupfs_root *root) +static void init_cgroup_root(struct cgroup_root *root, + struct cgroup_sb_opts *opts) { - struct cgroup *cgrp = &root->top_cgroup; + struct cgroup *cgrp = &root->cgrp; - INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; + atomic_set(&root->nr_cgrps, 1); cgrp->root = root; - RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); -} -static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) -{ - int id; - - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); - - id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, - GFP_KERNEL); - if (id < 0) - return id; - - root->hierarchy_id = id; - return 0; -} - -static void cgroup_exit_root_id(struct cgroupfs_root *root) -{ - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); - - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } -} - -static int cgroup_test_super(struct super_block *sb, void *data) -{ - struct cgroup_sb_opts *opts = data; - struct cgroupfs_root *root = sb->s_fs_info; - - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_mask || opts->none) - && (opts->subsys_mask != root->subsys_mask)) - return 0; - - return 1; -} - -static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) -{ - struct cgroupfs_root *root; - - if (!opts->subsys_mask && !opts->none) - return NULL; - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) - return ERR_PTR(-ENOMEM); - - init_cgroup_root(root); - - /* - * We need to set @root->subsys_mask now so that @root can be - * matched by cgroup_test_super() before it finishes - * initialization; otherwise, competing mounts with the same - * options may try to bind the same subsystems instead of waiting - * for the first one leading to unexpected mount errors. - * SUBSYS_BOUND will be set once actual binding is complete. - */ - root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); if (opts->cpuset_clone_children) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); - return root; + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static void cgroup_free_root(struct cgroupfs_root *root) +static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) { - if (root) { - /* hierarhcy ID shoulid already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - - idr_destroy(&root->cgroup_idr); - kfree(root); - } -} + LIST_HEAD(tmp_links); + struct cgroup *root_cgrp = &root->cgrp; + struct css_set *cset; + int i, ret; -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroup_sb_opts *opts = data; + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + if (ret < 0) + goto out; + root_cgrp->id = ret; - BUG_ON(!opts->subsys_mask && !opts->none); + /* + * We're accessing css_set_count without locking css_set_rwsem here, + * but that's OK - it can only be increased by someone holding + * cgroup_lock, and that's us. The worst that can happen is that we + * have some link structures left over + */ + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + if (ret) + goto out; - ret = set_anon_super(sb, NULL); + ret = cgroup_init_root_id(root); if (ret) - return ret; + goto out; - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + root_cgrp); + if (IS_ERR(root->kf_root)) { + ret = PTR_ERR(root->kf_root); + goto exit_root_id; + } + root_cgrp->kn = root->kf_root->kn; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (ret) + goto destroy_root; - return 0; -} + ret = rebind_subsystems(root, ss_mask); + if (ret) + goto destroy_root; -static int cgroup_get_rootdir(struct super_block *sb) -{ - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - .d_delete = always_delete_dentry, - }; + /* + * There must be no failure case after here, since rebinding takes + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ + list_add(&root->root_list, &cgroup_roots); + cgroup_root_count++; - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); + /* + * Link the root cgroup in this hierarchy into all the css_set + * objects. + */ + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + link_css_set(&tmp_links, cset, root_cgrp); + up_write(&css_set_rwsem); - if (!inode) - return -ENOMEM; + BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - /* for everything else we want ->d_op set */ - sb->s_d_op = &cgroup_dops; - return 0; + kernfs_activate(root_cgrp->kn); + ret = 0; + goto out; + +destroy_root: + kernfs_destroy_root(root->kf_root); + root->kf_root = NULL; +exit_root_id: + cgroup_exit_root_id(root); +out: + free_cgrp_cset_links(&tmp_links); + return ret; } static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct cgroup_root *root; struct cgroup_sb_opts opts; - struct cgroupfs_root *root; - int ret = 0; - struct super_block *sb; - struct cgroupfs_root *new_root; - struct list_head tmp_links; - struct inode *inode; - const struct cred *cred; + struct dentry *dentry; + int ret; + bool new_sb; - /* First find the desired set of subsystems */ + /* + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. + */ + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); +retry: + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); + + /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - mutex_unlock(&cgroup_mutex); if (ret) - goto out_err; - - /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. - */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto out_err; - } - opts.new_root = new_root; + goto out_unlock; - /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_free_root(opts.new_root); - goto out_err; + /* look for a matching existing root */ + if (!opts.subsys_mask && !opts.none && !opts.name) { + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + ret = 0; + goto out_unlock; } - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - /* We used the new root structure, so this is a new hierarchy */ - struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; - int i; - struct css_set *cset; - - BUG_ON(sb->s_root != NULL); - - ret = cgroup_get_rootdir(sb); - if (ret) - goto drop_new_super; - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, - 0, 1, GFP_KERNEL); - if (root_cgrp->id < 0) - goto unlock_drop; - - /* Check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto unlock_drop; - - /* - * We're accessing css_set_count without locking - * css_set_lock here, but that's OK - it can only be - * increased by someone holding cgroup_lock, and - * that's us. The worst that can happen is that we - * have some link structures left over - */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); - if (ret) - goto unlock_drop; - - /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ - ret = cgroup_init_root_id(root, 2, 0); - if (ret) - goto unlock_drop; + for_each_root(root) { + bool name_match = false; - sb->s_root->d_fsdata = root_cgrp; - root_cgrp->dentry = sb->s_root; - - /* - * We're inside get_sb() and will call lookup_one_len() to - * create the root files, which doesn't work if SELinux is - * in use. The following cred dancing somehow works around - * it. See 2ce9738ba ("cgroupfs: use init_cred when - * populating new cgroupfs mount") for more details. - */ - cred = override_creds(&init_cred); - - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); - if (ret) - goto rm_base_files; - - ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret) - goto rm_base_files; - - revert_creds(cred); + if (root == &cgrp_dfl_root) + continue; /* - * There must be no failure case after here, since rebinding - * takes care of subsystems' refcounts, which are explicitly - * dropped in the failure exit path. + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } - list_add(&root->root_list, &cgroup_roots); - cgroup_root_count++; - - /* Link the top cgroup in this hierarchy into all - * the css_set objects */ - write_lock(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) - link_css_set(&tmp_links, cset, root_cgrp); - write_unlock(&css_set_lock); - - free_cgrp_cset_links(&tmp_links); - - BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - } else { /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. */ - cgroup_free_root(opts.new_root); + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->cgrp.subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; - goto drop_new_super; + goto out_unlock; } else { pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - } - - kfree(opts.release_agent); - kfree(opts.name); - return dget(sb->s_root); - - rm_base_files: - free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); - unlock_drop: - cgroup_exit_root_id(root); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - drop_new_super: - deactivate_locked_super(sb); - out_err: - kfree(opts.release_agent); - kfree(opts.name); - return ERR_PTR(ret); -} -static void cgroup_kill_sb(struct super_block *sb) { - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgrp_cset_link *link, *tmp_link; - int ret; - - BUG_ON(!root); - - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); - - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); + /* + * A root's lifetime is governed by its root cgroup. Zero + * ref indicate that the root is being destroyed. Wait for + * destruction to complete so that the subsystems are free. + * We can use wait_queue for the wait but this path is + * super cold. Let's just sleep for a bit and retry. + */ + if (!atomic_inc_not_zero(&root->cgrp.refcnt)) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + kfree(opts.release_agent); + kfree(opts.name); + msleep(10); + goto retry; + } - /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); + ret = 0; + goto out_unlock; } /* - * Release all the links from cset_links to this hierarchy's - * root cgroup + * No such thing, create a new one. name= matching without subsys + * specification is allowed for already existing hierarchies but we + * can't create new one without subsys specification. */ - write_lock(&css_set_lock); - - list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { - list_del(&link->cset_link); - list_del(&link->cgrp_link); - kfree(link); + if (!opts.subsys_mask && !opts.none) { + ret = -EINVAL; + goto out_unlock; } - write_unlock(&css_set_lock); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) { + ret = -ENOMEM; + goto out_unlock; } - cgroup_exit_root_id(root); + init_cgroup_root(root, &opts); + + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); - mutex_unlock(&cgroup_root_mutex); +out_unlock: mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); + + kfree(opts.release_agent); + kfree(opts.name); - simple_xattrs_free(&cgrp->xattrs); + if (ret) + return ERR_PTR(ret); - kill_litter_super(sb); - cgroup_free_root(root); + dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb); + if (IS_ERR(dentry) || !new_sb) + cgroup_put(&root->cgrp); + return dentry; +} + +static void cgroup_kill_sb(struct super_block *sb) +{ + struct kernfs_root *kf_root = kernfs_root_from_sb(sb); + struct cgroup_root *root = cgroup_root_from_kf(kf_root); + + cgroup_put(&root->cgrp); + kernfs_kill_sb(sb); } static struct file_system_type cgroup_fs_type = { @@ -1754,57 +1628,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; /** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Writes path of cgroup into buf. Returns 0 on success, -errno on error. - * - * We can't generate cgroup path using dentry->d_name, as accessing - * dentry->name must be protected by irq-unsafe dentry->d_lock or parent - * inode's i_mutex, while on the other hand cgroup_path() can be called - * with some irq-safe spinlocks held. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - int ret = -ENAMETOOLONG; - char *start; - - if (!cgrp->parent) { - if (strlcpy(buf, "/", buflen) >= buflen) - return -ENAMETOOLONG; - return 0; - } - - start = buf + buflen - 1; - *start = '\0'; - - rcu_read_lock(); - do { - const char *name = cgroup_name(cgrp); - int len; - - len = strlen(name); - if ((start -= len) < buf) - goto out; - memcpy(start, name, len); - - if (--start < buf) - goto out; - *start = '/'; - - cgrp = cgrp->parent; - } while (cgrp->parent); - ret = 0; - memmove(buf, start, buf + buflen - start); -out: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_path); - -/** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task * @buf: the buffer to write the path into @@ -1815,49 +1638,55 @@ EXPORT_SYMBOL_GPL(cgroup_path); * function grabs cgroup_mutex and shouldn't be used inside locks used by * cgroup controller callbacks. * - * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. + * Return value is the same as kernfs_path(). */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { - struct cgroupfs_root *root; + struct cgroup_root *root; struct cgroup *cgrp; - int hierarchy_id = 1, ret = 0; - - if (buflen < 2) - return -ENAMETOOLONG; + int hierarchy_id = 1; + char *path = NULL; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); if (root) { cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path(cgrp, buf, buflen); + path = cgroup_path(cgrp, buf, buflen); } else { /* if no hierarchy exists, everyone is in "/" */ - memcpy(buf, "/", 2); + if (strlcpy(buf, "/", buflen) < buflen) + path = buf; } + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); - return ret; + return path; } EXPORT_SYMBOL_GPL(task_cgroup_path); -/* - * Control Group taskset - */ -struct task_and_cgroup { - struct task_struct *task; - struct cgroup *cgrp; - struct css_set *cset; -}; - +/* used to track tasks and other necessary states during migration */ struct cgroup_taskset { - struct task_and_cgroup single; - struct flex_array *tc_array; - int tc_array_len; - int idx; - struct cgroup *cur_cgrp; + /* the src and dst cset list running through cset->mg_node */ + struct list_head src_csets; + struct list_head dst_csets; + + /* + * Fields for cgroup_taskset_*() iteration. + * + * Before migration is committed, the target migration tasks are on + * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of + * the csets on ->dst_csets. ->csets point to either ->src_csets + * or ->dst_csets depending on whether migration is committed. + * + * ->cur_csets and ->cur_task point to the current task position + * during iteration. + */ + struct list_head *csets; + struct css_set *cur_cset; + struct task_struct *cur_task; }; /** @@ -1868,15 +1697,11 @@ struct cgroup_taskset { */ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) { - if (tset->tc_array) { - tset->idx = 0; - return cgroup_taskset_next(tset); - } else { - tset->cur_cgrp = tset->single.cgrp; - return tset->single.task; - } + tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); + tset->cur_task = NULL; + + return cgroup_taskset_next(tset); } -EXPORT_SYMBOL_GPL(cgroup_taskset_first); /** * cgroup_taskset_next - iterate to the next task in taskset @@ -1887,48 +1712,36 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_first); */ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) { - struct task_and_cgroup *tc; + struct css_set *cset = tset->cur_cset; + struct task_struct *task = tset->cur_task; - if (!tset->tc_array || tset->idx >= tset->tc_array_len) - return NULL; + while (&cset->mg_node != tset->csets) { + if (!task) + task = list_first_entry(&cset->mg_tasks, + struct task_struct, cg_list); + else + task = list_next_entry(task, cg_list); - tc = flex_array_get(tset->tc_array, tset->idx++); - tset->cur_cgrp = tc->cgrp; - return tc->task; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_next); + if (&task->cg_list != &cset->mg_tasks) { + tset->cur_cset = cset; + tset->cur_task = task; + return task; + } -/** - * cgroup_taskset_cur_css - return the matching css for the current task - * @tset: taskset of interest - * @subsys_id: the ID of the target subsystem - * - * Return the css for the current (last returned) task of @tset for - * subsystem specified by @subsys_id. This function must be preceded by - * either cgroup_taskset_first() or cgroup_taskset_next(). - */ -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id) -{ - return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); -} -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); + cset = list_next_entry(cset, mg_node); + task = NULL; + } -/** - * cgroup_taskset_size - return the number of tasks in taskset - * @tset: taskset of interest - */ -int cgroup_taskset_size(struct cgroup_taskset *tset) -{ - return tset->tc_array ? tset->tc_array_len : 1; + return NULL; } -EXPORT_SYMBOL_GPL(cgroup_taskset_size); - -/* +/** * cgroup_task_migrate - move a task from one cgroup to another. + * @old_cgrp; the cgroup @tsk is being migrated from + * @tsk: the task being migrated + * @new_cset: the new css_set @tsk is being attached to * - * Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. */ static void cgroup_task_migrate(struct cgroup *old_cgrp, struct task_struct *tsk, @@ -1936,6 +1749,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, { struct css_set *old_cset; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * We are synchronized through threadgroup_lock() against PF_EXITING * setting such that we can't race against cgroup_exit() changing the @@ -1944,15 +1760,16 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); old_cset = task_css_set(tsk); - task_lock(tsk); + get_css_set(new_cset); rcu_assign_pointer(tsk->cgroups, new_cset); - task_unlock(tsk); - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &new_cset->tasks); - write_unlock(&css_set_lock); + /* + * Use move_tail so that cgroup_taskset_first() still returns the + * leader after migration. This works because cgroup_migrate() + * ensures that the dst_cset of the leader is the first on the + * tset's dst_csets list. + */ + list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); /* * We just gained a reference on old_cset by taking it from the @@ -1960,178 +1777,291 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * we're safe to drop it here; it will be freed under RCU. */ set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set(old_cset); + put_css_set_locked(old_cset, false); } /** - * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup - * @cgrp: the cgroup to attach to - * @tsk: the task or the leader of the threadgroup to be attached - * @threadgroup: attach the whole threadgroup? + * cgroup_migrate_finish - cleanup after attach + * @preloaded_csets: list of preloaded css_sets * - * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of @tsk or each thread in the threadgroup individually in turn. + * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See + * those functions for details. */ -static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, - bool threadgroup) -{ - int retval, i, group_size; - struct cgroup_subsys *ss, *failed_ss = NULL; - struct cgroupfs_root *root = cgrp->root; - /* threadgroup list cursor and array */ - struct task_struct *leader = tsk; - struct task_and_cgroup *tc; - struct flex_array *group; - struct cgroup_taskset tset = { }; +static void cgroup_migrate_finish(struct list_head *preloaded_csets) +{ + struct css_set *cset, *tmp_cset; - /* - * step 0: in order to do expensive, possibly blocking operations for - * every thread, we cannot iterate the thread group list, since it needs - * rcu or tasklist locked. instead, build an array of all threads in the - * group - group_rwsem prevents new threads from appearing, and if - * threads exit, this will just be an over-estimate. - */ - if (threadgroup) - group_size = get_nr_threads(tsk); - else - group_size = 1; - /* flex_array supports very large thread-groups better than kmalloc. */ - group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); - if (!group) - return -ENOMEM; - /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); - if (retval) - goto out_free_group_list; + lockdep_assert_held(&cgroup_mutex); + + down_write(&css_set_rwsem); + list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_preload_node); + put_css_set_locked(cset, false); + } + up_write(&css_set_rwsem); +} + +/** + * cgroup_migrate_add_src - add a migration source css_set + * @src_cset: the source css_set to add + * @dst_cgrp: the destination cgroup + * @preloaded_csets: list of preloaded css_sets + * + * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin + * @src_cset and add it to @preloaded_csets, which should later be cleaned + * up by cgroup_migrate_finish(). + * + * This function may be called without holding threadgroup_lock even if the + * target is a process. Threads may be created and destroyed but as long + * as cgroup_mutex is not dropped, no new css_set can be put into play and + * the preloaded css_sets are guaranteed to cover all migrations. + */ +static void cgroup_migrate_add_src(struct css_set *src_cset, + struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + struct cgroup *src_cgrp; + + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + + /* nothing to do if this cset already belongs to the cgroup */ + if (src_cgrp == dst_cgrp) + return; + + if (!list_empty(&src_cset->mg_preload_node)) + return; + + WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(!list_empty(&src_cset->mg_tasks)); + WARN_ON(!list_empty(&src_cset->mg_node)); + + src_cset->mg_src_cgrp = src_cgrp; + get_css_set(src_cset); + list_add(&src_cset->mg_preload_node, preloaded_csets); +} + +/** + * cgroup_migrate_prepare_dst - prepare destination css_sets for migration + * @dst_cgrp: the destination cgroup + * @preloaded_csets: list of preloaded source css_sets + * + * Tasks are about to be moved to @dst_cgrp and all the source css_sets + * have been preloaded to @preloaded_csets. This function looks up and + * pins all destination css_sets, links each to its source, and put them on + * @preloaded_csets. + * + * This function must be called after cgroup_migrate_add_src() has been + * called on each migration source css_set. After migration is performed + * using cgroup_migrate(), cgroup_migrate_finish() must be called on + * @preloaded_csets. + */ +static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, + struct list_head *preloaded_csets) +{ + LIST_HEAD(csets); + struct css_set *src_cset; + + lockdep_assert_held(&cgroup_mutex); + + /* look up the dst cset for each src cset and link it to src */ + list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + struct css_set *dst_cset; + + dst_cset = find_css_set(src_cset, dst_cgrp); + if (!dst_cset) + goto err; + + WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + src_cset->mg_dst_cset = dst_cset; + + if (list_empty(&dst_cset->mg_preload_node)) + list_add(&dst_cset->mg_preload_node, &csets); + else + put_css_set(dst_cset, false); + } + + list_splice(&csets, preloaded_csets); + return 0; +err: + cgroup_migrate_finish(&csets); + return -ENOMEM; +} + +/** + * cgroup_migrate - migrate a process or task to a cgroup + * @cgrp: the destination cgroup + * @leader: the leader of the process or the task to migrate + * @threadgroup: whether @leader points to the whole process or a single task + * + * Migrate a process or task denoted by @leader to @cgrp. If migrating a + * process, the caller must be holding threadgroup_lock of @leader. The + * caller is also responsible for invoking cgroup_migrate_add_src() and + * cgroup_migrate_prepare_dst() on the targets before invoking this + * function and following up with cgroup_migrate_finish(). + * + * As long as a controller's ->can_attach() doesn't fail, this function is + * guaranteed to succeed. This means that, excluding ->can_attach() + * failure, when migrating multiple targets, the success or failure can be + * decided for all targets by invoking group_migrate_prepare_dst() before + * actually starting migrating. + */ +static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, + bool threadgroup) +{ + struct cgroup_taskset tset = { + .src_csets = LIST_HEAD_INIT(tset.src_csets), + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), + .csets = &tset.src_csets, + }; + struct cgroup_subsys_state *css, *failed_css = NULL; + struct css_set *cset, *tmp_cset; + struct task_struct *task, *tmp_task; + int i, ret; - i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ + down_write(&css_set_rwsem); rcu_read_lock(); + task = leader; do { - struct task_and_cgroup ent; + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) + goto next; - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) + /* leave @task alone if post_fork() hasn't linked it yet */ + if (list_empty(&task->cg_list)) goto next; - /* as per above, nr_threads may decrease, but not increase. */ - BUG_ON(i >= group_size); - ent.task = tsk; - ent.cgrp = task_cgroup_from_root(tsk, root); - /* nothing to do if this task is already in the cgroup */ - if (ent.cgrp == cgrp) + cset = task_css_set(task); + if (!cset->mg_src_cgrp) goto next; + /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. + * cgroup_taskset_first() must always return the leader. + * Take care to avoid disturbing the ordering. */ - retval = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(retval != 0); - i++; + list_move_tail(&task->cg_list, &cset->mg_tasks); + if (list_empty(&cset->mg_node)) + list_add_tail(&cset->mg_node, &tset.src_csets); + if (list_empty(&cset->mg_dst_cset->mg_node)) + list_move_tail(&cset->mg_dst_cset->mg_node, + &tset.dst_csets); next: if (!threadgroup) break; - } while_each_thread(leader, tsk); + } while_each_thread(leader, task); rcu_read_unlock(); - /* remember the number of threads in the array for later. */ - group_size = i; - tset.tc_array = group; - tset.tc_array_len = group_size; + up_write(&css_set_rwsem); /* methods shouldn't be called if no task is actually migrating */ - retval = 0; - if (!group_size) - goto out_free_group_list; - - /* - * step 1: check that we can legitimately attach to the cgroup. - */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + if (list_empty(&tset.src_csets)) + return 0; - if (ss->can_attach) { - retval = ss->can_attach(css, &tset); - if (retval) { - failed_ss = ss; + /* check that we can legitimately attach to the cgroup */ + for_each_css(css, i, cgrp) { + if (css->ss->can_attach) { + ret = css->ss->can_attach(css, &tset); + if (ret) { + failed_css = css; goto out_cancel_attach; } } } /* - * step 2: make sure css_sets exist for all threads to be migrated. - * we use find_css_set, which allocates a new one if necessary. + * Now that we're guaranteed success, proceed to move all tasks to + * the new cgroup. There are no failure cases after here, so this + * is the commit point. */ - for (i = 0; i < group_size; i++) { - struct css_set *old_cset; - - tc = flex_array_get(group, i); - old_cset = task_css_set(tc->task); - tc->cset = find_css_set(old_cset, cgrp); - if (!tc->cset) { - retval = -ENOMEM; - goto out_put_css_set_refs; - } + down_write(&css_set_rwsem); + list_for_each_entry(cset, &tset.src_csets, mg_node) { + list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) + cgroup_task_migrate(cset->mg_src_cgrp, task, + cset->mg_dst_cset); } + up_write(&css_set_rwsem); /* - * step 3: now that we're guaranteed success wrt the css_sets, - * proceed to move all tasks to the new cgroup. There are no - * failure cases after here, so this is the commit point. + * Migration is committed, all target tasks are now on dst_csets. + * Nothing is sensitive to fork() after this point. Notify + * controllers that migration is complete. */ - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); - } - /* nothing is sensitive to fork() after this point. */ + tset.csets = &tset.dst_csets; - /* - * step 4: do subsystem attach callbacks. - */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); + for_each_css(css, i, cgrp) + if (css->ss->attach) + css->ss->attach(css, &tset); - if (ss->attach) - ss->attach(css, &tset); - } + ret = 0; + goto out_release_tset; - /* - * step 5: success! and cleanup - */ - retval = 0; -out_put_css_set_refs: - if (retval) { - for (i = 0; i < group_size; i++) { - tc = flex_array_get(group, i); - if (!tc->cset) - break; - put_css_set(tc->cset); - } - } out_cancel_attach: - if (retval) { - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss == failed_ss) - break; - if (ss->cancel_attach) - ss->cancel_attach(css, &tset); - } + for_each_css(css, i, cgrp) { + if (css == failed_css) + break; + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, &tset); } -out_free_group_list: - flex_array_free(group); - return retval; +out_release_tset: + down_write(&css_set_rwsem); + list_splice_init(&tset.dst_csets, &tset.src_csets); + list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { + list_splice_tail_init(&cset->mg_tasks, &cset->tasks); + list_del_init(&cset->mg_node); + } + up_write(&css_set_rwsem); + return ret; +} + +/** + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup + * @dst_cgrp: the cgroup to attach to + * @leader: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? + * + * Call holding cgroup_mutex and threadgroup_lock of @leader. + */ +static int cgroup_attach_task(struct cgroup *dst_cgrp, + struct task_struct *leader, bool threadgroup) +{ + LIST_HEAD(preloaded_csets); + struct task_struct *task; + int ret; + + /* look up all src csets */ + down_read(&css_set_rwsem); + rcu_read_lock(); + task = leader; + do { + cgroup_migrate_add_src(task_css_set(task), dst_cgrp, + &preloaded_csets); + if (!threadgroup) + break; + } while_each_thread(leader, task); + rcu_read_unlock(); + up_read(&css_set_rwsem); + + /* prepare dst csets and commit */ + ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + if (!ret) + ret = cgroup_migrate(dst_cgrp, leader, threadgroup); + + cgroup_migrate_finish(&preloaded_csets); + return ret; } /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock - * cgroup_mutex and threadgroup; may take task_lock of task. + * cgroup_mutex and threadgroup. */ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { @@ -2148,7 +2078,7 @@ retry_find_task: tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - ret= -ESRCH; + ret = -ESRCH; goto out_unlock_cgroup; } /* @@ -2216,12 +2146,19 @@ out_unlock_cgroup: */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { - struct cgroupfs_root *root; + struct cgroup_root *root; int retval = 0; mutex_lock(&cgroup_mutex); - for_each_active_root(root) { - struct cgroup *from_cgrp = task_cgroup_from_root(from, root); + for_each_root(root) { + struct cgroup *from_cgrp; + + if (root == &cgrp_dfl_root) + continue; + + down_read(&css_set_rwsem); + from_cgrp = task_cgroup_from_root(from, root); + up_read(&css_set_rwsem); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2246,24 +2183,24 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, } static int cgroup_release_agent_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { - BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); - if (strlen(buffer) >= PATH_MAX) - return -EINVAL; + struct cgroup_root *root = css->cgroup->root; + + BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; - mutex_lock(&cgroup_root_mutex); - strcpy(css->cgroup->root->release_agent_path, buffer); - mutex_unlock(&cgroup_root_mutex); + spin_lock(&release_agent_path_lock); + strlcpy(root->release_agent_path, buffer, + sizeof(root->release_agent_path)); + spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); return 0; } -static int cgroup_release_agent_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_release_agent_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = seq_css(seq)->cgroup; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2273,264 +2210,114 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css, return 0; } -static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); - return 0; -} - -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 + struct cgroup *cgrp = seq_css(seq)->cgroup; -static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) -{ - char buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - char *end; - - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - if (cft->write_u64) { - u64 val = simple_strtoull(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_u64(css, cft, val); - } else { - s64 val = simple_strtoll(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_s64(css, cft, val); - } - if (!retval) - retval = nbytes; - return retval; + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + return 0; } -static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) +static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - size_t max_bytes = cft->max_write_len; - char *buffer = local_buffer; - - if (!max_bytes) - max_bytes = sizeof(local_buffer) - 1; - if (nbytes >= max_bytes) - return -E2BIG; - /* Allocate a dynamic buffer if we need one */ - if (nbytes >= sizeof(local_buffer)) { - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out; - } - - buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(css, cft, strstrip(buffer)); - if (!retval) - retval = nbytes; -out: - if (buffer != local_buffer) - kfree(buffer); - return retval; -} + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of->kn->priv; + struct cgroup_subsys_state *css; + int ret; -static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; + /* + * kernfs guarantees that a file isn't deleted with operations in + * flight, which means that the matching css is and stays alive and + * doesn't need to be pinned. The RCU locking is not necessary + * either. It's just for the convenience of using cgroup_css(). + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + rcu_read_unlock(); - if (cft->write) - return cft->write(css, cft, file, buf, nbytes, ppos); - if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); - if (cft->write_string) - return cgroup_write_string(css, cft, file, buf, nbytes, ppos); - if (cft->trigger) { - int ret = cft->trigger(css, (unsigned int)cft->private); - return ret ? ret : nbytes; + if (cft->write_string) { + ret = cft->write_string(css, cft, strstrip(buf)); + } else if (cft->write_u64) { + unsigned long long v; + ret = kstrtoull(buf, 0, &v); + if (!ret) + ret = cft->write_u64(css, cft, v); + } else if (cft->write_s64) { + long long v; + ret = kstrtoll(buf, 0, &v); + if (!ret) + ret = cft->write_s64(css, cft, v); + } else if (cft->trigger) { + ret = cft->trigger(css, (unsigned int)cft->private); + } else { + ret = -EINVAL; } - return -EINVAL; -} -static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) -{ - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(css, cft); - int len = sprintf(tmp, "%llu\n", (unsigned long long) val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); + return ret ?: nbytes; } -static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(css, cft); - int len = sprintf(tmp, "%lld\n", (long long) val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); + return seq_cft(seq)->seq_start(seq, ppos); } -static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - - if (cft->read) - return cft->read(css, cft, file, buf, nbytes, ppos); - if (cft->read_u64) - return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); - if (cft->read_s64) - return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); - return -EINVAL; + return seq_cft(seq)->seq_next(seq, v, ppos); } -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - -static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - struct seq_file *sf = cb->state; - return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - struct cfent *cfe = m->private; - struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cfe->css; - - if (cft->read_map) { - struct cgroup_map_cb cb = { - .fill = cgroup_map_add, - .state = m, - }; - return cft->read_map(css, cft, &cb); - } - return cft->read_seq_string(css, cft, m); -} - -static const struct file_operations cgroup_seqfile_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = seq_lseek, - .release = cgroup_file_release, -}; - -static int cgroup_file_open(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - struct cgroup_subsys_state *css; - int err; + struct cftype *cft = seq_cft(m); + struct cgroup_subsys_state *css = seq_css(m); - err = generic_file_open(inode, file); - if (err) - return err; + if (cft->seq_show) + return cft->seq_show(m, arg); - /* - * If the file belongs to a subsystem, pin the css. Will be - * unpinned either on open failure or release. This ensures that - * @css stays alive for all file operations. - */ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - if (cft->ss && !css_tryget(css)) - css = NULL; - rcu_read_unlock(); - - if (!css) - return -ENODEV; - - /* - * @cfe->css is used by read/write/close to determine the - * associated css. @file->private_data would be a better place but - * that's already used by seqfile. Multiple accessors may use it - * simultaneously which is okay as the association never changes. - */ - WARN_ON_ONCE(cfe->css && cfe->css != css); - cfe->css = css; - - if (cft->read_map || cft->read_seq_string) { - file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, cfe); - } else if (cft->open) { - err = cft->open(inode, file); - } - - if (css->ss && err) - css_put(css); - return err; + if (cft->read_u64) + seq_printf(m, "%llu\n", cft->read_u64(css, cft)); + else if (cft->read_s64) + seq_printf(m, "%lld\n", cft->read_s64(css, cft)); + else + return -EINVAL; + return 0; } -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - int ret = 0; +static struct kernfs_ops cgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_show = cgroup_seqfile_show, +}; - if (cft->release) - ret = cft->release(inode, file); - if (css->ss) - css_put(css); - if (file->f_op == &cgroup_seqfile_operations) - single_release(inode, file); - return ret; -} +static struct kernfs_ops cgroup_kf_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_start = cgroup_seqfile_start, + .seq_next = cgroup_seqfile_next, + .seq_stop = cgroup_seqfile_stop, + .seq_show = cgroup_seqfile_show, +}; /* * cgroup_rename - Only allow simple rename of directories in place. */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { + struct cgroup *cgrp = kn->priv; int ret; - struct cgroup_name *name, *old_name; - struct cgroup *cgrp; - - /* - * It's convinient to use parent dir's i_mutex to protected - * cgrp->name. - */ - lockdep_assert_held(&old_dir->i_mutex); - if (!S_ISDIR(old_dentry->d_inode->i_mode)) + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) + if (kn->parent != new_parent) return -EIO; - cgrp = __d_cgrp(old_dentry); - /* * This isn't a proper migration and its usefulness is very * limited. Disallow if sane_behavior. @@ -2538,229 +2325,61 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_dentry); - if (!name) - return -ENOMEM; - - ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - kfree(name); - return ret; - } - - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); - - kfree_rcu(old_name, rcu_head); - return 0; -} - -static struct simple_xattrs *__d_xattrs(struct dentry *dentry) -{ - if (S_ISDIR(dentry->d_inode->i_mode)) - return &__d_cgrp(dentry)->xattrs; - else - return &__d_cfe(dentry)->xattrs; -} - -static inline int xattr_enabled(struct dentry *dentry) -{ - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return root->flags & CGRP_ROOT_XATTR; -} - -static bool is_valid_xattr(const char *name) -{ - if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) - return true; - return false; -} - -static int cgroup_setxattr(struct dentry *dentry, const char *name, - const void *val, size_t size, int flags) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); -} - -static int cgroup_removexattr(struct dentry *dentry, const char *name) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_remove(__d_xattrs(dentry), name); -} - -static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, - void *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_get(__d_xattrs(dentry), name, buf, size); -} - -static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - return simple_xattr_list(__d_xattrs(dentry), buf, size); -} - -static const struct file_operations cgroup_file_operations = { - .read = cgroup_file_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; - -static const struct inode_operations cgroup_file_inode_operations = { - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -/* - * Check if a file is a control file - */ -static inline struct cftype *__file_cft(struct file *file) -{ - if (file_inode(file)->i_fop != &cgroup_file_operations) - return ERR_PTR(-EINVAL); - return __d_cft(file->f_dentry); -} - -static int cgroup_create_file(struct dentry *dentry, umode_t mode, - struct super_block *sb) -{ - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. kernfs_rename() doesn't require active_ref + * protection. Break them before grabbing cgroup_tree_mutex. + */ + kernfs_break_active_protection(new_parent); + kernfs_break_active_protection(kn); - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + ret = kernfs_rename(kn, new_parent, new_name_str); - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - inc_nlink(dentry->d_parent->d_inode); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); - /* - * Control reaches here with cgroup_mutex held. - * @inode->i_mutex should nest outside cgroup_mutex but we - * want to populate it immediately without releasing - * cgroup_mutex. As @inode isn't visible to anyone else - * yet, trylock will always succeed without affecting - * lockdep checks. - */ - WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - inode->i_op = &cgroup_file_inode_operations; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; + kernfs_unbreak_active_protection(kn); + kernfs_unbreak_active_protection(new_parent); + return ret; } -/** - * cgroup_file_mode - deduce file mode of a control file - * @cft: the control file in question - * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander - */ -static umode_t cgroup_file_mode(const struct cftype *cft) +/* set uid and gid of cgroup dirs and files to that of the creator */ +static int cgroup_kn_set_ugid(struct kernfs_node *kn) { - umode_t mode = 0; - - if (cft->mode) - return cft->mode; - - if (cft->read || cft->read_u64 || cft->read_s64 || - cft->read_map || cft->read_seq_string) - mode |= S_IRUGO; + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; - if (cft->write || cft->write_u64 || cft->write_s64 || - cft->write_string || cft->trigger) - mode |= S_IWUSR; + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; - return mode; + return kernfs_setattr(kn, &iattr); } static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { - struct dentry *dir = cgrp->dentry; - struct cgroup *parent = __d_cgrp(dir); - struct dentry *dentry; - struct cfent *cfe; - int error; - umode_t mode; - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - - if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, cft->ss->name); - strcat(name, "."); - } - strcat(name, cft->name); - - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); - - cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); - if (!cfe) - return -ENOMEM; - - dentry = lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out; - } + char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn; + struct lock_class_key *key = NULL; + int ret; - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + key = &cft->lockdep_key; +#endif + kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), + cgroup_file_mode(cft), 0, cft->kf_ops, cft, + NULL, false, key); + if (IS_ERR(kn)) + return PTR_ERR(kn); - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); - if (!error) { - list_add_tail(&cfe->node, &parent->files); - cfe = NULL; - } - dput(dentry); -out: - kfree(cfe); - return error; + ret = cgroup_kn_set_ugid(kn); + if (ret) + kernfs_remove(kn); + return ret; } /** @@ -2780,11 +2399,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_tree_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) + continue; if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) @@ -2806,115 +2426,96 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], return 0; } -static void cgroup_cfts_prepare(void) - __acquires(&cgroup_mutex) -{ - /* - * Thanks to the entanglement with vfs inode locking, we can't walk - * the existing cgroups under cgroup_mutex and create files. - * Instead, we use css_for_each_descendant_pre() and drop RCU read - * lock before calling cgroup_addrm_files(). - */ - mutex_lock(&cgroup_mutex); -} - -static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) +static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *root = &ss->root->top_cgroup; - struct super_block *sb = ss->root->sb; - struct dentry *prev = NULL; - struct inode *inode; + struct cgroup *root = &ss->root->cgrp; struct cgroup_subsys_state *css; - u64 update_before; int ret = 0; - /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &cgroup_dummy_root || - !atomic_inc_not_zero(&sb->s_active)) { - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* - * All cgroups which are created after we drop cgroup_mutex will - * have the updated set of files, so we only need to update the - * cgroups created before the current @cgroup_serial_nr_next. - */ - update_before = cgroup_serial_nr_next; + lockdep_assert_held(&cgroup_tree_mutex); - mutex_unlock(&cgroup_mutex); + /* don't bother if @ss isn't attached */ + if (ss->root == &cgrp_dfl_root) + return 0; /* add/rm files for all cgroups created before */ - rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (cgroup_is_dead(cgrp)) continue; - inode = cgrp->dentry->d_inode; - dget(cgrp->dentry); - rcu_read_unlock(); - - dput(prev); - prev = cgrp->dentry; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - - rcu_read_lock(); + ret = cgroup_addrm_files(cgrp, cfts, is_add); if (ret) break; } - rcu_read_unlock(); - dput(prev); - deactivate_super(sb); + + if (is_add && !ret) + kernfs_activate(root->kn); return ret; } -/** - * cgroup_add_cftypes - add an array of cftypes to a subsystem - * @ss: target cgroup subsystem - * @cfts: zero-length name terminated array of cftypes - * - * Register @cfts to @ss. Files described by @cfts are created for all - * existing cgroups to which @ss is attached and all future cgroups will - * have them too. This function can be called anytime whether @ss is - * attached or not. - * - * Returns 0 on successful registration, -errno on failure. Note that this - * function currently returns 0 as long as @cfts registration is successful - * even if some file creation attempts on existing cgroups fail. - */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static void cgroup_exit_cftypes(struct cftype *cfts) { - struct cftype_set *set; struct cftype *cft; - int ret; - set = kzalloc(sizeof(*set), GFP_KERNEL); - if (!set) - return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* free copy for custom atomic_write_len, see init_cftypes() */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) + kfree(cft->kf_ops); + cft->kf_ops = NULL; + cft->ss = NULL; + } +} + +static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; + + for (cft = cfts; cft->name[0] != '\0'; cft++) { + struct kernfs_ops *kf_ops; + + WARN_ON(cft->ss || cft->kf_ops); + + if (cft->seq_start) + kf_ops = &cgroup_kf_ops; + else + kf_ops = &cgroup_kf_single_ops; + + /* + * Ugh... if @cft wants a custom max_write_len, we need to + * make a copy of kf_ops to set its atomic_write_len. + */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { + kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); + if (!kf_ops) { + cgroup_exit_cftypes(cfts); + return -ENOMEM; + } + kf_ops->atomic_write_len = cft->max_write_len; + } - for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->kf_ops = kf_ops; cft->ss = ss; + } - cgroup_cfts_prepare(); - set->cfts = cfts; - list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(cfts, true); - if (ret) - cgroup_rm_cftypes(cfts); - return ret; + return 0; +} + +static int cgroup_rm_cftypes_locked(struct cftype *cfts) +{ + lockdep_assert_held(&cgroup_tree_mutex); + + if (!cfts || !cfts[0].ss) + return -ENOENT; + + list_del(&cfts->node); + cgroup_apply_cftypes(cfts, false); + cgroup_exit_cftypes(cfts); + return 0; } -EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem @@ -2929,24 +2530,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); */ int cgroup_rm_cftypes(struct cftype *cfts) { - struct cftype_set *set; + int ret; - if (!cfts || !cfts[0].ss) - return -ENOENT; + mutex_lock(&cgroup_tree_mutex); + ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_tree_mutex); + return ret; +} - cgroup_cfts_prepare(); +/** + * cgroup_add_cftypes - add an array of cftypes to a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Register @cfts to @ss. Files described by @cfts are created for all + * existing cgroups to which @ss is attached and all future cgroups will + * have them too. This function can be called anytime whether @ss is + * attached or not. + * + * Returns 0 on successful registration, -errno on failure. Note that this + * function currently returns 0 as long as @cfts registration is successful + * even if some file creation attempts on existing cgroups fail. + */ +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + int ret; - list_for_each_entry(set, &cfts[0].ss->cftsets, node) { - if (set->cfts == cfts) { - list_del(&set->node); - kfree(set); - cgroup_cfts_commit(cfts, false); - return 0; - } - } + if (!cfts || cfts[0].name[0] == '\0') + return 0; + + ret = cgroup_init_cftypes(ss, cfts); + if (ret) + return ret; + + mutex_lock(&cgroup_tree_mutex); + + list_add_tail(&cfts->node, &ss->cfts); + ret = cgroup_apply_cftypes(cfts, true); + if (ret) + cgroup_rm_cftypes_locked(cfts); - cgroup_cfts_commit(NULL, false); - return -ENOENT; + mutex_unlock(&cgroup_tree_mutex); + return ret; } /** @@ -2955,61 +2580,27 @@ int cgroup_rm_cftypes(struct cftype *cfts) * * Return the number of tasks in the cgroup. */ -int cgroup_task_count(const struct cgroup *cgrp) +static int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return count; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first call to css_task_iter_start(). - */ -static void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - write_lock(&css_set_lock); - use_task_css_set_links = 1; - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - */ - if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &task_css_set(p)->tasks); - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - write_unlock(&css_set_lock); -} - /** * css_next_child - find the next child of a given css * @pos_css: the current position (%NULL to initiate traversal) * @parent_css: css whose children to walk * * This function returns the next child of @parent_css and should be called - * under RCU read lock. The only requirement is that @parent_css and - * @pos_css are accessible. The next sibling is guaranteed to be returned - * regardless of their states. + * under either cgroup_mutex or RCU read lock. The only requirement is + * that @parent_css and @pos_css are accessible. The next sibling is + * guaranteed to be returned regardless of their states. */ struct cgroup_subsys_state * css_next_child(struct cgroup_subsys_state *pos_css, @@ -3019,7 +2610,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutexes_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3055,7 +2646,6 @@ css_next_child(struct cgroup_subsys_state *pos_css, return cgroup_css(next, parent_css->ss); } -EXPORT_SYMBOL_GPL(css_next_child); /** * css_next_descendant_pre - find the next descendant for pre-order walk @@ -3066,10 +2656,10 @@ EXPORT_SYMBOL_GPL(css_next_child); * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @root are accessible and @pos is a descendant of @root. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @root are accessible and @pos is a descendant of @root. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3077,7 +2667,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3098,7 +2688,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } -EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css @@ -3108,17 +2697,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct rightmost descendant as long as @pos is - * accessible. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct rightmost descendant as + * long as @pos is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutexes_or_rcu_locked(); do { last = pos; @@ -3130,7 +2719,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) return last; } -EXPORT_SYMBOL_GPL(css_rightmost_descendant); static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) @@ -3154,10 +2742,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @cgroup are accessible and @pos is a descendant of + * @cgroup. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, @@ -3165,7 +2754,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3183,7 +2772,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* no sibling left, visit parent */ return css_parent(pos); } -EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * css_advance_task_iter - advance a task itererator to the next css_set @@ -3206,9 +2794,14 @@ static void css_advance_task_iter(struct css_task_iter *it) } link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; - } while (list_empty(&cset->tasks)); + } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); + it->cset_link = l; - it->task = cset->tasks.next; + + if (!list_empty(&cset->tasks)) + it->task = cset->tasks.next; + else + it->task = cset->mg_tasks.next; } /** @@ -3227,17 +2820,12 @@ static void css_advance_task_iter(struct css_task_iter *it) */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_lock) + __acquires(css_set_rwsem) { - /* - * The first time anyone tries to iterate across a css, we need to - * enable the list linking each css_set to its tasks, and fix up - * all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); + /* no one should try to iterate before mounting cgroups */ + WARN_ON_ONCE(!use_task_css_set_links); - read_lock(&css_set_lock); + down_read(&css_set_rwsem); it->origin_css = css; it->cset_link = &css->cgroup->cset_links; @@ -3257,24 +2845,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; - struct cgrp_cset_link *link; + struct cgrp_cset_link *link = list_entry(it->cset_link, + struct cgrp_cset_link, cset_link); /* If the iterator cg is NULL, we have no tasks */ if (!it->cset_link) return NULL; res = list_entry(l, struct task_struct, cg_list); - /* Advance iterator to find next entry */ + + /* + * Advance iterator to find next entry. cset->tasks is consumed + * first and then ->mg_tasks. After ->mg_tasks, we move onto the + * next cset. + */ l = l->next; - link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); - if (l == &link->cset->tasks) { - /* - * We reached the end of this task list - move on to the - * next cgrp_cset_link. - */ + + if (l == &link->cset->tasks) + l = link->cset->mg_tasks.next; + + if (l == &link->cset->mg_tasks) css_advance_task_iter(it); - } else { + else it->task = l; - } + return res; } @@ -3285,191 +2878,62 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_lock) -{ - read_unlock(&css_set_lock); -} - -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) + __releases(css_set_rwsem) { - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); + up_read(&css_set_rwsem); } /** - * css_scan_tasks - iterate though all the tasks in a css - * @css: the css to iterate tasks of - * @test: optional test callback - * @process: process callback - * @data: data passed to @test and @process - * @heap: optional pre-allocated heap used for task iteration - * - * Iterate through all the tasks in @css, calling @test for each, and if it - * returns %true, call @process for it also. - * - * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates css_task_iter_{start,next,end}() but does not - * lock css_set_lock for the call to @process. - * - * It is guaranteed that @process will act on every task that is a member - * of @css for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different css during the - * call, or are forked or move into the css during the call. - * - * Note that @test may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should be - * cheap. + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside * - * If @heap is non-NULL, a heap has been pre-allocated and will be used for - * heap operations (and its "gt" member will be overwritten), else a - * temporary heap will be used (allocation of which may cause this function - * to fail). + * Locking rules between cgroup_post_fork() and the migration path + * guarantee that, if a task is forking while being migrated, the new child + * is guaranteed to be either visible in the source cgroup after the + * parent's migration is complete or put into the target cgroup. No task + * can slip out of migration through forking. */ -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - int retval, i; + LIST_HEAD(preloaded_csets); + struct cgrp_cset_link *link; struct css_task_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct timespec latest_time = { 0, 0 }; - - if (heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } + struct task_struct *task; + int ret; - again: - /* - * Scan tasks in the css, using the @test callback to determine - * which are of interest, and invoking @process callback on the - * ones which need an update. Since we don't want to hold any - * locks during the task updates, gather tasks to be processed in a - * heap structure. The heap is sorted by descending task start - * time. If the statically-sized heap fills up, we overflow tasks - * that started later, and in future iterations only consider tasks - * that started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (test && !test(p, data)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - css_task_iter_end(&it); + mutex_lock(&cgroup_mutex); - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - process(q, data); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} + /* all tasks in @from are being moved, all csets are source */ + down_read(&css_set_rwsem); + list_for_each_entry(link, &from->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, to, &preloaded_csets); + up_read(&css_set_rwsem); -static void cgroup_transfer_one_task(struct task_struct *task, void *data) -{ - struct cgroup *new_cgroup = data; + ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + if (ret) + goto out_err; - mutex_lock(&cgroup_mutex); - cgroup_attach_task(new_cgroup, task, false); + /* + * Migrate tasks one-by-one until @form is empty. This fails iff + * ->can_attach() fails. + */ + do { + css_task_iter_start(&from->dummy_css, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + ret = cgroup_migrate(to, task, false); + put_task_struct(task); + } + } while (task && !ret); +out_err: + cgroup_migrate_finish(&preloaded_csets); mutex_unlock(&cgroup_mutex); -} - -/** - * cgroup_trasnsfer_tasks - move tasks from one cgroup to another - * @to: cgroup to which the tasks will be moved - * @from: cgroup in which the tasks currently reside - */ -int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) -{ - return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, - to, NULL); + return ret; } /* @@ -3504,14 +2968,12 @@ struct cgroup_pidlist { pid_t *list; /* how many elements the above list has */ int length; - /* how many files are using the current array */ - int use_count; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; - /* protects the other fields */ - struct rw_semaphore rwsem; + /* for delayed destruction */ + struct delayed_work destroy_dwork; }; /* @@ -3527,6 +2989,7 @@ static void *pidlist_allocate(int count) else return kmalloc(count * sizeof(pid_t), GFP_KERNEL); } + static void pidlist_free(void *p) { if (is_vmalloc_addr(p)) @@ -3536,6 +2999,47 @@ static void pidlist_free(void *p) } /* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ @@ -3565,52 +3069,92 @@ after: return dest; } +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + * + * All this extra complexity was caused by the original implementation + * committing to an entirely unnecessary property. In the long term, we + * want to do away with it. Explicitly scramble sort order if + * sane_behavior so that no such expectation exists in the new interface. + * + * Scrambling is done by swapping every two consecutive bits, which is + * non-identity one-to-one mapping which disturbs sort order sufficiently. + */ +static pid_t pid_fry(pid_t pid) +{ + unsigned a = pid & 0x55555555; + unsigned b = pid & 0xAAAAAAAA; + + return (a << 1) | (b >> 1); +} + +static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) +{ + if (cgroup_sane_behavior(cgrp)) + return pid_fry(pid); + else + return pid; +} + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } +static int fried_cmppid(const void *a, const void *b) +{ + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) { struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - /* - * We can't drop the pidlist_mutex before taking the l->rwsem in case - * the last ref-holder is trying to remove l from the list at the same - * time. Holding the pidlist_mutex precludes somebody taking whichever - * list we find out from under us - compare release_pid_array(). - */ - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry(l, &cgrp->pidlists, links) { - if (l->key.type == type && l->key.ns == ns) { - /* make sure l doesn't vanish out from under us */ - down_write(&l->rwsem); - mutex_unlock(&cgrp->pidlist_mutex); - return l; - } - } + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); + if (!l) return l; - } - init_rwsem(&l->rwsem); - down_write(&l->rwsem); + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; - l->key.ns = get_pid_ns(ns); + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); - mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3627,6 +3171,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct task_struct *tsk; struct cgroup_pidlist *l; + lockdep_assert_held(&cgrp->pidlist_mutex); + /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -3653,20 +3199,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); + if (cgroup_sane_behavior(cgrp)) + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); + else + sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(array, length); - l = cgroup_pidlist_find(cgrp, type); + + l = cgroup_pidlist_find_create(cgrp, type); if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } - /* store array, freeing old if necessary - lock already held */ + + /* store array, freeing old if necessary */ pidlist_free(l->list); l->list = array; l->length = length; - l->use_count++; - up_write(&l->rwsem); *lp = l; return 0; } @@ -3682,21 +3232,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - int ret = -EINVAL; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); css_task_iter_start(&cgrp->dummy_css, &it); while ((tsk = css_task_iter_next(&it))) { @@ -3721,8 +3281,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); -err: - return ret; + mutex_unlock(&cgroup_mutex); + return 0; } @@ -3740,20 +3300,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pidlist *l = s->private; + struct kernfs_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; - int *iter; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; - down_read(&l->rwsem); if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (l->list[mid] == pid) { + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) index = mid + 1; else end = mid; @@ -3764,19 +3349,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; - *pos = *iter; + *pos = cgroup_pid_fry(cgrp, *iter); return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pidlist *l = s->private; - up_read(&l->rwsem); + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pidlist *l = s->private; + struct kernfs_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; /* @@ -3787,7 +3378,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) if (p >= end) { return NULL; } else { - *pos = *p; + *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); return p; } } @@ -3808,92 +3399,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { .show = cgroup_pidlist_show, }; -static void cgroup_release_pid_array(struct cgroup_pidlist *l) -{ - /* - * the case where we're the last user of this particular pidlist will - * have us remove it from the cgroup's list, which entails taking the - * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> - * pidlist_mutex, we have to take pidlist_mutex first. - */ - mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->rwsem); - BUG_ON(!l->use_count); - if (!--l->use_count) { - /* we're the last user if refcount is 0; remove and free */ - list_del(&l->links); - mutex_unlock(&l->owner->pidlist_mutex); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - up_write(&l->rwsem); - kfree(l); - return; - } - mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->rwsem); -} - -static int cgroup_pidlist_release(struct inode *inode, struct file *file) -{ - struct cgroup_pidlist *l; - if (!(file->f_mode & FMODE_READ)) - return 0; - /* - * the seq_file will only be initialized if the file was opened for - * reading; hence we check if it's not null only in that case. - */ - l = ((struct seq_file *)file->private_data)->private; - cgroup_release_pid_array(l); - return seq_release(inode, file); -} - -static const struct file_operations cgroup_pidlist_operations = { - .read = seq_read, - .llseek = seq_lseek, - .write = cgroup_file_write, - .release = cgroup_pidlist_release, -}; - -/* - * The following functions handle opens on a file that displays a pidlist - * (tasks or procs). Prepare an array of the process/thread IDs of whoever's - * in the cgroup. - */ -/* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) -{ - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l; - int retval; - - /* Nothing to do for write-only files */ - if (!(file->f_mode & FMODE_READ)) - return 0; - - /* have the array populated */ - retval = pidlist_array_load(cgrp, type, &l); - if (retval) - return retval; - /* configure file information */ - file->f_op = &cgroup_pidlist_operations; - - retval = seq_open(file, &cgroup_pidlist_seq_operations); - if (retval) { - cgroup_release_pid_array(l); - return retval; - } - ((struct seq_file *)file->private_data)->private = l; - return 0; -} -static int cgroup_tasks_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); -} -static int cgroup_procs_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); -} - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3911,219 +3416,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, return 0; } -/* - * When dput() is called asynchronously, if umount has been done and - * then deactivate_super() in cgroup_free_fn() kills the superblock, - * there's a small window that vfs will see the root dentry with non-zero - * refcnt and trigger BUG(). - * - * That's why we hold a reference before dput() and drop it right after. - */ -static void cgroup_dput(struct cgroup *cgrp) -{ - struct super_block *sb = cgrp->root->sb; - - atomic_inc(&sb->s_active); - dput(cgrp->dentry); - deactivate_super(sb); -} - -/* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ -static void cgroup_event_remove(struct work_struct *work) -{ - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); - struct cgroup_subsys_state *css = event->css; - - remove_wait_queue(event->wqh, &event->wait); - - event->cft->unregister_event(css, event->cft, event->eventfd); - - /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); - - eventfd_ctx_put(event->eventfd); - kfree(event); - css_put(css); -} - -/* - * Gets called on POLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; - unsigned long flags = (unsigned long)key; - - if (flags & POLLHUP) { - /* - * If the event has been detached at cgroup removal, we - * can simply return knowing the other side will cleanup - * for us. - * - * We can't race against event freeing since the other - * side will require wqh->lock via remove_wait_queue(), - * which we hold. - */ - spin_lock(&cgrp->event_list_lock); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - /* - * We are in atomic context, but cgroup_event_remove() - * may sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - } - - return 0; -} - -static void cgroup_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) -{ - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); -} - -/* - * Parse input and register new cgroup event handler. - * - * Input must be in format '<event_fd> <control_fd> <args>'. - * Interpretation of args is defined by control file implementation. - */ -static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, const char *buffer) -{ - struct cgroup *cgrp = dummy_css->cgroup; - struct cgroup_event *event; - struct cgroup_subsys_state *cfile_css; - unsigned int efd, cfd; - struct fd efile; - struct fd cfile; - char *endp; - int ret; - - efd = simple_strtoul(buffer, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buffer = endp + 1; - - cfd = simple_strtoul(buffer, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buffer = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); - - efile = fdget(efd); - if (!efile.file) { - ret = -EBADF; - goto out_kfree; - } - - event->eventfd = eventfd_ctx_fileget(efile.file); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto out_put_efile; - } - - cfile = fdget(cfd); - if (!cfile.file) { - ret = -EBADF; - goto out_put_eventfd; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile.file), MAY_READ); - if (ret < 0) - goto out_put_cfile; - - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. - */ - rcu_read_lock(); - - ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) - ret = 0; - - rcu_read_unlock(); - if (ret) - goto out_put_cfile; - - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(event->css, event->cft, - event->eventfd, buffer); - if (ret) - goto out_put_css; - - efile.file->f_op->poll(efile.file, &event->pt); - - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); - - fdput(cfile); - fdput(efile); - - return 0; - -out_put_css: - css_put(event->css); -out_put_cfile: - fdput(cfile); -out_put_eventfd: - eventfd_ctx_put(event->eventfd); -out_put_efile: - fdput(efile); -out_kfree: - kfree(event); - - return ret; -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4143,17 +3435,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", - .open = cgroup_procs_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, .write_u64 = cgroup_procs_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, - .mode = S_IWUGO, - }, - { .name = "cgroup.clone_children", .flags = CFTYPE_INSANE, .read_u64 = cgroup_clone_children_read, @@ -4162,7 +3452,7 @@ static struct cftype cgroup_base_files[] = { { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_sane_behavior_show, + .seq_show = cgroup_sane_behavior_show, }, /* @@ -4173,9 +3463,12 @@ static struct cftype cgroup_base_files[] = { { .name = "tasks", .flags = CFTYPE_INSANE, /* use "procs" instead */ - .open = cgroup_tasks_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { @@ -4187,9 +3480,9 @@ static struct cftype cgroup_base_files[] = { { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_release_agent_show, + .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; @@ -4208,13 +3501,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* process cftsets of each subsystem */ for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, set->cfts, true); + list_for_each_entry(cfts, &ss->cfts, node) { + ret = cgroup_addrm_files(cgrp, cfts, true); if (ret < 0) goto err; } @@ -4257,7 +3550,7 @@ static void css_free_work_fn(struct work_struct *work) css_put(css->parent); css->ss->css_free(css); - cgroup_dput(cgrp); + cgroup_put(cgrp); } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4265,10 +3558,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) struct cgroup_subsys_state *css = container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context which we don't have. - */ INIT_WORK(&css->destroy_work, css_free_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } @@ -4278,7 +3567,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); + RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4303,6 +3592,7 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) @@ -4310,7 +3600,7 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; css->cgroup->nr_css++; - rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; } @@ -4320,6 +3610,7 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4330,45 +3621,97 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); } -/* +/** + * create_css - create a cgroup_subsys_state + * @cgrp: the cgroup new css will be associated with + * @ss: the subsys of new css + * + * Create a new css associated with @cgrp - @ss pair. On success, the new + * css is online and installed in @cgrp with all interface files created. + * Returns 0 on success, -errno on failure. + */ +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *css; + int err; + + lockdep_assert_held(&cgroup_mutex); + + css = ss->css_alloc(cgroup_css(parent, ss)); + if (IS_ERR(css)) + return PTR_ERR(css); + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free_css; + + init_css(css, ss, cgrp); + + err = cgroup_populate_dir(cgrp, 1 << ss->id); + if (err) + goto err_free_percpu_ref; + + err = online_css(css); + if (err) + goto err_clear_dir; + + cgroup_get(cgrp); + css_get(css->parent); + + cgrp->subsys_mask |= 1 << ss->id; + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } + + return 0; + +err_clear_dir: + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); +err_free_percpu_ref: + percpu_ref_cancel_init(&css->refcnt); +err_free_css: + ss->css_free(css); + return err; +} + +/** * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held + * @name: name of the new cgroup + * @mode: mode to set on new cgroup */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - umode_t mode) +static long cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { - struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; - struct cgroup_name *name; - struct cgroupfs_root *root = parent->root; - int err = 0; + struct cgroup_root *root = parent->root; + int ssid, err; struct cgroup_subsys *ss; - struct super_block *sb = root->sb; + struct kernfs_node *kn; + + /* + * XXX: The default hierarchy isn't fully implemented yet. Block + * !root cgroup creation on it for now. + */ + if (root == &cgrp_dfl_root) + return -EINVAL; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(dentry); - if (!name) - goto err_free_cgrp; - rcu_assign_pointer(cgrp->name, name); - - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) - goto err_free_name; + mutex_lock(&cgroup_tree_mutex); /* * Only live parents can have children. Note that the liveliness @@ -4379,21 +3722,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_id; + goto err_unlock_tree; } - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - atomic_inc(&sb->s_active); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) { + err = -ENOMEM; + goto err_unlock; + } init_cgroup_housekeeping(cgrp); - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; - cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; @@ -4404,123 +3747,93 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css; - - css = ss->css_alloc(cgroup_css(parent, ss)); - if (IS_ERR(css)) { - err = PTR_ERR(css); - goto err_free_all; - } - css_ar[ss->subsys_id] = css; - - err = percpu_ref_init(&css->refcnt, css_release); - if (err) - goto err_free_all; - - init_css(css, ss, cgrp); + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + err = PTR_ERR(kn); + goto err_free_id; } + cgrp->kn = kn; /* - * Create directory. cgroup_create_file() returns with the new - * directory locked on success so that it can be populated without - * dropping cgroup_mutex. + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. */ - err = cgroup_create_file(dentry, S_IFDIR | mode, sb); - if (err < 0) - goto err_free_all; - lockdep_assert_held(&dentry->d_inode->i_mutex); + kernfs_get(kn); cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - - /* hold a ref to the parent's dentry */ - dget(parent->dentry); - - /* creation succeeded, notify subsystems */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - err = online_css(css); - if (err) - goto err_destroy; - - /* each css holds a ref to the cgroup's dentry and parent css */ - dget(dentry); - css_get(css->parent); - - /* mark it consumed for error path */ - css_ar[ss->subsys_id] = NULL; - - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); - ss->warned_broken_hierarchy = true; - } - } + atomic_inc(&root->nr_cgrps); + cgroup_get(parent); + /* + * @cgrp is now fully operational. If something fails after this + * point, it'll be released via the normal destruction path. + */ idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_addrm_files(cgrp, cgroup_base_files, true); + err = cgroup_kn_set_ugid(kn); if (err) goto err_destroy; - err = cgroup_populate_dir(cgrp, root->subsys_mask); + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; + /* let's create and online css's */ + for_each_subsys(ss, ssid) { + if (root->cgrp.subsys_mask & (1 << ssid)) { + err = create_css(cgrp, ss); + if (err) + goto err_destroy; + } + } + + kernfs_activate(kn); + mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return 0; -err_free_all: - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - if (css) { - percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); - } - } - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); -err_free_name: - kfree(rcu_dereference_raw(cgrp->name)); -err_free_cgrp: +err_unlock: + mutex_unlock(&cgroup_mutex); +err_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); kfree(cgrp); return err; err_destroy: - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - if (css) { - percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); - } - } cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *c_parent = dentry->d_parent->d_fsdata; + struct cgroup *parent = parent_kn->priv; + int ret; + + /* + * cgroup_create() grabs cgroup_tree_mutex which nests outside + * kernfs active_ref and cgroup_create() already synchronizes + * properly against removal through cgroup_lock_live_group(). + * Break it before calling cgroup_create(). + */ + cgroup_get(parent); + kernfs_break_active_protection(parent_kn); + + ret = cgroup_create(parent, name, mode); - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); + kernfs_unbreak_active_protection(parent_kn); + cgroup_put(parent); + return ret; } /* @@ -4533,6 +3846,7 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* @@ -4550,6 +3864,7 @@ static void css_killed_work_fn(struct work_struct *work) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); /* * Put the css refs from kill_css(). Each css holds an extra @@ -4571,18 +3886,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -/** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) +static void __kill_css(struct cgroup_subsys_state *css) { - cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + lockdep_assert_held(&cgroup_tree_mutex); + + /* + * This must happen before css is disassociated with its cgroup. + * See seq_css() for details. + */ + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* * Killing would put the base ref, but we need to keep it alive @@ -4604,6 +3916,28 @@ static void kill_css(struct cgroup_subsys_state *css) } /** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + struct cgroup *cgrp = css->cgroup; + + lockdep_assert_held(&cgroup_tree_mutex); + + /* if already killed, noop */ + if (cgrp->subsys_mask & (1 << css->ss->id)) { + cgrp->subsys_mask &= ~(1 << css->ss->id); + __kill_css(css); + } +} + +/** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed * @@ -4630,22 +3964,21 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct dentry *d = cgrp->dentry; - struct cgroup_event *event, *tmp; - struct cgroup_subsys *ss; struct cgroup *child; + struct cgroup_subsys_state *css; bool empty; + int ssid; - lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* - * css_set_lock synchronizes access to ->cset_links and prevents - * @cgrp from being removed while __put_css_set() is in progress. + * css_set_rwsem synchronizes access to ->cset_links and prevents + * @cgrp from being removed while put_css_set() is in progress. */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); empty = list_empty(&cgrp->cset_links); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (!empty) return -EBUSY; @@ -4666,18 +3999,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Initiate massacre of all css's. cgroup_destroy_css_killed() - * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. - */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (css) - kill_css(css); - } - - /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that * CGRP_DEAD assertion is depended upon by css_next_child() to @@ -4686,6 +4007,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ set_bit(CGRP_DEAD, &cgrp->flags); + /* + * Initiate massacre of all css's. cgroup_destroy_css_killed() + * will be invoked to perform the rest of destruction once the + * percpu refs of all css's are confirmed to be killed. This + * involves removing the subsystem's files, drop cgroup_mutex. + */ + mutex_unlock(&cgroup_mutex); + for_each_css(css, ssid, cgrp) + kill_css(css); + mutex_lock(&cgroup_mutex); + /* CGRP_DEAD is set, remove from ->release_list for the last time */ raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) @@ -4701,26 +4033,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); - /* - * Clear the base files and remove @cgrp directory. The removal - * puts the base ref but we aren't quite done with @cgrp yet, so - * hold onto it. - */ - cgroup_addrm_files(cgrp, cgroup_base_files, false); - dget(d); - cgroup_d_remove_dir(d); + /* remove @cgrp directory along with the base files */ + mutex_unlock(&cgroup_mutex); /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_from_dir(). Those are supported by RCU protecting + * clearing of cgrp->kn->priv backpointer, which should happen + * after all files under it have been removed. */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); + kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + mutex_lock(&cgroup_mutex); return 0; }; @@ -4737,73 +4063,82 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - struct dentry *d = cgrp->dentry; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - dput(d); + cgroup_put(cgrp); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); } -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +static int cgroup_rmdir(struct kernfs_node *kn) { - int ret; - - mutex_lock(&cgroup_mutex); - ret = cgroup_destroy_locked(dentry->d_fsdata); - mutex_unlock(&cgroup_mutex); + struct cgroup *cgrp = kn->priv; + int ret = 0; - return ret; -} + /* + * This is self-destruction but @kn can't be removed while this + * callback is in progress. Let's break active protection. Once + * the protection is broken, @cgrp can be destroyed at any point. + * Pin it so that it stays accessible. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); -static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) -{ - INIT_LIST_HEAD(&ss->cftsets); + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); /* - * base_cftset is embedded in subsys itself, no need to worry about - * deregistration. + * @cgrp might already have been destroyed while we're trying to + * grab the mutexes. */ - if (ss->base_cftypes) { - struct cftype *cft; + if (!cgroup_is_dead(cgrp)) + ret = cgroup_destroy_locked(cgrp); - for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) - cft->ss = ss; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); - ss->base_cftset.cfts = ss->base_cftypes; - list_add_tail(&ss->base_cftset.node, &ss->cftsets); - } + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); + return ret; } +static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - /* init base cftset */ - cgroup_init_cftsets(ss); + INIT_LIST_HEAD(&ss->cfts); - /* Create the top cgroup state for this subsystem */ - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); - ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); + /* Create the root cgroup state for this subsystem */ + ss->root = &cgrp_dfl_root; + css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_css(css, ss, cgroup_dummy_top); + init_css(css, ss, &cgrp_dfl_root.cgrp); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the - * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = css; + * init_css_set is in the subsystem's root cgroup. */ + init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4814,177 +4149,11 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - mutex_unlock(&cgroup_mutex); - - /* this function shouldn't be used with modular subsystems, since they - * need to register a subsys_id, among other things */ - BUG_ON(ss->module); -} - -/** - * cgroup_load_subsys: load and register a modular subsystem at runtime - * @ss: the subsystem to load - * - * This function should be called in a modular subsystem's initcall. If the - * subsystem is built as a module, it will be assigned a new subsys_id and set - * up for use. If the subsystem is built-in anyway, work is delegated to the - * simpler cgroup_init_subsys. - */ -int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - int i, ret; - struct hlist_node *tmp; - struct css_set *cset; - unsigned long key; - - /* check name and function validity */ - if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->css_alloc == NULL || ss->css_free == NULL) - return -EINVAL; - - /* - * we don't support callbacks in modular subsystems. this check is - * before the ss->module check for consistency; a subsystem that could - * be a module should still have no callbacks even if the user isn't - * compiling it as one. - */ - if (ss->fork || ss->exit) - return -EINVAL; - - /* - * an optionally modular subsystem is built-in: we want to do nothing, - * since cgroup_init_subsys will have already taken care of it. - */ - if (ss->module == NULL) { - /* a sanity check */ - BUG_ON(cgroup_subsys[ss->subsys_id] != ss); - return 0; - } - - /* init base cftset */ - cgroup_init_cftsets(ss); - - mutex_lock(&cgroup_mutex); - cgroup_subsys[ss->subsys_id] = ss; - - /* - * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the dummy root - * attachment). - */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); - if (IS_ERR(css)) { - /* failure case - need to deassign the cgroup_subsys[] slot. */ - cgroup_subsys[ss->subsys_id] = NULL; - mutex_unlock(&cgroup_mutex); - return PTR_ERR(css); - } - - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); - ss->root = &cgroup_dummy_root; - - /* our new subsystem will be attached to the dummy hierarchy. */ - init_css(css, ss, cgroup_dummy_top); - - /* - * Now we need to entangle the css into the existing css_sets. unlike - * in cgroup_init_subsys, there are now multiple css_sets, so each one - * will need a new pointer to it; done by iterating the css_set_table. - * furthermore, modifying the existing css_sets will corrupt the hash - * table state, so each changed css_set will need its hash recomputed. - * this is all done under the css_set_lock. - */ - write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { - /* skip entries that we already rehashed */ - if (cset->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hash_del(&cset->hlist); - /* set new value */ - cset->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - ret = online_css(css); - if (ret) - goto err_unload; - - /* success! */ - mutex_unlock(&cgroup_mutex); - return 0; - -err_unload: - mutex_unlock(&cgroup_mutex); - /* @ss can't be mounted here as try_module_get() would fail */ - cgroup_unload_subsys(ss); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_load_subsys); - -/** - * cgroup_unload_subsys: unload a modular subsystem - * @ss: the subsystem to unload - * - * This function should be called in a modular subsystem's exitcall. When this - * function is invoked, the refcount on the subsystem's module will be 0, so - * the subsystem will not be attached to any hierarchy. - */ -void cgroup_unload_subsys(struct cgroup_subsys *ss) -{ - struct cgrp_cset_link *link; - - BUG_ON(ss->module == NULL); - - /* - * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get() in rebind_subsystems() should ensure that it - * doesn't start being used while we're killing it off. - */ - BUG_ON(ss->root != &cgroup_dummy_root); - - mutex_lock(&cgroup_mutex); - - offline_css(cgroup_css(cgroup_dummy_top, ss)); - - /* deassign the subsys_id */ - cgroup_subsys[ss->subsys_id] = NULL; - - /* remove subsystem from the dummy root's list of subsystems */ - list_del_init(&ss->sibling); - - /* - * disentangle the css from all css_sets attached to the dummy - * top. as in loading, we need to pay our respects to the hashtable - * gods. - */ - write_lock(&css_set_lock); - list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { - struct css_set *cset = link->cset; - unsigned long key; - - hash_del(&cset->hlist); - cset->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - /* - * remove subsystem's css from the cgroup_dummy_top and free it - - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. - */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss)); - RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); + cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); } -EXPORT_SYMBOL_GPL(cgroup_unload_subsys); /** * cgroup_init_early - cgroup initialization at system boot @@ -4994,34 +4163,24 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys); */ int __init cgroup_init_early(void) { + static struct cgroup_sb_opts __initdata opts = + { .flags = CGRP_ROOT_SANE_BEHAVIOR }; struct cgroup_subsys *ss; int i; - atomic_set(&init_css_set.refcount, 1); - INIT_LIST_HEAD(&init_css_set.cgrp_links); - INIT_LIST_HEAD(&init_css_set.tasks); - INIT_HLIST_NODE(&init_css_set.hlist); - css_set_count = 1; - init_cgroup_root(&cgroup_dummy_root); - cgroup_root_count = 1; + init_cgroup_root(&cgrp_dfl_root, &opts); RCU_INIT_POINTER(init_task.cgroups, &init_css_set); - init_cgrp_cset_link.cset = &init_css_set; - init_cgrp_cset_link.cgrp = cgroup_dummy_top; - list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); - list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - - /* at bootup time, we don't worry about modular subsystems */ - for_each_builtin_subsys(ss, i) { - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->css_alloc); - BUG_ON(!ss->css_free); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } + for_each_subsys(ss, i) { + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, + ss->id, ss->name); + WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, + "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + + ss->id = i; + ss->name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss); @@ -5039,53 +4198,46 @@ int __init cgroup_init(void) { struct cgroup_subsys *ss; unsigned long key; - int i, err; + int ssid, err; - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - for_each_builtin_subsys(ss, i) { - if (!ss->early_init) - cgroup_init_subsys(ss); - } - - /* allocate id for the dummy hierarchy */ + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); hash_add(css_set_table, &init_css_set.hlist, key); - BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); - - err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, - 0, 1, GFP_KERNEL); - BUG_ON(err < 0); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); - cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) { - err = -ENOMEM; - goto out; + for_each_subsys(ss, ssid) { + if (!ss->early_init) + cgroup_init_subsys(ss); + + /* + * cftype registration needs kmalloc and can't be done + * during early_init. Register base cftypes separately. + */ + if (ss->base_cftypes) + WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); } + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); + if (!cgroup_kobj) + return -ENOMEM; + err = register_filesystem(&cgroup_fs_type); if (err < 0) { kobject_put(cgroup_kobj); - goto out; + return err; } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; + return 0; } static int __init cgroup_wq_init(void) @@ -5100,6 +4252,15 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); + + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; } core_initcall(cgroup_wq_init); @@ -5108,12 +4269,6 @@ core_initcall(cgroup_wq_init); * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy * - Used for /proc/<pid>/cgroup. - * - No need to task_lock(tsk) on this tsk->cgroup reference, as it - * doesn't really matter if tsk->cgroup changes after we read it, - * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it - * anyway. No need to check that tsk->cgroup != NULL, thanks to - * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks - * cgroup to top_cgroup. */ /* TODO: Use a proper seq_file iterator */ @@ -5121,12 +4276,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *path; int retval; - struct cgroupfs_root *root; + struct cgroup_root *root; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -5139,28 +4294,36 @@ int proc_cgroup_show(struct seq_file *m, void *v) retval = 0; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); - for_each_active_root(root) { + for_each_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int count = 0; + int ssid, count = 0; + + if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + continue; seq_printf(m, "%d:", root->hierarchy_id); - for_each_root_subsys(root, ss) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + for_each_subsys(ss, ssid) + if (root->cgrp.subsys_mask & (1 << ssid)) + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; goto out_unlock; - seq_puts(m, buf); + } + seq_puts(m, path); seq_putc(m, '\n'); } out_unlock: + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); put_task_struct(tsk); out_free: @@ -5186,7 +4349,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, - ss->root->number_of_cgroups, !ss->disabled); + atomic_read(&ss->root->nr_cgrps), !ss->disabled); mutex_unlock(&cgroup_mutex); return 0; @@ -5205,27 +4368,16 @@ static const struct file_operations proc_cgroupstats_operations = { }; /** - * cgroup_fork - attach newly forked task to its parents cgroup. + * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. * - * Description: A task inherits its parent's cgroup at fork(). - * - * A pointer to the shared css_set was automatically copied in - * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU or cgroup_mutex, so - * might no longer be a valid cgroup pointer. cgroup_attach_task() might - * have already changed current->cgroups, allowing the previously - * referenced cgroup group to be removed and freed. - * - * At the point that cgroup_fork() is called, 'current' is the parent - * task, and the passed argument 'child' points to the child task. + * A task is associated with the init_css_set until cgroup_post_fork() + * attaches it to the parent's css_set. Empty cg_list indicates that + * @child isn't holding reference to its css_set. */ void cgroup_fork(struct task_struct *child) { - task_lock(current); - get_css_set(task_css_set(current)); - child->cgroups = current->cgroups; - task_unlock(current); + RCU_INIT_POINTER(child->cgroups, &init_css_set); INIT_LIST_HEAD(&child->cg_list); } @@ -5245,23 +4397,37 @@ void cgroup_post_fork(struct task_struct *child) int i; /* - * use_task_css_set_links is set to 1 before we walk the tasklist - * under the tasklist_lock and we read it here after we added the child - * to the tasklist under the tasklist_lock as well. If the child wasn't - * yet in the tasklist when we walked through it from - * cgroup_enable_task_cg_lists(), then use_task_css_set_links value - * should be visible now due to the paired locking and barriers implied - * by LOCK/UNLOCK: it is written before the tasklist_lock unlock - * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock - * lock on fork. + * This may race against cgroup_enable_task_cg_links(). As that + * function sets use_task_css_set_links before grabbing + * tasklist_lock and we just went through tasklist_lock to add + * @child, it's guaranteed that either we see the set + * use_task_css_set_links or cgroup_enable_task_cg_lists() sees + * @child during its iteration. + * + * If we won the race, @child is associated with %current's + * css_set. Grabbing css_set_rwsem guarantees both that the + * association is stable, and, on completion of the parent's + * migration, @child is visible in the source of migration or + * already in the destination cgroup. This guarantee is necessary + * when implementing operations which need to migrate all tasks of + * a cgroup to another. + * + * Note that if we lose to cgroup_enable_task_cg_links(), @child + * will remain in init_css_set. This is safe because all tasks are + * in the init_css_set before cg_links is enabled and there's no + * operation which transfers all tasks out of init_css_set. */ if (use_task_css_set_links) { - write_lock(&css_set_lock); - task_lock(child); - if (list_empty(&child->cg_list)) - list_add(&child->cg_list, &task_css_set(child)->tasks); - task_unlock(child); - write_unlock(&css_set_lock); + struct css_set *cset; + + down_write(&css_set_rwsem); + cset = task_css_set(current); + if (list_empty(&child->cg_list)) { + rcu_assign_pointer(child->cgroups, cset); + list_add(&child->cg_list, &cset->tasks); + get_css_set(cset); + } + up_write(&css_set_rwsem); } /* @@ -5270,15 +4436,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, and the builtin section of the subsys - * array is immutable, so we don't need to lock the - * subsys array here. On the other hand, modular section - * of the array can be freed at module unload, so we - * can't touch that. - */ - for_each_builtin_subsys(ss, i) + for_each_subsys(ss, i) if (ss->fork) ss->fork(child); } @@ -5287,7 +4445,6 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process - * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -5297,57 +4454,38 @@ void cgroup_post_fork(struct task_struct *child) * use notify_on_release cgroups where very high task exit scaling * is required on large systems. * - * the_top_cgroup_hack: - * - * Set the exiting tasks cgroup to the root cgroup (top_cgroup). - * - * We call cgroup_exit() while the task is still competent to - * handle notify_on_release(), then leave the task attached to the - * root cgroup in each hierarchy for the remainder of its exit. - * - * To do this properly, we would increment the reference count on - * top_cgroup, and near the very end of the kernel/exit.c do_exit() - * code we would add a second cgroup function call, to drop that - * reference. This would just create an unnecessary hot spot on - * the top_cgroup reference count, to no avail. - * - * Normally, holding a reference to a cgroup without bumping its - * count is unsafe. The cgroup could go away, or someone could - * attach us to a different cgroup, decrementing the count on - * the first cgroup that we never incremented. But in this case, - * top_cgroup isn't going away, and either task has PF_EXITING set, - * which wards off any cgroup_attach_task() attempts, or task is a failed - * fork, never visible to cgroup_attach_task. + * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We + * call cgroup_exit() while the task is still competent to handle + * notify_on_release(), then leave the task attached to the root cgroup in + * each hierarchy for the remainder of its exit. No need to bother with + * init_css_set refcnting. init_css_set never goes away and we can't race + * with migration path - PF_EXITING is visible to migration path. */ -void cgroup_exit(struct task_struct *tsk, int run_callbacks) +void cgroup_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; + bool put_cset = false; int i; /* - * Unlink from the css_set task list if necessary. - * Optimistically check cg_list before taking - * css_set_lock + * Unlink from @tsk from its css_set. As migration path can't race + * with us, we can check cg_list without grabbing css_set_rwsem. */ if (!list_empty(&tsk->cg_list)) { - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_del_init(&tsk->cg_list); - write_unlock(&css_set_lock); + down_write(&css_set_rwsem); + list_del_init(&tsk->cg_list); + up_write(&css_set_rwsem); + put_cset = true; } /* Reassign the task to the init_css_set. */ - task_lock(tsk); cset = task_css_set(tsk); RCU_INIT_POINTER(tsk->cgroups, &init_css_set); - if (run_callbacks && need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, see cgroup_post_fork() for details. - */ - for_each_builtin_subsys(ss, i) { + if (need_forkexit_callback) { + /* see cgroup_post_fork() for details */ + for_each_subsys(ss, i) { if (ss->exit) { struct cgroup_subsys_state *old_css = cset->subsys[i]; struct cgroup_subsys_state *css = task_css(tsk, i); @@ -5356,9 +4494,9 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } } } - task_unlock(tsk); - put_css_set_taskexit(cset); + if (put_cset) + put_css_set(cset, true); } static void check_for_release(struct cgroup *cgrp) @@ -5415,16 +4553,17 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf = NULL, *agentbuf = NULL, *path; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) goto continue_free; agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); if (!agentbuf) @@ -5432,7 +4571,7 @@ static void cgroup_release_agent(struct work_struct *work) i = 0; argv[i++] = agentbuf; - argv[i++] = pathbuf; + argv[i++] = path; argv[i] = NULL; i = 0; @@ -5466,11 +4605,7 @@ static int __init cgroup_disable(char *str) if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about - * module subsystems, so we don't worry about them. - */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" @@ -5484,28 +4619,42 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under RCU read lock. The caller is responsible for - * pinning the returned css if it needs to be accessed outside the RCU - * critical section. + * If @dentry is a directory for a cgroup which has @ss enabled on it, try + * to get the corresponding css and return it. If such css doesn't exist + * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); - /* is @dentry a cgroup dir? */ - if (!dentry->d_inode || - dentry->d_inode->i_op != &cgroup_dir_inode_operations) + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); - cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); + rcu_read_lock(); + + /* + * This path doesn't originate from kernfs and @kn could already + * have been or be removed at any point. @kn->priv is RCU + * protected for this access. See destroy_locked() for details. + */ + cgrp = rcu_dereference(kn->priv); + if (cgrp) + css = cgroup_css(cgrp, ss); + + if (!css || !css_tryget(css)) + css = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + return css; } /** @@ -5520,9 +4669,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&cgroup_mutex), - "css_from_id() needs proper protection"); + cgroup_assert_mutexes_or_rcu_locked(); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) @@ -5570,55 +4717,62 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, return count; } -static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, - struct seq_file *seq) +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name; - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; + cgroup_name(c, name_buf, NAME_MAX + 1); seq_printf(seq, "Root %d group %s\n", - c->root->hierarchy_id, name); + c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); + kfree(name_buf); return 0; } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_css_links_read(struct seq_file *seq, void *v) { + struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; + seq_printf(seq, "css_set %p\n", cset); + list_for_each_entry(task, &cset->tasks, cg_list) { - if (count++ > MAX_TASKS_SHOWN_PER_CSS) { - seq_puts(seq, " ...\n"); - break; - } else { - seq_printf(seq, " task %d\n", - task_pid_vnr(task)); - } + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); + } + + list_for_each_entry(task, &cset->mg_tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) + goto overflow; + seq_printf(seq, " task %d\n", task_pid_vnr(task)); } + continue; + overflow: + seq_puts(seq, " ...\n"); } - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return 0; } @@ -5645,12 +4799,12 @@ static struct cftype debug_files[] = { { .name = "current_css_set_cg_links", - .read_seq_string = current_css_set_cg_links_read, + .seq_show = current_css_set_cg_links_read, }, { .name = "cgroup_css_links", - .read_seq_string = cgroup_css_links_read, + .seq_show = cgroup_css_links_read, }, { @@ -5661,11 +4815,9 @@ static struct cftype debug_files[] = { { } /* terminate */ }; -struct cgroup_subsys debug_subsys = { - .name = "debug", +struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f0ff64d0ebaa..2bc4a2256444 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) static inline struct freezer *task_freezer(struct task_struct *task) { - return css_freezer(task_css(task, freezer_subsys_id)); + return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) @@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state) return "THAWED"; }; -struct cgroup_subsys freezer_subsys; - static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css, tset) { + cgroup_taskset_for_each(task, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { @@ -216,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, } } +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; @@ -224,14 +232,26 @@ static void freezer_fork(struct task_struct *task) freezer = task_freezer(task); /* - * The root cgroup is non-freezable, so we can skip the - * following check. + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. */ if (!parent_freezer(freezer)) goto out; + /* + * Grab @freezer->lock and freeze @task after verifying @task still + * belongs to @freezer and it's freezing. The former is for the + * case where we have raced against task migration and lost and + * @task is already in a different cgroup which may not be frozen. + * This isn't strictly necessary as freeze_task() is allowed to be + * called spuriously but let's do it anyway for, if nothing else, + * documentation. + */ spin_lock_irq(&freezer->lock); - if (freezer->state & CGROUP_FREEZING) + if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) freeze_task(task); spin_unlock_irq(&freezer->lock); out: @@ -301,10 +321,9 @@ out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *m) +static int freezer_read(struct seq_file *m, void *v) { - struct cgroup_subsys_state *pos; + struct cgroup_subsys_state *css = seq_css(m), *pos; rcu_read_lock(); @@ -423,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) } static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { bool freeze; @@ -458,7 +477,7 @@ static struct cftype files[] = { { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = freezer_read, + .seq_show = freezer_read, .write_string = freezer_write, }, { @@ -474,13 +493,11 @@ static struct cftype files[] = { { } /* terminate */ }; -struct cgroup_subsys freezer_subsys = { - .name = "freezer", +struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, - .subsys_id = freezer_subsys_id, .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, diff --git a/kernel/compat.c b/kernel/compat.c index 0a09e481b70b..e40b0430b562 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -30,28 +30,6 @@ #include <asm/uaccess.h> -/* - * Get/set struct timeval with struct timespec on the native side - */ -static int compat_get_timeval_convert(struct timespec *o, - struct compat_timeval __user *i) -{ - long usec; - - if (get_user(o->tv_sec, &i->tv_sec) || - get_user(usec, &i->tv_usec)) - return -EFAULT; - o->tv_nsec = usec * 1000; - return 0; -} - -static int compat_put_timeval_convert(struct compat_timeval __user *o, - struct timeval *i) -{ - return (put_user(i->tv_sec, &o->tv_sec) || - put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; -} - static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) { memset(txc, 0, sizeof(struct timex)); @@ -110,13 +88,13 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) return 0; } -asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) +COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) { if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval_convert(tv, &ktv)) + if (compat_put_timeval(&ktv, tv)) return -EFAULT; } if (tz) { @@ -127,62 +105,61 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, return 0; } -asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) +COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, + struct timezone __user *, tz) { - struct timespec kts; - struct timezone ktz; + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; if (tv) { - if (compat_get_timeval_convert(&kts, tv)) + if (compat_get_timeval(&user_tv, tv)) return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) + if (copy_from_user(&new_tz, tz, sizeof(*tz))) return -EFAULT; } - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } -int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(get_compat_timeval); -int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) { return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(put_compat_timeval); -int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(get_compat_timespec); -int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } -EXPORT_SYMBOL_GPL(put_compat_timespec); int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; else - return get_compat_timeval(tv, utv); + return __compat_get_timeval(tv, utv); } EXPORT_SYMBOL_GPL(compat_get_timeval); @@ -191,7 +168,7 @@ int compat_put_timeval(const struct timeval *tv, void __user *utv) if (COMPAT_USE_64BIT_TIME) return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; else - return put_compat_timeval(tv, utv); + return __compat_put_timeval(tv, utv); } EXPORT_SYMBOL_GPL(compat_put_timeval); @@ -200,7 +177,7 @@ int compat_get_timespec(struct timespec *ts, const void __user *uts) if (COMPAT_USE_64BIT_TIME) return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; else - return get_compat_timespec(ts, uts); + return __compat_get_timespec(ts, uts); } EXPORT_SYMBOL_GPL(compat_get_timespec); @@ -209,10 +186,33 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) if (COMPAT_USE_64BIT_TIME) return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; else - return put_compat_timespec(ts, uts); + return __compat_put_timespec(ts, uts); } EXPORT_SYMBOL_GPL(compat_put_timespec); +int compat_convert_timespec(struct timespec __user **kts, + const void __user *cts) +{ + struct timespec ts; + struct timespec __user *uts; + + if (!cts || COMPAT_USE_64BIT_TIME) { + *kts = (struct timespec __user *)cts; + return 0; + } + + uts = compat_alloc_user_space(sizeof(ts)); + if (!uts) + return -EFAULT; + if (compat_get_timespec(&ts, cts)) + return -EFAULT; + if (copy_to_user(uts, &ts, sizeof(ts))) + return -EFAULT; + + *kts = uts; + return 0; +} + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; @@ -229,21 +229,21 @@ static long compat_nanosleep_restart(struct restart_block *restart) if (ret) { rmtp = restart->nanosleep.compat_rmtp; - if (rmtp && put_compat_timespec(&rmt, rmtp)) + if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } return ret; } -asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) +COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { struct timespec tu, rmt; mm_segment_t oldfs; long ret; - if (get_compat_timespec(&tu, rqtp)) + if (compat_get_timespec(&tu, rqtp)) return -EFAULT; if (!timespec_valid(&tu)) @@ -263,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, restart->fn = compat_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; - if (rmtp && put_compat_timespec(&rmt, rmtp)) + if (rmtp && compat_put_timespec(&rmt, rmtp)) return -EFAULT; } @@ -328,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x) return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); } -asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) +COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf) { if (tbuf) { struct tms tms; @@ -354,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) * types that can be passed to put_user()/get_user(). */ -asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) +COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set) { old_sigset_t s; long ret; @@ -424,8 +424,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how, #endif -asmlinkage long compat_sys_setrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; @@ -443,15 +443,15 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, #ifdef COMPAT_RLIM_OLD_INFINITY -asmlinkage long compat_sys_old_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - ret = sys_old_getrlimit(resource, &r); + ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r); set_fs(old_fs); if (!ret) { @@ -470,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, #endif -asmlinkage long compat_sys_getrlimit(unsigned int resource, - struct compat_rlimit __user *rlim) +COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource, + struct compat_rlimit __user *, rlim) { struct rlimit r; int ret; @@ -596,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, return compat_get_bitmap(k, user_mask_ptr, len * 8); } -asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, - unsigned int len, - compat_ulong_t __user *user_mask_ptr) +COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid, + unsigned int, len, + compat_ulong_t __user *, user_mask_ptr) { cpumask_var_t new_mask; int retval; @@ -616,8 +616,8 @@ out: return retval; } -asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, - compat_ulong_t __user *user_mask_ptr) +COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, + compat_ulong_t __user *, user_mask_ptr) { int ret; cpumask_var_t mask; @@ -647,8 +647,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, int get_compat_itimerspec(struct itimerspec *dst, const struct compat_itimerspec __user *src) { - if (get_compat_timespec(&dst->it_interval, &src->it_interval) || - get_compat_timespec(&dst->it_value, &src->it_value)) + if (__compat_get_timespec(&dst->it_interval, &src->it_interval) || + __compat_get_timespec(&dst->it_value, &src->it_value)) return -EFAULT; return 0; } @@ -656,15 +656,15 @@ int get_compat_itimerspec(struct itimerspec *dst, int put_compat_itimerspec(struct compat_itimerspec __user *dst, const struct itimerspec *src) { - if (put_compat_timespec(&src->it_interval, &dst->it_interval) || - put_compat_timespec(&src->it_value, &dst->it_value)) + if (__compat_put_timespec(&src->it_interval, &dst->it_interval) || + __compat_put_timespec(&src->it_value, &dst->it_value)) return -EFAULT; return 0; } -long compat_sys_timer_create(clockid_t which_clock, - struct compat_sigevent __user *timer_event_spec, - timer_t __user *created_timer_id) +COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, + struct compat_sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) { struct sigevent __user *event = NULL; @@ -680,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock, return sys_timer_create(which_clock, event, created_timer_id); } -long compat_sys_timer_settime(timer_t timer_id, int flags, - struct compat_itimerspec __user *new, - struct compat_itimerspec __user *old) +COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + struct compat_itimerspec __user *, new, + struct compat_itimerspec __user *, old) { long err; mm_segment_t oldfs; @@ -703,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags, return err; } -long compat_sys_timer_gettime(timer_t timer_id, - struct compat_itimerspec __user *setting) +COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct compat_itimerspec __user *, setting) { long err; mm_segment_t oldfs; @@ -720,14 +720,14 @@ long compat_sys_timer_gettime(timer_t timer_id, return err; } -long compat_sys_clock_settime(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; struct timespec ts; - if (get_compat_timespec(&ts, tp)) + if (compat_get_timespec(&ts, tp)) return -EFAULT; oldfs = get_fs(); set_fs(KERNEL_DS); @@ -737,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock, return err; } -long compat_sys_clock_gettime(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; @@ -749,13 +749,13 @@ long compat_sys_clock_gettime(clockid_t which_clock, err = sys_clock_gettime(which_clock, (struct timespec __user *) &ts); set_fs(oldfs); - if (!err && put_compat_timespec(&ts, tp)) + if (!err && compat_put_timespec(&ts, tp)) return -EFAULT; return err; } -long compat_sys_clock_adjtime(clockid_t which_clock, - struct compat_timex __user *utp) +COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, + struct compat_timex __user *, utp) { struct timex txc; mm_segment_t oldfs; @@ -777,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock, return ret; } -long compat_sys_clock_getres(clockid_t which_clock, - struct compat_timespec __user *tp) +COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, + struct compat_timespec __user *, tp) { long err; mm_segment_t oldfs; @@ -789,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock, err = sys_clock_getres(which_clock, (struct timespec __user *) &ts); set_fs(oldfs); - if (!err && tp && put_compat_timespec(&ts, tp)) + if (!err && tp && compat_put_timespec(&ts, tp)) return -EFAULT; return err; } @@ -799,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) long err; mm_segment_t oldfs; struct timespec tu; - struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; + struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp; restart->nanosleep.rmtp = (struct timespec __user *) &tu; oldfs = get_fs(); @@ -808,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) set_fs(oldfs); if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&tu, rmtp)) + compat_put_timespec(&tu, rmtp)) return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { @@ -818,16 +818,16 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) return err; } -long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, - struct compat_timespec __user *rqtp, - struct compat_timespec __user *rmtp) +COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, + struct compat_timespec __user *, rqtp, + struct compat_timespec __user *, rmtp) { long err; mm_segment_t oldfs; struct timespec in, out; struct restart_block *restart; - if (get_compat_timespec(&in, rqtp)) + if (compat_get_timespec(&in, rqtp)) return -EFAULT; oldfs = get_fs(); @@ -838,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, set_fs(oldfs); if ((err == -ERESTART_RESTARTBLOCK) && rmtp && - put_compat_timespec(&out, rmtp)) + compat_put_timespec(&out, rmtp)) return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { @@ -1010,7 +1010,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, /* compat_time_t is a 32 bit "long" and needs to get converted. */ -asmlinkage long compat_sys_time(compat_time_t __user * tloc) +COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc) { compat_time_t i; struct timeval tv; @@ -1026,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc) return i; } -asmlinkage long compat_sys_stime(compat_time_t __user *tptr) +COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr) { struct timespec tv; int err; @@ -1046,7 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr) #endif /* __ARCH_WANT_COMPAT_SYS_TIME */ -asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) +COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) { struct timex txc; int err, ret; @@ -1065,11 +1065,11 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) } #ifdef CONFIG_NUMA -asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - compat_uptr_t __user *pages32, - const int __user *nodes, - int __user *status, - int flags) +COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, + compat_uptr_t __user *, pages32, + const int __user *, nodes, + int __user *, status, + int, flags) { const void __user * __user *pages; int i; @@ -1085,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); } -asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, - compat_ulong_t maxnode, - const compat_ulong_t __user *old_nodes, - const compat_ulong_t __user *new_nodes) +COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, + compat_ulong_t, maxnode, + const compat_ulong_t __user *, old_nodes, + const compat_ulong_t __user *, new_nodes) { unsigned long __user *old = NULL; unsigned long __user *new = NULL; @@ -1130,7 +1130,7 @@ COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, set_fs(KERNEL_DS); ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); set_fs(old_fs); - if (put_compat_timespec(&t, interval)) + if (compat_put_timespec(&t, interval)) return -EFAULT; return ret; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index e5f3917aa05b..6cb20d2e7ee0 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -53,10 +53,10 @@ void context_tracking_user_enter(void) /* * Repeat the user_enter() check here because some archs may be calling * this from asm and if no CPU needs context tracking, they shouldn't - * go further. Repeat the check here until they support the static key - * check. + * go further. Repeat the check here until they support the inline static + * key check. */ - if (!static_key_false(&context_tracking_enabled)) + if (!context_tracking_is_enabled()) return; /* @@ -160,7 +160,7 @@ void context_tracking_user_exit(void) { unsigned long flags; - if (!static_key_false(&context_tracking_enabled)) + if (!context_tracking_is_enabled()) return; if (in_interrupt()) diff --git a/kernel/cpu.c b/kernel/cpu.c index deff2e693766..a9e710eef0e2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -19,6 +19,7 @@ #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/suspend.h> +#include <linux/lockdep.h> #include "smpboot.h" @@ -27,18 +28,23 @@ static DEFINE_MUTEX(cpu_add_remove_lock); /* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. + * The following two APIs (cpu_maps_update_begin/done) must be used when + * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. + * The APIs cpu_notifier_register_begin/done() must be used to protect CPU + * hotplug callback (un)registration performed using __register_cpu_notifier() + * or __unregister_cpu_notifier(). */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_done); static RAW_NOTIFIER_HEAD(cpu_chain); @@ -57,17 +63,30 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif } cpu_hotplug = { .active_writer = NULL, .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "cpu_hotplug.lock" }, +#endif }; +/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ +#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) +#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) +#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) + void get_online_cpus(void) { might_sleep(); if (cpu_hotplug.active_writer == current) return; + cpuhp_lock_acquire_read(); mutex_lock(&cpu_hotplug.lock); cpu_hotplug.refcount++; mutex_unlock(&cpu_hotplug.lock); @@ -87,6 +106,7 @@ void put_online_cpus(void) if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } EXPORT_SYMBOL_GPL(put_online_cpus); @@ -117,6 +137,7 @@ void cpu_hotplug_begin(void) { cpu_hotplug.active_writer = current; + cpuhp_lock_acquire(); for (;;) { mutex_lock(&cpu_hotplug.lock); if (likely(!cpu_hotplug.refcount)) @@ -131,6 +152,7 @@ void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); + cpuhp_lock_release(); } /* @@ -166,6 +188,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ + return raw_notifier_chain_register(&cpu_chain, nb); +} + static int __cpu_notify(unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -189,6 +216,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) BUG_ON(cpu_notify(val, v)); } EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) { @@ -198,6 +226,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile deleted file mode 100644 index 59ab052ef7a0..000000000000 --- a/kernel/cpu/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-y = idle.o diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c deleted file mode 100644 index 988573a9a387..000000000000 --- a/kernel/cpu/idle.c +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Generic entry point for the idle threads - */ -#include <linux/sched.h> -#include <linux/cpu.h> -#include <linux/tick.h> -#include <linux/mm.h> -#include <linux/stackprotector.h> - -#include <asm/tlb.h> - -#include <trace/events/power.h> - -static int __read_mostly cpu_idle_force_poll; - -void cpu_idle_poll_ctrl(bool enable) -{ - if (enable) { - cpu_idle_force_poll++; - } else { - cpu_idle_force_poll--; - WARN_ON_ONCE(cpu_idle_force_poll < 0); - } -} - -#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP -static int __init cpu_idle_poll_setup(char *__unused) -{ - cpu_idle_force_poll = 1; - return 1; -} -__setup("nohlt", cpu_idle_poll_setup); - -static int __init cpu_idle_nopoll_setup(char *__unused) -{ - cpu_idle_force_poll = 0; - return 1; -} -__setup("hlt", cpu_idle_nopoll_setup); -#endif - -static inline int cpu_idle_poll(void) -{ - rcu_idle_enter(); - trace_cpu_idle_rcuidle(0, smp_processor_id()); - local_irq_enable(); - while (!tif_need_resched()) - cpu_relax(); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - rcu_idle_exit(); - return 1; -} - -/* Weak implementations for optional arch specific functions */ -void __weak arch_cpu_idle_prepare(void) { } -void __weak arch_cpu_idle_enter(void) { } -void __weak arch_cpu_idle_exit(void) { } -void __weak arch_cpu_idle_dead(void) { } -void __weak arch_cpu_idle(void) -{ - cpu_idle_force_poll = 1; - local_irq_enable(); -} - -/* - * Generic idle loop implementation - */ -static void cpu_idle_loop(void) -{ - while (1) { - tick_nohz_idle_enter(); - - while (!need_resched()) { - check_pgt_cache(); - rmb(); - - if (cpu_is_offline(smp_processor_id())) - arch_cpu_idle_dead(); - - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { - cpu_idle_poll(); - } else { - if (!current_clr_polling_and_test()) { - stop_critical_timings(); - rcu_idle_enter(); - arch_cpu_idle(); - WARN_ON_ONCE(irqs_disabled()); - rcu_idle_exit(); - start_critical_timings(); - } else { - local_irq_enable(); - } - __current_set_polling(); - } - arch_cpu_idle_exit(); - /* - * We need to test and propagate the TIF_NEED_RESCHED - * bit here because we might not have send the - * reschedule IPI to idle tasks. - */ - if (tif_need_resched()) - set_preempt_need_resched(); - } - tick_nohz_idle_exit(); - schedule_preempt_disabled(); - } -} - -void cpu_startup_entry(enum cpuhp_state state) -{ - /* - * This #ifdef needs to die, but it's too late in the cycle to - * make this generic (arm and sh have never invoked the canary - * init for the non boot cpus!). Will be fixed in 3.11 - */ -#ifdef CONFIG_X86 - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. The boot CPU already has it initialized but no harm - * in doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -#endif - __current_set_polling(); - arch_cpu_idle_prepare(); - cpu_idle_loop(); -} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4772034b4b17..3d54c418bd06 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return css_cs(task_css(task, cpuset_subsys_id)); + return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) @@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) } /** - * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's - * @tsk: task to test - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed - * mask needs to be changed. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_cpumask(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - - set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); -} - -/** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() - * - * Called with cpuset_mutex held - * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its cpus_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_cpumask(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + css_task_iter_end(&it); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, continue; rcu_read_unlock(); - update_tasks_cpumask(cp, heap); + update_tasks_cpumask(cp); rcu_read_lock(); css_put(&cp->css); @@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - struct ptr_heap heap; int retval; int is_load_balanced; @@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; - is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - update_tasks_cpumask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_cpumask_hier(cs, true); if (is_load_balanced) rebuild_sched_domains_locked(); @@ -974,12 +948,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cpuset_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -996,8 +964,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &tsk->mems_allowed); + rcu_read_unlock(); } /* @@ -1026,7 +996,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_lock(tsk); /* * Determine if a loop is necessary if another thread is doing - * get_mems_allowed(). If at least one node remains unchanged and + * read_mems_allowed_begin(). If at least one node remains unchanged and * tsk does not have a mempolicy, then an empty nodemask will not be * possible when mems_allowed is larger than a word. */ @@ -1052,53 +1022,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } -struct cpuset_change_nodemask_arg { - struct cpuset *cs; - nodemask_t *newmems; -}; - -/* - * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy - * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cpuset_mutex held. - */ -static void cpuset_change_nodemask(struct task_struct *p, void *data) -{ - struct cpuset_change_nodemask_arg *arg = data; - struct cpuset *cs = arg->cs; - struct mm_struct *mm; - int migrate; - - cpuset_change_task_nodemask(p, arg->newmems); - - mm = get_task_mm(p); - if (!mm) - return; - - migrate = is_memory_migrate(cs); - - mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); - mmput(mm); -} - static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held. No return value. It's guaranteed that - * css_scan_tasks() always returns 0 if @heap != NULL. + * Iterate through each task of @cs updating its mems_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct cpuset *mems_cs = effective_nodemask_cpuset(cs); - struct cpuset_change_nodemask_arg arg = { .cs = cs, - .newmems = &newmems }; + struct css_task_iter it; + struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ @@ -1114,7 +1053,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) { + struct mm_struct *mm; + bool migrate; + + cpuset_change_task_nodemask(task, &newmems); + + mm = get_task_mm(task); + if (!mm) + continue; + + migrate = is_memory_migrate(cs); + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) + cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); + mmput(mm); + } + css_task_iter_end(&it); /* * All the tasks' nodemasks have been updated, update @@ -1130,15 +1087,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1159,7 +1114,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, continue; rcu_read_unlock(); - update_tasks_nodemask(cp, heap); + update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -1184,7 +1139,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - struct ptr_heap heap; /* * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; @@ -1223,17 +1177,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval < 0) - goto done; - mutex_lock(&callback_mutex); cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_nodemask_hier(cs, true); done: return retval; } @@ -1261,38 +1209,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) } /** - * cpuset_change_flag - make a task's spread flags the same as its cpuset's - * @tsk: task to be updated - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_flag(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - - cpuset_update_task_spread_flag(cs, tsk); -} - -/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. */ -static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_flags(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + cpuset_update_task_spread_flag(cs, task); + css_task_iter_end(&it); } /* @@ -1310,7 +1242,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, struct cpuset *trialcs; int balance_flag_changed; int spread_flag_changed; - struct ptr_heap heap; int err; trialcs = alloc_trial_cpuset(cs); @@ -1326,10 +1257,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) goto out; - err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (err < 0) - goto out; - balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); @@ -1344,8 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, rebuild_sched_domains_locked(); if (spread_flag_changed) - update_tasks_flags(cs, &heap); - heap_free(&heap); + update_tasks_flags(cs); out: free_trial_cpuset(trialcs); return err; @@ -1449,6 +1375,8 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +static struct cpuset *cpuset_attach_old_cs; + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) @@ -1457,6 +1385,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct task_struct *task; int ret; + /* used later by cpuset_attach() */ + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + mutex_lock(&cpuset_mutex); /* @@ -1468,7 +1399,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1520,10 +1451,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, - cpuset_subsys_id); struct cpuset *cs = css_cs(css); - struct cpuset *oldcs = css_cs(oldcss); + struct cpuset *oldcs = cpuset_attach_old_cs; struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1537,7 +1466,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1677,7 +1606,7 @@ out_unlock: * Common handling for a write to a "cpus" or "mems" file. */ static int cpuset_write_resmask(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buf) + struct cftype *cft, char *buf) { struct cpuset *cs = css_cs(css); struct cpuset *trialcs; @@ -1731,66 +1660,41 @@ out_unlock: * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. - * A single large read to a buffer that crosses a page boundary is - * ok, because the result being copied to user land is not recomputed - * across a page fault. */ - -static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static int cpuset_common_seq_show(struct seq_file *sf, void *v) { - size_t count; + struct cpuset *cs = css_cs(seq_css(sf)); + cpuset_filetype_t type = seq_cft(sf)->private; + ssize_t count; + char *buf, *s; + int ret = 0; - mutex_lock(&callback_mutex); - count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); - mutex_unlock(&callback_mutex); - - return count; -} - -static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) -{ - size_t count; + count = seq_get_buf(sf, &buf); + s = buf; mutex_lock(&callback_mutex); - count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); - mutex_unlock(&callback_mutex); - - return count; -} - -static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - s = page; switch (type) { case FILE_CPULIST: - s += cpuset_sprintf_cpulist(s, cs); + s += cpulist_scnprintf(s, count, cs->cpus_allowed); break; case FILE_MEMLIST: - s += cpuset_sprintf_memlist(s, cs); + s += nodelist_scnprintf(s, count, cs->mems_allowed); break; default: - retval = -EINVAL; - goto out; + ret = -EINVAL; + goto out_unlock; } - *s++ = '\n'; - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; + if (s < buf + count - 1) { + *s++ = '\n'; + seq_commit(sf, s - buf); + } else { + seq_commit(sf, -1); + } +out_unlock: + mutex_unlock(&callback_mutex); + return ret; } static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -1847,7 +1751,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) static struct cftype files[] = { { .name = "cpus", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, @@ -1855,7 +1759,7 @@ static struct cftype files[] = { { .name = "mems", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, @@ -2049,8 +1953,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) kfree(cs); } -struct cgroup_subsys cpuset_subsys = { - .name = "cpuset", +struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, @@ -2058,7 +1961,6 @@ struct cgroup_subsys cpuset_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, }; @@ -2115,10 +2017,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - rcu_read_lock(); - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", - cgroup_name(cs->css.cgroup)); - rcu_read_unlock(); + printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); } } @@ -2166,7 +2067,7 @@ retry: */ if ((sane && cpumask_empty(cs->cpus_allowed)) || (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs, NULL); + update_tasks_cpumask(cs); mutex_lock(&callback_mutex); nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); @@ -2180,7 +2081,7 @@ retry: */ if ((sane && nodes_empty(cs->mems_allowed)) || (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs, NULL); + update_tasks_nodemask(cs); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -2242,7 +2143,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) mutex_lock(&callback_mutex); top_cpuset.mems_allowed = new_mems; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, NULL); + update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -2334,10 +2235,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) struct cpuset *cpus_cs; mutex_lock(&callback_mutex); - task_lock(tsk); + rcu_read_lock(); cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); guarantee_online_cpus(cpus_cs, pmask); - task_unlock(tsk); + rcu_read_unlock(); mutex_unlock(&callback_mutex); } @@ -2390,10 +2291,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) nodemask_t mask; mutex_lock(&callback_mutex); - task_lock(tsk); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &mask); - task_unlock(tsk); + rcu_read_unlock(); mutex_unlock(&callback_mutex); return mask; @@ -2509,11 +2410,11 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) /* Not hardwall and node outside mems_allowed: scan up cpusets */ mutex_lock(&callback_mutex); - task_lock(current); + rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); - task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); + rcu_read_unlock(); + mutex_unlock(&callback_mutex); return allowed; } @@ -2638,27 +2539,27 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, * @task: pointer to task_struct of some task. * * Description: Prints @task's name, cpuset name, and cached copy of its - * mems_allowed to the kernel log. Must hold task_lock(task) to allow - * dereferencing task_cs(task). + * mems_allowed to the kernel log. */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { /* Statically allocated to prevent using excess stack. */ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); + struct cgroup *cgrp; - struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - - rcu_read_lock(); spin_lock(&cpuset_buffer_lock); + rcu_read_lock(); + cgrp = task_cs(tsk)->css.cgroup; nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + printk(KERN_INFO "%s cpuset=", tsk->comm); + pr_cont_cgroup_name(cgrp); + pr_cont(" mems_allowed=%s\n", cpuset_nodelist); - spin_unlock(&cpuset_buffer_lock); rcu_read_unlock(); + spin_unlock(&cpuset_buffer_lock); } /* @@ -2689,9 +2590,9 @@ int cpuset_memory_pressure_enabled __read_mostly; void __cpuset_memory_pressure_bump(void) { - task_lock(current); + rcu_read_lock(); fmeter_markevent(&task_cs(current)->fmeter); - task_unlock(current); + rcu_read_unlock(); } #ifdef CONFIG_PROC_PID_CPUSET @@ -2708,12 +2609,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *p; struct cgroup_subsys_state *css; int retval; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -2723,14 +2624,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; + retval = -ENAMETOOLONG; rcu_read_lock(); - css = task_css(tsk, cpuset_subsys_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + css = task_css(tsk, cpuset_cgrp_id); + p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); - if (retval < 0) + if (!p) goto out_put_task; - seq_puts(m, buf); + seq_puts(m, p); seq_putc(m, '\n'); + retval = 0; out_put_task: put_task_struct(tsk); out_free: diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7d2f35e5df2f..2956c8da1605 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -49,6 +49,7 @@ #include <linux/pid.h> #include <linux/smp.h> #include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/rcupdate.h> #include <asm/cacheflush.h> @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); + if (current->mm) { + int i; + + for (i = 0; i < VMACACHE_SIZE; i++) { + if (!current->vmacache[i]) + continue; + flush_cache_range(current->vmacache[i], + addr, addr + BREAK_INSTR_SIZE); + } } + /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } @@ -736,7 +744,8 @@ int kgdb_nmicallback(int cpu, void *regs) return 1; } -int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) +int kgdb_nmicallin(int cpu, int trapnr, void *regs, int err_code, + atomic_t *send_ready) { #ifdef CONFIG_SMP if (!kgdb_io_ready(0) || !send_ready) @@ -750,7 +759,7 @@ int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) ks->cpu = cpu; ks->ex_vector = trapnr; ks->signo = SIGTRAP; - ks->err_code = KGDB_KDB_REASON_SYSTEM_NMI; + ks->err_code = err_code; ks->linux_regs = regs; ks->send_ready = send_ready; kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); @@ -1034,7 +1043,7 @@ int dbg_io_get_char(void) * otherwise as a quick means to stop program execution and "break" into * the debugger. */ -void kgdb_breakpoint(void) +noinline void kgdb_breakpoint(void) { atomic_inc(&kgdb_setting_breakpoint); wmb(); /* Sync point before breakpoint */ diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 572aa4f5677c..127d9bc49fb4 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -75,13 +75,11 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); -#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { return DBG_PASS_EVENT; } -#define KGDB_KDB_REASON_SYSTEM_NMI 0 #endif /* CONFIG_KGDB_KDB */ #endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/events/core.c b/kernel/events/core.c index f5744010a8d2..f83a71a3e46d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -119,7 +119,8 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ - PERF_FLAG_PID_CGROUP) + PERF_FLAG_PID_CGROUP |\ + PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks @@ -230,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); -void perf_sample_event_took(u64 sample_len_ns) +static void perf_duration_warn(struct irq_work *w) { + u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); u64 avg_local_sample_len; u64 local_samples_len; + + local_samples_len = __get_cpu_var(running_sample_length); + avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; + + printk_ratelimited(KERN_WARNING + "perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns >> 1, + sysctl_perf_event_sample_rate); +} + +static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); + +void perf_sample_event_took(u64 sample_len_ns) +{ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); + u64 avg_local_sample_len; + u64 local_samples_len; if (allowed_ns == 0) return; @@ -262,13 +281,14 @@ void perf_sample_event_took(u64 sample_len_ns) sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; - printk_ratelimited(KERN_WARNING - "perf samples too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns, - sysctl_perf_event_sample_rate); - update_perf_cpu_limits(); + + if (!irq_work_queue(&perf_duration_work)) { + early_printk("perf interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + avg_local_sample_len, allowed_ns >> 1, + sysctl_perf_event_sample_rate); + } } static atomic64_t perf_event_id; @@ -341,7 +361,7 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_css(task, perf_subsys_id), + return container_of(task_css(task, perf_event_cgrp_id), struct perf_cgroup, css); } @@ -369,11 +389,6 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline bool perf_tryget_cgroup(struct perf_event *event) -{ - return css_tryget(&event->cgrp->css); -} - static inline void perf_put_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); @@ -592,9 +607,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - rcu_read_lock(); - - css = css_from_dir(f.file->f_dentry, &perf_subsys); + css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -603,13 +616,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; - /* must be done before we fput() the file */ - if (!perf_tryget_cgroup(event)) { - event->cgrp = NULL; - ret = -ENOENT; - goto out; - } - /* * all events in a group must monitor * the same cgroup because a task belongs @@ -620,7 +626,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - rcu_read_unlock(); fdput(f); return ret; } @@ -1713,7 +1718,7 @@ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = group_event->pmu; + struct pmu *pmu = ctx->pmu; u64 now = ctx->time; bool simulate = false; @@ -2562,8 +2567,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev, if (cpuctx->ctx.nr_branch_stack > 0 && pmu->flush_branch_stack) { - pmu = cpuctx->ctx.pmu; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); @@ -3542,7 +3545,7 @@ static void perf_event_for_each(struct perf_event *event, static int perf_event_period(struct perf_event *event, u64 __user *arg) { struct perf_event_context *ctx = event->ctx; - int ret = 0; + int ret = 0, active; u64 value; if (!is_sampling_event(event)) @@ -3566,6 +3569,20 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->attr.sample_period = value; event->hw.sample_period = value; } + + active = (event->state == PERF_EVENT_STATE_ACTIVE); + if (active) { + perf_pmu_disable(ctx->pmu); + event->pmu->stop(event, PERF_EF_UPDATE); + } + + local64_set(&event->hw.period_left, 0); + + if (active) { + event->pmu->start(event, PERF_EF_RELOAD); + perf_pmu_enable(ctx->pmu); + } + unlock: raw_spin_unlock_irq(&ctx->lock); @@ -6279,7 +6296,7 @@ static int perf_event_idx_default(struct perf_event *event) * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. */ -static void *find_pmu_context(int ctxn) +static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) { struct pmu *pmu; @@ -6670,6 +6687,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); INIT_LIST_HEAD(&event->rb_entry); + INIT_LIST_HEAD(&event->active_entry); + INIT_HLIST_NODE(&event->hlist_entry); + init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); @@ -6980,6 +7000,7 @@ SYSCALL_DEFINE5(perf_event_open, int event_fd; int move_group = 0; int err; + int f_flags = O_RDWR; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -7008,7 +7029,10 @@ SYSCALL_DEFINE5(perf_event_open, if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) return -EINVAL; - event_fd = get_unused_fd(); + if (flags & PERF_FLAG_FD_CLOEXEC) + f_flags |= O_CLOEXEC; + + event_fd = get_unused_fd_flags(f_flags); if (event_fd < 0) return event_fd; @@ -7130,7 +7154,8 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, + f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); goto err_context; @@ -7833,14 +7858,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { struct perf_event_context *ctx = __info; - struct perf_event *event, *tmp; + struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_remove_from_context(event); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) __perf_remove_from_context(event); + rcu_read_unlock(); } static void perf_event_exit_cpu_context(int cpu) @@ -7864,11 +7889,11 @@ static void perf_event_exit_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + perf_event_exit_cpu_context(cpu); + mutex_lock(&swhash->hlist_mutex); swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); - - perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -8013,7 +8038,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) task_function_call(task, __perf_cgroup_move, task); } @@ -8032,9 +8057,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, +struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index e8b168af135b..146a5792b1d2 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -61,19 +61,20 @@ again: * * kernel user * - * READ ->data_tail READ ->data_head - * smp_mb() (A) smp_rmb() (C) - * WRITE $data READ $data - * smp_wmb() (B) smp_mb() (D) - * STORE ->data_head WRITE ->data_tail + * if (LOAD ->data_tail) { LOAD ->data_head + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head STORE ->data_tail + * } * * Where A pairs with D, and B pairs with C. * - * I don't think A needs to be a full barrier because we won't in fact - * write data until we see the store from userspace. So we simply don't - * issue the data WRITE until we observe it. Be conservative for now. + * In our case (A) is a control dependency that separates the load of + * the ->data_tail and the stores of $data. In case ->data_tail + * indicates there is no room in the buffer to store $data we do not. * - * OTOH, D needs to be a full barrier since it separates the data READ + * D needs to be a full barrier since it separates the data READ * from the tail WRITE. * * For B a WMB is sufficient since it separates two WRITEs, and for C @@ -81,7 +82,7 @@ again: * * See perf_output_begin(). */ - smp_wmb(); + smp_wmb(); /* B, matches C */ rb->user_page->data_head = head; /* @@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle, if (!rb->overwrite && unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) goto fail; + + /* + * The above forms a control dependency barrier separating the + * @tail load above from the data stores below. Since the @tail + * load is required to compute the branch to fail below. + * + * A, matches D; the full memory barrier userspace SHOULD issue + * after reading the data and before storing the new tail + * position. + * + * See perf_output_put_handle(). + */ + head += size; } while (local_cmpxchg(&rb->head, offset, head) != offset); /* - * Separate the userpage->tail read from the data stores below. - * Matches the MB userspace SHOULD issue after reading the data - * and before storing the new tail position. - * - * See perf_output_put_handle(). + * We rely on the implied barrier() by local_cmpxchg() to ensure + * none of the data stores below can be lifted up by the compiler. */ - smp_mb(); if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 24b7d6ca871b..04709b66369d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -73,6 +73,17 @@ struct uprobe { struct inode *inode; /* Also hold a ref to inode */ loff_t offset; unsigned long flags; + + /* + * The generic code assumes that it has two members of unknown type + * owned by the arch-specific code: + * + * insn - copy_insn() saves the original instruction here for + * arch_uprobe_analyze_insn(). + * + * ixol - potentially modified instruction to execute out of + * line, copied to xol_area by xol_get_insn_slot(). + */ struct arch_uprobe arch; }; @@ -86,6 +97,29 @@ struct return_instance { }; /* + * Execute out of line area: anonymous executable mapping installed + * by the probed task to execute the copy of the original instruction + * mangled by set_swbp(). + * + * On a breakpoint hit, thread contests for a slot. It frees the + * slot after singlestep. Currently a fixed number of slots are + * allocated. + */ +struct xol_area { + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct page *page; + + /* + * We keep the vma's vm_start rather than a pointer to the vma + * itself. The probed process or a naughty kernel module could make + * the vma go away, and we must handle that reasonably gracefully. + */ + unsigned long vaddr; /* Page(s) of instruction slots */ +}; + +/* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. @@ -330,7 +364,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); + return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -529,8 +563,8 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; - void *insn = uprobe->arch.insn; - int size = MAX_UINSN_BYTES; + void *insn = &uprobe->arch.insn; + int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ @@ -569,7 +603,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -1264,7 +1298,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) /* Initialize the slot */ copy_to_page(area->page, xol_vaddr, - uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); + &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. @@ -1403,12 +1437,10 @@ static void uprobe_warn(struct task_struct *t, const char *msg) static void dup_xol_work(struct callback_head *work) { - kfree(work); - if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->vaddr)) + if (!__create_xol_area(current->utask->dup_xol_addr)) uprobe_warn(current, "dup xol area"); } @@ -1419,7 +1451,6 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; - struct callback_head *work; struct xol_area *area; t->utask = NULL; @@ -1441,14 +1472,9 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags) if (mm == t->mm) return; - /* TODO: move it into the union in uprobe_task */ - work = kmalloc(sizeof(*work), GFP_KERNEL); - if (!work) - return uprobe_warn(t, "dup xol area"); - - t->utask->vaddr = area->vaddr; - init_task_work(work, dup_xol_work); - task_work_add(t, work, true); + t->utask->dup_xol_addr = area->vaddr; + init_task_work(&t->utask->dup_xol_work, dup_xol_work); + task_work_add(t, &t->utask->dup_xol_work, true); } /* @@ -1778,6 +1804,11 @@ static bool handle_trampoline(struct pt_regs *regs) return true; } +bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) +{ + return false; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1828,7 +1859,15 @@ static void handle_swbp(struct pt_regs *regs) if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; + /* Tracing handlers use ->utask to communicate with fetch methods */ + if (!get_utask()) + goto out; + + if (arch_uprobe_ignore(&uprobe->arch, regs)) + goto out; + handler_chain(uprobe, regs); + if (can_skip_sstep(uprobe, regs)) goto out; diff --git a/kernel/exit.c b/kernel/exit.c index a949819055d5..6ed6a1d552b5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -74,6 +74,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead) __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_group); + list_del_rcu(&p->thread_node); } /* @@ -569,7 +570,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ @@ -783,9 +784,10 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + if (group_dead) + disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - check_stack_usage(); exit_thread(); /* @@ -796,21 +798,17 @@ void do_exit(long code) */ perf_event_exit_task(tsk); - cgroup_exit(tsk, 1); - - if (group_dead) - disassociate_ctty(1); + cgroup_exit(tsk); module_put(task_thread_info(tsk)->exec_domain->module); - proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); @@ -843,6 +841,7 @@ void do_exit(long code) validate_creds_for_do_exit(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); @@ -1037,17 +1036,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return wait_noreap_copyout(wo, p, pid, uid, why, status); } + traced = ptrace_reparented(p); /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } - - traced = ptrace_reparented(p); /* * It can be ptraced but not reparented, check * thread_group_leader() to filter out sub-threads. @@ -1108,7 +1103,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) /* * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. + * thread can reap it because we its state == DEAD/TRACE. */ read_unlock(&tasklist_lock); @@ -1145,22 +1140,19 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (!retval) retval = pid; - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); return retval; @@ -1337,7 +1329,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + int ret; + + if (unlikely(p->exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, p); if (!ret) return ret; @@ -1355,33 +1352,44 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (unlikely(p->exit_state == EXIT_TRACE)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + if (likely(!ptrace)) wo->notask_error = 0; return 0; } - /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } + if (!ptrace_reparented(p)) + ptrace = 1; + } + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by @@ -1407,19 +1415,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, wo->notask_error = 0; } else { /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ diff --git a/kernel/extable.c b/kernel/extable.c index 763faf037ec1..d8a6446adbcb 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[]; extern struct exception_table_entry __stop___ex_table[]; /* Cleared by build time tools if the table is already sorted. */ -u32 __initdata main_extable_sort_needed = 1; +u32 __initdata __visible main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) diff --git a/kernel/fork.c b/kernel/fork.c index dfa736c98d17..54a8d26f612f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -28,6 +28,8 @@ #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> +#include <linux/mm.h> +#include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> @@ -71,6 +73,7 @@ #include <linux/signalfd.h> #include <linux/uprobes.h> #include <linux/aio.h> +#include <linux/compiler.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -237,6 +240,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + task_numa_free(tsk); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); @@ -283,7 +287,7 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } -int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, +int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -363,7 +367,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; - mm->mmap_cache = NULL; + mm->vmacache_seqnum = 0; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -529,8 +533,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? - (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); @@ -539,8 +541,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm_init_owner(mm, p); clear_tlb_flush_pending(mm); - if (likely(!mm_alloc_pgd(mm))) { + if (current->mm) { + mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; + } else { + mm->flags = default_dump_filter; mm->def_flags = 0; + } + + if (likely(!mm_alloc_pgd(mm))) { mmu_notifier_mm_init(mm); return mm; } @@ -800,14 +809,11 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; - if (!oldmm) - return NULL; - mm = allocate_mm(); if (!mm) goto fail_nomem; @@ -879,6 +885,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; + /* initialize the new vmacache entries */ + vmacache_flush(tsk); + if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; @@ -1035,6 +1044,11 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); + + /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ + sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); + tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); + init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); @@ -1067,15 +1081,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static void copy_flags(unsigned long clone_flags, struct task_struct *p) -{ - unsigned long new_flags = p->flags; - - new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); - new_flags |= PF_FORKNOEXEC; - p->flags = new_flags; -} - SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; @@ -1087,8 +1092,10 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init(&p->pi_waiters); + p->pi_waiters = RB_ROOT; + p->pi_waiters_leftmost = NULL; p->pi_blocked_on = NULL; + p->pi_top_task = NULL; #endif } @@ -1222,9 +1229,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - copy_flags(clone_flags, p); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); @@ -1268,9 +1275,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; + goto bad_fork_cleanup_threadgroup_lock; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; @@ -1311,7 +1317,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(clone_flags, p); + retval = sched_fork(clone_flags, p); + if (retval) + goto bad_fork_cleanup_policy; retval = perf_event_init_task(p); if (retval) @@ -1403,13 +1411,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->tgid = p->pid; } - p->pdeath_signal = 0; - p->exit_state = 0; - p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; + p->pdeath_signal = 0; INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; @@ -1472,6 +1478,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, atomic_inc(¤t->signal->sigcnt); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + list_add_tail_rcu(&p->thread_node, + &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; @@ -1519,11 +1527,10 @@ bad_fork_cleanup_policy: perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_cgroup: +bad_fork_cleanup_threadgroup_lock: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: @@ -1645,7 +1652,7 @@ SYSCALL_DEFINE0(fork) return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ - return(-EINVAL); + return -EINVAL; #endif } #endif @@ -1653,7 +1660,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif diff --git a/kernel/futex.c b/kernel/futex.c index f6ff0191ecf7..6801b3751a95 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -63,14 +63,103 @@ #include <linux/sched/rt.h> #include <linux/hugetlb.h> #include <linux/freezer.h> +#include <linux/bootmem.h> #include <asm/futex.h> #include "locking/rtmutex_common.h" -int __read_mostly futex_cmpxchg_enabled; +/* + * Basic futex operation and ordering guarantees: + * + * The waiter reads the futex value in user space and calls + * futex_wait(). This function computes the hash bucket and acquires + * the hash bucket lock. After that it reads the futex user space value + * again and verifies that the data has not changed. If it has not changed + * it enqueues itself into the hash bucket, releases the hash bucket lock + * and schedules. + * + * The waker side modifies the user space value of the futex and calls + * futex_wake(). This function computes the hash bucket and acquires the + * hash bucket lock. Then it looks for waiters on that futex in the hash + * bucket and wakes them. + * + * In futex wake up scenarios where no tasks are blocked on a futex, taking + * the hb spinlock can be avoided and simply return. In order for this + * optimization to work, ordering guarantees must exist so that the waiter + * being added to the list is acknowledged when the list is concurrently being + * checked by the waker, avoiding scenarios like the following: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * uval = *futex; + * *futex = newval; + * sys_futex(WAKE, futex); + * futex_wake(futex); + * if (queue_empty()) + * return; + * if (uval == val) + * lock(hash_bucket(futex)); + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); + * + * This would cause the waiter on CPU 0 to wait forever because it + * missed the transition of the user space value from val to newval + * and the waker did not find the waiter in the hash bucket queue. + * + * The correct serialization ensures that a waiter either observes + * the changed user space value before blocking or is woken by a + * concurrent waker: + * + * CPU 0 CPU 1 + * val = *futex; + * sys_futex(WAIT, futex, val); + * futex_wait(futex, val); + * + * waiters++; + * mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `-------> mb(); (B) + * if (uval == val) + * queue(); + * unlock(hash_bucket(futex)); + * schedule(); if (waiters) + * lock(hash_bucket(futex)); + * wake_waiters(futex); + * unlock(hash_bucket(futex)); + * + * Where (A) orders the waiters increment and the futex value read -- this + * is guaranteed by the head counter in the hb spinlock; and where (B) + * orders the write to futex and the waiters read -- this is done by the + * barriers in get_futex_key_refs(), through either ihold or atomic_inc, + * depending on the futex type. + * + * This yields the following case (where X:=waiters, Y:=futex): + * + * X = Y = 0 + * + * w[X]=1 w[Y]=1 + * MB MB + * r[Y]=y r[X]=x + * + * Which guarantees that x==0 && y==0 is impossible; which translates back into + * the guarantee that we cannot both miss the futex variable change and the + * enqueue. + */ -#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG +int __read_mostly futex_cmpxchg_enabled; +#endif /* * Futex flags used to encode options to functions and preserve them across @@ -147,11 +236,59 @@ static const struct futex_q futex_q_init = { * waiting on a futex. */ struct futex_hash_bucket { + atomic_t waiters; spinlock_t lock; struct plist_head chain; -}; +} ____cacheline_aligned_in_smp; + +static unsigned long __read_mostly futex_hashsize; -static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; +static struct futex_hash_bucket *futex_queues; + +static inline void futex_get_mm(union futex_key *key) +{ + atomic_inc(&key->private.mm->mm_count); + /* + * Ensure futex_get_mm() implies a full barrier such that + * get_futex_key() implies a full barrier. This is relied upon + * as full barrier (B), see the ordering comment above. + */ + smp_mb__after_atomic_inc(); +} + +/* + * Reflects a new waiter being added to the waitqueue. + */ +static inline void hb_waiters_inc(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_inc(&hb->waiters); + /* + * Full barrier (A), see the ordering comment above. + */ + smp_mb__after_atomic_inc(); +#endif +} + +/* + * Reflects a waiter being removed from the waitqueue by wakeup + * paths. + */ +static inline void hb_waiters_dec(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + atomic_dec(&hb->waiters); +#endif +} + +static inline int hb_waiters_pending(struct futex_hash_bucket *hb) +{ +#ifdef CONFIG_SMP + return atomic_read(&hb->waiters); +#else + return 1; +#endif +} /* * We hash on the keys returned from get_futex_key (see below). @@ -161,7 +298,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) u32 hash = jhash2((u32*)&key->both.word, (sizeof(key->both.word)+sizeof(key->both.ptr))/4, key->both.offset); - return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; + return &futex_queues[hash & (futex_hashsize - 1)]; } /* @@ -187,10 +324,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); + ihold(key->shared.inode); /* implies MB (B) */ break; case FUT_OFF_MMSHARED: - atomic_inc(&key->private.mm->mm_count); + futex_get_mm(key); /* implies MB (B) */ break; } } @@ -264,7 +401,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ return 0; } @@ -371,7 +508,7 @@ again: key->shared.pgoff = basepage_index(page); } - get_futex_key_refs(key); + get_futex_key_refs(key); /* implies MB (B) */ out: unlock_page(page_head); @@ -598,13 +735,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; - struct plist_head *head; struct task_struct *p; pid_t pid = uval & FUTEX_TID_MASK; - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex(&this->key, key)) { /* * Another waiter already exists - bump up @@ -838,6 +972,7 @@ static void __unqueue_futex(struct futex_q *q) hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); } /* @@ -986,7 +1121,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; int ret; @@ -998,10 +1132,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) goto out; hb = hash_futex(&key); + + /* Make sure we really have tasks to wakeup */ + if (!hb_waiters_pending(hb)) + goto out_put_key; + spin_lock(&hb->lock); - head = &hb->chain; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1019,6 +1157,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); +out_put_key: put_futex_key(&key); out: return ret; @@ -1034,7 +1173,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head; struct futex_q *this, *next; int ret, op_ret; @@ -1082,9 +1220,7 @@ retry_private: goto retry; } - head = &hb1->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (match_futex (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1097,10 +1233,8 @@ retry_private: } if (op_ret > 0) { - head = &hb2->chain; - op_ret = 0; - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (match_futex (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; @@ -1142,7 +1276,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, */ if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); + hb_waiters_dec(hb1); plist_add(&q->list, &hb2->chain); + hb_waiters_inc(hb2); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -1270,7 +1406,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, int drop_count = 0, task_count = 0, ret; struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; - struct plist_head *head1; struct futex_q *this, *next; u32 curval2; @@ -1317,6 +1452,7 @@ retry: hb2 = hash_futex(&key2); retry_private: + hb_waiters_inc(hb2); double_lock_hb(hb1, hb2); if (likely(cmpval != NULL)) { @@ -1326,6 +1462,7 @@ retry_private: if (unlikely(ret)) { double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); ret = get_user(curval, uaddr1); if (ret) @@ -1375,6 +1512,7 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); @@ -1384,6 +1522,7 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); put_futex_key(&key2); put_futex_key(&key1); cond_resched(); @@ -1393,8 +1532,7 @@ retry_private: } } - head1 = &hb1->chain; - plist_for_each_entry_safe(this, next, head1, list) { + plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (task_count - nr_wake >= nr_requeue) break; @@ -1460,6 +1598,7 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); + hb_waiters_dec(hb2); /* * drop_futex_key_refs() must be called outside the spinlocks. During @@ -1487,17 +1626,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) struct futex_hash_bucket *hb; hb = hash_futex(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all queue_lock() + * users end up calling queue_me(). Similarly, for housekeeping, + * decrement the counter at queue_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + hb_waiters_inc(hb); + q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); + spin_lock(&hb->lock); /* implies MB (A) */ return hb; } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +queue_unlock(struct futex_hash_bucket *hb) __releases(&hb->lock) { spin_unlock(&hb->lock); + hb_waiters_dec(hb); } /** @@ -1867,7 +2018,7 @@ retry_private: ret = get_futex_value_locked(&uval, uaddr); if (ret) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = get_user(uval, uaddr); if (ret) @@ -1881,7 +2032,7 @@ retry_private: } if (uval != val) { - queue_unlock(q, *hb); + queue_unlock(*hb); ret = -EWOULDBLOCK; } @@ -2029,7 +2180,7 @@ retry_private: * Task is exiting and we just wait for the * exit to complete. */ - queue_unlock(&q, hb); + queue_unlock(hb); put_futex_key(&q.key); cond_resched(); goto retry; @@ -2081,7 +2232,7 @@ retry_private: goto out_put_key; out_unlock_put_key: - queue_unlock(&q, hb); + queue_unlock(hb); out_put_key: put_futex_key(&q.key); @@ -2091,7 +2242,7 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - queue_unlock(&q, hb); + queue_unlock(hb); ret = fault_in_user_writeable(uaddr); if (ret) @@ -2113,7 +2264,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; u32 uval, vpid = task_pid_vnr(current); int ret; @@ -2153,9 +2303,7 @@ retry: * Ok, other tasks may need to be woken up - check waiters * and do the wakeup if necessary: */ - head = &hb->chain; - - plist_for_each_entry_safe(this, next, head, list) { + plist_for_each_entry_safe(this, next, &hb->chain, list) { if (!match_futex (&this->key, &key)) continue; ret = wake_futex_pi(uaddr, uval, this); @@ -2232,6 +2380,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &hb->chain); + hb_waiters_dec(hb); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; @@ -2316,6 +2465,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * code while we sleep on uaddr. */ debug_rt_mutex_init_waiter(&rt_waiter); + RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); + RB_CLEAR_NODE(&rt_waiter.tree_entry); rt_waiter.task = NULL; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); @@ -2731,10 +2882,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void) { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG u32 curval; - int i; /* * This will fail and we want it. Some arch implementations do @@ -2748,8 +2899,31 @@ static int __init futex_init(void) */ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ + unsigned int futex_shift; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + + futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; + + futex_detect_cmpxchg(); - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); spin_lock_init(&futex_queues[i].lock); } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f9f44fd4d34d..55c8c9349cfe 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -183,7 +183,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI)) { - if (get_compat_timespec(&ts, utime)) + if (compat_get_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) return -EINVAL; diff --git a/kernel/groups.c b/kernel/groups.c index 90cf1c38c8ea..451698f86cfa 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp) * set_groups - Change a group subscription in a set of credentials * @new: The newly prepared set of credentials to alter * @group_info: The group list to install - * - * Validate a group subscription and, if valid, insert it into a set - * of credentials. */ -int set_groups(struct cred *new, struct group_info *group_info) +void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; - return 0; } EXPORT_SYMBOL(set_groups); @@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups); int set_current_groups(struct group_info *group_info) { struct cred *new; - int ret; new = prepare_creds(); if (!new) return -ENOMEM; - ret = set_groups(new, group_info); - if (ret < 0) { - abort_creds(new); - return ret; - } - + set_groups(new, group_info); return commit_creds(new); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 383319bae3f7..d55092ceee29 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -46,6 +46,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/timer.h> #include <linux/freezer.h> @@ -167,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } - -/* - * Get the preferred target CPU for NOHZ - */ -static int hrtimer_get_target(int this_cpu, int pinned) -{ -#ifdef CONFIG_NO_HZ_COMMON - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) - return get_nohz_timer_target(); -#endif - return this_cpu; -} - /* * With HIGHRES=y we do not migrate the timer when it is expiring * before the next event on the target cpu because we cannot reprogram @@ -213,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); - int cpu = hrtimer_get_target(this_cpu, pinned); + int cpu = get_nohz_timer_target(pinned); int basenum = base->index; again: @@ -1610,7 +1598,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, unsigned long slack; slack = current->timer_slack_ns; - if (rt_task(current)) + if (dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9328b80eaf14..06bb1417b063 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -37,7 +37,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; -unsigned long __read_mostly sysctl_hung_task_warnings = 10; +int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; @@ -98,7 +98,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (!sysctl_hung_task_warnings) return; - sysctl_hung_task_warnings--; + + if (sysctl_hung_task_warnings > 0) + sysctl_hung_task_warnings--; /* * Ok, the task did not get scheduled for more than 2 minutes, @@ -244,5 +246,4 @@ static int __init hung_task_init(void) return 0; } - -module_init(hung_task_init); +subsys_initcall(hung_task_init); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 4a1fef09f658..07cbdfea9ae2 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -40,6 +40,7 @@ config IRQ_EDGE_EOI_HANDLER # Generic configurable interrupt chip implementation config GENERIC_IRQ_CHIP bool + select IRQ_DOMAIN # Generic irq_domain hw <--> linux irq number translation config IRQ_DOMAIN diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc04c166c54d..6397df2d6945 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -281,6 +281,19 @@ void unmask_irq(struct irq_desc *desc) } } +void unmask_threaded_irq(struct irq_desc *desc) +{ + struct irq_chip *chip = desc->irq_data.chip; + + if (chip->flags & IRQCHIP_EOI_THREADED) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_unmask) { + chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); + } +} + /* * handle_nested_irq - Handle a nested irq from a irq thread * @irq: the interrupt number @@ -435,6 +448,27 @@ static inline void preflow_handler(struct irq_desc *desc) static inline void preflow_handler(struct irq_desc *desc) { } #endif +static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) +{ + if (!(desc->istate & IRQS_ONESHOT)) { + chip->irq_eoi(&desc->irq_data); + return; + } + /* + * We need to unmask in the following cases: + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) { + chip->irq_eoi(&desc->irq_data); + unmask_irq(desc); + } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) { + chip->irq_eoi(&desc->irq_data); + } +} + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -448,6 +482,8 @@ static inline void preflow_handler(struct irq_desc *desc) { } void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { + struct irq_chip *chip = desc->irq_data.chip; + raw_spin_lock(&desc->lock); if (unlikely(irqd_irq_inprogress(&desc->irq_data))) @@ -473,18 +509,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); - if (desc->istate & IRQS_ONESHOT) - cond_unmask_irq(desc); + cond_unmask_eoi_irq(desc, chip); -out_eoi: - desc->irq_data.chip->irq_eoi(&desc->irq_data); -out_unlock: raw_spin_unlock(&desc->lock); return; out: - if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) - goto out_eoi; - goto out_unlock; + if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); } /** diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index bd8e788d71e0..1ef0606797c9 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -73,6 +73,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, EXPORT_SYMBOL(devm_request_threaded_irq); /** + * devm_request_any_context_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_any_context_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + devres_free(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_any_context_irq); + +/** * devm_free_irq - free an interrupt * @dev: device to free interrupt for * @irq: Interrupt line to free diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 131ca176b497..635480270858 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -41,6 +41,7 @@ irqreturn_t no_action(int cpl, void *dev_id) { return IRQ_NONE; } +EXPORT_SYMBOL_GPL(no_action); static void warn_no_thread(unsigned int irq, struct irqaction *action) { @@ -51,7 +52,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) { /* * In case the thread crashed and was killed we just pretend that @@ -157,7 +158,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - irq_wake_thread(desc, action); + __irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 001fa5bab490..ddf1ffeb79f1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -6,6 +6,7 @@ * of this file for your non core code. */ #include <linux/irqdesc.h> +#include <linux/kernel_stat.h> #ifdef CONFIG_SPARSE_IRQ # define IRQ_BITMAP_BITS (NR_IRQS + 8196) @@ -73,6 +74,7 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); +extern void unmask_threaded_irq(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); @@ -82,6 +84,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); bool irq_wait_for_poll(struct irq_desc *desc); +void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -179,3 +182,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) { return d->state_use_accessors & mask; } + +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) +{ + __this_cpu_inc(*desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 192a302d6cfd..a7174617616b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -274,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } +EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { @@ -488,6 +489,11 @@ void dynamic_irq_cleanup(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } +void kstat_incr_irq_this_cpu(unsigned int irq) +{ + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); +} + unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb36fe58..f14033700c25 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include <linux/mutex.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/of_irq.h> #include <linux/topology.h> #include <linux/seq_file.h> #include <linux/slab.h> diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b17..2486a4c1a710 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -32,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif -/** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * @irq: interrupt number to wait for - * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void synchronize_irq(unsigned int irq) +static void __synchronize_hardirq(struct irq_desc *desc) { - struct irq_desc *desc = irq_to_desc(irq); bool inprogress; - if (!desc) - return; - do { unsigned long flags; @@ -67,12 +53,56 @@ void synchronize_irq(unsigned int irq) /* Oops, that failed? */ } while (inprogress); +} - /* - * We made sure that no hardirq handler is running. Now verify - * that no threaded handlers are active. - */ - wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); +/** + * synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending hard IRQ handlers for this + * interrupt to complete before returning. If you use this + * function while holding a resource the IRQ handler may need you + * will deadlock. It does not take associated threaded handlers + * into account. + * + * Do not use this for shutdown scenarios where you must be sure + * that all parts (hardirq and threaded handler) have completed. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_hardirq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) + __synchronize_hardirq(desc); +} +EXPORT_SYMBOL(synchronize_hardirq); + +/** + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + __synchronize_hardirq(desc); + /* + * We made sure that no hardirq handler is + * running. Now verify that no threaded handlers are + * active. + */ + wait_event(desc->wait_for_threads, + !atomic_read(&desc->threads_active)); + } } EXPORT_SYMBOL(synchronize_irq); @@ -718,7 +748,7 @@ again: if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) - unmask_irq(desc); + unmask_threaded_irq(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -727,7 +757,7 @@ out_unlock: #ifdef CONFIG_SMP /* - * Check whether we need to chasnge the affinity of the interrupt thread. + * Check whether we need to change the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) @@ -802,8 +832,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } @@ -881,6 +910,33 @@ static int irq_thread(void *data) return 0; } +/** + * irq_wake_thread - wake the irq thread for the action identified by dev_id + * @irq: Interrupt line + * @dev_id: Device identity for which the thread should be woken + * + */ +void irq_wake_thread(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + unsigned long flags; + + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return; + + raw_spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action; action; action = action->next) { + if (action->dev_id == dev_id) { + if (action->thread) + __irq_wake_thread(desc, action); + break; + } + } + raw_spin_unlock_irqrestore(&desc->lock, flags); +} +EXPORT_SYMBOL_GPL(irq_wake_thread); + static void irq_setup_forced_threading(struct irqaction *new) { if (!force_irqthreads) @@ -897,6 +953,23 @@ static void irq_setup_forced_threading(struct irqaction *new) } } +static int irq_request_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + return c->irq_request_resources ? c->irq_request_resources(d) : 0; +} + +static void irq_release_resources(struct irq_desc *desc) +{ + struct irq_data *d = &desc->irq_data; + struct irq_chip *c = d->chip; + + if (c->irq_release_resources) + c->irq_release_resources(d); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -1092,6 +1165,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { + ret = irq_request_resources(desc); + if (ret) { + pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n", + new->name, irq, desc->irq_data.chip->name); + goto out_mask; + } + init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ @@ -1262,8 +1342,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) *action_ptr = action->next; /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) + if (!desc->action) { irq_shutdown(desc); + irq_release_resources(desc); + } #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 36f6ee181b0c..ac1ba2f11032 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -324,15 +324,15 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", 0600, desc->dir, + proc_create_data("smp_affinity", 0644, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); /* create /proc/irq/<irq>/affinity_hint */ - proc_create_data("affinity_hint", 0400, desc->dir, + proc_create_data("affinity_hint", 0444, desc->dir, &irq_affinity_hint_proc_fops, (void *)(long)irq); /* create /proc/irq/<irq>/smp_affinity_list */ - proc_create_data("smp_affinity_list", 0600, desc->dir, + proc_create_data("smp_affinity_list", 0644, desc->dir, &irq_affinity_list_proc_fops, (void *)(long)irq); proc_create_data("node", 0444, desc->dir, @@ -372,7 +372,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP - proc_create("irq/default_smp_affinity", 0600, NULL, + proc_create("irq/default_smp_affinity", 0644, NULL, &default_affinity_proc_fops); #endif } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 55fcce6065cf..a82170e2fa78 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void) * * Can be re-enqueued while the callback is still in progress. */ -void irq_work_queue(struct irq_work *work) +bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) - return; + return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); @@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work) } preempt_enable(); + + return true; } EXPORT_SYMBOL_GPL(irq_work_queue); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3127ad52cdb2..cb0cf37dac3a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -23,6 +23,7 @@ #include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/compiler.h> #include <asm/sections.h> @@ -36,8 +37,8 @@ * These will be re-linked against their real values * during the second link stage. */ -extern const unsigned long kallsyms_addresses[] __attribute__((weak)); -extern const u8 kallsyms_names[] __attribute__((weak)); +extern const unsigned long kallsyms_addresses[] __weak; +extern const u8 kallsyms_names[] __weak; /* * Tell the compiler that the count isn't in the small data section if the arch @@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak)); extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); -extern const u8 kallsyms_token_table[] __attribute__((weak)); -extern const u16 kallsyms_token_index[] __attribute__((weak)); +extern const u8 kallsyms_token_table[] __weak; +extern const u16 kallsyms_token_index[] __weak; -extern const unsigned long kallsyms_markers[] __attribute__((weak)); +extern const unsigned long kallsyms_markers[] __weak; static inline int is_kernel_inittext(unsigned long addr) { diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c970167e402..c8380ad203bc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -32,6 +32,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> +#include <linux/compiler.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -932,6 +933,7 @@ static int kimage_load_segment(struct kimage *image, */ struct kimage *kexec_image; struct kimage *kexec_crash_image; +int kexec_load_disabled; static DEFINE_MUTEX(kexec_mutex); @@ -942,7 +944,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, int result; /* We only trust the superuser with rebooting the system. */ - if (!capable(CAP_SYS_BOOT)) + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) return -EPERM; /* @@ -1038,10 +1040,10 @@ void __weak crash_unmap_reserved_pages(void) {} #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, - struct compat_kexec_segment __user *segments, - unsigned long flags) +COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, + compat_ulong_t, nr_segments, + struct compat_kexec_segment __user *, segments, + compat_ulong_t, flags) { struct compat_kexec_segment in; struct kexec_segment out, __user *ksegments; @@ -1234,7 +1236,7 @@ static int __init crash_notes_memory_init(void) } return 0; } -module_init(crash_notes_memory_init) +subsys_initcall(crash_notes_memory_init); /* @@ -1536,7 +1538,7 @@ void vmcoreinfo_append_str(const char *fmt, ...) size_t r; va_start(args, fmt); - r = vsnprintf(buf, sizeof(buf), fmt, args); + r = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); @@ -1550,10 +1552,10 @@ void vmcoreinfo_append_str(const char *fmt, ...) * provide an empty default implementation here -- architecture * code may override this */ -void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) +void __weak arch_crash_save_vmcoreinfo(void) {} -unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) +unsigned long __weak paddr_vmcoreinfo_note(void) { return __pa((unsigned long)(char *)&vmcoreinfo_note); } @@ -1628,7 +1630,7 @@ static int __init crash_save_vmcoreinfo_init(void) return 0; } -module_init(crash_save_vmcoreinfo_init) +subsys_initcall(crash_save_vmcoreinfo_init); /* * Move into place and start executing a preloaded standalone diff --git a/kernel/kmod.c b/kernel/kmod.c index b086006c59e7..6b375af4958d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -239,7 +239,7 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = do_execve(sub_info->path, + retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); if (!retval) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 9659d38e008f..2495a9b14ac8 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -18,6 +18,9 @@ #include <linux/stat.h> #include <linux/sched.h> #include <linux/capability.h> +#include <linux/compiler.h> + +#include <linux/rcupdate.h> /* rcu_expedited */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -126,7 +129,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, { return sprintf(buf, "%lx %x\n", paddr_vmcoreinfo_note(), - (unsigned int)vmcoreinfo_max_size); + (unsigned int)sizeof(vmcoreinfo_note)); } KERNEL_ATTR_RO(vmcoreinfo); @@ -160,8 +163,8 @@ KERNEL_ATTR_RW(rcu_expedited); /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ -extern const void __start_notes __attribute__((weak)); -extern const void __stop_notes __attribute__((weak)); +extern const void __start_notes __weak; +extern const void __stop_notes __weak; #define notes_size (&__stop_notes - &__start_notes) static ssize_t notes_read(struct file *filp, struct kobject *kobj, diff --git a/kernel/kthread.c b/kernel/kthread.c index b5ae3ee860a9..9a130ec06f7a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk) if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif - return numa_node_id(); + return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) @@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), { struct task_struct *p; - p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, + p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, cpu); if (IS_ERR(p)) return p; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index baab8e5e7f66..b8bdcd4785b7 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o +obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o @@ -23,3 +24,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o +obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 576ba756a32d..b0e9467922e1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -590,6 +590,7 @@ static int very_verbose(struct lock_class *class) /* * Is this the address of a static object: */ +#ifdef __KERNEL__ static int static_obj(void *obj) { unsigned long start = (unsigned long) &_stext, @@ -616,6 +617,7 @@ static int static_obj(void *obj) */ return is_module_address(addr) || is_module_percpu_address(addr); } +#endif /* * To make lock name printouts unique, we calculate a unique @@ -1934,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; + hlock = curr->held_locks + depth - 1; /* * Only non-recursive-read entries get new dependencies * added: */ - if (hlock->read != 2) { + if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, distance, trylock_loop)) return 0; @@ -2096,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ - if (!hlock->trylock && (hlock->check == 2) && + if (!hlock->trylock && hlock->check && lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: @@ -2515,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); - if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) + if (!hlock->check) continue; if (!mark_lock(curr, hlock, usage_bit)) @@ -2555,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) debug_atomic_inc(hardirqs_on_events); } -void trace_hardirqs_on_caller(unsigned long ip) +__visible void trace_hardirqs_on_caller(unsigned long ip) { time_hardirqs_on(CALLER_ADDR0, ip); @@ -2608,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on); /* * Hardirqs were disabled: */ -void trace_hardirqs_off_caller(unsigned long ip) +__visible void trace_hardirqs_off_caller(unsigned long ip) { struct task_struct *curr = current; @@ -3053,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; - if (!prove_locking) - check = 1; - if (unlikely(!debug_locks)) return 0; @@ -3067,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (lock->key == &__lockdep_no_validate__) - check = 1; + if (!prove_locking || lock->key == &__lockdep_no_validate__) + check = 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -3136,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->holdtime_stamp = lockstat_clock(); #endif - if (check == 2 && !mark_irqflags(curr, hlock)) + if (check && !mark_irqflags(curr, hlock)) return 0; /* mark it as used: */ @@ -4115,6 +4114,7 @@ void debug_check_no_locks_held(void) } EXPORT_SYMBOL_GPL(debug_check_no_locks_held); +#ifdef __KERNEL__ void debug_show_all_locks(void) { struct task_struct *g, *p; @@ -4172,6 +4172,7 @@ retry: read_unlock(&tasklist_lock); } EXPORT_SYMBOL_GPL(debug_show_all_locks); +#endif /* * Careful: only use this function if you are sure that @@ -4187,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); -void lockdep_sys_exit(void) +asmlinkage void lockdep_sys_exit(void) { struct task_struct *curr = current; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c new file mode 100644 index 000000000000..f26b1a18e34e --- /dev/null +++ b/kernel/locking/locktorture.c @@ -0,0 +1,452 @@ +/* + * Module-based torture test facility for locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Based on kernel/rcu/torture.c. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/trace_clock.h> +#include <asm/byteorder.h> +#include <linux/torture.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); + +torture_param(int, nwriters_stress, -1, + "Number of write-locking stress-test threads"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, + "Number of jiffies between shuffles, 0=disable"); +torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); + +static char *torture_type = "spin_lock"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, + "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); + +static atomic_t n_lock_torture_errors; + +static struct task_struct *stats_task; +static struct task_struct **writer_tasks; + +static int nrealwriters_stress; +static bool lock_is_write_held; + +struct lock_writer_stress_stats { + long n_write_lock_fail; + long n_write_lock_acquired; +}; +static struct lock_writer_stress_stats *lwsa; + +#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE) +#define LOCKTORTURE_RUNNABLE_INIT 1 +#else +#define LOCKTORTURE_RUNNABLE_INIT 0 +#endif +int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; +module_param(locktorture_runnable, int, 0444); +MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot"); + +/* Forward reference. */ +static void lock_torture_cleanup(void); + +/* + * Operations vector for selecting different types of tests. + */ +struct lock_torture_ops { + void (*init)(void); + int (*writelock)(void); + void (*write_delay)(struct torture_random_state *trsp); + void (*writeunlock)(void); + unsigned long flags; + const char *name; +}; + +static struct lock_torture_ops *cur_ops; + +/* + * Definitions for lock torture testing. + */ + +static int torture_lock_busted_write_lock(void) +{ + return 0; /* BUGGY, do not use in real life!!! */ +} + +static void torture_lock_busted_write_delay(struct torture_random_state *trsp) +{ + const unsigned long longdelay_us = 100; + + /* We want a long delay occasionally to force massive contention. */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_lock_busted_write_unlock(void) +{ + /* BUGGY, do not use in real life!!! */ +} + +static struct lock_torture_ops lock_busted_ops = { + .writelock = torture_lock_busted_write_lock, + .write_delay = torture_lock_busted_write_delay, + .writeunlock = torture_lock_busted_write_unlock, + .name = "lock_busted" +}; + +static DEFINE_SPINLOCK(torture_spinlock); + +static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock) +{ + spin_lock(&torture_spinlock); + return 0; +} + +static void torture_spin_lock_write_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_us = 100; + + /* We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (nrealwriters_stress * 2000 * longdelay_us))) + mdelay(longdelay_us); + if (!(torture_random(trsp) % + (nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) +{ + spin_unlock(&torture_spinlock); +} + +static struct lock_torture_ops spin_lock_ops = { + .writelock = torture_spin_lock_write_lock, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_spin_lock_write_unlock, + .name = "spin_lock" +}; + +static int torture_spin_lock_write_lock_irq(void) +__acquires(torture_spinlock_irq) +{ + unsigned long flags; + + spin_lock_irqsave(&torture_spinlock, flags); + cur_ops->flags = flags; + return 0; +} + +static void torture_lock_spin_write_unlock_irq(void) +__releases(torture_spinlock) +{ + spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); +} + +static struct lock_torture_ops spin_lock_irq_ops = { + .writelock = torture_spin_lock_write_lock_irq, + .write_delay = torture_spin_lock_write_delay, + .writeunlock = torture_lock_spin_write_unlock_irq, + .name = "spin_lock_irq" +}; + +/* + * Lock torture writer kthread. Repeatedly acquires and releases + * the lock, checking for duplicate acquisitions. + */ +static int lock_torture_writer(void *arg) +{ + struct lock_writer_stress_stats *lwsp = arg; + static DEFINE_TORTURE_RANDOM(rand); + + VERBOSE_TOROUT_STRING("lock_torture_writer task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1); + cur_ops->writelock(); + if (WARN_ON_ONCE(lock_is_write_held)) + lwsp->n_write_lock_fail++; + lock_is_write_held = 1; + lwsp->n_write_lock_acquired++; + cur_ops->write_delay(&rand); + lock_is_write_held = 0; + cur_ops->writeunlock(); + stutter_wait("lock_torture_writer"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_writer"); + return 0; +} + +/* + * Create an lock-torture-statistics message in the specified buffer. + */ +static void lock_torture_printk(char *page) +{ + bool fail = 0; + int i; + long max = 0; + long min = lwsa[0].n_write_lock_acquired; + long long sum = 0; + + for (i = 0; i < nrealwriters_stress; i++) { + if (lwsa[i].n_write_lock_fail) + fail = true; + sum += lwsa[i].n_write_lock_acquired; + if (max < lwsa[i].n_write_lock_fail) + max = lwsa[i].n_write_lock_fail; + if (min > lwsa[i].n_write_lock_fail) + min = lwsa[i].n_write_lock_fail; + } + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, + "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", + sum, max, min, max / 2 > min ? "???" : "", + fail, fail ? "!!!" : ""); + if (fail) + atomic_inc(&n_lock_torture_errors); +} + +/* + * Print torture statistics. Caller must ensure that there is only one + * call to this function at a given time!!! This is normally accomplished + * by relying on the module system to only have one copy of the module + * loaded, and then by giving the lock_torture_stats kthread full control + * (or the init/cleanup functions when lock_torture_stats thread is not + * running). + */ +static void lock_torture_stats_print(void) +{ + int size = nrealwriters_stress * 200 + 8192; + char *buf; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("lock_torture_stats_print: Out of memory, need: %d", + size); + return; + } + lock_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int lock_torture_stats(void *arg) +{ + VERBOSE_TOROUT_STRING("lock_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + lock_torture_stats_print(); + torture_shutdown_absorb("lock_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("lock_torture_stats"); + return 0; +} + +static inline void +lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, + const char *tag) +{ + pr_alert("%s" TORTURE_FLAG + "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealwriters_stress, stat_interval, verbose, + shuffle_interval, stutter, shutdown_secs, + onoff_interval, onoff_holdoff); +} + +static void lock_torture_cleanup(void) +{ + int i; + + if (torture_cleanup()) + return; + + if (writer_tasks) { + for (i = 0; i < nrealwriters_stress; i++) + torture_stop_kthread(lock_torture_writer, + writer_tasks[i]); + kfree(writer_tasks); + writer_tasks = NULL; + } + + torture_stop_kthread(lock_torture_stats, stats_task); + lock_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (atomic_read(&n_lock_torture_errors)) + lock_torture_print_module_parms(cur_ops, + "End of test: FAILURE"); + else if (torture_onoff_failures()) + lock_torture_print_module_parms(cur_ops, + "End of test: LOCK_HOTPLUG"); + else + lock_torture_print_module_parms(cur_ops, + "End of test: SUCCESS"); +} + +static int __init lock_torture_init(void) +{ + int i; + int firsterr = 0; + static struct lock_torture_ops *torture_ops[] = { + &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, + }; + + torture_init_begin(torture_type, verbose, &locktorture_runnable); + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + pr_alert("lock-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("lock-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); + torture_init_end(); + return -EINVAL; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nwriters_stress >= 0) + nrealwriters_stress = nwriters_stress; + else + nrealwriters_stress = 2 * num_online_cpus(); + lock_torture_print_module_parms(cur_ops, "Start of test"); + + /* Initialize the statistics so that each run gets its own numbers. */ + + lock_is_write_held = 0; + lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); + if (lwsa == NULL) { + VERBOSE_TOROUT_STRING("lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + lwsa[i].n_write_lock_fail = 0; + lwsa[i].n_write_lock_acquired = 0; + } + + /* Start up the kthreads. */ + + if (onoff_interval > 0) { + firsterr = torture_onoff_init(onoff_holdoff * HZ, + onoff_interval * HZ); + if (firsterr) + goto unwind; + } + if (shuffle_interval > 0) { + firsterr = torture_shuffle_init(shuffle_interval); + if (firsterr) + goto unwind; + } + if (shutdown_secs > 0) { + firsterr = torture_shutdown_init(shutdown_secs, + lock_torture_cleanup); + if (firsterr) + goto unwind; + } + if (stutter > 0) { + firsterr = torture_stutter_init(stutter); + if (firsterr) + goto unwind; + } + + writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealwriters_stress; i++) { + firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], + writer_tasks[i]); + if (firsterr) + goto unwind; + } + if (stat_interval > 0) { + firsterr = torture_create_kthread(lock_torture_stats, NULL, + stats_task); + if (firsterr) + goto unwind; + } + torture_init_end(); + return 0; + +unwind: + torture_init_end(); + lock_torture_cleanup(); + return firsterr; +} + +module_init(lock_torture_init); +module_exit(lock_torture_cleanup); diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c new file mode 100644 index 000000000000..838dc9e00669 --- /dev/null +++ b/kernel/locking/mcs_spinlock.c @@ -0,0 +1,178 @@ + +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include "mcs_spinlock.h" + +#ifdef CONFIG_SMP + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_queue * +osq_wait_next(struct optimistic_spin_queue **lock, + struct optimistic_spin_queue *node, + struct optimistic_spin_queue *prev) +{ + struct optimistic_spin_queue *next = NULL; + + for (;;) { + if (*lock == node && cmpxchg(lock, node, prev) == node) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + arch_mutex_cpu_relax(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *prev, *next; + + node->locked = 0; + node->next = NULL; + + node->prev = prev = xchg(lock, node); + if (likely(prev == NULL)) + return true; + + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + arch_mutex_cpu_relax(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + arch_mutex_cpu_relax(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *next; + + /* + * Fast path for the uncontended case. + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + + /* + * Second most likely case. + */ + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} + +#endif + diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h new file mode 100644 index 000000000000..a2dbac4aca6b --- /dev/null +++ b/kernel/locking/mcs_spinlock.h @@ -0,0 +1,129 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +#include <asm/mcs_spinlock.h> + +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ +}; + +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l) \ +do { \ + while (!(smp_load_acquire(l))) \ + arch_mutex_cpu_relax(); \ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l) \ + smp_store_release((l), 1) +#endif + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* + * Lock acquired, don't need to set node->locked to 1. Threads + * only spin on its own node->locked value for lock acquisition. + * However, since this thread can immediately acquire the lock + * and does not proceed to spin on its own node->locked, this + * value won't be used. If a debug mode is needed to + * audit lock status, then set node->locked value here. + */ + return; + } + ACCESS_ONCE(prev->next) = node; + + /* Wait until the lock holder passes the lock down. */ + arch_mcs_spin_lock_contended(&node->locked); +} + +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + + /* Pass lock to next waiter. */ + arch_mcs_spin_unlock_contended(&next->locked); +} + +/* + * Cancellable version of the MCS lock above. + * + * Intended for adaptive spinning of sleeping locks: + * mutex_lock()/rwsem_down_{read,write}() etc. + */ + +struct optimistic_spin_queue { + struct optimistic_spin_queue *next, *prev; + int locked; /* 1 if lock acquired */ +}; + +extern bool osq_lock(struct optimistic_spin_queue **lock); +extern void osq_unlock(struct optimistic_spin_queue **lock); + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443fe1f48..e1191c996c59 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -75,9 +75,20 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current); + + if (!lock->owner) + DEBUG_LOCKS_WARN_ON(!lock->owner); + else + DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); + + /* + * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug + * mutexes so that we can do it here after we've verified state. + */ + atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4dd6e4c219de..bc73d33c6760 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,6 +25,7 @@ #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/debug_locks.h> +#include "mcs_spinlock.h" /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -33,6 +34,13 @@ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" # include <asm-generic/mutex-null.h> +/* + * Must be 0 for the debug case so we do not do the unlock outside of the + * wait_lock region. debug_mutex_unlock() will do the actual unlock in this + * case. + */ +# undef __mutex_slowpath_needs_to_unlock +# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" # include <asm/mutex.h> @@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->spin_mlock = NULL; + lock->osq = NULL; #endif debug_mutex_init(lock, name, key); @@ -67,8 +75,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count); +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); /** * mutex_lock - acquire the mutex @@ -111,54 +118,7 @@ EXPORT_SYMBOL(mutex_lock); * more or less simultaneously, the spinners need to acquire a MCS lock * first before spinning on the owner field. * - * We don't inline mspin_lock() so that perf can correctly account for the - * time spent in this lock function. */ -struct mspin_node { - struct mspin_node *next ; - int locked; /* 1 if lock acquired */ -}; -#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) - -static noinline -void mspin_lock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *prev; - - /* Init node */ - node->locked = 0; - node->next = NULL; - - prev = xchg(lock, node); - if (likely(prev == NULL)) { - /* Lock acquired */ - node->locked = 1; - return; - } - ACCESS_ONCE(prev->next) = node; - smp_wmb(); - /* Wait until the lock holder passes the lock down */ - while (!ACCESS_ONCE(node->locked)) - arch_mutex_cpu_relax(); -} - -static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *next = ACCESS_ONCE(node->next); - - if (likely(!next)) { - /* - * Release the lock by setting it to NULL - */ - if (cmpxchg(lock, node, NULL) == node) - return; - /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); - } - ACCESS_ONCE(next->locked) = 1; - smp_wmb(); -} /* * Mutex spinning code migrated from kernel/sched/core.c @@ -212,6 +172,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + if (need_resched()) + return 0; + rcu_read_lock(); owner = ACCESS_ONCE(lock->owner); if (owner) @@ -225,7 +188,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) } #endif -static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +__visible __used noinline +void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** * mutex_unlock - release the mutex @@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; + if (!osq_lock(&lock->osq)) + goto slowpath; + for (;;) { struct task_struct *owner; - struct mspin_node node; if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; @@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * performed the optimistic spinning cannot be done. */ if (ACCESS_ONCE(ww->ctx)) - goto slowpath; + break; } /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ - mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) { - mspin_unlock(MLOCK(lock), &node); - goto slowpath; - } + if (owner && !mutex_spin_on_owner(lock, owner)) + break; if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { @@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mspin_unlock(MLOCK(lock), &node); + osq_unlock(&lock->osq); preempt_enable(); return 0; } - mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the @@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) - goto slowpath; + break; /* * The cpu_relax() call is a compiler barrier which forces @@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } + osq_unlock(&lock->osq); slowpath: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) + schedule_preempt_disabled(); #endif spin_lock_mutex(&lock->wait_lock, flags); @@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - /* * some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to @@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); + spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); + if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = @@ -746,7 +716,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) /* * Release the lock, slowpath: */ -static __used noinline void +__visible void __mutex_unlock_slowpath(atomic_t *lock_count) { __mutex_unlock_common_slowpath(lock_count, 1); @@ -803,7 +773,7 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); -static __used noinline void __sched +__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..49b2ed3dced8 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -24,7 +24,7 @@ #include <linux/kallsyms.h> #include <linux/syscalls.h> #include <linux/interrupt.h> -#include <linux/plist.h> +#include <linux/rbtree.h> #include <linux/fs.h> #include <linux/debug_locks.h> @@ -57,7 +57,7 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } @@ -154,16 +154,12 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { memset(waiter, 0x11, sizeof(*waiter)); - plist_node_init(&waiter->list_entry, MAX_PRIO); - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); waiter->deadlock_task_pid = NULL; } void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..aa4dff04b594 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -14,6 +14,7 @@ #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/timer.h> #include "rtmutex_common.h" @@ -91,10 +92,107 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) } #endif +static inline int +rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->prio < right->prio) + return 1; + + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 1 above, + * then right waiter has a dl_prio() too. + */ + if (dl_prio(left->prio)) + return (left->task->dl.deadline < right->task->dl.deadline); + + return 0; +} + +static void +rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &lock->waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + lock->waiters_leftmost = &waiter->tree_entry; + + rb_link_node(&waiter->tree_entry, parent, link); + rb_insert_color(&waiter->tree_entry, &lock->waiters); +} + +static void +rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->tree_entry)) + return; + + if (lock->waiters_leftmost == &waiter->tree_entry) + lock->waiters_leftmost = rb_next(&waiter->tree_entry); + + rb_erase(&waiter->tree_entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree_entry); +} + +static void +rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + struct rb_node **link = &task->pi_waiters.rb_node; + struct rb_node *parent = NULL; + struct rt_mutex_waiter *entry; + int leftmost = 1; + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); + if (rt_mutex_waiter_less(waiter, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + task->pi_waiters_leftmost = &waiter->pi_tree_entry; + + rb_link_node(&waiter->pi_tree_entry, parent, link); + rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); +} + +static void +rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) +{ + if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + return; + + if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) + task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); + + rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree_entry); +} + /* - * Calculate task priority from the waiter list priority + * Calculate task priority from the waiter tree priority * - * Return task->normal_prio when the waiter list is empty or when + * Return task->normal_prio when the waiter tree is empty or when * the waiter is not allowed to do priority boosting */ int rt_mutex_getprio(struct task_struct *task) @@ -102,10 +200,30 @@ int rt_mutex_getprio(struct task_struct *task) if (likely(!task_has_pi_waiters(task))) return task->normal_prio; - return min(task_top_pi_waiter(task)->pi_list_entry.prio, + return min(task_top_pi_waiter(task)->prio, task->normal_prio); } +struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return NULL; + + return task_top_pi_waiter(task)->task; +} + +/* + * Called by sched_setscheduler() to check whether the priority change + * is overruled by a possible priority boosting. + */ +int rt_mutex_check_prio(struct task_struct *task, int newprio) +{ + if (!task_has_pi_waiters(task)) + return 0; + + return task_top_pi_waiter(task)->task->prio <= newprio; +} + /* * Adjust the priority of a task, after its pi_waiters got modified. * @@ -115,7 +233,7 @@ static void __rt_mutex_adjust_prio(struct task_struct *task) { int prio = rt_mutex_getprio(task); - if (task->prio != prio) + if (task->prio != prio || dl_prio(prio)) rt_mutex_setprio(task, prio); } @@ -233,7 +351,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * When deadlock detection is off then we check, if further * priority adjustment is necessary. */ - if (!detect_deadlock && waiter->list_entry.prio == task->prio) + if (!detect_deadlock && waiter->prio == task->prio) goto out_unlock_pi; lock = waiter->lock; @@ -254,9 +372,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* Requeue the waiter */ - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); + waiter->prio = task->prio; + rt_mutex_enqueue(lock, waiter); /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -280,17 +398,15 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, if (waiter == rt_mutex_top_waiter(lock)) { /* Boost the owner */ - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, top_waiter); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } else if (top_waiter == waiter) { /* Deboost the owner */ - plist_del(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, waiter); __rt_mutex_adjust_prio(task); } @@ -355,7 +471,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * 3) it is top waiter */ if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (task->prio >= rt_mutex_top_waiter(lock)->prio) { if (!waiter || waiter != rt_mutex_top_waiter(lock)) return 0; } @@ -369,7 +485,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, /* remove the queued waiter. */ if (waiter) { - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); task->pi_blocked_on = NULL; } @@ -379,8 +495,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) { top = rt_mutex_top_waiter(lock); - top->pi_list_entry.prio = top->list_entry.prio; - plist_add(&top->pi_list_entry, &task->pi_waiters); + rt_mutex_enqueue_pi(task, top); } raw_spin_unlock_irqrestore(&task->pi_lock, flags); } @@ -416,13 +531,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, task->prio); - plist_node_init(&waiter->pi_list_entry, task->prio); + waiter->prio = task->prio; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); + rt_mutex_enqueue(lock, waiter); task->pi_blocked_on = waiter; @@ -433,8 +547,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, top_waiter); + rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) @@ -486,7 +600,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * boosted mode and go back to normal after releasing * lock->wait_lock. */ - plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + rt_mutex_dequeue_pi(current, waiter); rt_mutex_set_owner(lock, NULL); @@ -510,7 +624,7 @@ static void remove_waiter(struct rt_mutex *lock, int chain_walk = 0; raw_spin_lock_irqsave(¤t->pi_lock, flags); - plist_del(&waiter->list_entry, &lock->wait_list); + rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); @@ -521,13 +635,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + rt_mutex_dequeue_pi(owner, waiter); if (rt_mutex_has_waiters(lock)) { struct rt_mutex_waiter *next; next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &owner->pi_waiters); + rt_mutex_enqueue_pi(owner, next); } __rt_mutex_adjust_prio(owner); @@ -537,8 +651,6 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&owner->pi_lock, flags); } - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - if (!chain_walk) return; @@ -565,7 +677,8 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || waiter->list_entry.prio == task->prio) { + if (!waiter || (waiter->prio == task->prio && + !dl_prio(task->prio))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -638,6 +751,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); + RB_CLEAR_NODE(&waiter.pi_tree_entry); + RB_CLEAR_NODE(&waiter.tree_entry); raw_spin_lock(&lock->wait_lock); @@ -904,7 +1019,8 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; raw_spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list); + lock->waiters = RB_ROOT; + lock->waiters_leftmost = NULL; debug_rt_mutex_init(lock, name); } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..7431a9c86f35 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -40,13 +40,13 @@ extern void schedule_rt_mutex_test(struct rt_mutex *lock); * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @list_entry: pi node to enqueue into the mutex waiters list - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @tree_entry: pi node to enqueue into the mutex waiters tree + * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { - struct plist_node list_entry; - struct plist_node pi_list_entry; + struct rb_node tree_entry; + struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -54,14 +54,15 @@ struct rt_mutex_waiter { struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif + int prio; }; /* - * Various helpers to access the waiters-plist: + * Various helpers to access the waiters-tree: */ static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { - return !plist_head_empty(&lock->wait_list); + return !RB_EMPTY_ROOT(&lock->waiters); } static inline struct rt_mutex_waiter * @@ -69,8 +70,8 @@ rt_mutex_top_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *w; - w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, - list_entry); + w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, + tree_entry); BUG_ON(w->lock != lock); return w; @@ -78,14 +79,14 @@ rt_mutex_top_waiter(struct rt_mutex *lock) static inline int task_has_pi_waiters(struct task_struct *p) { - return !plist_head_empty(&p->pi_waiters); + return !RB_EMPTY_ROOT(&p->pi_waiters); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { - return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, - pi_list_entry); + return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, + pi_tree_entry); } /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 19c5fa95e0b4..1d66e08e897d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -143,6 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) /* * wait for the read lock to be granted */ +__visible struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; @@ -190,6 +191,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait until we successfully acquire the write lock */ +__visible struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; @@ -252,6 +254,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) * handle waking up a waiter on the semaphore * - up_read/up_write has decremented the active part of count if we come here */ +__visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; @@ -272,6 +275,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) * - caller incremented waiting part of count and discovered it still negative * - just wake up any readers at the front of the queue */ +__visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; diff --git a/kernel/module.c b/kernel/module.c index f5a3b1e8ec51..11869408f79b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - __this_cpu_write(mod->refptr->incs, 1); + raw_cpu_write(mod->refptr->incs, 1); return 0; } @@ -815,10 +815,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (!(flags & O_NONBLOCK)) { - printk(KERN_WARNING - "waiting module removal not supported: please upgrade"); - } + if (!(flags & O_NONBLOCK)) + pr_warn("waiting module removal not supported: please upgrade\n"); if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -1015,9 +1013,11 @@ static size_t module_flags_taint(struct module *mod, char *buf) buf[l++] = 'F'; if (mod->taints & (1 << TAINT_CRAP)) buf[l++] = 'C'; + if (mod->taints & (1 << TAINT_UNSIGNED_MODULE)) + buf[l++] = 'E'; /* * TAINT_FORCED_RMMOD: could be added. - * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't * apply to modules. */ return l; @@ -1950,6 +1950,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) switch (sym[i].st_shndx) { case SHN_COMMON: + /* Ignore common symbols */ + if (!strncmp(name, "__gnu_lto", 9)) + break; + /* We compiled with -fno-common. These are not supposed to happen. */ pr_debug("Common symbol: %s\n", name); @@ -3216,7 +3220,7 @@ static int load_module(struct load_info *info, const char __user *uargs, pr_notice_once("%s: module verification failed: signature " "and/or required key missing - tainting " "kernel\n", mod->name); - add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK); + add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK); } #endif @@ -3811,12 +3815,12 @@ void print_modules(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - printk(" %s%s", mod->name, module_flags(mod, buf)); + pr_cont(" %s%s", mod->name, module_flags(mod, buf)); } preempt_enable(); if (last_unloaded_module[0]) - printk(" [last unloaded: %s]", last_unloaded_module); - printk("\n"); + pr_cont(" [last unloaded: %s]", last_unloaded_module); + pr_cont("\n"); } #ifdef CONFIG_MODVERSIONS diff --git a/kernel/notifier.c b/kernel/notifier.c index 2d5cc4ccff7f..db4c8b08a50c 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ - if (rcu_dereference_raw(nh->head)) { + if (rcu_access_pointer(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); diff --git a/kernel/padata.c b/kernel/padata.c index 2abd25d79cc8..161402f0b517 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -112,7 +112,7 @@ int padata_do_parallel(struct padata_instance *pinst, rcu_read_lock_bh(); - pd = rcu_dereference(pinst->pd); + pd = rcu_dereference_bh(pinst->pd); err = -EINVAL; if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) diff --git a/kernel/panic.c b/kernel/panic.c index c00b4ceb39e8..d02fa9fef46a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,7 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -int panic_timeout; +int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -100,7 +100,7 @@ void panic(const char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); + pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing @@ -141,7 +141,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); @@ -165,7 +165,7 @@ void panic(const char *fmt, ...) extern int stop_a_enabled; /* Make sure the user can actually press Stop-A (L1-A) */ stop_a_enabled = 1; - printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); + pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n"); } #endif #if defined(CONFIG_S390) @@ -176,6 +176,7 @@ void panic(const char *fmt, ...) disabled_wait(caller); } #endif + pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -199,7 +200,7 @@ struct tnt { static const struct tnt tnts[] = { { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_CPU_OUT_OF_SPEC, 'S', ' ' }, { TAINT_FORCED_RMMOD, 'R', ' ' }, { TAINT_MACHINE_CHECK, 'M', ' ' }, { TAINT_BAD_PAGE, 'B', ' ' }, @@ -210,6 +211,7 @@ static const struct tnt tnts[] = { { TAINT_CRAP, 'C', ' ' }, { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, + { TAINT_UNSIGNED_MODULE, 'E', ' ' }, }; /** @@ -228,6 +230,7 @@ static const struct tnt tnts[] = { * 'C' - modules from drivers/staging are loaded. * 'I' - Working around severe firmware bug. * 'O' - Out-of-tree module has been loaded. + * 'E' - Unsigned module has been loaded. * * The string is overwritten by the next call to print_tainted(). */ @@ -274,8 +277,7 @@ unsigned long get_taint(void) void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) { if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off()) - printk(KERN_WARNING - "Disabling lock debugging due to kernel taint\n"); + pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } @@ -380,8 +382,7 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); - printk(KERN_WARNING "---[ end trace %016llx ]---\n", - (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } /* @@ -459,7 +460,7 @@ EXPORT_SYMBOL(warn_slowpath_null); * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ -void __stack_chk_fail(void) +__visible void __stack_chk_fail(void) { panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); diff --git a/kernel/params.c b/kernel/params.c index c00d5b502aa4..b00142e7f3ba 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -227,17 +227,10 @@ int parse_args(const char *doing, } /* Lazy bastard, eh? */ -#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ +#define STANDARD_PARAM_DEF(name, type, format, strtolfn) \ int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ - tmptype l; \ - int ret; \ - \ - ret = strtolfn(val, 0, &l); \ - if (ret < 0 || ((type)l != l)) \ - return ret < 0 ? ret : -EINVAL; \ - *((type *)kp->arg) = l; \ - return 0; \ + return strtolfn(val, 0, (type *)kp->arg); \ } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ @@ -253,13 +246,13 @@ int parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8); +STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16); +STANDARD_PARAM_DEF(int, int, "%i", kstrtoint); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint); +STANDARD_PARAM_DEF(long, long, "%li", kstrtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 06c62de9c711..db95d8eb761b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -318,7 +318,9 @@ static void *pidns_get(struct task_struct *task) struct pid_namespace *ns; rcu_read_lock(); - ns = get_pid_ns(task_active_pid_ns(task)); + ns = task_active_pid_ns(task); + if (ns) + get_pid_ns(ns); rcu_read_unlock(); return ns; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c7f31aa272f7..3b8946416a5f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -233,7 +233,8 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) /* * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. */ static int cpu_clock_sample_group(const clockid_t which_clock, struct task_struct *p, @@ -260,30 +261,53 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return 0; } +static int posix_cpu_clock_get_task(struct task_struct *tsk, + const clockid_t which_clock, + struct timespec *tp) +{ + int err = -EINVAL; + unsigned long long rtn; + + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (same_thread_group(tsk, current)) + err = cpu_clock_sample(which_clock, tsk, &rtn); + } else { + unsigned long flags; + struct sighand_struct *sighand; + + /* + * while_each_thread() is not yet entirely RCU safe, + * keep locking the group while sampling process + * clock for now. + */ + sighand = lock_task_sighand(tsk, &flags); + if (!sighand) + return err; + + if (tsk == current || thread_group_leader(tsk)) + err = cpu_clock_sample_group(which_clock, tsk, &rtn); + + unlock_task_sighand(tsk, &flags); + } + + if (!err) + sample_to_timespec(which_clock, rtn, tp); + + return err; +} + static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); - int error = -EINVAL; - unsigned long long rtn; + int err = -EINVAL; if (pid == 0) { /* * Special case constant value for our own clocks. * We don't have to do any lookup to find ourselves. */ - if (CPUCLOCK_PERTHREAD(which_clock)) { - /* - * Sampling just ourselves we can do with no locking. - */ - error = cpu_clock_sample(which_clock, - current, &rtn); - } else { - read_lock(&tasklist_lock); - error = cpu_clock_sample_group(which_clock, - current, &rtn); - read_unlock(&tasklist_lock); - } + err = posix_cpu_clock_get_task(current, which_clock, tp); } else { /* * Find the given PID, and validate that the caller @@ -292,29 +316,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) struct task_struct *p; rcu_read_lock(); p = find_task_by_vpid(pid); - if (p) { - if (CPUCLOCK_PERTHREAD(which_clock)) { - if (same_thread_group(p, current)) { - error = cpu_clock_sample(which_clock, - p, &rtn); - } - } else { - read_lock(&tasklist_lock); - if (thread_group_leader(p) && p->sighand) { - error = - cpu_clock_sample_group(which_clock, - p, &rtn); - } - read_unlock(&tasklist_lock); - } - } + if (p) + err = posix_cpu_clock_get_task(p, which_clock, tp); rcu_read_unlock(); } - if (error) - return error; - sample_to_timespec(which_clock, rtn, tp); - return 0; + return err; } @@ -371,36 +378,40 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) */ static int posix_cpu_timer_del(struct k_itimer *timer) { - struct task_struct *p = timer->it.cpu.task; int ret = 0; + unsigned long flags; + struct sighand_struct *sighand; + struct task_struct *p = timer->it.cpu.task; - if (likely(p != NULL)) { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { - /* - * We raced with the reaping of the task. - * The deletion should have cleared us off the list. - */ - BUG_ON(!list_empty(&timer->it.cpu.entry)); - } else { - spin_lock(&p->sighand->siglock); - if (timer->it.cpu.firing) - ret = TIMER_RETRY; - else - list_del(&timer->it.cpu.entry); - spin_unlock(&p->sighand->siglock); - } - read_unlock(&tasklist_lock); + WARN_ON_ONCE(p == NULL); - if (!ret) - put_task_struct(p); + /* + * Protect against sighand release/switch in exit/exec and process/ + * thread timer list entry concurrent read/writes. + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { + /* + * We raced with the reaping of the task. + * The deletion should have cleared us off the list. + */ + WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry)); + } else { + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); + + unlock_task_sighand(p, &flags); } + if (!ret) + put_task_struct(p); + return ret; } -static void cleanup_timers_list(struct list_head *head, - unsigned long long curr) +static void cleanup_timers_list(struct list_head *head) { struct cpu_timer_list *timer, *next; @@ -414,16 +425,11 @@ static void cleanup_timers_list(struct list_head *head, * time for later timer_gettime calls to return. * This must be called with the siglock held. */ -static void cleanup_timers(struct list_head *head, - cputime_t utime, cputime_t stime, - unsigned long long sum_exec_runtime) +static void cleanup_timers(struct list_head *head) { - - cputime_t ptime = utime + stime; - - cleanup_timers_list(head, cputime_to_expires(ptime)); - cleanup_timers_list(++head, cputime_to_expires(utime)); - cleanup_timers_list(++head, sum_exec_runtime); + cleanup_timers_list(head); + cleanup_timers_list(++head); + cleanup_timers_list(++head); } /* @@ -433,41 +439,14 @@ static void cleanup_timers(struct list_head *head, */ void posix_cpu_timers_exit(struct task_struct *tsk) { - cputime_t utime, stime; - add_device_randomness((const void*) &tsk->se.sum_exec_runtime, sizeof(unsigned long long)); - task_cputime(tsk, &utime, &stime); - cleanup_timers(tsk->cpu_timers, - utime, stime, tsk->se.sum_exec_runtime); + cleanup_timers(tsk->cpu_timers); } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime; - - task_cputime(tsk, &utime, &stime); - cleanup_timers(tsk->signal->cpu_timers, - utime + sig->utime, stime + sig->stime, - tsk->se.sum_exec_runtime + sig->sum_sched_runtime); -} - -static void clear_dead_task(struct k_itimer *itimer, unsigned long long now) -{ - struct cpu_timer_list *timer = &itimer->it.cpu; - - /* - * That's all for this thread or process. - * We leave our residual in expires to be reported. - */ - put_task_struct(timer->task); - timer->task = NULL; - if (timer->expires < now) { - timer->expires = 0; - } else { - timer->expires -= now; - } + cleanup_timers(tsk->signal->cpu_timers); } static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -477,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp) /* * Insert the timer on the appropriate list before any timers that - * expire later. This must be called with the tasklist_lock held - * for reading, interrupts disabled and p->sighand->siglock taken. + * expire later. This must be called with the sighand lock held. */ static void arm_timer(struct k_itimer *timer) { @@ -569,7 +547,8 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * Sample a process (thread group) timer for the given group_leader task. - * Must be called with tasklist_lock held for reading. + * Must be called with task sighand lock held for safe while_each_thread() + * traversal. */ static int cpu_timer_sample_group(const clockid_t which_clock, struct task_struct *p, @@ -608,7 +587,8 @@ static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); */ static void posix_cpu_timer_kick_nohz(void) { - schedule_work(&nohz_kick_work); + if (context_tracking_is_enabled()) + schedule_work(&nohz_kick_work); } bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) @@ -631,43 +611,39 @@ static inline void posix_cpu_timer_kick_nohz(void) { } * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -static int posix_cpu_timer_set(struct k_itimer *timer, int flags, +static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, struct itimerspec *new, struct itimerspec *old) { + unsigned long flags; + struct sighand_struct *sighand; struct task_struct *p = timer->it.cpu.task; unsigned long long old_expires, new_expires, old_incr, val; int ret; - if (unlikely(p == NULL)) { - /* - * Timer refers to a dead task's clock. - */ - return -ESRCH; - } + WARN_ON_ONCE(p == NULL); new_expires = timespec_to_sample(timer->it_clock, &new->it_value); - read_lock(&tasklist_lock); /* - * We need the tasklist_lock to protect against reaping that - * clears p->sighand. If p has just been reaped, we can no + * Protect against sighand release/switch in exit/exec and p->cpu_timers + * and p->signal->cpu_timers read/write in arm_timer() + */ + sighand = lock_task_sighand(p, &flags); + /* + * If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(p->sighand == NULL)) { - read_unlock(&tasklist_lock); - put_task_struct(p); - timer->it.cpu.task = NULL; + if (unlikely(sighand == NULL)) { return -ESRCH; } /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + WARN_ON_ONCE(!irqs_disabled()); ret = 0; old_incr = timer->it.cpu.incr; - spin_lock(&p->sighand->siglock); old_expires = timer->it.cpu.expires; if (unlikely(timer->it.cpu.firing)) { timer->it.cpu.firing = -1; @@ -724,12 +700,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, * disable this firing since we are already reporting * it as an overrun (thanks to bump_cpu_timer above). */ - spin_unlock(&p->sighand->siglock); - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); goto out; } - if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { + if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { new_expires += val; } @@ -743,9 +718,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, arm_timer(timer); } - spin_unlock(&p->sighand->siglock); - read_unlock(&tasklist_lock); - + unlock_task_sighand(p, &flags); /* * Install the new reload setting, and * set up the signal and overrun bookkeeping. @@ -787,7 +760,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { unsigned long long now; struct task_struct *p = timer->it.cpu.task; - int clear_dead; + + WARN_ON_ONCE(p == NULL); /* * Easy part: convert the reload time. @@ -800,52 +774,34 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) return; } - if (unlikely(p == NULL)) { - /* - * This task already died and the timer will never fire. - * In this case, expires is actually the dead value. - */ - dead: - sample_to_timespec(timer->it_clock, timer->it.cpu.expires, - &itp->it_value); - return; - } - /* * Sample the clock to take the difference with the expiry time. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { cpu_clock_sample(timer->it_clock, p, &now); - clear_dead = p->exit_state; } else { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { + struct sighand_struct *sighand; + unsigned long flags; + + /* + * Protect against sighand release/switch in exit/exec and + * also make timer sampling safe if it ends up calling + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. * Call the timer disarmed, nothing else to do. */ - put_task_struct(p); - timer->it.cpu.task = NULL; timer->it.cpu.expires = 0; - read_unlock(&tasklist_lock); - goto dead; + sample_to_timespec(timer->it_clock, timer->it.cpu.expires, + &itp->it_value); } else { cpu_timer_sample_group(timer->it_clock, p, &now); - clear_dead = (unlikely(p->exit_state) && - thread_group_empty(p)); + unlock_task_sighand(p, &flags); } - read_unlock(&tasklist_lock); - } - - if (unlikely(clear_dead)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - clear_dead_task(timer, now); - goto dead; } if (now < timer->it.cpu.expires) { @@ -1059,14 +1015,12 @@ static void check_process_timers(struct task_struct *tsk, */ void posix_cpu_timer_schedule(struct k_itimer *timer) { + struct sighand_struct *sighand; + unsigned long flags; struct task_struct *p = timer->it.cpu.task; unsigned long long now; - if (unlikely(p == NULL)) - /* - * The task was cleaned up already, no future firings. - */ - goto out; + WARN_ON_ONCE(p == NULL); /* * Fetch the current sample and update the timer's expiry time. @@ -1074,49 +1028,45 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) if (CPUCLOCK_PERTHREAD(timer->it_clock)) { cpu_clock_sample(timer->it_clock, p, &now); bump_cpu_timer(timer, now); - if (unlikely(p->exit_state)) { - clear_dead_task(timer, now); + if (unlikely(p->exit_state)) + goto out; + + /* Protect timer list r/w in arm_timer() */ + sighand = lock_task_sighand(p, &flags); + if (!sighand) goto out; - } - read_lock(&tasklist_lock); /* arm_timer needs it. */ - spin_lock(&p->sighand->siglock); } else { - read_lock(&tasklist_lock); - if (unlikely(p->sighand == NULL)) { + /* + * Protect arm_timer() and timer sampling in case of call to + * thread_group_cputime(). + */ + sighand = lock_task_sighand(p, &flags); + if (unlikely(sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. */ - put_task_struct(p); - timer->it.cpu.task = p = NULL; timer->it.cpu.expires = 0; - goto out_unlock; + goto out; } else if (unlikely(p->exit_state) && thread_group_empty(p)) { - /* - * We've noticed that the thread is dead, but - * not yet reaped. Take this opportunity to - * drop our task ref. - */ - cpu_timer_sample_group(timer->it_clock, p, &now); - clear_dead_task(timer, now); - goto out_unlock; + unlock_task_sighand(p, &flags); + /* Optimizations: if the process is dying, no need to rearm */ + goto out; } - spin_lock(&p->sighand->siglock); cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); - /* Leave the tasklist_lock locked for the call below. */ + /* Leave the sighand locked for the call below. */ } /* * Now re-arm for the new expiry time. */ - BUG_ON(!irqs_disabled()); + WARN_ON_ONCE(!irqs_disabled()); arm_timer(timer); - spin_unlock(&p->sighand->siglock); - -out_unlock: - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); + /* Kick full dynticks CPUs in case they need to tick on the new timer */ + posix_cpu_timer_kick_nohz(); out: timer->it_overrun_last = timer->it_overrun; timer->it_overrun = -1; @@ -1200,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) struct k_itimer *timer, *next; unsigned long flags; - BUG_ON(!irqs_disabled()); + WARN_ON_ONCE(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1256,13 +1206,6 @@ void run_posix_cpu_timers(struct task_struct *tsk) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } - - /* - * In case some timers were rescheduled after the queue got emptied, - * wake up full dynticks CPUs. - */ - if (tsk->signal->cputimer.running) - posix_cpu_timer_kick_nohz(); } /* @@ -1274,7 +1217,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, { unsigned long long now; - BUG_ON(clock_idx == CPUCLOCK_SCHED); + WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index d09dd10c5a5e..9a58bc258810 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -32,7 +32,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_sector = sector; + bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio->bi_end_io = end_swap_bio_read; diff --git a/kernel/power/console.c b/kernel/power/console.c index eacb8bd8cab4..aba9c545a0e3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,6 +9,7 @@ #include <linux/kbd_kern.h> #include <linux/vt.h> #include <linux/module.h> +#include <linux/slab.h> #include "power.h" #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0121dab83f43..f4f2073711d3 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -82,6 +82,7 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) unlock_system_sleep(); } +EXPORT_SYMBOL_GPL(hibernation_set_ops); static bool entering_platform_hibernation; @@ -293,10 +294,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) { + if (!in_suspend) events_check_enabled = false; - platform_leave(platform_mode); - } + + platform_leave(platform_mode); Power_up: syscore_resume(); @@ -972,16 +973,20 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - unsigned int maj, min; dev_t res; - int ret = -EINVAL; + int len = n; + char *name; - if (sscanf(buf, "%u:%u", &maj, &min) != 2) - goto out; + if (len && buf[len-1] == '\n') + len--; + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; - res = MKDEV(maj,min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto out; + res = name_to_dev_t(name); + kfree(name); + if (!res) + return -EINVAL; lock_system_sleep(); swsusp_resume_device = res; @@ -989,9 +994,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, printk(KERN_INFO "PM: Starting manual resume from disk\n"); noresume = 0; software_resume(); - ret = n; - out: - return ret; + return n; } power_attr(resume); diff --git a/kernel/power/main.c b/kernel/power/main.c index 1d1bf630e6e9..6271bc4073ef 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -282,8 +282,8 @@ struct kobject *power_kobj; * state - control system power state. * * show() returns what states are supported, which is hard-coded to - * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and - * 'disk' (Suspend-to-Disk). + * 'freeze' (Low-Power Idle), 'standby' (Power-On Suspend), + * 'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk). * * store() accepts one of those strings, translates it into the * proper enumerated value, and initiates a suspend transition. diff --git a/kernel/power/power.h b/kernel/power/power.h index 7d4b7ffb3c1d..15f37ea08719 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -2,6 +2,7 @@ #include <linux/suspend_ioctls.h> #include <linux/utsname.h> #include <linux/freezer.h> +#include <linux/compiler.h> struct swsusp_info { struct new_utsname uts; @@ -11,7 +12,7 @@ struct swsusp_info { unsigned long image_pages; unsigned long pages; unsigned long size; -} __attribute__((aligned(PAGE_SIZE))); +} __aligned(PAGE_SIZE); #ifdef CONFIG_HIBERNATION /* kernel/power/snapshot.c */ @@ -49,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) +asmlinkage int swsusp_save(void); + /* kernel/power/hibernate.c */ extern bool freezer_test_done; diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 8dff9b48075a..884b77058864 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -66,6 +66,7 @@ static struct pm_qos_constraints cpu_dma_constraints = { .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, .notifiers = &cpu_dma_lat_notifier, }; @@ -79,6 +80,7 @@ static struct pm_qos_constraints network_lat_constraints = { .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, .notifiers = &network_lat_notifier, }; @@ -93,6 +95,7 @@ static struct pm_qos_constraints network_tput_constraints = { .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, .notifiers = &network_throughput_notifier, }; @@ -128,7 +131,7 @@ static const struct file_operations pm_qos_power_fops = { static inline int pm_qos_get_value(struct pm_qos_constraints *c) { if (plist_head_empty(&c->list)) - return c->default_value; + return c->no_constraint_value; switch (c->type) { case PM_QOS_MIN: @@ -170,6 +173,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, { unsigned long flags; int prev_value, curr_value, new_value; + int ret; spin_lock_irqsave(&pm_qos_lock, flags); prev_value = pm_qos_get_value(c); @@ -205,13 +209,15 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, trace_pm_qos_update_target(action, prev_value, curr_value); if (prev_value != curr_value) { - blocking_notifier_call_chain(c->notifiers, - (unsigned long)curr_value, - NULL); - return 1; + ret = 1; + if (c->notifiers) + blocking_notifier_call_chain(c->notifiers, + (unsigned long)curr_value, + NULL); } else { - return 0; + ret = 0; } + return ret; } /** diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b38109e204af..18fb7a2fb14b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -27,6 +27,7 @@ #include <linux/highmem.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/compiler.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free) struct linked_page { struct linked_page *next; char data[LINKED_PAGE_DATA_SIZE]; -} __attribute__((packed)); +} __packed; static inline void free_list_of_pages(struct linked_page *list, int clear_page_nosave) @@ -637,7 +638,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem(sizeof(struct nosave_region)); + region = memblock_virt_alloc(sizeof(struct nosave_region), 0); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); @@ -1268,7 +1269,7 @@ static void free_unnecessary_pages(void) * [number of saveable pages] - [number of pages that can be freed in theory] * * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, * minus mapped file pages. */ static unsigned long minimum_image_size(unsigned long saveable) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 62ee437b5c7e..c3ad9cafe930 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -26,6 +26,7 @@ #include <linux/syscore_ops.h> #include <linux/ftrace.h> #include <trace/events/power.h> +#include <linux/compiler.h> #include "power.h" @@ -39,7 +40,7 @@ static const struct platform_suspend_ops *suspend_ops; static bool need_suspend_ops(suspend_state_t state) { - return !!(state > PM_SUSPEND_FREEZE); + return state > PM_SUSPEND_FREEZE; } static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); @@ -156,13 +157,13 @@ static int suspend_prepare(suspend_state_t state) } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +void __weak arch_suspend_disable_irqs(void) { local_irq_disable(); } /* default implementation */ -void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +void __weak arch_suspend_enable_irqs(void) { local_irq_enable(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c33ed200410..8c9a4819f798 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -101,7 +101,7 @@ struct swsusp_header { unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; -} __attribute__((packed)); +} __packed; static struct swsusp_header *swsusp_header; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index 8f50de394d22..019069c84ff6 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -18,6 +18,8 @@ #include <linux/rbtree.h> #include <linux/slab.h> +#include "power.h" + static DEFINE_MUTEX(wakelocks_lock); struct wakelock { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index be7c86bae576..a45b50962295 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -319,7 +319,7 @@ static void log_store(int facility, int level, else free = log_first_idx - log_next_idx; - if (free > size + sizeof(struct printk_log)) + if (free >= size + sizeof(struct printk_log)) break; /* drop old messages until we have enough contiuous space */ @@ -327,7 +327,7 @@ static void log_store(int facility, int level, log_first_seq++; } - if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) { + if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* * This message + an additional empty header does not fit * at the end of the buffer. Add an empty header with len == 0 @@ -351,7 +351,7 @@ static void log_store(int facility, int level, else msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); - msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len; + msg->len = size; /* insert message */ log_next_idx += msg->len; @@ -757,14 +757,10 @@ void __init setup_log_buf(int early) return; if (early) { - unsigned long mem; - - mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (!mem) - return; - new_log_buf = __va(mem); + new_log_buf = + memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); } else { - new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); } if (unlikely(!new_log_buf)) { @@ -1080,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) next_seq = log_next_seq; len = 0; - prev = 0; while (len >= 0 && seq < next_seq) { struct printk_log *msg = log_from_idx(idx); int textlen; @@ -1565,9 +1560,12 @@ asmlinkage int vprintk_emit(int facility, int level, level = kern_level - '0'; case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; - case 'c': /* KERN_CONT */ - break; } + /* + * No need to check length here because vscnprintf + * put '\0' at the end of the string. Only valid and + * newly printed level is detected. + */ text_len -= end_of_header - text; text = (char *)end_of_header; } @@ -1599,10 +1597,13 @@ asmlinkage int vprintk_emit(int facility, int level, * either merge it with the current buffer and flush, or if * there was a race with interrupts (prefix == true) then just * flush it out and store this line separately. + * If the preceding printk was from a different task and missed + * a newline, flush and append the newline. */ - if (cont.len && cont.owner == current) { - if (!(lflags & LOG_PREFIX)) - stored = cont_add(facility, level, text, text_len); + if (cont.len) { + if (cont.owner == current && !(lflags & LOG_PREFIX)) + stored = cont_add(facility, level, text, + text_len); cont_flush(LOG_NEWLINE); } @@ -1882,6 +1883,7 @@ void suspend_console(void) console_lock(); console_suspended = 1; up(&console_sem); + mutex_release(&console_lock_dep_map, 1, _RET_IP_); } void resume_console(void) @@ -1889,6 +1891,7 @@ void resume_console(void) if (!console_suspend_enabled) return; down(&console_sem); + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); console_suspended = 0; console_unlock(); } @@ -2789,7 +2792,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, next_idx = idx; l = 0; - prev = 0; while (seq < dumper->next_seq) { struct printk_log *msg = log_from_idx(idx); diff --git a/kernel/profile.c b/kernel/profile.c index 6631e1ef55ab..cb980f0c731b 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -549,14 +549,14 @@ static int create_hash_tables(void) struct page *page; page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); page = alloc_pages_exact_node(node, - GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 0); if (!page) goto out_cleanup; @@ -591,18 +591,28 @@ out_cleanup: int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ { struct proc_dir_entry *entry; + int err = 0; if (!prof_on) return 0; - if (create_hash_tables()) - return -ENOMEM; + + cpu_notifier_register_begin(); + + if (create_hash_tables()) { + err = -ENOMEM; + goto out; + } + entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) - return 0; + goto out; proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - hotcpu_notifier(profile_cpu_callback, 0); - return 0; + __hotcpu_notifier(profile_cpu_callback, 0); + +out: + cpu_notifier_register_done(); + return err; } -module_init(create_proc_profile); +subsys_initcall(create_proc_profile); #endif /* CONFIG_PROC_FS */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1f4bcb3cc21c..adf98622cb32 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1180,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, return ret; } -asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, - compat_long_t addr, compat_long_t data) +COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, + compat_long_t, addr, compat_long_t, data) { struct task_struct *child; long ret; diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 01e9ec37a3e3..807ccfbf69b3 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,5 +1,5 @@ obj-y += update.o srcu.o -obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7859a0a3951e..bfda2726ca45 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2011 * @@ -23,6 +23,7 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include <trace/events/rcu.h> #ifdef CONFIG_RCU_TRACE #define RCU_TRACE(stmt) stmt #else /* #ifdef CONFIG_RCU_TRACE */ @@ -96,25 +97,26 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -extern void kfree(const void *); +void kfree(const void *); static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; + rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); return 1; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); + rcu_lock_release(&rcu_callback_map); return 0; } } -extern int rcu_expedited; - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_suppress; diff --git a/kernel/rcu/torture.c b/kernel/rcu/rcutorture.c index 3929cd451511..bd30bc61bc05 100644 --- a/kernel/rcu/torture.c +++ b/kernel/rcu/rcutorture.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2005, 2006 * @@ -48,112 +48,58 @@ #include <linux/slab.h> #include <linux/trace_clock.h> #include <asm/byteorder.h> +#include <linux/torture.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); -MODULE_ALIAS("rcutorture"); -#ifdef MODULE_PARAM_PREFIX -#undef MODULE_PARAM_PREFIX -#endif -#define MODULE_PARAM_PREFIX "rcutorture." - -static int fqs_duration; -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); -static int fqs_holdoff; -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -static int fqs_stutter = 3; -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -static bool gp_exp; -module_param(gp_exp, bool, 0444); -MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); -static bool gp_normal; -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); -static int irqreader = 1; -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -static int n_barrier_cbs; -module_param(n_barrier_cbs, int, 0444); -MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -static int nfakewriters = 4; -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -static int nreaders = -1; -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -static int object_debug; -module_param(object_debug, int, 0444); -MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -static int onoff_holdoff; -module_param(onoff_holdoff, int, 0444); -MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); -static int onoff_interval; -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -static int shuffle_interval = 3; -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -static int shutdown_secs; -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); -static int stall_cpu; -module_param(stall_cpu, int, 0444); -MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); -static int stall_cpu_holdoff = 10; -module_param(stall_cpu_holdoff, int, 0444); -MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); -static int stat_interval = 60; -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -static int stutter = 5; -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -static int test_boost = 1; -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -static int test_boost_duration = 4; -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -static int test_boost_interval = 7; -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -static bool test_no_idle_hz = true; -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); + +torture_param(int, fqs_duration, 0, + "Duration of fqs bursts (us), 0 to disable"); +torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); +torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); +torture_param(bool, gp_normal, false, + "Use normal (non-expedited) GP wait primitives"); +torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); +torture_param(int, n_barrier_cbs, 0, + "# of callbacks/kthreads for barrier testing"); +torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); +torture_param(int, nreaders, -1, "Number of RCU reader threads"); +torture_param(int, object_debug, 0, + "Enable debug-object double call_rcu() testing"); +torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); +torture_param(int, onoff_interval, 0, + "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); +torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); +torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); +torture_param(int, stall_cpu_holdoff, 10, + "Time to wait before starting stall (s)."); +torture_param(int, stat_interval, 60, + "Number of seconds between stats printk()s"); +torture_param(int, stutter, 5, "Number of seconds to run/halt test"); +torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +torture_param(int, test_boost_duration, 4, + "Duration of each boost test, seconds."); +torture_param(int, test_boost_interval, 7, + "Interval between boost tests, seconds."); +torture_param(bool, test_no_idle_hz, true, + "Test support for tickless idle CPUs"); +torture_param(bool, verbose, true, + "Enable verbose debugging printk()s"); + static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -static bool verbose; -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - -static char printk_buf[4096]; static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; static struct task_struct *stats_task; -static struct task_struct *shuffler_task; -static struct task_struct *stutter_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ static struct task_struct *stall_task; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; @@ -172,10 +118,10 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = - { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = - { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], + rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -188,22 +134,9 @@ static long n_rcu_torture_boost_rterror; static long n_rcu_torture_boost_failure; static long n_rcu_torture_boosts; static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static unsigned long sum_offline; -static int min_offline = -1; -static int max_offline; -static long n_online_attempts; -static long n_online_successes; -static unsigned long sum_online; -static int min_online = -1; -static int max_online; static long n_barrier_attempts; static long n_barrier_successes; static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; - -static int stutter_pause_test; #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -234,7 +167,6 @@ static u64 notrace rcu_trace_clock_local(void) } #endif /* #else #ifdef CONFIG_RCU_TRACE */ -static unsigned long shutdown_time; /* jiffies to system shutdown. */ static unsigned long boost_starttime; /* jiffies of next boost test start. */ DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ /* and boost task create/destroy. */ @@ -244,51 +176,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - -/* Forward reference. */ -static void rcu_torture_cleanup(void); - -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(const char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - /* * Allocate an element from the rcu_tortures pool. */ @@ -322,44 +209,6 @@ rcu_torture_free(struct rcu_torture *p) spin_unlock_bh(&rcu_torture_lock); } -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += (unsigned long)local_clock(); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -static void -rcu_stutter_wait(const char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); - } -} - /* * Operations vector for selecting different types of tests. */ @@ -367,7 +216,7 @@ rcu_stutter_wait(const char *title) struct rcu_torture_ops { void (*init)(void); int (*readlock)(void); - void (*read_delay)(struct rcu_random_state *rrsp); + void (*read_delay)(struct torture_random_state *rrsp); void (*readunlock)(int idx); int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); @@ -376,7 +225,7 @@ struct rcu_torture_ops { void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); - int (*stats)(char *page); + void (*stats)(char *page); int irq_capable; int can_boost; const char *name; @@ -394,7 +243,7 @@ static int rcu_torture_read_lock(void) __acquires(RCU) return 0; } -static void rcu_read_delay(struct rcu_random_state *rrsp) +static void rcu_read_delay(struct torture_random_state *rrsp) { const unsigned long shortdelay_us = 200; const unsigned long longdelay_ms = 50; @@ -403,12 +252,13 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) * period, and we want a long delay occasionally to trigger * force_quiescent_state. */ - if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) mdelay(longdelay_ms); - if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); #ifdef CONFIG_PREEMPT - if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + if (!preempt_count() && + !(torture_random(rrsp) % (nrealreaders * 20000))) preempt_schedule(); /* No QS if preempt_disable() in effect */ #endif } @@ -429,7 +279,7 @@ rcu_torture_cb(struct rcu_head *p) int i; struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - if (fullstop != FULLSTOP_DONTSTOP) { + if (torture_must_stop_irq()) { /* Test is ending, just drop callbacks on the floor. */ /* The next initialization will pick up the pieces. */ return; @@ -522,6 +372,48 @@ static struct rcu_torture_ops rcu_bh_ops = { }; /* + * Don't even think about trying any of these in real life!!! + * The names includes "busted", and they really means it! + * The only purpose of these functions is to provide a buggy RCU + * implementation to make sure that rcutorture correctly emits + * buggy-RCU error messages. + */ +static void rcu_busted_torture_deferred_free(struct rcu_torture *p) +{ + /* This is a deliberate bug for testing purposes only! */ + rcu_torture_cb(&p->rtort_rcu); +} + +static void synchronize_rcu_busted(void) +{ + /* This is a deliberate bug for testing purposes only! */ +} + +static void +call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + /* This is a deliberate bug for testing purposes only! */ + func(head); +} + +static struct rcu_torture_ops rcu_busted_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_busted_torture_deferred_free, + .sync = synchronize_rcu_busted, + .exp_sync = synchronize_rcu_busted, + .call = call_rcu_busted, + .cb_barrier = NULL, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_busted" +}; + +/* * Definitions for srcu torture testing. */ @@ -532,7 +424,7 @@ static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) return srcu_read_lock(&srcu_ctl); } -static void srcu_read_delay(struct rcu_random_state *rrsp) +static void srcu_read_delay(struct torture_random_state *rrsp) { long delay; const long uspertick = 1000000 / HZ; @@ -540,7 +432,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) /* We want there to be long-running readers, but not all the time. */ - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); + delay = torture_random(rrsp) % + (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); else @@ -578,21 +471,19 @@ static void srcu_torture_barrier(void) srcu_barrier(&srcu_ctl); } -static int srcu_torture_stats(char *page) +static void srcu_torture_stats(char *page) { - int cnt = 0; int cpu; int idx = srcu_ctl.completed & 0x1; - cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + page += sprintf(page, "%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, + page += sprintf(page, " %d(%lu,%lu)", cpu, per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); } - cnt += sprintf(&page[cnt], "\n"); - return cnt; + sprintf(page, "\n"); } static void srcu_torture_synchronize_expedited(void) @@ -681,12 +572,12 @@ static int rcu_torture_boost(void *arg) struct rcu_boost_inflight rbi = { .inflight = 0 }; struct sched_param sp; - VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ sp.sched_priority = 1; if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); n_rcu_torture_boost_rterror++; } @@ -697,9 +588,8 @@ static int rcu_torture_boost(void *arg) oldstarttime = boost_starttime; while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + stutter_wait("rcu_torture_boost"); + if (torture_must_stop()) goto checkwait; } @@ -714,15 +604,14 @@ static int rcu_torture_boost(void *arg) call_rcu(&rbi.rcu, rcu_torture_boost_cb); if (jiffies - call_rcu_time > test_boost_duration * HZ - HZ / 2) { - VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; } call_rcu_time = jiffies; } cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) + stutter_wait("rcu_torture_boost"); + if (torture_must_stop()) goto checkwait; } @@ -746,16 +635,17 @@ static int rcu_torture_boost(void *arg) } /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); +checkwait: stutter_wait("rcu_torture_boost"); + } while (!torture_must_stop()); /* Clean up and exit. */ - VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) + while (!kthread_should_stop() || rbi.inflight) { + torture_shutdown_absorb("rcu_torture_boost"); schedule_timeout_uninterruptible(1); + } smp_mb(); /* order accesses to ->inflight before stack-frame death. */ destroy_rcu_head_on_stack(&rbi.rcu); + torture_kthread_stopping("rcu_torture_boost"); return 0; } @@ -770,7 +660,7 @@ rcu_torture_fqs(void *arg) unsigned long fqs_resume_time; int fqs_burst_remaining; - VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; while (ULONG_CMP_LT(jiffies, fqs_resume_time) && @@ -784,12 +674,9 @@ rcu_torture_fqs(void *arg) udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; } - rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + stutter_wait("rcu_torture_fqs"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_fqs"); return 0; } @@ -806,10 +693,10 @@ rcu_torture_writer(void *arg) struct rcu_torture *rp; struct rcu_torture *rp1; struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); + static DEFINE_TORTURE_RANDOM(rand); - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); + VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + set_user_nice(current, MAX_NICE); do { schedule_timeout_uninterruptible(1); @@ -817,7 +704,7 @@ rcu_torture_writer(void *arg) if (rp == NULL) continue; rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); + udelay(torture_random(&rand) & 0x3ff); old_rp = rcu_dereference_check(rcu_torture_current, current == writer_task); rp->rtort_mbtest = 1; @@ -830,7 +717,7 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; if (gp_normal == gp_exp) - exp = !!(rcu_random(&rand) & 0x80); + exp = !!(torture_random(&rand) & 0x80); else exp = gp_exp; if (!exp) { @@ -856,12 +743,9 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); - rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + stutter_wait("rcu_torture_writer"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_writer"); return 0; } @@ -872,19 +756,19 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { - DEFINE_RCU_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); + VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, MAX_NICE); do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); + schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); + udelay(torture_random(&rand) & 0x3ff); if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) { + torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); } else if (gp_normal == gp_exp) { - if (rcu_random(&rand) & 0x80) + if (torture_random(&rand) & 0x80) cur_ops->sync(); else cur_ops->exp_sync(); @@ -893,13 +777,10 @@ rcu_torture_fakewriter(void *arg) } else { cur_ops->exp_sync(); } - rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + stutter_wait("rcu_torture_fakewriter"); + } while (!torture_must_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_fakewriter"); return 0; } @@ -925,7 +806,7 @@ static void rcu_torture_timer(unsigned long unused) int idx; int completed; int completed_end; - static DEFINE_RCU_RANDOM(rand); + static DEFINE_TORTURE_RANDOM(rand); static DEFINE_SPINLOCK(rand_lock); struct rcu_torture *p; int pipe_count; @@ -984,14 +865,14 @@ rcu_torture_reader(void *arg) int completed; int completed_end; int idx; - DEFINE_RCU_RANDOM(rand); + DEFINE_TORTURE_RANDOM(rand); struct rcu_torture *p; int pipe_count; struct timer_list t; unsigned long long ts; - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); + VERBOSE_TOROUT_STRING("rcu_torture_reader task started"); + set_user_nice(current, MAX_NICE); if (irqreader && cur_ops->irq_capable) setup_timer_on_stack(&t, rcu_torture_timer, 0); @@ -1038,24 +919,20 @@ rcu_torture_reader(void *arg) preempt_enable(); cur_ops->readunlock(idx); schedule(); - rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); + stutter_wait("rcu_torture_reader"); + } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + torture_kthread_stopping("rcu_torture_reader"); return 0; } /* * Create an RCU-torture statistics message in the specified buffer. */ -static int +static void rcu_torture_printk(char *page) { - int cnt = 0; int cpu; int i; long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; @@ -1071,8 +948,8 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], + page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, rcu_torture_current_version, @@ -1080,53 +957,46 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", + page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); - cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", + page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", n_rcu_torture_boost_failure, n_rcu_torture_boosts, n_rcu_torture_timers); - cnt += sprintf(&page[cnt], - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); - cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", + page = torture_onoff_stats(page); + page += sprintf(page, "barrier: %ld/%ld:%ld", n_barrier_successes, n_barrier_attempts, n_rcu_torture_barrier_error); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) != 0 || n_rcu_torture_barrier_error != 0 || n_rcu_torture_boost_ktrerror != 0 || n_rcu_torture_boost_rterror != 0 || n_rcu_torture_boost_failure != 0 || i > 1) { - cnt += sprintf(&page[cnt], "!!! "); + page += sprintf(page, "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(1); } - cnt += sprintf(&page[cnt], "Reader Pipe: "); + page += sprintf(page, "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Reader Batch: "); + page += sprintf(page, " %ld", pipesummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Reader Batch: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + page += sprintf(page, " %ld", batchsummary[i]); + page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); + page += sprintf(page, "Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - cnt += sprintf(&page[cnt], " %d", + page += sprintf(page, " %d", atomic_read(&rcu_torture_wcount[i])); } - cnt += sprintf(&page[cnt], "\n"); + page += sprintf(page, "\n"); if (cur_ops->stats) - cnt += cur_ops->stats(&page[cnt]); - return cnt; + cur_ops->stats(page); } /* @@ -1140,132 +1010,33 @@ rcu_torture_printk(char *page) static void rcu_torture_stats_print(void) { - int cnt; + int size = nr_cpu_ids * 200 + 8192; + char *buf; - cnt = rcu_torture_printk(printk_buf); - pr_alert("%s", printk_buf); + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("rcu-torture: Out of memory, need: %d", size); + return; + } + rcu_torture_printk(buf); + pr_alert("%s", buf); + kfree(buf); } /* * Periodically prints torture statistics, if periodic statistics printing * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. */ static int rcu_torture_stats(void *arg) { - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + VERBOSE_TOROUT_STRING("rcu_torture_stats task started"); do { schedule_timeout_interruptible(stat_interval * HZ); rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - if (stutter_task) - set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); - if (fqs_task) - set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); - if (shutdown_task) - set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); -#ifdef CONFIG_HOTPLUG_CPU - if (onoff_task) - set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - if (stall_task) - set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); - if (barrier_cbs_tasks) - for (i = 0; i < n_barrier_cbs; i++) - if (barrier_cbs_tasks[i]) - set_cpus_allowed_ptr(barrier_cbs_tasks[i], - shuffle_tmp_mask); - if (barrier_task) - set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + torture_shutdown_absorb("rcu_torture_stats"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_stats"); return 0; } @@ -1292,10 +1063,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) onoff_interval, onoff_holdoff); } -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - static void rcutorture_booster_cleanup(int cpu) { struct task_struct *t; @@ -1303,14 +1070,12 @@ static void rcutorture_booster_cleanup(int cpu) if (boost_tasks[cpu] == NULL) return; mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); t = boost_tasks[cpu]; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); - boost_tasks[cpu] = NULL; + torture_stop_kthread(rcu_torture_boost, t); } static int rcutorture_booster_init(int cpu) @@ -1322,13 +1087,13 @@ static int rcutorture_booster_init(int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, cpu_to_node(cpu), "rcu_torture_boost"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); - VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); n_rcu_torture_boost_ktrerror++; boost_tasks[cpu] = NULL; mutex_unlock(&boost_mutex); @@ -1341,175 +1106,6 @@ static int rcutorture_booster_init(int cpu) } /* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - unsigned long delta; - int maxcpu = -1; - DEFINE_RCU_RANDOM(rand); - int ret; - unsigned long starttime; - - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - if (onoff_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); - } - while (!kthread_should_stop()) { - cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - int ret; - - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); - onoff_task = NULL; -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static int -rcu_torture_onoff_init(void) -{ - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -/* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. */ @@ -1517,11 +1113,11 @@ static int rcu_torture_stall(void *args) { unsigned long stop_at; - VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); if (stall_cpu_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } if (!kthread_should_stop()) { stop_at = get_seconds() + stall_cpu; @@ -1535,7 +1131,7 @@ static int rcu_torture_stall(void *args) rcu_read_unlock(); pr_alert("rcu_torture_stall end.\n"); } - rcutorture_shutdown_absorb("rcu_torture_stall"); + torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); return 0; @@ -1544,27 +1140,9 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - int ret; - if (stall_cpu <= 0) return 0; - stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); - if (IS_ERR(stall_task)) { - ret = PTR_ERR(stall_task); - stall_task = NULL; - return ret; - } - return 0; -} - -/* Clean up after the CPU-stall kthread, if one was spawned. */ -static void rcu_torture_stall_cleanup(void) -{ - if (stall_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); - kthread_stop(stall_task); - stall_task = NULL; + return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } /* Callback function for RCU barrier testing. */ @@ -1578,30 +1156,28 @@ static int rcu_torture_barrier_cbs(void *arg) { long myid = (long)arg; bool lastphase = 0; + bool newphase; struct rcu_head rcu; init_rcu_head_on_stack(&rcu); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); - set_user_nice(current, 19); + VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, MAX_NICE); do { wait_event(barrier_cbs_wq[myid], - barrier_phase != lastphase || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - lastphase = barrier_phase; + (newphase = + ACCESS_ONCE(barrier_phase)) != lastphase || + torture_must_stop()); + lastphase = newphase; smp_mb(); /* ensure barrier_phase load before ->call(). */ - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + if (torture_must_stop()) break; cur_ops->call(&rcu, rcu_torture_barrier_cbf); if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); + } while (!torture_must_stop()); cur_ops->cb_barrier(); destroy_rcu_head_on_stack(&rcu); + torture_kthread_stopping("rcu_torture_barrier_cbs"); return 0; } @@ -1610,7 +1186,7 @@ static int rcu_torture_barrier(void *arg) { int i; - VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting"); do { atomic_set(&barrier_cbs_invoked, 0); atomic_set(&barrier_cbs_count, n_barrier_cbs); @@ -1620,23 +1196,19 @@ static int rcu_torture_barrier(void *arg) wake_up(&barrier_cbs_wq[i]); wait_event(barrier_wq, atomic_read(&barrier_cbs_count) == 0 || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + torture_must_stop()); + if (torture_must_stop()) break; n_barrier_attempts++; - cur_ops->cb_barrier(); + cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */ if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { n_rcu_torture_barrier_error++; WARN_ON_ONCE(1); } n_barrier_successes++; schedule_timeout_interruptible(HZ / 10); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_barrier"); return 0; } @@ -1669,24 +1241,13 @@ static int rcu_torture_barrier_init(void) return -ENOMEM; for (i = 0; i < n_barrier_cbs; i++) { init_waitqueue_head(&barrier_cbs_wq[i]); - barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)(long)i, - "rcu_torture_barrier_cbs"); - if (IS_ERR(barrier_cbs_tasks[i])) { - ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); - barrier_cbs_tasks[i] = NULL; + ret = torture_create_kthread(rcu_torture_barrier_cbs, + (void *)(long)i, + barrier_cbs_tasks[i]); + if (ret) return ret; - } - } - barrier_task = kthread_run(rcu_torture_barrier, NULL, - "rcu_torture_barrier"); - if (IS_ERR(barrier_task)) { - ret = PTR_ERR(barrier_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); - barrier_task = NULL; } - return 0; + return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task); } /* Clean up after RCU barrier testing. */ @@ -1694,19 +1255,11 @@ static void rcu_torture_barrier_cleanup(void) { int i; - if (barrier_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); - kthread_stop(barrier_task); - barrier_task = NULL; - } + torture_stop_kthread(rcu_torture_barrier, barrier_task); if (barrier_cbs_tasks != NULL) { - for (i = 0; i < n_barrier_cbs; i++) { - if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); - kthread_stop(barrier_cbs_tasks[i]); - barrier_cbs_tasks[i] = NULL; - } - } + for (i = 0; i < n_barrier_cbs; i++) + torture_stop_kthread(rcu_torture_barrier_cbs, + barrier_cbs_tasks[i]); kfree(barrier_cbs_tasks); barrier_cbs_tasks = NULL; } @@ -1744,90 +1297,42 @@ rcu_torture_cleanup(void) { int i; - mutex_lock(&fullstop_mutex); rcutorture_record_test_transition(); - if (fullstop == FULLSTOP_SHUTDOWN) { - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); + if (torture_cleanup()) { if (cur_ops->cb_barrier != NULL) cur_ops->cb_barrier(); return; } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_barrier_cleanup(); - rcu_torture_stall_cleanup(); - if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; + rcu_torture_barrier_cleanup(); + torture_stop_kthread(rcu_torture_stall, stall_task); + torture_stop_kthread(rcu_torture_writer, writer_task); if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } + for (i = 0; i < nrealreaders; i++) + torture_stop_kthread(rcu_torture_reader, + reader_tasks[i]); kfree(reader_tasks); - reader_tasks = NULL; } rcu_torture_current = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; + torture_stop_kthread(rcu_torture_fakewriter, + fakewriter_tasks[i]); } kfree(fakewriter_tasks); fakewriter_tasks = NULL; } - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; + torture_stop_kthread(rcu_torture_stats, stats_task); + torture_stop_kthread(rcu_torture_fqs, fqs_task); if ((test_boost == 1 && cur_ops->can_boost) || test_boost == 2) { unregister_cpu_notifier(&rcutorture_cpu_nb); for_each_possible_cpu(i) rcutorture_booster_cleanup(i); } - if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; - rcu_torture_onoff_cleanup(); /* Wait for all RCU callbacks to fire. */ @@ -1838,8 +1343,7 @@ rcu_torture_cleanup(void) if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else if (n_online_successes != n_online_attempts || - n_offline_successes != n_offline_attempts) + else if (torture_onoff_failures()) rcu_torture_print_module_parms(cur_ops, "End of test: RCU_HOTPLUG"); else @@ -1908,12 +1412,11 @@ rcu_torture_init(void) int i; int cpu; int firsterr = 0; - int retval; static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, }; - mutex_lock(&fullstop_mutex); + torture_init_begin(torture_type, verbose, &rcutorture_runnable); /* Process args and tell the world that the torturer is on the job. */ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { @@ -1928,7 +1431,7 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return -EINVAL; } if (cur_ops->fqs == NULL && fqs_duration != 0) { @@ -1943,7 +1446,6 @@ rcu_torture_init(void) else nrealreaders = 2 * num_online_cpus(); rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; /* Set up the freelist. */ @@ -1979,108 +1481,61 @@ rcu_torture_init(void) /* Start up the kthreads. */ - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_create(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (firsterr) goto unwind; - } - wake_up_process(writer_task); fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); + VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_fakewriter, + NULL, fakewriter_tasks[i]); + if (firsterr) goto unwind; - } } reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); + VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; + firsterr = torture_create_kthread(rcu_torture_reader, NULL, + reader_tasks[i]); + if (firsterr) goto unwind; - } } if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; + firsterr = torture_create_kthread(rcu_torture_stats, NULL, + stats_task); + if (firsterr) goto unwind; - } } if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); + firsterr = torture_shuffle_init(shuffle_interval * HZ); + if (firsterr) goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } } if (stutter < 0) stutter = 0; if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; + firsterr = torture_stutter_init(stutter * HZ); + if (firsterr) goto unwind; - } } if (fqs_duration < 0) fqs_duration = 0; if (fqs_duration) { - /* Create the stutter thread */ - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; + /* Create the fqs thread */ + torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); + if (firsterr) goto unwind; - } } if (test_boost_interval < 1) test_boost_interval = 1; @@ -2094,49 +1549,31 @@ rcu_torture_init(void) for_each_possible_cpu(i) { if (cpu_is_offline(i)) continue; /* Heuristic: CPU can go offline. */ - retval = rcutorture_booster_init(i); - if (retval < 0) { - firsterr = retval; + firsterr = rcutorture_booster_init(i); + if (firsterr) goto unwind; - } } } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - wake_up_process(shutdown_task); - } - i = rcu_torture_onoff_init(); - if (i != 0) { - firsterr = i; + firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); + if (firsterr) goto unwind; - } - register_reboot_notifier(&rcutorture_shutdown_nb); - i = rcu_torture_stall_init(); - if (i != 0) { - firsterr = i; + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ); + if (firsterr) goto unwind; - } - retval = rcu_torture_barrier_init(); - if (retval != 0) { - firsterr = retval; + firsterr = rcu_torture_stall_init(); + if (firsterr) + goto unwind; + firsterr = rcu_torture_barrier_init(); + if (firsterr) goto unwind; - } if (object_debug) rcu_test_debug_objects(); rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); + torture_init_end(); return 0; unwind: - mutex_unlock(&fullstop_mutex); + torture_init_end(); rcu_torture_cleanup(); return firsterr; } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 01d5ccb8bfe3..c639556f3fa0 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 @@ -36,8 +36,6 @@ #include <linux/delay.h> #include <linux/srcu.h> -#include <trace/events/rcu.h> - #include "rcu.h" /* @@ -363,6 +361,29 @@ static void srcu_flip(struct srcu_struct *sp) /* * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing SRCU read-side critical section. On systems with + * more than one CPU, this means that when "func()" is invoked, each CPU + * is guaranteed to have executed a full memory barrier since the end of + * its last corresponding SRCU read-side critical section whose beginning + * preceded the call to call_rcu(). It also means that each CPU executing + * an SRCU read-side critical section that continues beyond the start of + * "func()" must have executed a memory barrier after the call_rcu() + * but before the beginning of that SRCU read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting SRCU callback function "func()", then both CPU A and CPU + * B are guaranteed to execute a full memory barrier during the time + * interval between the call to call_rcu() and the invocation of "func()". + * This guarantee applies even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Of course, these guarantees apply only for invocations of call_srcu(), + * srcu_read_lock(), and srcu_read_unlock() that are all passed the same + * srcu_struct structure. */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *head)) @@ -375,7 +396,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - schedule_delayed_work(&sp->work, 0); + queue_delayed_work(system_power_efficient_wq, &sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -459,7 +480,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) * Note that it is illegal to call synchronize_srcu() from the corresponding * SRCU read-side critical section; doing so will result in deadlock. * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. + * srcu_struct from some other srcu_struct's read-side critical section, + * as long as the resulting graph of srcu_structs is acyclic. + * + * There are memory-ordering constraints implied by synchronize_srcu(). + * On systems with more than one CPU, when synchronize_srcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last corresponding SRCU-sched read-side critical section + * whose beginning preceded the call to synchronize_srcu(). In addition, + * each CPU having an SRCU read-side critical section that extends beyond + * the return from synchronize_srcu() is guaranteed to have executed a + * full memory barrier after the beginning of synchronize_srcu() and before + * the beginning of that SRCU read-side critical section. Note that these + * guarantees include CPUs that are offline, idle, or executing in user mode, + * as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_srcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_srcu(). This guarantee applies even if CPU A and CPU B + * are the same CPU, but again only if the system has more than one CPU. + * + * Of course, these memory-ordering guarantees apply only when + * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are + * passed the same srcu_struct structure. */ void synchronize_srcu(struct srcu_struct *sp) { @@ -476,12 +520,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); * Wait for an SRCU grace period to elapse, but be more aggressive about * spinning rather than blocking when waiting. * - * Note that it is also illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; - * doing so will result in deadlock. However, it is perfectly legal - * to call synchronize_srcu_expedited() on one srcu_struct from some - * other srcu_struct's read-side critical section, as long as - * the resulting graph of srcu_structs is acyclic. + * Note that synchronize_srcu_expedited() has the same deadlock and + * memory-ordering properties as does synchronize_srcu(). */ void synchronize_srcu_expedited(struct srcu_struct *sp) { @@ -491,6 +531,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); /** * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + * @sp: srcu_struct on which to wait for in-flight callbacks. */ void srcu_barrier(struct srcu_struct *sp) { @@ -631,7 +672,8 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - schedule_delayed_work(&sp->work, SRCU_INTERVAL); + queue_delayed_work(system_power_efficient_wq, + &sp->work, SRCU_INTERVAL); } /* diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 1254f312d024..d9efcc13008c 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -37,10 +37,6 @@ #include <linux/prefetch.h> #include <linux/ftrace_event.h> -#ifdef CONFIG_RCU_TRACE -#include <trace/events/rcu.h> -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - #include "rcu.h" /* Forward declarations for tiny_plugin.h. */ diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae352..431528520562 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright (c) 2010 Linaro * diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd081987a8ec..0c47e300210a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -58,8 +58,6 @@ #include <linux/suspend.h> #include "tree.h" -#include <trace/events/rcu.h> - #include "rcu.h" MODULE_ALIAS("rcutree"); @@ -369,6 +367,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { + struct rcu_state *rsp; + struct rcu_data *rdp; + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = @@ -380,6 +381,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, current->pid, current->comm, idle->pid, idle->comm); /* must be idle task! */ } + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + do_nocb_deferred_wakeup(rdp); + } rcu_prepare_for_idle(smp_processor_id()); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ @@ -411,11 +416,12 @@ static void rcu_eqs_enter(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { rdtp->dynticks_nesting = 0; - else + rcu_eqs_enter_common(rdtp, oldval, user); + } else { rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_eqs_enter_common(rdtp, oldval, user); + } } /** @@ -533,11 +539,12 @@ static void rcu_eqs_exit(bool user) rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) + if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else + } else { rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); + rcu_eqs_exit_common(rdtp, oldval, user); + } } /** @@ -716,7 +723,7 @@ bool rcu_lockdep_current_cpu_online(void) bool ret; if (in_nmi()) - return 1; + return true; preempt_disable(); rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; @@ -755,6 +762,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, } /* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); + +/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -812,16 +825,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, */ rcu_kick_nohz_cpu(rdp->cpu); + /* + * Alternatively, the CPU might be running in the kernel + * for an extended period of time without a quiescent state. + * Attempt to force the CPU through the scheduler to gain the + * needed quiescent state, but only if the grace period has gone + * on for an uncommonly long time. If there are many stuck CPUs, + * we will beat on the first one until it gets unstuck, then move + * to the next. Only do this for the primary flavor of RCU. + */ + if (rdp->rsp == rcu_state && + ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + rdp->rsp->jiffies_resched += 5; + resched_cpu(rdp->cpu); + } + return 0; } static void record_gp_stall_check_time(struct rcu_state *rsp) { - unsigned long j = ACCESS_ONCE(jiffies); + unsigned long j = jiffies; + unsigned long j1; rsp->gp_start = j; smp_wmb(); /* Record start time before stall time. */ - rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); + j1 = rcu_jiffies_till_stall_check(); + rsp->jiffies_stall = j + j1; + rsp->jiffies_resched = j + j1 / 2; } /* @@ -972,7 +1003,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) return; - j = ACCESS_ONCE(jiffies); + j = jiffies; /* * Lots of memory barriers to reject false positives. @@ -1133,8 +1164,10 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) + if (rnp != rnp_root) { raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); + } /* * Get a new grace-period number. If there really is no grace @@ -1354,6 +1387,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); return; } + smp_mb__after_unlock_lock(); __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1368,6 +1402,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); if (rsp->gp_flags == 0) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); @@ -1386,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ record_gp_stall_check_time(rsp); - smp_wmb(); /* Record GP times before starting GP. */ - rsp->gpnum++; + /* Record GP times before starting GP, hence smp_store_release(). */ + smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ mutex_lock(&rsp->onoff_mutex); + smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1409,6 +1445,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1463,6 +1500,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); rsp->gp_flags &= ~RCU_GP_FLAG_FQS; raw_spin_unlock_irq(&rnp->lock); } @@ -1480,6 +1518,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -1505,19 +1544,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) __note_gp_changes(rsp, rnp, rdp); + /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); - rsp->completed = rsp->gpnum; /* Declare grace period done. */ + /* Declare grace period done. */ + ACCESS_ONCE(rsp->completed) = rsp->gpnum; trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); @@ -1553,6 +1596,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; cond_resched(); @@ -1582,6 +1626,7 @@ static int __noreturn rcu_gp_kthread(void *arg) (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), j); + /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) @@ -1749,6 +1794,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp_c = rnp; rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); WARN_ON_ONCE(rnp_c->qsmask); } @@ -1778,6 +1824,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum) { @@ -1901,13 +1948,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Adopt the RCU callbacks from the specified rcu_state structure's * orphanage. The caller must hold the ->orphan_lock. */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) return; /* Do the accounting first. */ @@ -1986,12 +2033,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp); + rcu_adopt_orphan_cbs(rsp, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) @@ -2202,6 +2250,7 @@ static void force_qs_rnp(struct rcu_state *rsp, cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (!rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2231,6 +2280,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rnp = rcu_get_root(rsp); if (rnp->qsmask == 0) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ } } @@ -2254,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp) if (rnp_old != NULL) raw_spin_unlock(&rnp_old->fqslock); if (ret) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; return; } rnp_old = rnp; @@ -2263,9 +2313,10 @@ static void force_quiescent_state(struct rcu_state *rsp) /* Reached the root of the rcu_node tree, acquire lock. */ raw_spin_lock_irqsave(&rnp_old->lock, flags); + smp_mb__after_unlock_lock(); raw_spin_unlock(&rnp_old->fqslock); if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; + ACCESS_ONCE(rsp->n_force_qs_lh)++; raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ } @@ -2303,6 +2354,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* If there are callbacks ready, invoke them. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_callbacks(rsp, rdp); + + /* Do any needed deferred wakeups of rcuo kthreads. */ + do_nocb_deferred_wakeup(rdp); } /* @@ -2378,6 +2432,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_node *rnp_root = rcu_get_root(rsp); raw_spin_lock(&rnp_root->lock); + smp_mb__after_unlock_lock(); rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); } else { @@ -2437,7 +2492,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy); + offline = !__call_rcu_nocb(rdp, head, lazy, flags); WARN_ON_ONCE(offline); /* _call_rcu() is illegal on offline CPU; leak the callback. */ local_irq_restore(flags); @@ -2584,6 +2639,58 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +/** + * get_state_synchronize_rcu - Snapshot current RCU state + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_rcu(void) +{ + /* + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_rcu() + * and cond_synchronize_rcu(). + */ + return smp_load_acquire(&rcu_state->gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/** + * cond_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call to + * get_state_synchronize_rcu(), just return. Otherwise, invoke + * synchronize_rcu() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_rcu(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. + */ + newstate = smp_load_acquire(&rcu_state->completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu); + static int synchronize_sched_expedited_cpu_stop(void *data) { /* @@ -2757,6 +2864,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Check for CPU stalls, if enabled. */ check_cpu_stall(rsp, rdp); + /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ + if (rcu_nohz_full_cpu(rsp)) + return 0; + /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { @@ -2790,6 +2901,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } + /* Does this CPU need a deferred NOCB wakeup? */ + if (rcu_nocb_need_deferred_wakeup(rdp)) { + rdp->n_rp_nocb_defer_wakeup++; + return 1; + } + /* nothing to do */ rdp->n_rp_need_nothing++; return 0; @@ -2815,7 +2932,7 @@ static int rcu_pending(int cpu) * non-NULL, store an indication of whether all callbacks are lazy. * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { bool al = true; bool hc = false; @@ -3214,9 +3331,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = rcu_num_lvls - 1; i > 0; i--) + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = rcu_fanout_leaf; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -3346,6 +3463,8 @@ static void __init rcu_init_geometry(void) if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && nr_cpu_ids == NR_CPUS) return; + pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", + rcu_fanout_leaf, nr_cpu_ids); /* * Compute number of nodes that can be handled an rcu_node tree diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 52be957c9fe2..75dc3c39a02a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -317,6 +317,7 @@ struct rcu_data { unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; unsigned long n_rp_gp_started; + unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; /* 6) _rcu_barrier() and OOM callbacks. */ @@ -335,6 +336,7 @@ struct rcu_data { int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ @@ -453,6 +455,8 @@ struct rcu_state { /* but in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + unsigned long jiffies_resched; /* Time at which to resched */ + /* a reluctant CPU. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ const char *name; /* Name of structure. */ @@ -548,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy); + bool lazy, unsigned long flags); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp); + struct rcu_data *rdp, + unsigned long flags); +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); +static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); @@ -564,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj); static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); +static bool rcu_nohz_full_cpu(struct rcu_state *rsp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 08a765232432..962d1d589929 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -14,8 +14,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 @@ -204,6 +204,7 @@ static void rcu_preempt_note_context_switch(int cpu) rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -312,6 +313,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); } @@ -361,10 +363,14 @@ void rcu_read_unlock_special(struct task_struct *t) special = t->rcu_read_unlock_special; if (special & RCU_READ_UNLOCK_NEED_QS) { rcu_preempt_qs(smp_processor_id()); + if (!t->rcu_read_unlock_special) { + local_irq_restore(flags); + return; + } } - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { + /* Hardware IRQ handlers cannot block, complain if they get here. */ + if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { local_irq_restore(flags); return; } @@ -381,6 +387,7 @@ void rcu_read_unlock_special(struct task_struct *t) for (;;) { rnp = t->rcu_blocked_node; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); if (rnp == t->rcu_blocked_node) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -605,6 +612,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, while (!list_empty(lp)) { t = list_entry(lp->next, typeof(*t), rcu_node_entry); raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); list_del(&t->rcu_node_entry); t->rcu_blocked_node = rnp_root; list_add(&t->rcu_node_entry, lp_root); @@ -629,6 +637,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * in this case. */ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); if (rnp_root->boost_tasks != NULL && rnp_root->boost_tasks != rnp_root->gp_tasks && rnp_root->boost_tasks != rnp_root->exp_tasks) @@ -772,6 +781,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -779,14 +789,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ wake_up(&sync_rcu_preempt_exp_wq); + } break; } mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); rnp->expmask &= ~mask; } } @@ -806,6 +819,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); if (list_empty(&rnp->blkd_tasks)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { @@ -886,6 +900,7 @@ void synchronize_rcu_expedited(void) /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->expmask = rnp->qsmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1191,6 +1206,7 @@ static int rcu_boost(struct rcu_node *rnp) return 0; /* Nothing left to boost. */ raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* * Recheck under the lock: all tasks in need of boosting @@ -1377,6 +1393,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_BOOST_PRIO; @@ -1569,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu, NULL); } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up @@ -1639,7 +1658,7 @@ extern int tick_nohz_active; * only if it has been awhile since the last time we did so. Afterwards, * if there are any callbacks ready for immediate invocation, return true. */ -static bool rcu_try_advance_all_cbs(void) +static bool __maybe_unused rcu_try_advance_all_cbs(void) { bool cbs_ready = false; struct rcu_data *rdp; @@ -1679,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void) * * The caller must have disabled interrupts. */ +#ifndef CONFIG_RCU_NOCB_CPU_ALL int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); @@ -1709,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) } return 0; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Prepare a CPU for idle from an RCU perspective. The first major task @@ -1722,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj) */ static void rcu_prepare_for_idle(int cpu) { +#ifndef CONFIG_RCU_NOCB_CPU_ALL struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); struct rcu_node *rnp; @@ -1769,9 +1791,11 @@ static void rcu_prepare_for_idle(int cpu) continue; rnp = rdp->mynode; raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1781,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_cleanup_after_idle(int cpu) { - +#ifndef CONFIG_RCU_NOCB_CPU_ALL if (rcu_is_nocb_cpu(cpu)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ } /* @@ -1852,6 +1877,7 @@ static int rcu_oom_notify(struct notifier_block *self, /* Wait for callbacks from earlier instance to complete. */ wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + smp_mb(); /* Ensure callback reuse happens after callback invocation. */ /* * Prevent premature wakeup: ensure that all increments happen @@ -2082,6 +2108,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) init_waitqueue_head(&rnp->nocb_gp_wq[1]); } +#ifndef CONFIG_RCU_NOCB_CPU_ALL /* Is the specified CPU a no-CPUs CPU? */ bool rcu_is_nocb_cpu(int cpu) { @@ -2089,6 +2116,7 @@ bool rcu_is_nocb_cpu(int cpu) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ /* * Enqueue the specified string of rcu_head structures onto the specified @@ -2101,7 +2129,8 @@ bool rcu_is_nocb_cpu(int cpu) static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, struct rcu_head *rhp, struct rcu_head **rhtp, - int rhcount, int rhcount_lazy) + int rhcount, int rhcount_lazy, + unsigned long flags) { int len; struct rcu_head **old_rhpp; @@ -2122,9 +2151,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, } len = atomic_long_read(&rdp->nocb_q_count); if (old_rhpp == &rdp->nocb_head) { - wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + if (!irqs_disabled_flags(flags)) { + wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmpty")); + } else { + rdp->nocb_defer_wakeup = true; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeEmptyIsDeferred")); + } rdp->qlen_last_fqs_check = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else if (len > rdp->qlen_last_fqs_check + qhimark) { wake_up_process(t); /* ... or if many callbacks queued. */ rdp->qlen_last_fqs_check = LONG_MAX / 2; @@ -2145,12 +2181,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, * "rcuo" kthread can find it. */ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { if (!rcu_is_nocb_cpu(rdp->cpu)) return 0; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); if (__is_kfree_rcu_offset((unsigned long)rhp->func)) trace_rcu_kfree_callback(rdp->rsp->name, rhp, (unsigned long)rhp->func, @@ -2168,7 +2204,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * not a no-CBs CPU. */ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { long ql = rsp->qlen; long qll = rsp->qlen_lazy; @@ -2182,14 +2219,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, /* First, enqueue the donelist, if any. This preserves CB ordering. */ if (rsp->orphan_donelist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll); + rsp->orphan_donetail, ql, qll, flags); ql = qll = 0; rsp->orphan_donelist = NULL; rsp->orphan_donetail = &rsp->orphan_donelist; } if (rsp->orphan_nxtlist != NULL) { __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll); + rsp->orphan_nxttail, ql, qll, flags); ql = qll = 0; rsp->orphan_nxtlist = NULL; rsp->orphan_nxttail = &rsp->orphan_nxtlist; @@ -2209,6 +2246,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); c = rcu_start_future_gp(rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -2250,6 +2288,7 @@ static int rcu_nocb_kthread(void *arg) trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("Sleep")); wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); + /* Memory barrier provide by xchg() below. */ } else if (firsttime) { firsttime = 0; trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, @@ -2310,6 +2349,22 @@ static int rcu_nocb_kthread(void *arg) return 0; } +/* Is a deferred wakeup of rcu_nocb_kthread() required? */ +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return ACCESS_ONCE(rdp->nocb_defer_wakeup); +} + +/* Do a deferred wakeup of rcu_nocb_kthread(). */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (!rcu_nocb_need_deferred_wakeup(rdp)) + return; + ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; + wake_up(&rdp->nocb_wq); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); +} + /* Initialize per-rcu_data variables for no-CBs CPUs. */ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { @@ -2365,13 +2420,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) } static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) + bool lazy, unsigned long flags) { return 0; } static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) + struct rcu_data *rdp, + unsigned long flags) { return 0; } @@ -2380,6 +2436,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } +static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) +{ + return false; +} + +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ +} + static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } @@ -2829,3 +2894,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) } #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +/* + * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the + * grace-period kthread will do force_quiescent_state() processing? + * The idea is to avoid waking up RCU core processing on such a + * CPU unless the grace period has extended for too long. + * + * This code relies on the fact that all NO_HZ_FULL CPUs are also + * CONFIG_RCU_NOCB_CPU CPUs. + */ +static bool rcu_nohz_full_cpu(struct rcu_state *rsp) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(smp_processor_id()) && + (!rcu_gp_in_progress(rsp) || + ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) + return 1; +#endif /* #ifdef CONFIG_NO_HZ_FULL */ + return 0; +} diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3596797b7e46..5cdc62e1beeb 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 * @@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); + ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); @@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", + seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n", rdp->n_rp_gp_completed, rdp->n_rp_gp_started, + rdp->n_rp_nocb_defer_wakeup, rdp->n_rp_need_nothing); } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 6cb3dff89e2b..4c0a9b0af469 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -12,8 +12,8 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2001 * @@ -49,7 +49,6 @@ #include <linux/module.h> #define CREATE_TRACE_POINTS -#include <trace/events/rcu.h> #include "rcu.h" @@ -128,6 +127,11 @@ struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +static struct lock_class_key rcu_callback_key; +struct lockdep_map rcu_callback_map = + STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); +EXPORT_SYMBOL_GPL(rcu_callback_map); + int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active && debug_locks && @@ -195,17 +199,6 @@ void wait_rcu_gp(call_rcu_func_t crf) } EXPORT_SYMBOL_GPL(wait_rcu_gp); -#ifdef CONFIG_PROVE_RCU -/* - * wrapper function to avoid #include problems. - */ -int rcu_my_thread_group_empty(void) -{ - return thread_group_empty(current); -} -EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); -#endif /* #ifdef CONFIG_PROVE_RCU */ - #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD static inline void debug_init_rcu_head(struct rcu_head *head) { diff --git a/kernel/relay.c b/kernel/relay.c index 5001c9887db1..52d6a6f56261 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -227,7 +227,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) * relay_remove_buf - remove a channel buffer * @kref: target kernel reference that contains the relay buffer * - * Removes the file from the fileystem, which also frees the + * Removes the file from the filesystem, which also frees the * rchan_buf_struct and the channel buffer. Should only be called from * kref_put(). */ diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 4aa8a305aede..51dbac6a3633 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) counter->parent = parent; } -int res_counter_charge_locked(struct res_counter *counter, unsigned long val, - bool force) +static u64 res_counter_uncharge_locked(struct res_counter *counter, + unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; + return counter->usage; +} + +static int res_counter_charge_locked(struct res_counter *counter, + unsigned long val, bool force) { int ret = 0; @@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, return __res_counter_charge(counter, val, limit_fail_at, true); } -u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - u64 res_counter_uncharge_until(struct res_counter *counter, struct res_counter *top, unsigned long val) diff --git a/kernel/resource.c b/kernel/resource.c index 3f285dce9347..8957d686e29b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -432,11 +432,6 @@ static void resource_clip(struct resource *res, resource_size_t min, res->end = max; } -static bool resource_contains(struct resource *res1, struct resource *res2) -{ - return res1->start <= res2->start && res1->end >= res2->end; -} - /* * Find empty slot in the resource tree with the given range and * alignment constraints @@ -471,10 +466,11 @@ static int __find_resource(struct resource *root, struct resource *old, arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ - avail = *new; avail.start = ALIGN(tmp.start, constraint->align); avail.end = tmp.end; + avail.flags = new->flags & ~IORESOURCE_UNSET; if (avail.start >= tmp.start) { + alloc.flags = avail.flags; alloc.start = constraint->alignf(constraint->alignf_data, &avail, size, constraint->align); alloc.end = alloc.start + size - 1; @@ -515,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new, * @newsize: new size of the resource descriptor * @constraint: the size and alignment constraints to be met. */ -int reallocate_resource(struct resource *root, struct resource *old, +static int reallocate_resource(struct resource *root, struct resource *old, resource_size_t newsize, struct resource_constraint *constraint) { @@ -949,8 +945,8 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; - res->flags |= flags; + res->flags = resource_type(parent); + res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7b621409cf15..ab32b7b0db5c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o -obj-y += wait.o completion.o -obj-$(CONFIG_SMP) += cpupri.o +obj-y += core.o proc.o clock.o cputime.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += wait.o completion.o idle.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 4a073539c58e..e73efba98301 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) struct autogroup *ag; int err; - if (nice < -20 || nice > 19) + if (nice < MIN_NICE || nice > MAX_NICE) return -EINVAL; err = security_task_setnice(current, nice); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c3ae1446461c..3ef6451e972e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -26,9 +26,10 @@ * at 0 on boot (but people really shouldn't rely on that). * * cpu_clock(i) -- can be used from any context, including NMI. - * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) * local_clock() -- is cpu_clock() on the current cpu. * + * sched_clock_cpu(i) + * * How: * * The implementation either uses sched_clock() when @@ -50,15 +51,6 @@ * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * - * Notes: - * - * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things - * like cpufreq interrupts that can change the base clock (TSC) multiplier - * and cause funny jumps in time -- although the filtering provided by - * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it - * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on - * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -66,13 +58,16 @@ #include <linux/percpu.h> #include <linux/ktime.h> #include <linux/sched.h> +#include <linux/static_key.h> +#include <linux/workqueue.h> +#include <linux/compiler.h> /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. * Architectures and sub-architectures can override this. */ -unsigned long long __attribute__((weak)) sched_clock(void) +unsigned long long __weak sched_clock(void) { return (unsigned long long)(jiffies - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); @@ -82,7 +77,52 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -__read_mostly int sched_clock_stable; +static struct static_key __sched_clock_stable = STATIC_KEY_INIT; +static int __sched_clock_stable_early; + +int sched_clock_stable(void) +{ + return static_key_false(&__sched_clock_stable); +} + +static void __set_sched_clock_stable(void) +{ + if (!sched_clock_stable()) + static_key_slow_inc(&__sched_clock_stable); +} + +void set_sched_clock_stable(void) +{ + __sched_clock_stable_early = 1; + + smp_mb(); /* matches sched_clock_init() */ + + if (!sched_clock_running) + return; + + __set_sched_clock_stable(); +} + +static void __clear_sched_clock_stable(struct work_struct *work) +{ + /* XXX worry about clock continuity */ + if (sched_clock_stable()) + static_key_slow_dec(&__sched_clock_stable); +} + +static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); + +void clear_sched_clock_stable(void) +{ + __sched_clock_stable_early = 0; + + smp_mb(); /* matches sched_clock_init() */ + + if (!sched_clock_running) + return; + + schedule_work(&sched_clock_work); +} struct sched_clock_data { u64 tick_raw; @@ -116,6 +156,20 @@ void sched_clock_init(void) } sched_clock_running = 1; + + /* + * Ensure that it is impossible to not do a static_key update. + * + * Either {set,clear}_sched_clock_stable() must see sched_clock_running + * and do the update, or we must see their __sched_clock_stable_early + * and do the update, or both. + */ + smp_mb(); /* matches {set,clear}_sched_clock_stable() */ + + if (__sched_clock_stable_early) + __set_sched_clock_stable(); + else + __clear_sched_clock_stable(NULL); } /* @@ -242,20 +296,20 @@ u64 sched_clock_cpu(int cpu) struct sched_clock_data *scd; u64 clock; - WARN_ON_ONCE(!irqs_disabled()); - - if (sched_clock_stable) + if (sched_clock_stable()) return sched_clock(); if (unlikely(!sched_clock_running)) return 0ull; + preempt_disable_notrace(); scd = cpu_sdc(cpu); if (cpu != smp_processor_id()) clock = sched_clock_remote(scd); else clock = sched_clock_local(scd); + preempt_enable_notrace(); return clock; } @@ -265,7 +319,7 @@ void sched_clock_tick(void) struct sched_clock_data *scd; u64 now, now_gtod; - if (sched_clock_stable) + if (sched_clock_stable()) return; if (unlikely(!sched_clock_running)) @@ -316,14 +370,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); */ u64 cpu_clock(int cpu) { - u64 clock; - unsigned long flags; - - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); + if (!sched_clock_stable()) + return sched_clock_cpu(cpu); - return clock; + return sched_clock(); } /* @@ -335,14 +385,10 @@ u64 cpu_clock(int cpu) */ u64 local_clock(void) { - u64 clock; - unsigned long flags; + if (!sched_clock_stable()) + return sched_clock_cpu(raw_smp_processor_id()); - local_irq_save(flags); - clock = sched_clock_cpu(smp_processor_id()); - local_irq_restore(flags); - - return clock; + return sched_clock(); } #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ @@ -362,12 +408,12 @@ u64 sched_clock_cpu(int cpu) u64 cpu_clock(int cpu) { - return sched_clock_cpu(cpu); + return sched_clock(); } u64 local_clock(void) { - return sched_clock_cpu(0); + return sched_clock(); } #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a88f4a485c5e..268a45ea238c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include <linux/init_task.h> #include <linux/binfmts.h> #include <linux/context_tracking.h> +#include <linux/compiler.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -296,8 +297,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; - - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -434,7 +433,7 @@ void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { __hrtick_restart(rq); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); rq->hrtick_csd_pending = 1; } } @@ -557,12 +556,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { @@ -825,19 +827,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - u64 st; - steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - st = steal_ticks(steal); - steal = st * TICK_NSEC; - rq->prev_steal_time_rq += steal; - delta -= steal; } #endif @@ -899,7 +895,9 @@ static inline int normal_prio(struct task_struct *p) { int prio; - if (task_has_rt_policy(p)) + if (task_has_dl_policy(p)) + prio = MAX_DL_PRIO-1; + else if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -945,7 +943,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class->switched_from) prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio) + } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); } @@ -1108,6 +1106,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) goto out; + trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); out: @@ -1499,8 +1498,7 @@ void scheduler_ipi(void) * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ - if (tif_need_resched()) - set_preempt_need_resched(); + preempt_fold_need_resched(); if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) @@ -1717,6 +1715,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif + RB_CLEAR_NODE(&p->dl.rb_node); + hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + p->dl.dl_runtime = p->dl.runtime = 0; + p->dl.dl_deadline = p->dl.deadline = 0; + p->dl.dl_period = 0; + p->dl.flags = 0; + INIT_LIST_HEAD(&p->rt.run_list); #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1738,8 +1743,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; INIT_LIST_HEAD(&p->numa_entry); p->numa_group = NULL; @@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled) numabalancing_enabled = enabled; } #endif /* CONFIG_SCHED_DEBUG */ -#endif /* CONFIG_NUMA_BALANCING */ + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_numa_balancing(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = numabalancing_enabled; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_numabalancing_state(state); + return err; +} +#endif +#endif /* * fork()/clone()-time setup: */ -void sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); @@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; @@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - if (!rt_prio(p->prio)) + if (dl_prio(p->prio)) { + put_cpu(); + return -EAGAIN; + } else if (rt_prio(p->prio)) { + p->sched_class = &rt_sched_class; + } else { p->sched_class = &fair_sched_class; + } if (p->sched_class->task_fork) p->sched_class->task_fork(p); @@ -1834,12 +1869,125 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p) init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif put_cpu(); + return 0; +} + +unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + /* + * Doing this here saves a lot of checks in all + * the calling paths, and returning zero seems + * safe for them anyway. + */ + if (period == 0) + return 0; + + return div64_u64(runtime << 20, period); +} + +#ifdef CONFIG_SMP +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->rd->dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + int cpus = 0; + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cpus++; + + return cpus; +} +#else +inline struct dl_bw *dl_bw_of(int i) +{ + return &cpu_rq(i)->dl.dl_bw; +} + +static inline int dl_bw_cpus(int i) +{ + return 1; +} +#endif + +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; } /* + * We must be sure that accepting a new task (or allowing changing the + * parameters of an existing one) is consistent with the bandwidth + * constraints. If yes, this function also accordingly updates the currently + * allocated bandwidth to reflect the new situation. + * + * This function is called while holding p's rq->lock. + */ +static int dl_overflow(struct task_struct *p, int policy, + const struct sched_attr *attr) +{ + + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + u64 period = attr->sched_period ?: attr->sched_deadline; + u64 runtime = attr->sched_runtime; + u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; + int cpus, err = -1; + + if (new_bw == p->dl.dl_bw) + return 0; + + /* + * Either if a task, enters, leave, or stays -deadline but changes + * its parameters, we may need to update accordingly the total + * allocated bandwidth of the container. + */ + raw_spin_lock(&dl_b->lock); + cpus = dl_bw_cpus(task_cpu(p)); + if (dl_policy(policy) && !task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, 0, new_bw)) { + __dl_add(dl_b, new_bw); + err = 0; + } else if (dl_policy(policy) && task_has_dl_policy(p) && + !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + __dl_clear(dl_b, p->dl.dl_bw); + __dl_add(dl_b, new_bw); + err = 0; + } else if (!dl_policy(policy) && task_has_dl_policy(p)) { + __dl_clear(dl_b, p->dl.dl_bw); + err = 0; + } + raw_spin_unlock(&dl_b->lock); + + return err; +} + +extern void init_dl_bw(struct dl_bw *dl_b); + +/* * wake_up_new_task - wake up a newly created task for the first time. * * This function will do some initial scheduler statistics housekeeping @@ -2001,7 +2149,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { - task_numa_free(prev); + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); /* * Remove function-return probe instances associated with this @@ -2016,13 +2165,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) #ifdef CONFIG_SMP -/* assumes rq->lock is held */ -static inline void pre_schedule(struct rq *rq, struct task_struct *prev) -{ - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -} - /* rq->lock is NOT held, but preemption is disabled */ static inline void post_schedule(struct rq *rq) { @@ -2040,10 +2182,6 @@ static inline void post_schedule(struct rq *rq) #else -static inline void pre_schedule(struct rq *rq, struct task_struct *p) -{ -} - static inline void post_schedule(struct rq *rq) { } @@ -2296,7 +2434,7 @@ void scheduler_tick(void) #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + trigger_load_balance(rq); #endif rq_last_tick_reset(rq); } @@ -2325,7 +2463,7 @@ u64 scheduler_tick_max_deferment(void) if (time_before_eq(next, now)) return 0; - return jiffies_to_usecs(next - now) * NSEC_PER_USEC; + return jiffies_to_nsecs(next - now); } #endif @@ -2359,8 +2497,13 @@ void __kprobes preempt_count_add(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + if (preempt_count() == val) { + unsigned long ip = get_parent_ip(CALLER_ADDR1); +#ifdef CONFIG_DEBUG_PREEMPT + current->preempt_disable_ip = ip; +#endif + trace_preempt_off(CALLER_ADDR0, ip); + } } EXPORT_SYMBOL(preempt_count_add); @@ -2403,6 +2546,13 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); +#ifdef CONFIG_DEBUG_PREEMPT + if (in_atomic_preempt_off()) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -2414,10 +2564,10 @@ static inline void schedule_debug(struct task_struct *prev) { /* * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. + * schedule() atomically, we ignore that path. Otherwise whine + * if we are scheduling when we should not. */ - if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) __schedule_bug(prev); rcu_sleep_check(); @@ -2426,36 +2576,34 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); } -static void put_prev_task(struct rq *rq, struct task_struct *prev) -{ - if (prev->on_rq || rq->skip_clock_update < 0) - update_rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev); -} - /* * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq) +pick_next_task(struct rq *rq, struct task_struct *prev) { - const struct sched_class *class; + const struct sched_class *class = &fair_sched_class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq); - if (likely(p)) + if (likely(prev->sched_class == class && + rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev); + if (likely(p && p != RETRY_TASK)) return p; } +again: for_each_class(class) { - p = class->pick_next_task(rq); - if (p) + p = class->pick_next_task(rq, prev); + if (p) { + if (unlikely(p == RETRY_TASK)) + goto again; return p; + } } BUG(); /* the idle class will always have a runnable task */ @@ -2549,13 +2697,10 @@ need_resched: switch_count = &prev->nvcsw; } - pre_schedule(rq, prev); - - if (unlikely(!rq->nr_running)) - idle_balance(cpu, rq); + if (prev->on_rq || rq->skip_clock_update < 0) + update_rq_clock(rq); - put_prev_task(rq, prev); - next = pick_next_task(rq); + next = pick_next_task(rq, prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->skip_clock_update = 0; @@ -2701,52 +2846,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -static long __sched -sleep_on_common(wait_queue_head_t *q, int state, long timeout) -{ - unsigned long flags; - wait_queue_t wait; - - init_waitqueue_entry(&wait, current); - - __set_current_state(state); - - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, &wait); - spin_unlock(&q->lock); - timeout = schedule_timeout(timeout); - spin_lock_irq(&q->lock); - __remove_wait_queue(q, &wait); - spin_unlock_irqrestore(&q->lock, flags); - - return timeout; -} - -void __sched interruptible_sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(interruptible_sleep_on); - -long __sched -interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(interruptible_sleep_on_timeout); - -void __sched sleep_on(wait_queue_head_t *q) -{ - sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} -EXPORT_SYMBOL(sleep_on); - -long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) -{ - return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); -} -EXPORT_SYMBOL(sleep_on_timeout); - #ifdef CONFIG_RT_MUTEXES /* @@ -2757,15 +2856,16 @@ EXPORT_SYMBOL(sleep_on_timeout); * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). * - * Used by the rt_mutex code to implement priority inheritance logic. + * Used by the rt_mutex code to implement priority inheritance + * logic. Call site only calls if the priority of the task changed. */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, on_rq, running; + int oldprio, on_rq, running, enqueue_flag = 0; struct rq *rq; const struct sched_class *prev_class; - BUG_ON(prio < 0 || prio > MAX_PRIO); + BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); @@ -2788,6 +2888,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } trace_sched_pi_setprio(p, prio); + p->pi_top_task = rt_mutex_get_top_task(p); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->on_rq; @@ -2797,30 +2898,56 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->put_prev_task(rq, p); - if (rt_prio(prio)) + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || (p->pi_top_task && + dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { + p->dl.dl_boosted = 1; + p->dl.dl_throttled = 0; + enqueue_flag = ENQUEUE_REPLENISH; + } else + p->dl.dl_boosted = 0; + p->sched_class = &dl_sched_class; + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; + if (oldprio < prio) + enqueue_flag = ENQUEUE_HEAD; p->sched_class = &rt_sched_class; - else + } else { + if (dl_prio(oldprio)) + p->dl.dl_boosted = 0; p->sched_class = &fair_sched_class; + } p->prio = prio; if (running) p->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + enqueue_task(rq, p, enqueue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: __task_rq_unlock(rq); } #endif + void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; unsigned long flags; struct rq *rq; - if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; /* * We have to be careful, if called from sys_setpriority(), @@ -2831,9 +2958,9 @@ void set_user_nice(struct task_struct *p, long nice) * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is - * SCHED_FIFO/SCHED_RR: + * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ - if (task_has_rt_policy(p)) { + if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -2898,11 +3025,11 @@ SYSCALL_DEFINE1(nice, int, increment) if (increment > 40) increment = 40; - nice = TASK_NICE(current) + increment; - if (nice < -20) - nice = -20; - if (nice > 19) - nice = 19; + nice = task_nice(current) + increment; + if (nice < MIN_NICE) + nice = MIN_NICE; + if (nice > MAX_NICE) + nice = MAX_NICE; if (increment < 0 && !can_nice(current, nice)) return -EPERM; @@ -2931,18 +3058,6 @@ int task_prio(const struct task_struct *p) } /** - * task_nice - return the nice value of a given task. - * @p: the task in question. - * - * Return: The nice value [ -20 ... 0 ... 19 ]. - */ -int task_nice(const struct task_struct *p) -{ - return TASK_NICE(p); -} -EXPORT_SYMBOL(task_nice); - -/** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. * @@ -2988,20 +3103,102 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold rq lock. */ +/* + * This function initializes the sched_dl_entity of a newly becoming + * SCHED_DEADLINE task. + * + * Only the static values are considered here, the actual runtime and the + * absolute deadline will be properly calculated when the task is enqueued + * for the first time with its new policy. + */ static void -__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +__setparam_dl(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + init_dl_task_timer(dl_se); + dl_se->dl_runtime = attr->sched_runtime; + dl_se->dl_deadline = attr->sched_deadline; + dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; + dl_se->flags = attr->sched_flags; + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_throttled = 0; + dl_se->dl_new = 1; +} + +static void __setscheduler_params(struct task_struct *p, + const struct sched_attr *attr) { + int policy = attr->sched_policy; + + if (policy == -1) /* setparam */ + policy = p->policy; + p->policy = policy; - p->rt_priority = prio; + + if (dl_policy(policy)) + __setparam_dl(p, attr); + else if (fair_policy(policy)) + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when + * !rt_policy. Always setting this ensures that things like + * getparam()/getattr() don't report silly values for !rt tasks. + */ + p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - /* we are holding p->pi_lock already */ - p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) + set_load_weight(p); +} + +/* Actually do priority change: must hold pi & rq lock. */ +static void __setscheduler(struct rq *rq, struct task_struct *p, + const struct sched_attr *attr) +{ + __setscheduler_params(p, attr); + + /* + * If we get here, there was no pi waiters boosting the + * task. It is safe to use the normal prio. + */ + p->prio = normal_prio(p); + + if (dl_prio(p->prio)) + p->sched_class = &dl_sched_class; + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; - set_load_weight(p); +} + +static void +__getparam_dl(struct task_struct *p, struct sched_attr *attr) +{ + struct sched_dl_entity *dl_se = &p->dl; + + attr->sched_priority = p->rt_priority; + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + attr->sched_period = dl_se->dl_period; + attr->sched_flags = dl_se->flags; +} + +/* + * This function validates the new parameters of a -deadline task. + * We ask for the deadline not being zero, and greater or equal + * than the runtime, as well as the period of being zero or + * greater than deadline. Furthermore, we have to be sure that + * user parameters are above the internal resolution (1us); we + * check sched_runtime only since it is always the smaller one. + */ +static bool +__checkparam_dl(const struct sched_attr *attr) +{ + return attr && attr->sched_deadline != 0 && + (attr->sched_period == 0 || + (s64)(attr->sched_period - attr->sched_deadline) >= 0) && + (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && + attr->sched_runtime >= (2 << (DL_SCALE - 1)); } /* @@ -3020,10 +3217,14 @@ static bool check_same_owner(struct task_struct *p) return match; } -static int __sched_setscheduler(struct task_struct *p, int policy, - const struct sched_param *param, bool user) +static int __sched_setscheduler(struct task_struct *p, + const struct sched_attr *attr, + bool user) { + int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : + MAX_RT_PRIO - 1 - attr->sched_priority; int retval, oldprio, oldpolicy = -1, on_rq, running; + int policy = attr->sched_policy; unsigned long flags; const struct sched_class *prev_class; struct rq *rq; @@ -3037,31 +3238,40 @@ recheck: reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; } else { - reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); - policy &= ~SCHED_RESET_ON_FORK; + reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_FIFO && policy != SCHED_RR && + if (policy != SCHED_DEADLINE && + policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && policy != SCHED_IDLE) return -EINVAL; } + if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) + return -EINVAL; + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (rt_policy(policy) != (param->sched_priority != 0)) + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_policy(policy) != (attr->sched_priority != 0))) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (user && !capable(CAP_SYS_NICE)) { + if (fair_policy(policy)) { + if (attr->sched_nice < task_nice(p) && + !can_nice(p, attr->sched_nice)) + return -EPERM; + } + if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); @@ -3071,17 +3281,26 @@ recheck: return -EPERM; /* can't increase priority */ - if (param->sched_priority > p->rt_priority && - param->sched_priority > rlim_rtprio) + if (attr->sched_priority > p->rt_priority && + attr->sched_priority > rlim_rtprio) return -EPERM; } + /* + * Can't set/change SCHED_DEADLINE policy at all for now + * (safest behavior); in the future we would like to allow + * unprivileged DL tasks to increase their relative deadline + * or reduce their runtime (both ways reducing utilization) + */ + if (dl_policy(policy)) + return -EPERM; + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { - if (!can_nice(p, TASK_NICE(p))) + if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3118,16 +3337,25 @@ recheck: } /* - * If not changing anything there's no need to proceed further: + * If not changing anything there's no need to proceed further, + * but store a possible modification of reset_on_fork. */ - if (unlikely(policy == p->policy && (!rt_policy(policy) || - param->sched_priority == p->rt_priority))) { + if (unlikely(policy == p->policy)) { + if (fair_policy(policy) && attr->sched_nice != task_nice(p)) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; + if (dl_policy(policy)) + goto change; + + p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &flags); return 0; } +change: -#ifdef CONFIG_RT_GROUP_SCHED if (user) { +#ifdef CONFIG_RT_GROUP_SCHED /* * Do not allow realtime tasks into groups that have no runtime * assigned. @@ -3138,8 +3366,24 @@ recheck: task_rq_unlock(rq, p, &flags); return -EPERM; } - } #endif +#ifdef CONFIG_SMP + if (dl_bandwidth_enabled() && dl_policy(policy)) { + cpumask_t *span = rq->rd->span; + + /* + * Don't allow tasks with an affinity mask smaller than + * the entire root_domain to become SCHED_DEADLINE. We + * will also fail if there's no bandwidth available. + */ + if (!cpumask_subset(span, &p->cpus_allowed) || + rq->rd->dl_bw.bw == 0) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { @@ -3147,6 +3391,35 @@ recheck: task_rq_unlock(rq, p, &flags); goto recheck; } + + /* + * If setscheduling to SCHED_DEADLINE (or changing the parameters + * of a SCHED_DEADLINE task) we need to check if enough bandwidth + * is available. + */ + if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { + task_rq_unlock(rq, p, &flags); + return -EBUSY; + } + + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; + + /* + * Special case for priority boosted tasks. + * + * If the new priority is lower or equal (user space view) + * than the current (boosted) priority, we just store the new + * normal parameters and do not touch the scheduler class and + * the runqueue. This will be done when the task deboost + * itself. + */ + if (rt_mutex_check_prio(p, newprio)) { + __setscheduler_params(p, attr); + task_rq_unlock(rq, p, &flags); + return 0; + } + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) @@ -3154,16 +3427,18 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); - p->sched_reset_on_fork = reset_on_fork; - - oldprio = p->prio; prev_class = p->sched_class; - __setscheduler(rq, p, policy, param->sched_priority); + __setscheduler(rq, p, attr); if (running) p->sched_class->set_curr_task(rq); - if (on_rq) - enqueue_task(rq, p, 0); + if (on_rq) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); + } check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -3173,6 +3448,26 @@ recheck: return 0; } +static int _sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool check) +{ + struct sched_attr attr = { + .sched_policy = policy, + .sched_priority = param->sched_priority, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + + /* + * Fixup the legacy SCHED_RESET_ON_FORK hack + */ + if (policy & SCHED_RESET_ON_FORK) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + policy &= ~SCHED_RESET_ON_FORK; + attr.sched_policy = policy; + } + + return __sched_setscheduler(p, &attr, check); +} /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. @@ -3186,10 +3481,16 @@ recheck: int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, true); + return _sched_setscheduler(p, policy, param, true); } EXPORT_SYMBOL_GPL(sched_setscheduler); +int sched_setattr(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, true); +} +EXPORT_SYMBOL_GPL(sched_setattr); + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -3206,7 +3507,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { - return __sched_setscheduler(p, policy, param, false); + return _sched_setscheduler(p, policy, param, false); } static int @@ -3231,6 +3532,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) return retval; } +/* + * Mimics kernel/events/core.c perf_copy_attr(). + */ +static int sched_copy_attr(struct sched_attr __user *uattr, + struct sched_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = SCHED_ATTR_SIZE_VER0; + + if (size < SCHED_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * XXX: do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? + */ + attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. @@ -3262,6 +3636,34 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) } /** + * sys_sched_setattr - same as above, but with extended sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + */ +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) +{ + struct sched_attr attr; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || flags) + return -EINVAL; + + if (sched_copy_attr(uattr, &attr)) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setattr(p, &attr); + rcu_read_unlock(); + + return retval; +} + +/** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. * @@ -3316,6 +3718,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; + if (task_has_dl_policy(p)) { + retval = -EINVAL; + goto out_unlock; + } lp.sched_priority = p->rt_priority; rcu_read_unlock(); @@ -3331,6 +3737,96 @@ out_unlock: return retval; } +static int sched_read_attr(struct sched_attr __user *uattr, + struct sched_attr *attr, + unsigned int usize) +{ + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, usize)) + return -EFAULT; + + /* + * If we're handed a smaller struct than we know of, + * ensure all the unknown bits are 0 - i.e. old + * user-space does not get uncomplete information. + */ + if (usize < sizeof(*attr)) { + unsigned char *addr; + unsigned char *end; + + addr = (void *)attr + usize; + end = (void *)attr + sizeof(*attr); + + for (; addr < end; addr++) { + if (*addr) + goto err_size; + } + + attr->size = usize; + } + + ret = copy_to_user(uattr, attr, attr->size); + if (ret) + return -EFAULT; + +out: + return ret; + +err_size: + ret = -E2BIG; + goto out; +} + +/** + * sys_sched_getattr - similar to sched_getparam, but with sched_attr + * @pid: the pid in question. + * @uattr: structure containing the extended parameters. + * @size: sizeof(attr) for fwd/bwd comp. + */ +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + }; + struct task_struct *p; + int retval; + + if (!uattr || pid < 0 || size > PAGE_SIZE || + size < SCHED_ATTR_SIZE_VER0 || flags) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + attr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + if (task_has_dl_policy(p)) + __getparam_dl(p, &attr); + else if (task_has_rt_policy(p)) + attr.sched_priority = p->rt_priority; + else + attr.sched_nice = task_nice(p); + + rcu_read_unlock(); + + retval = sched_read_attr(uattr, &attr, size); + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { cpumask_var_t cpus_allowed, new_mask; @@ -3375,8 +3871,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_unlock; + cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); + + /* + * Since bandwidth control happens on root_domain basis, + * if admission test is enabled, we only admit -deadline + * tasks allowed to run on all the CPUs in the task's + * root_domain. + */ +#ifdef CONFIG_SMP + if (task_has_dl_policy(p)) { + const struct cpumask *span = task_rq(p)->rd->span; + + if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { + retval = -EBUSY; + goto out_unlock; + } + } +#endif again: retval = set_cpus_allowed_ptr(p, new_mask); @@ -3653,7 +4167,7 @@ again: } double_rq_lock(rq, p_rq); - while (task_rq(p) != p_rq) { + if (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); goto again; } @@ -3742,6 +4256,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -3768,6 +4283,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) case SCHED_RR: ret = 1; break; + case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_IDLE: @@ -3811,7 +4327,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, goto out_unlock; rq = task_rq_lock(p, &flags); - time_slice = p->sched_class->get_rr_interval(rq, p); + time_slice = 0; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, p, &flags); rcu_read_unlock(); @@ -3935,6 +4453,7 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; + idle->on_rq = 1; #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif @@ -4090,6 +4609,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } @@ -4153,8 +4673,10 @@ void idle_task_exit(void) BUG_ON(cpu_online(smp_processor_id())); - if (mm != &init_mm) + if (mm != &init_mm) { switch_mm(mm, &init_mm, current); + finish_arch_post_lock_switch(); + } mmdrop(mm); } @@ -4172,6 +4694,22 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) +{ +} + +static const struct sched_class fake_sched_class = { + .put_prev_task = put_prev_task_fake, +}; + +static struct task_struct fake_task = { + /* + * Avoid pull_{rt,dl}_task() + */ + .prio = MAX_PRIO + 1, + .sched_class = &fake_sched_class, +}; + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -4212,7 +4750,7 @@ static void migrate_tasks(unsigned int dead_cpu) if (rq->nr_running == 1) break; - next = pick_next_task(rq); + next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -4302,7 +4840,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(13); + struct ctl_table *table = sd_alloc_ctl_entry(14); if (table == NULL) return NULL; @@ -4330,9 +4868,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax, false); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "name", sd->name, + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[12] is terminator */ + /* &table[13] is terminator */ return table; } @@ -4514,13 +5055,31 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { + unsigned long flags; + long cpu = (long)hcpu; + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active((long)hcpu, false); + set_cpu_active(cpu, false); + + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + struct dl_bw *dl_b = dl_bw_of(cpu); + bool overflow; + int cpus; + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + if (overflow) + return notifier_from_errno(-EBUSY); + } return NOTIFY_OK; - default: - return NOTIFY_DONE; } + + return NOTIFY_DONE; } static int __init migration_init(void) @@ -4739,6 +5298,8 @@ static void free_rootdomain(struct rcu_head *rcu) struct root_domain *rd = container_of(rcu, struct root_domain, rcu); cpupri_cleanup(&rd->cpupri); + cpudl_cleanup(&rd->cpudl); + free_cpumask_var(rd->dlo_mask); free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); @@ -4790,8 +5351,14 @@ static int init_rootdomain(struct root_domain *rd) goto out; if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_dlo_mask; + + init_dl_bw(&rd->dl_bw); + if (cpudl_init(&rd->cpudl) != 0) + goto free_dlo_mask; if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; @@ -4799,6 +5366,8 @@ static int init_rootdomain(struct root_domain *rd) free_rto_mask: free_cpumask_var(rd->rto_mask); +free_dlo_mask: + free_cpumask_var(rd->dlo_mask); free_online: free_cpumask_var(rd->online); free_span: @@ -5884,7 +6453,7 @@ static cpumask_var_t fallback_doms; * cpu core maps. It is supposed to return 1 if the topology changed * or 0 if it stayed the same. */ -int __attribute__((weak)) arch_update_cpu_topology(void) +int __weak arch_update_cpu_topology(void) { return 0; } @@ -6150,6 +6719,7 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); + init_sched_dl_class(); } #else void __init sched_init_smp(void) @@ -6219,13 +6789,15 @@ void __init sched_init(void) #endif /* CONFIG_CPUMASK_OFFSTACK */ } + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + init_dl_bandwidth(&def_dl_bandwidth, + global_rt_period(), global_rt_runtime()); + #ifdef CONFIG_SMP init_defrootdomain(); #endif - init_rt_bandwidth(&def_rt_bandwidth, - global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -6249,6 +6821,7 @@ void __init sched_init(void) rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); + init_dl_rq(&rq->dl, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); @@ -6277,7 +6850,6 @@ void __init sched_init(void) rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif @@ -6320,10 +6892,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_RT_MUTEXES - plist_head_init(&init_task.pi_waiters); -#endif - /* * The boot idle thread does lazy MMU switching as well: */ @@ -6370,7 +6938,8 @@ void __might_sleep(const char *file, int line, int preempt_offset) static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ - if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && + !is_idle_task(current)) || system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) @@ -6388,6 +6957,13 @@ void __might_sleep(const char *file, int line, int preempt_offset) debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); +#ifdef CONFIG_DEBUG_PREEMPT + if (!preempt_count_equals(preempt_offset)) { + pr_err("Preemption disabled at:"); + print_ip_sym(current->preempt_disable_ip); + pr_cont("\n"); + } +#endif dump_stack(); } EXPORT_SYMBOL(__might_sleep); @@ -6397,13 +6973,16 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { const struct sched_class *prev_class = p->sched_class; + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + }; int old_prio = p->prio; int on_rq; on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); - __setscheduler(rq, p, SCHED_NORMAL, 0); + __setscheduler(rq, p, &attr); if (on_rq) { enqueue_task(rq, p, 0); resched_task(rq->curr); @@ -6433,12 +7012,12 @@ void normalize_rt_tasks(void) p->se.statistics.block_start = 0; #endif - if (!rt_task(p)) { + if (!dl_task(p) && !rt_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: */ - if (TASK_NICE(p) < 0 && p->mm) + if (task_nice(p) < 0 && p->mm) set_user_nice(p, 0); continue; } @@ -6606,7 +7185,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -6628,16 +7207,6 @@ void sched_move_task(struct task_struct *tsk) } #endif /* CONFIG_CGROUP_SCHED */ -#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) -static unsigned long to_ratio(u64 period, u64 runtime) -{ - if (runtime == RUNTIME_INF) - return 1ULL << 20; - - return div64_u64(runtime << 20, period); -} -#endif - #ifdef CONFIG_RT_GROUP_SCHED /* * Ensure that the real time constraints are schedulable. @@ -6811,24 +7380,13 @@ static long sched_group_rt_period(struct task_group *tg) do_div(rt_period_us, NSEC_PER_USEC); return rt_period_us; } +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED static int sched_rt_global_constraints(void) { - u64 runtime, period; int ret = 0; - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - runtime = global_rt_runtime(); - period = global_rt_period(); - - /* - * Sanity check on the sysctl variables. - */ - if (runtime > period && runtime != RUNTIME_INF) - return -EINVAL; - mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); @@ -6851,17 +7409,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) static int sched_rt_global_constraints(void) { unsigned long flags; - int i; - - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - - /* - * There's always some RT tasks in the root group - * -- migration, kstopmachine etc.. - */ - if (sysctl_sched_rt_runtime == 0) - return -EBUSY; + int i, ret = 0; raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { @@ -6873,36 +7421,91 @@ static int sched_rt_global_constraints(void) } raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; + return ret; } #endif /* CONFIG_RT_GROUP_SCHED */ -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int sched_dl_global_constraints(void) { - int ret; - static DEFINE_MUTEX(mutex); + u64 runtime = global_rt_runtime(); + u64 period = global_rt_period(); + u64 new_bw = to_ratio(period, runtime); + int cpu, ret = 0; + unsigned long flags; - mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); - /* make sure that internally we keep jiffies */ - /* also, writing zero resets timeslice to default */ - if (!ret && write) { - sched_rr_timeslice = sched_rr_timeslice <= 0 ? - RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + /* + * Here we want to check the bandwidth not being set to some + * value smaller than the currently allocated bandwidth in + * any of the root_domains. + * + * FIXME: Cycling on all the CPUs is overdoing, but simpler than + * cycling on root_domains... Discussion on different/better + * solutions is welcome! + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + if (new_bw < dl_b->total_bw) + ret = -EBUSY; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + if (ret) + break; } - mutex_unlock(&mutex); + return ret; } +static void sched_dl_do_global(void) +{ + u64 new_bw = -1; + int cpu; + unsigned long flags; + + def_dl_bandwidth.dl_period = global_rt_period(); + def_dl_bandwidth.dl_runtime = global_rt_runtime(); + + if (global_rt_runtime() != RUNTIME_INF) + new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + + /* + * FIXME: As above... + */ + for_each_possible_cpu(cpu) { + struct dl_bw *dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + dl_b->bw = new_bw; + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + } +} + +static int sched_rt_global_validate(void) +{ + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + if ((sysctl_sched_rt_runtime != RUNTIME_INF) && + (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) + return -EINVAL; + + return 0; +} + +static void sched_rt_do_global(void) +{ + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +} + int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret; int old_period, old_runtime; static DEFINE_MUTEX(mutex); + int ret; mutex_lock(&mutex); old_period = sysctl_sched_rt_period; @@ -6911,21 +7514,50 @@ int sched_rt_handler(struct ctl_table *table, int write, ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { + ret = sched_rt_global_validate(); + if (ret) + goto undo; + ret = sched_rt_global_constraints(); - if (ret) { - sysctl_sched_rt_period = old_period; - sysctl_sched_rt_runtime = old_runtime; - } else { - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = - ns_to_ktime(global_rt_period()); - } + if (ret) + goto undo; + + ret = sched_dl_global_constraints(); + if (ret) + goto undo; + + sched_rt_do_global(); + sched_dl_do_global(); + } + if (0) { +undo: + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; } mutex_unlock(&mutex); return ret; } +int sched_rr_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + /* make sure that internally we keep jiffies */ + /* also, writing zero resets timeslice to default */ + if (!ret && write) { + sched_rr_timeslice = sched_rr_timeslice <= 0 ? + RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + } + mutex_unlock(&mutex); + return ret; +} + #ifdef CONFIG_CGROUP_SCHED static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -6980,7 +7612,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -6998,7 +7630,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } @@ -7258,15 +7890,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v) { - struct task_group *tg = css_tg(css); + struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - cb->fill(cb, "nr_periods", cfs_b->nr_periods); - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); - cb->fill(cb, "throttled_time", cfs_b->throttled_time); + seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); + seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); + seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); return 0; } @@ -7320,7 +7951,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .read_map = cpu_stats_show, + .seq_show = cpu_stats_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -7338,8 +7969,7 @@ static struct cftype cpu_files[] = { { } /* terminate */ }; -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", +struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, @@ -7347,7 +7977,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .subsys_id = cpu_cgroup_subsys_id, .base_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f64722ff0299..c143ee380e3a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return css_ca(task_css(tsk, cpuacct_subsys_id)); + return css_ca(task_css(tsk, cpuacct_cgrp_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) @@ -163,10 +163,9 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; @@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct seq_file *sf, void *v) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(sf)); int cpu; s64 val = 0; @@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, val += kcpustat->cpustat[CPUTIME_NICE]; } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; for_each_online_cpu(cpu) { @@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); return 0; } @@ -220,11 +218,11 @@ static struct cftype files[] = { }, { .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, + .seq_show = cpuacct_percpu_seq_show, }, { .name = "stat", - .read_map = cpuacct_stats_show, + .seq_show = cpuacct_stats_show, }, { } /* terminate */ }; @@ -277,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, .base_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c new file mode 100644 index 000000000000..5b9bb42b2d47 --- /dev/null +++ b/kernel/sched/cpudeadline.c @@ -0,0 +1,216 @@ +/* + * kernel/sched/cpudl.c + * + * Global CPU deadline management + * + * Author: Juri Lelli <j.lelli@sssup.it> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include <linux/gfp.h> +#include <linux/kernel.h> +#include "cpudeadline.h" + +static inline int parent(int i) +{ + return (i - 1) >> 1; +} + +static inline int left_child(int i) +{ + return (i << 1) + 1; +} + +static inline int right_child(int i) +{ + return (i << 1) + 2; +} + +static inline int dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +static void cpudl_exchange(struct cpudl *cp, int a, int b) +{ + int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; + + swap(cp->elements[a], cp->elements[b]); + swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); +} + +static void cpudl_heapify(struct cpudl *cp, int idx) +{ + int l, r, largest; + + /* adapted from lib/prio_heap.c */ + while(1) { + l = left_child(idx); + r = right_child(idx); + largest = idx; + + if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, + cp->elements[l].dl)) + largest = l; + if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, + cp->elements[r].dl)) + largest = r; + if (largest == idx) + break; + + /* Push idx down the heap one level and bump one up */ + cpudl_exchange(cp, largest, idx); + idx = largest; + } +} + +static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) +{ + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); + + if (dl_time_before(new_dl, cp->elements[idx].dl)) { + cp->elements[idx].dl = new_dl; + cpudl_heapify(cp, idx); + } else { + cp->elements[idx].dl = new_dl; + while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, + cp->elements[idx].dl)) { + cpudl_exchange(cp, idx, parent(idx)); + idx = parent(idx); + } + } +} + +static inline int cpudl_maximum(struct cpudl *cp) +{ + return cp->elements[0].cpu; +} + +/* + * cpudl_find - find the best (later-dl) CPU in the system + * @cp: the cpudl max-heap context + * @p: the task + * @later_mask: a mask to fill in with the selected CPUs (or NULL) + * + * Returns: int - best CPU (heap maximum if suitable) + */ +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask) +{ + int best_cpu = -1; + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && cpumask_and(later_mask, cp->free_cpus, + &p->cpus_allowed) && cpumask_and(later_mask, + later_mask, cpu_active_mask)) { + best_cpu = cpumask_any(later_mask); + goto out; + } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + best_cpu = cpudl_maximum(cp); + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); + } + +out: + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + + return best_cpu; +} + +/* + * cpudl_set - update the cpudl max-heap + * @cp: the cpudl max-heap context + * @cpu: the target cpu + * @dl: the new earliest deadline for this cpu + * + * Notes: assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) +{ + int old_idx, new_cpu; + unsigned long flags; + + WARN_ON(!cpu_present(cpu)); + + raw_spin_lock_irqsave(&cp->lock, flags); + old_idx = cp->cpu_to_idx[cpu]; + if (!is_valid) { + /* remove item */ + if (old_idx == IDX_INVALID) { + /* + * Nothing to remove if old_idx was invalid. + * This could happen if a rq_offline_dl is + * called for a CPU without -dl tasks running. + */ + goto out; + } + new_cpu = cp->elements[cp->size - 1].cpu; + cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; + cp->elements[old_idx].cpu = new_cpu; + cp->size--; + cp->cpu_to_idx[new_cpu] = old_idx; + cp->cpu_to_idx[cpu] = IDX_INVALID; + while (old_idx > 0 && dl_time_before( + cp->elements[parent(old_idx)].dl, + cp->elements[old_idx].dl)) { + cpudl_exchange(cp, old_idx, parent(old_idx)); + old_idx = parent(old_idx); + } + cpumask_set_cpu(cpu, cp->free_cpus); + cpudl_heapify(cp, old_idx); + + goto out; + } + + if (old_idx == IDX_INVALID) { + cp->size++; + cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].cpu = cpu; + cp->cpu_to_idx[cpu] = cp->size - 1; + cpudl_change_key(cp, cp->size - 1, dl); + cpumask_clear_cpu(cpu, cp->free_cpus); + } else { + cpudl_change_key(cp, old_idx, dl); + } + +out: + raw_spin_unlock_irqrestore(&cp->lock, flags); +} + +/* + * cpudl_init - initialize the cpudl structure + * @cp: the cpudl max-heap context + */ +int cpudl_init(struct cpudl *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + raw_spin_lock_init(&cp->lock); + cp->size = 0; + for (i = 0; i < NR_CPUS; i++) + cp->cpu_to_idx[i] = IDX_INVALID; + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + return -ENOMEM; + cpumask_setall(cp->free_cpus); + + return 0; +} + +/* + * cpudl_cleanup - clean up the cpudl structure + * @cp: the cpudl max-heap context + */ +void cpudl_cleanup(struct cpudl *cp) +{ + /* + * nothing to do for the moment + */ +} diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h new file mode 100644 index 000000000000..a202789a412c --- /dev/null +++ b/kernel/sched/cpudeadline.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_CPUDL_H +#define _LINUX_CPUDL_H + +#include <linux/sched.h> + +#define IDX_INVALID -1 + +struct array_item { + u64 dl; + int cpu; +}; + +struct cpudl { + raw_spinlock_t lock; + int size; + int cpu_to_idx[NR_CPUS]; + struct array_item elements[NR_CPUS]; + cpumask_var_t free_cpus; +}; + + +#ifdef CONFIG_SMP +int cpudl_find(struct cpudl *cp, struct task_struct *p, + struct cpumask *later_mask); +void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); +int cpudl_init(struct cpudl *cp); +void cpudl_cleanup(struct cpudl *cp); +#else +#define cpudl_set(cp, cpu, dl) do { } while (0) +#define cpudl_init() do { } while (0) +#endif /* CONFIG_SMP */ + +#endif /* _LINUX_CPUDL_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 99947919e30b..a95097cb4591 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime, p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ task_group_account_field(p, index, (__force u64) cputime); @@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, p->gtime += cputime; /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { + if (task_nice(p) > 0) { cpustat[CPUTIME_NICE] += (__force u64) cputime; cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { @@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; + u64 steal; + cputime_t steal_ct; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; + /* + * cputime_t may be less precise than nsecs (eg: if it's + * based on jiffies). Lets cast the result to cputime + * granularity and account the rest on the next rounds. + */ + steal_ct = nsecs_to_cputime(steal); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - account_steal_time(st); - return st; + account_steal_time(steal_ct); + return steal_ct; } #endif return false; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c new file mode 100644 index 000000000000..27ef40925525 --- /dev/null +++ b/kernel/sched/deadline.c @@ -0,0 +1,1665 @@ +/* + * Deadline Scheduling Class (SCHED_DEADLINE) + * + * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS). + * + * Tasks that periodically executes their instances for less than their + * runtime won't miss any of their deadlines. + * Tasks that are not periodic or sporadic or that tries to execute more + * than their reserved bandwidth will be slowed down (and may potentially + * miss some of their deadlines), and won't affect any other task. + * + * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>, + * Juri Lelli <juri.lelli@gmail.com>, + * Michael Trimarchi <michael@amarulasolutions.com>, + * Fabio Checconi <fchecconi@gmail.com> + */ +#include "sched.h" + +#include <linux/slab.h> + +struct dl_bandwidth def_dl_bandwidth; + +static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) +{ + return container_of(dl_se, struct task_struct, dl); +} + +static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) +{ + return container_of(dl_rq, struct rq, dl); +} + +static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + + return &rq->dl; +} + +static inline int on_dl_rq(struct sched_dl_entity *dl_se) +{ + return !RB_EMPTY_NODE(&dl_se->rb_node); +} + +static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) +{ + struct sched_dl_entity *dl_se = &p->dl; + + return dl_rq->rb_leftmost == &dl_se->rb_node; +} + +void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime) +{ + raw_spin_lock_init(&dl_b->dl_runtime_lock); + dl_b->dl_period = period; + dl_b->dl_runtime = runtime; +} + +extern unsigned long to_ratio(u64 period, u64 runtime); + +void init_dl_bw(struct dl_bw *dl_b) +{ + raw_spin_lock_init(&dl_b->lock); + raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock); + if (global_rt_runtime() == RUNTIME_INF) + dl_b->bw = -1; + else + dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime()); + raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock); + dl_b->total_bw = 0; +} + +void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +{ + dl_rq->rb_root = RB_ROOT; + +#ifdef CONFIG_SMP + /* zero means no -deadline tasks */ + dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; + + dl_rq->dl_nr_migratory = 0; + dl_rq->overloaded = 0; + dl_rq->pushable_dl_tasks_root = RB_ROOT; +#else + init_dl_bw(&dl_rq->dl_bw); +#endif +} + +#ifdef CONFIG_SMP + +static inline int dl_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->dlo_count); +} + +static inline void dl_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask); + /* + * Must be visible before the overload count is + * set (as in sched_rt.c). + * + * Matched by the barrier in pull_dl_task(). + */ + smp_wmb(); + atomic_inc(&rq->rd->dlo_count); +} + +static inline void dl_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + atomic_dec(&rq->rd->dlo_count); + cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); +} + +static void update_dl_migration(struct dl_rq *dl_rq) +{ + if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { + if (!dl_rq->overloaded) { + dl_set_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 1; + } + } else if (dl_rq->overloaded) { + dl_clear_overload(rq_of_dl_rq(dl_rq)); + dl_rq->overloaded = 0; + } +} + +static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory++; + + update_dl_migration(dl_rq); +} + +static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + struct task_struct *p = dl_task_of(dl_se); + + if (p->nr_cpus_allowed > 1) + dl_rq->dl_nr_migratory--; + + update_dl_migration(dl_rq); +} + +/* + * The list of pushable -deadline task is not a plist, like in + * sched_rt.c, it is an rb-tree with tasks ordered by deadline. + */ +static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node; + struct rb_node *parent = NULL; + struct task_struct *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct task_struct, + pushable_dl_tasks); + if (dl_entity_preempt(&p->dl, &entry->dl)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + + rb_link_node(&p->pushable_dl_tasks, parent, link); + rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); +} + +static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ + struct dl_rq *dl_rq = &rq->dl; + + if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) + return; + + if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) { + struct rb_node *next_node; + + next_node = rb_next(&p->pushable_dl_tasks); + dl_rq->pushable_dl_tasks_leftmost = next_node; + } + + rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +} + +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root); +} + +static int push_dl_task(struct rq *rq); + +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return dl_task(prev); +} + +static inline void set_post_schedule(struct rq *rq) +{ + rq->post_schedule = has_pushable_dl_tasks(rq); +} + +#else + +static inline +void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline +void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ +} + +static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_dl_task(struct rq *rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} +#endif /* CONFIG_SMP */ + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags); + +/* + * We are being explicitly informed that a new instance is starting, + * and this means that: + * - the absolute deadline of the entity has to be placed at + * current time + relative deadline; + * - the runtime of the entity has to be set to the maximum value. + * + * The capability of specifying such event is useful whenever a -deadline + * entity wants to (try to!) synchronize its behaviour with the scheduler's + * one, and to (try to!) reconcile itself with its own scheduling + * parameters. + */ +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + + /* + * We use the regular wall clock time to set deadlines in the + * future; in fact, we must consider execution overheads (time + * spent on hardirq context, etc.). + */ + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + dl_se->dl_new = 0; +} + +/* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus + * exhausting its runtime. + * + * Here we are interested in making runtime overrun possible, but we do + * not want a entity which is misbehaving to affect the scheduling of all + * other entities. + * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS) + * is used, in order to confine each entity within its own bandwidth. + * + * This function deals exactly with that, and ensures that when the runtime + * of a entity is replenished, its deadline is also postponed. That ensures + * the overrunning entity can't interfere with other entity in the system and + * can't make them miss their deadlines. Reasons why this kind of overruns + * could happen are, typically, a entity voluntarily trying to overcome its + * runtime, or it just underestimated it during sched_setscheduler_ex(). + */ +static void replenish_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + BUG_ON(pi_se->dl_runtime <= 0); + + /* + * This could be the case for a !-dl task that is boosted. + * Just go with full inherited parameters. + */ + if (dl_se->dl_deadline == 0) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } + + /* + * We keep moving the deadline away until we get some + * available runtime for the entity. This ensures correct + * handling of situations where the runtime overrun is + * arbitrary large. + */ + while (dl_se->runtime <= 0) { + dl_se->deadline += pi_se->dl_period; + dl_se->runtime += pi_se->dl_runtime; + } + + /* + * At this point, the deadline really should be "in + * the future" with respect to rq->clock. If it's + * not, we are, for some reason, lagging too much! + * Anyway, after having warn userspace abut that, + * we still try to keep the things running by + * resetting the deadline and the budget of the + * entity. + */ + if (dl_time_before(dl_se->deadline, rq_clock(rq))) { + static bool lag_once = false; + + if (!lag_once) { + lag_once = true; + printk_sched("sched: DL replenish lagged to much\n"); + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * Here we check if --at time t-- an entity (which is probably being + * [re]activated or, in general, enqueued) can use its remaining runtime + * and its current deadline _without_ exceeding the bandwidth it is + * assigned (function returns true if it can't). We are in fact applying + * one of the CBS rules: when a task wakes up, if the residual runtime + * over residual deadline fits within the allocated bandwidth, then we + * can keep the current (absolute) deadline and residual budget without + * disrupting the schedulability of the system. Otherwise, we should + * refill the runtime and set the deadline a period in the future, + * because keeping the current (absolute) deadline of the task would + * result in breaking guarantees promised to other tasks (refer to + * Documentation/scheduler/sched-deadline.txt for more informations). + * + * This function returns true if: + * + * runtime / (deadline - t) > dl_runtime / dl_period , + * + * IOW we can't recycle current parameters. + * + * Notice that the bandwidth check is done against the period. For + * task with deadline equal to period this is the same of using + * dl_deadline instead of dl_period in the equation above. + */ +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, u64 t) +{ + u64 left, right; + + /* + * left and right are the two sides of the equation above, + * after a bit of shuffling to use multiplications instead + * of divisions. + * + * Note that none of the time values involved in the two + * multiplications are absolute: dl_deadline and dl_runtime + * are the relative deadline and the maximum runtime of each + * instance, runtime is the runtime left for the last instance + * and (deadline - t), since t is rq->clock, is the time left + * to the (absolute) deadline. Even if overflowing the u64 type + * is very unlikely to occur in both cases, here we scale down + * as we want to avoid that risk at all. Scaling down by 10 + * means that we reduce granularity to 1us. We are fine with it, + * since this is only a true/false check and, anyway, thinking + * of anything below microseconds resolution is actually fiction + * (but still we want to give the user that illusion >;). + */ + left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + right = ((dl_se->deadline - t) >> DL_SCALE) * + (pi_se->dl_runtime >> DL_SCALE); + + return dl_time_before(right, left); +} + +/* + * When a -deadline entity is queued back on the runqueue, its runtime and + * deadline might need updating. + * + * The policy here is that we update the deadline of the entity only if: + * - the current deadline is in the past, + * - using the remaining runtime with the current deadline would make + * the entity exceed its bandwidth. + */ +static void update_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * The arrival of a new instance needs special treatment, i.e., + * the actual scheduling parameters have to be "renewed". + */ + if (dl_se->dl_new) { + setup_new_dl_entity(dl_se, pi_se); + return; + } + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) || + dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; + dl_se->runtime = pi_se->dl_runtime; + } +} + +/* + * If the entity depleted all its runtime, and if we want it to sleep + * while waiting for some new execution time to become available, we + * set the bandwidth enforcement timer to the replenishment instant + * and try to activate it. + * + * Notice that it is important for the caller to know if the timer + * actually started or not (i.e., the replenishment instant is in + * the future or in the past). + */ +static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); + ktime_t now, act; + ktime_t soft, hard; + unsigned long range; + s64 delta; + + if (boosted) + return 0; + /* + * We want the timer to fire at the deadline, but considering + * that it is actually coming from rq->clock and not from + * hrtimer's time base reading. + */ + act = ns_to_ktime(dl_se->deadline); + now = hrtimer_cb_get_time(&dl_se->dl_timer); + delta = ktime_to_ns(now) - rq_clock(rq); + act = ktime_add_ns(act, delta); + + /* + * If the expiry time already passed, e.g., because the value + * chosen as the deadline is too small, don't even try to + * start the timer in the past! + */ + if (ktime_us_delta(act, now) < 0) + return 0; + + hrtimer_set_expires(&dl_se->dl_timer, act); + + soft = hrtimer_get_softexpires(&dl_se->dl_timer); + hard = hrtimer_get_expires(&dl_se->dl_timer); + range = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&dl_se->dl_timer, soft, + range, HRTIMER_MODE_ABS, 0); + + return hrtimer_active(&dl_se->dl_timer); +} + +/* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running + * means the task is throttled and needs a runtime replenishment. + * + * However, what we actually do depends on the fact the task is active, + * (it is on its rq) or has been removed from there by a call to + * dequeue_task_dl(). In the former case we must issue the runtime + * replenishment and add the task back to the dl_rq; in the latter, we just + * do nothing but clearing dl_throttled, so that runtime and deadline + * updating (and the queueing back to dl_rq) will be done by the + * next call to enqueue_task_dl(). + */ +static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) +{ + struct sched_dl_entity *dl_se = container_of(timer, + struct sched_dl_entity, + dl_timer); + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = task_rq(p); + raw_spin_lock(&rq->lock); + + /* + * We need to take care of a possible races here. In fact, the + * task might have changed its scheduling policy to something + * different from SCHED_DEADLINE or changed its reservation + * parameters (through sched_setscheduler()). + */ + if (!dl_task(p) || dl_se->dl_new) + goto unlock; + + sched_clock_tick(); + update_rq_clock(rq); + dl_se->dl_throttled = 0; + if (p->on_rq) { + enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); + if (task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_task(rq->curr); +#ifdef CONFIG_SMP + /* + * Queueing this task back might have overloaded rq, + * check if we need to kick someone away. + */ + if (has_pushable_dl_tasks(rq)) + push_dl_task(rq); +#endif + } +unlock: + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +void init_dl_task_timer(struct sched_dl_entity *dl_se) +{ + struct hrtimer *timer = &dl_se->dl_timer; + + if (hrtimer_active(timer)) { + hrtimer_try_to_cancel(timer); + return; + } + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = dl_task_timer; +} + +static +int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) +{ + int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq)); + int rorun = dl_se->runtime <= 0; + + if (!rorun && !dmiss) + return 0; + + /* + * If we are beyond our current deadline and we are still + * executing, then we have already used some of the runtime of + * the next instance. Thus, if we do not account that, we are + * stealing bandwidth from the system at each deadline miss! + */ + if (dmiss) { + dl_se->runtime = rorun ? dl_se->runtime : 0; + dl_se->runtime -= rq_clock(rq) - dl_se->deadline; + } + + return 1; +} + +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + +/* + * Update the current task's runtime statistics (provided it is still + * a -deadline task and has not been removed from the dl_rq). + */ +static void update_curr_dl(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_dl_entity *dl_se = &curr->dl; + u64 delta_exec; + + if (!dl_task(curr) || !on_dl_rq(dl_se)) + return; + + /* + * Consumed budget is computed considering the time as + * observed by schedulable tasks (excluding time spent + * in hardirq context, etc.). Deadlines are instead + * computed using hard walltime. This seems to be the more + * natural solution, but the full ramifications of this + * approach need further study. + */ + delta_exec = rq_clock_task(rq) - curr->se.exec_start; + if (unlikely((s64)delta_exec <= 0)) + return; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq_clock_task(rq); + cpuacct_charge(curr, delta_exec); + + sched_rt_avg_update(rq, delta_exec); + + dl_se->runtime -= delta_exec; + if (dl_runtime_exceeded(rq, dl_se)) { + __dequeue_task_dl(rq, curr, 0); + if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) + dl_se->dl_throttled = 1; + else + enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); + + if (!is_leftmost(curr, &rq->dl)) + resched_task(curr); + } + + /* + * Because -- for now -- we share the rt bandwidth, we need to + * account our runtime there too, otherwise actual rt tasks + * would be able to exceed the shared quota. + * + * Account to the root rt group for now. + * + * The solution we're working towards is having the RT groups scheduled + * using deadline servers -- however there's a few nasties to figure + * out before that can happen. + */ + if (rt_bandwidth_enabled()) { + struct rt_rq *rt_rq = &rq->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + /* + * We'll let actual RT tasks worry about the overflow here, we + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. + */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } +} + +#ifdef CONFIG_SMP + +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); + +static inline u64 next_deadline(struct rq *rq) +{ + struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); + + if (next && dl_prio(next->prio)) + return next->dl.deadline; + else + return 0; +} + +static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + if (dl_rq->earliest_dl.curr == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + /* + * If the dl_rq had no -deadline tasks, or if the new task + * has shorter deadline than the current one on dl_rq, we + * know that the previous earliest becomes our next earliest, + * as the new task becomes the earliest itself. + */ + dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; + dl_rq->earliest_dl.curr = deadline; + cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); + } else if (dl_rq->earliest_dl.next == 0 || + dl_time_before(deadline, dl_rq->earliest_dl.next)) { + /* + * On the other hand, if the new -deadline task has a + * a later deadline than the earliest one on dl_rq, but + * it is earlier than the next (if any), we must + * recompute the next-earliest. + */ + dl_rq->earliest_dl.next = next_deadline(rq); + } +} + +static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) +{ + struct rq *rq = rq_of_dl_rq(dl_rq); + + /* + * Since we may have removed our earliest (and/or next earliest) + * task we must recompute them. + */ + if (!dl_rq->dl_nr_running) { + dl_rq->earliest_dl.curr = 0; + dl_rq->earliest_dl.next = 0; + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); + } else { + struct rb_node *leftmost = dl_rq->rb_leftmost; + struct sched_dl_entity *entry; + + entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); + dl_rq->earliest_dl.curr = entry->deadline; + dl_rq->earliest_dl.next = next_deadline(rq); + cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); + } +} + +#else + +static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} +static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} + +#endif /* CONFIG_SMP */ + +static inline +void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + u64 deadline = dl_se->deadline; + + WARN_ON(!dl_prio(prio)); + dl_rq->dl_nr_running++; + inc_nr_running(rq_of_dl_rq(dl_rq)); + + inc_dl_deadline(dl_rq, deadline); + inc_dl_migration(dl_se, dl_rq); +} + +static inline +void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + int prio = dl_task_of(dl_se)->prio; + + WARN_ON(!dl_prio(prio)); + WARN_ON(!dl_rq->dl_nr_running); + dl_rq->dl_nr_running--; + dec_nr_running(rq_of_dl_rq(dl_rq)); + + dec_dl_deadline(dl_rq, dl_se->deadline); + dec_dl_migration(dl_se, dl_rq); +} + +static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + struct rb_node **link = &dl_rq->rb_root.rb_node; + struct rb_node *parent = NULL; + struct sched_dl_entity *entry; + int leftmost = 1; + + BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_dl_entity, rb_node); + if (dl_time_before(dl_se->deadline, entry->deadline)) + link = &parent->rb_left; + else { + link = &parent->rb_right; + leftmost = 0; + } + } + + if (leftmost) + dl_rq->rb_leftmost = &dl_se->rb_node; + + rb_link_node(&dl_se->rb_node, parent, link); + rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root); + + inc_dl_tasks(dl_se, dl_rq); +} + +static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + struct dl_rq *dl_rq = dl_rq_of_se(dl_se); + + if (RB_EMPTY_NODE(&dl_se->rb_node)) + return; + + if (dl_rq->rb_leftmost == &dl_se->rb_node) { + struct rb_node *next_node; + + next_node = rb_next(&dl_se->rb_node); + dl_rq->rb_leftmost = next_node; + } + + rb_erase(&dl_se->rb_node, &dl_rq->rb_root); + RB_CLEAR_NODE(&dl_se->rb_node); + + dec_dl_tasks(dl_se, dl_rq); +} + +static void +enqueue_dl_entity(struct sched_dl_entity *dl_se, + struct sched_dl_entity *pi_se, int flags) +{ + BUG_ON(on_dl_rq(dl_se)); + + /* + * If this is a wakeup or a new instance, the scheduling + * parameters of the task might need updating. Otherwise, + * we want a replenishment of its runtime. + */ + if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH) + replenish_dl_entity(dl_se, pi_se); + else + update_dl_entity(dl_se, pi_se); + + __enqueue_dl_entity(dl_se); +} + +static void dequeue_dl_entity(struct sched_dl_entity *dl_se) +{ + __dequeue_dl_entity(dl_se); +} + +static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + struct sched_dl_entity *pi_se = &p->dl; + + /* + * Use the scheduling parameters of the top pi-waiter + * task if we have one and its (relative) deadline is + * smaller than our one... OTW we keep our runtime and + * deadline. + */ + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + pi_se = &pi_task->dl; + + /* + * If p is throttled, we do nothing. In fact, if it exhausted + * its budget it needs a replenishment and, since it now is on + * its rq, the bandwidth timer callback (which clearly has not + * run yet) will take care of this. + */ + if (p->dl.dl_throttled) + return; + + enqueue_dl_entity(&p->dl, pi_se, flags); + + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + dequeue_dl_entity(&p->dl); + dequeue_pushable_dl_task(rq, p); +} + +static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +{ + update_curr_dl(rq); + __dequeue_task_dl(rq, p, flags); +} + +/* + * Yield task semantic for -deadline tasks is: + * + * get off from the CPU until our next instance, with + * a new runtime. This is of little use now, since we + * don't have a bandwidth reclaiming mechanism. Anyway, + * bandwidth reclaiming is planned for the future, and + * yield_task_dl will indicate that some spare budget + * is available for other task instances to use it. + */ +static void yield_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + /* + * We make the task go to sleep until its current deadline by + * forcing its runtime to zero. This way, update_curr_dl() stops + * it and the bandwidth timer will wake it up and will give it + * new scheduling parameters (thanks to dl_new=1). + */ + if (p->dl.runtime > 0) { + rq->curr->dl.dl_new = 1; + p->dl.runtime = 0; + } + update_curr_dl(rq); +} + +#ifdef CONFIG_SMP + +static int find_later_rq(struct task_struct *task); + +static int +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + + /* + * If we are dealing with a -deadline task, we must + * decide where to wake it up. + * If it has a later deadline and the current task + * on this rq can't move (provided the waking task + * can!) we prefer to send it somewhere else. On the + * other hand, if it has a shorter deadline, we + * try to make it stay here, it might be important. + */ + if (unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + (p->nr_cpus_allowed > 1)) { + int target = find_later_rq(p); + + if (target != -1) + cpu = target; + } + rcu_read_unlock(); + +out: + return cpu; +} + +static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) +{ + /* + * Current can't be migrated, useless to reschedule, + * let's hope p can move out. + */ + if (rq->curr->nr_cpus_allowed == 1 || + cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1) + return; + + /* + * p is migratable, so let's not schedule it and + * see if it is pushed or pulled somewhere else. + */ + if (p->nr_cpus_allowed != 1 && + cpudl_find(&rq->rd->cpudl, p, NULL) != -1) + return; + + resched_task(rq->curr); +} + +static int pull_dl_task(struct rq *this_rq); + +#endif /* CONFIG_SMP */ + +/* + * Only called when both the current and waking task are -deadline + * tasks. + */ +static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, + int flags) +{ + if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { + resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * In the unlikely case current and p have the same deadline + * let us try to decide what's the best thing to do... + */ + if ((p->dl.deadline == rq->curr->dl.deadline) && + !test_tsk_need_resched(rq->curr)) + check_preempt_equal_dl(rq, p); +#endif /* CONFIG_SMP */ +} + +#ifdef CONFIG_SCHED_HRTICK +static void start_hrtick_dl(struct rq *rq, struct task_struct *p) +{ + s64 delta = p->dl.dl_runtime - p->dl.runtime; + + if (delta > 10000) + hrtick_start(rq, p->dl.runtime); +} +#endif + +static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, + struct dl_rq *dl_rq) +{ + struct rb_node *left = dl_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_dl_entity, rb_node); +} + +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev) +{ + struct sched_dl_entity *dl_se; + struct task_struct *p; + struct dl_rq *dl_rq; + + dl_rq = &rq->dl; + + if (need_pull_dl_task(rq, prev)) + pull_dl_task(rq); + /* + * When prev is DL, we may throttle it in put_prev_task(). + * So, we update time before we check for dl_nr_running. + */ + if (prev->sched_class == &dl_sched_class) + update_curr_dl(rq); + + if (unlikely(!dl_rq->dl_nr_running)) + return NULL; + + put_prev_task(rq, prev); + + dl_se = pick_next_dl_entity(rq, dl_rq); + BUG_ON(!dl_se); + + p = dl_task_of(dl_se); + p->se.exec_start = rq_clock_task(rq); + + /* Running task will never be pushed. */ + dequeue_pushable_dl_task(rq, p); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); +#endif + + set_post_schedule(rq); + + return p; +} + +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +{ + update_curr_dl(rq); + + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) + enqueue_pushable_dl_task(rq, p); +} + +static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) +{ + update_curr_dl(rq); + +#ifdef CONFIG_SCHED_HRTICK + if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) + start_hrtick_dl(rq, p); +#endif +} + +static void task_fork_dl(struct task_struct *p) +{ + /* + * SCHED_DEADLINE tasks cannot fork and this is achieved through + * sched_fork() + */ +} + +static void task_dead_dl(struct task_struct *p) +{ + struct hrtimer *timer = &p->dl.dl_timer; + struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + + /* + * Since we are TASK_DEAD we won't slip out of the domain! + */ + raw_spin_lock_irq(&dl_b->lock); + dl_b->total_bw -= p->dl.dl_bw; + raw_spin_unlock_irq(&dl_b->lock); + + hrtimer_cancel(timer); +} + +static void set_curr_task_dl(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq_clock_task(rq); + + /* You can't push away the running task */ + dequeue_pushable_dl_task(rq, p); +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define DL_MAX_TRIES 3 + +static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (p->nr_cpus_allowed > 1)) + return 1; + + return 0; +} + +/* Returns the second earliest -deadline task, NULL otherwise */ +static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) +{ + struct rb_node *next_node = rq->dl.rb_leftmost; + struct sched_dl_entity *dl_se; + struct task_struct *p = NULL; + +next_node: + next_node = rb_next(next_node); + if (next_node) { + dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); + p = dl_task_of(dl_se); + + if (pick_dl_task(rq, p, cpu)) + return p; + + goto next_node; + } + + return NULL; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); + +static int find_later_rq(struct task_struct *task) +{ + struct sched_domain *sd; + struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); + int this_cpu = smp_processor_id(); + int best_cpu, cpu = task_cpu(task); + + /* Make sure the mask is initialized first */ + if (unlikely(!later_mask)) + return -1; + + if (task->nr_cpus_allowed == 1) + return -1; + + best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, + task, later_mask); + if (best_cpu == -1) + return -1; + + /* + * If we are here, some target has been found, + * the most suitable of which is cached in best_cpu. + * This is, among the runqueues where the current tasks + * have later deadlines than the task's one, the rq + * with the latest possible one. + * + * Now we check how well this matches with task's + * affinity and system topology. + * + * The last cpu where the task run is our first + * guess, since it is most likely cache-hot there. + */ + if (cpumask_test_cpu(cpu, later_mask)) + return cpu; + /* + * Check if this_cpu is to be skipped (i.e., it is + * not in the mask) or not. + */ + if (!cpumask_test_cpu(this_cpu, later_mask)) + this_cpu = -1; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + + /* + * If possible, preempting this_cpu is + * cheaper than migrating. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return this_cpu; + } + + /* + * Last chance: if best_cpu is valid and is + * in the mask, that becomes our choice. + */ + if (best_cpu < nr_cpu_ids && + cpumask_test_cpu(best_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return best_cpu; + } + } + } + rcu_read_unlock(); + + /* + * At this point, all our guesses failed, we just return + * 'something', and let the caller sort the things out. + */ + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(later_mask); + if (cpu < nr_cpu_ids) + return cpu; + + return -1; +} + +/* Locks the rq it finds */ +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *later_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < DL_MAX_TRIES; tries++) { + cpu = find_later_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + later_rq = cpu_rq(cpu); + + /* Retry if something changed. */ + if (double_lock_balance(rq, later_rq)) { + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(later_rq->cpu, + &task->cpus_allowed) || + task_running(rq, task) || !task->on_rq)) { + double_unlock_balance(rq, later_rq); + later_rq = NULL; + break; + } + } + + /* + * If the rq we found has no -deadline task, or + * its earliest one has a later deadline than our + * task, the rq is a good one. + */ + if (!later_rq->dl.dl_nr_running || + dl_time_before(task->dl.deadline, + later_rq->dl.earliest_dl.curr)) + break; + + /* Otherwise we try again. */ + double_unlock_balance(rq, later_rq); + later_rq = NULL; + } + + return later_rq; +} + +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = rb_entry(rq->dl.pushable_dl_tasks_leftmost, + struct task_struct, pushable_dl_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!p->on_rq); + BUG_ON(!dl_task(p)); + + return p; +} + +/* + * See if the non running -deadline tasks on this rq + * can be sent to some other CPU where they can preempt + * and start executing. + */ +static int push_dl_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *later_rq; + + if (!rq->dl.overloaded) + return 0; + + next_task = pick_next_pushable_dl_task(rq); + if (!next_task) + return 0; + +retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * If next_task preempts rq->curr, and rq->curr + * can move away, it makes sense to just reschedule + * without going further in pushing next_task. + */ + if (dl_task(rq->curr) && + dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && + rq->curr->nr_cpus_allowed > 1) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* Will lock the rq it'll find */ + later_rq = find_lock_later_rq(next_task, rq); + if (!later_rq) { + struct task_struct *task; + + /* + * We must check all this again, since + * find_lock_later_rq releases rq->lock and it is + * then possible that next_task has migrated. + */ + task = pick_next_pushable_dl_task(rq); + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * The task is still there. We don't try + * again, some other cpu will pull it when ready. + */ + dequeue_pushable_dl_task(rq, next_task); + goto out; + } + + if (!task) + /* No more tasks */ + goto out; + + put_task_struct(next_task); + next_task = task; + goto retry; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, later_rq->cpu); + activate_task(later_rq, next_task, 0); + + resched_task(later_rq->curr); + + double_unlock_balance(rq, later_rq); + +out: + put_task_struct(next_task); + + return 1; +} + +static void push_dl_tasks(struct rq *rq) +{ + /* Terminates as it moves a -deadline task */ + while (push_dl_task(rq)) + ; +} + +static int pull_dl_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p; + struct rq *src_rq; + u64 dmin = LONG_MAX; + + if (likely(!dl_overloaded(this_rq))) + return 0; + + /* + * Match the barrier from dl_set_overloaded; this guarantees that if we + * see overloaded we must also see the dlo_mask bit. + */ + smp_rmb(); + + for_each_cpu(cpu, this_rq->rd->dlo_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * It looks racy, abd it is! However, as in sched_rt.c, + * we are fine with this. + */ + if (this_rq->dl.dl_nr_running && + dl_time_before(this_rq->dl.earliest_dl.curr, + src_rq->dl.earliest_dl.next)) + continue; + + /* Might drop this_rq->lock */ + double_lock_balance(this_rq, src_rq); + + /* + * If there are no more pullable tasks on the + * rq, we're done with it. + */ + if (src_rq->dl.dl_nr_running <= 1) + goto skip; + + p = pick_next_earliest_dl_task(src_rq, this_cpu); + + /* + * We found a task to be pulled if: + * - it preempts our current (if there's one), + * - it will preempt the last one we pulled (if any). + */ + if (p && dl_time_before(p->dl.deadline, dmin) && + (!this_rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + this_rq->dl.earliest_dl.curr))) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->on_rq); + + /* + * Then we pull iff p has actually an earlier + * deadline than the current task of its runqueue. + */ + if (dl_time_before(p->dl.deadline, + src_rq->curr->dl.deadline)) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + + /* Is there any other task even earlier? */ + } +skip: + double_unlock_balance(this_rq, src_rq); + } + + return ret; +} + +static void post_schedule_dl(struct rq *rq) +{ + push_dl_tasks(rq); +} + +/* + * Since the task is not running and a reschedule is not going to happen + * anytime soon on its runqueue, we try pushing it away now. + */ +static void task_woken_dl(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + has_pushable_dl_tasks(rq) && + p->nr_cpus_allowed > 1 && + dl_task(rq->curr) && + (rq->curr->nr_cpus_allowed < 2 || + dl_entity_preempt(&rq->curr->dl, &p->dl))) { + push_dl_tasks(rq); + } +} + +static void set_cpus_allowed_dl(struct task_struct *p, + const struct cpumask *new_mask) +{ + struct rq *rq; + int weight; + + BUG_ON(!dl_task(p)); + + /* + * Update only if the task is actually running (i.e., + * it is on the rq AND it is not throttled). + */ + if (!on_dl_rq(&p->dl)) + return; + + weight = cpumask_weight(new_mask); + + /* + * Only update if the process changes its state from whether it + * can migrate or not. + */ + if ((p->nr_cpus_allowed > 1) == (weight > 1)) + return; + + rq = task_rq(p); + + /* + * The process used to be able to migrate OR it can now migrate + */ + if (weight <= 1) { + if (!task_current(rq, p)) + dequeue_pushable_dl_task(rq, p); + BUG_ON(!rq->dl.dl_nr_migratory); + rq->dl.dl_nr_migratory--; + } else { + if (!task_current(rq, p)) + enqueue_pushable_dl_task(rq, p); + rq->dl.dl_nr_migratory++; + } + + update_dl_migration(&rq->dl); +} + +/* Assumes rq->lock is held */ +static void rq_online_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_set_overload(rq); + + if (rq->dl.dl_nr_running > 0) + cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); +} + +/* Assumes rq->lock is held */ +static void rq_offline_dl(struct rq *rq) +{ + if (rq->dl.overloaded) + dl_clear_overload(rq); + + cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); +} + +void init_sched_dl_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i), + GFP_KERNEL, cpu_to_node(i)); +} + +#endif /* CONFIG_SMP */ + +static void switched_from_dl(struct rq *rq, struct task_struct *p) +{ + if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) + hrtimer_try_to_cancel(&p->dl.dl_timer); + +#ifdef CONFIG_SMP + /* + * Since this might be the only -deadline task on the rq, + * this is the right place to try to pull some other one + * from an overloaded cpu, if any. + */ + if (!rq->dl.dl_nr_running) + pull_dl_task(rq); +#endif +} + +/* + * When switching to -deadline, we may overload the rq, then + * we try to push someone off, if possible. + */ +static void switched_to_dl(struct rq *rq, struct task_struct *p) +{ + int check_resched = 1; + + /* + * If p is throttled, don't consider the possibility + * of preempting rq->curr, the check will be done right + * after its runtime will get replenished. + */ + if (unlikely(p->dl.dl_throttled)) + return; + + if (p->on_rq && rq->curr != p) { +#ifdef CONFIG_SMP + if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + /* Only reschedule if pushing failed */ + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && task_has_dl_policy(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + } +} + +/* + * If the scheduling parameters of a -deadline task changed, + * a push or pull operation might be needed. + */ +static void prio_changed_dl(struct rq *rq, struct task_struct *p, + int oldprio) +{ + if (p->on_rq || rq->curr == p) { +#ifdef CONFIG_SMP + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + pull_dl_task(rq); + + /* + * If we now have a earlier deadline task than p, + * then reschedule, provided p is still on this + * runqueue. + */ + if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) && + rq->curr == p) + resched_task(p); +#else + /* + * Again, we don't know if p has a earlier + * or later deadline, so let's blindly set a + * (maybe not needed) rescheduling point. + */ + resched_task(p); +#endif /* CONFIG_SMP */ + } else + switched_to_dl(rq, p); +} + +const struct sched_class dl_sched_class = { + .next = &rt_sched_class, + .enqueue_task = enqueue_task_dl, + .dequeue_task = dequeue_task_dl, + .yield_task = yield_task_dl, + + .check_preempt_curr = check_preempt_curr_dl, + + .pick_next_task = pick_next_task_dl, + .put_prev_task = put_prev_task_dl, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_dl, + .set_cpus_allowed = set_cpus_allowed_dl, + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .post_schedule = post_schedule_dl, + .task_woken = task_woken_dl, +#endif + + .set_curr_task = set_curr_task_dl, + .task_tick = task_tick_dl, + .task_fork = task_fork_dl, + .task_dead = task_dead_dl, + + .prio_changed = prio_changed_dl, + .switched_from = switched_from_dl, + .switched_to = switched_to_dl, +}; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 5c34d1817e8f..695f9773bb60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif @@ -139,7 +138,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); + SEQ_printf(m, " %d", task_node(p)); #endif #ifdef CONFIG_CGROUP_SCHED SEQ_printf(m, " %s", task_group_path(task_group(p))); @@ -321,6 +320,7 @@ do { \ P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); + P64(max_idle_balance_cost); #endif P(ttwu_count); @@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m) PN(cpu_clk); P(jiffies); #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - P(sched_clock_stable); + P(sched_clock_stable()); #endif #undef PN #undef P @@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) unsigned long nr_faults = -1; int cpu_current, home_node; - if (p->numa_faults) - nr_faults = p->numa_faults[2*node + i]; + if (p->numa_faults_memory) + nr_faults = p->numa_faults_memory[2*node + i]; cpu_current = !i ? (task_node(p) == node) : (pol && node_isset(node, pol->v.nodes)); home_node = (p->numa_preferred_nid == node); - SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", + SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n", i, node, cpu_current, home_node, nr_faults); } } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e64b0794060e..7e9bd0b1fa9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ -static inline int +static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { if (se->cfs_rq == pse->cfs_rq) - return 1; + return se->cfs_rq; - return 0; + return NULL; } static inline struct sched_entity *parent_entity(struct sched_entity *se) @@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { @@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) */ /* First walk up until both entities are at same depth */ - se_depth = depth_se(*se); - pse_depth = depth_se(*pse); + se_depth = (*se)->depth; + pse_depth = (*pse)->depth; while (se_depth > pse_depth) { se_depth--; @@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - static inline struct sched_entity *parent_entity(struct sched_entity *se) { return NULL; @@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; -/* - * After skipping a page migration on a shared page, skip N more numa page - * migrations unconditionally. This reduces the number of NUMA migrations - * in shared memory workloads, and has the effect of pulling tasks towards - * where their memory lives, over pulling the memory towards the task. - */ -unsigned int sysctl_numa_balancing_migrate_deferred = 16; - static unsigned int task_nr_scan_windows(struct task_struct *p) { unsigned long rss = 0; @@ -872,15 +847,6 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } -/* - * Once a preferred node is selected the scheduler balancer will prefer moving - * a task to that node for sysctl_numa_balancing_settle_count number of PTE - * scans. This will give the process the chance to accumulate more faults on - * the preferred node but still allow the scheduler to move the task again if - * the nodes CPUs are overloaded. - */ -unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; - static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != -1); @@ -902,10 +868,26 @@ struct numa_group { struct list_head task_list; struct rcu_head rcu; + nodemask_t active_nodes; unsigned long total_faults; + /* + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long *faults_cpu; unsigned long faults[0]; }; +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + pid_t task_numa_group_id(struct task_struct *p) { return p->numa_group ? p->numa_group->gid : 0; @@ -913,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p) static inline int task_faults_idx(int nid, int priv) { - return 2 * nid + priv; + return NR_NUMA_HINT_FAULT_TYPES * nid + priv; } static inline unsigned long task_faults(struct task_struct *p, int nid) { - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; - return p->numa_faults[task_faults_idx(nid, 0)] + - p->numa_faults[task_faults_idx(nid, 1)]; + return p->numa_faults_memory[task_faults_idx(nid, 0)] + + p->numa_faults_memory[task_faults_idx(nid, 1)]; } static inline unsigned long group_faults(struct task_struct *p, int nid) @@ -930,7 +912,14 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) if (!p->numa_group) return 0; - return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; + return p->numa_group->faults[task_faults_idx(nid, 0)] + + p->numa_group->faults[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ + return group->faults_cpu[task_faults_idx(nid, 0)] + + group->faults_cpu[task_faults_idx(nid, 1)]; } /* @@ -943,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) { unsigned long total_faults; - if (!p->numa_faults) + if (!p->numa_faults_memory) return 0; total_faults = p->total_numa_faults; @@ -962,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid) return 1000 * group_faults(p, nid) / p->numa_group->total_faults; } +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = p->numa_group; + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Do not migrate if the destination is not a node that + * is actively used by this numa group. + */ + if (!node_isset(dst_nid, ng->active_nodes)) + return false; + + /* + * Source is a node that is not actively used by this + * numa group, while the destination is. Migrate. + */ + if (!node_isset(src_nid, ng->active_nodes)) + return true; + + /* + * Both source and destination are nodes in active + * use by this numa group. Maximize memory bandwidth + * by migrating from more heavily used groups, to less + * heavily used ones, spreading the load around. + * Use a 1/4 hysteresis to avoid spurious page movement. + */ + return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); +} + static unsigned long weighted_cpuload(const int cpu); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); @@ -1023,7 +1075,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; - int imbalance_pct, idx; + int imbalance_pct; struct task_struct *best_task; long best_imp; @@ -1211,7 +1263,7 @@ static int task_numa_migrate(struct task_struct *p) * elsewhere, so there is no point in (re)trying. */ if (unlikely(!sd)) { - p->numa_preferred_nid = cpu_to_node(task_cpu(p)); + p->numa_preferred_nid = task_node(p); return -EINVAL; } @@ -1258,11 +1310,15 @@ static int task_numa_migrate(struct task_struct *p) p->numa_scan_period = task_scan_min(p); if (env.best_task == NULL) { - int ret = migrate_task_to(p, env.best_cpu); + ret = migrate_task_to(p, env.best_cpu); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_cpu); return ret; } ret = migrate_swap(p, env.best_task); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task)); put_task_struct(env.best_task); return ret; } @@ -1271,14 +1327,14 @@ static int task_numa_migrate(struct task_struct *p) static void numa_migrate_preferred(struct task_struct *p) { /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ p->numa_migrate_retry = jiffies + HZ; /* Success if task is already running on preferred CPU */ - if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) + if (task_node(p) == p->numa_preferred_nid) return; /* Otherwise, try migrate to a CPU on the preferred node */ @@ -1286,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p) } /* + * Find the nodes on which the workload is actively running. We do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + * + * The bitmask is used to make smarter decisions on when to do NUMA page + * migrations, To prevent flip-flopping, and excessive page migrations, nodes + * are added when they cause over 6/16 of the maximum number of faults, but + * only removed when they drop below 3/16. + */ +static void update_numa_active_node_mask(struct numa_group *numa_group) +{ + unsigned long faults, max_faults = 0; + int nid; + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults > max_faults) + max_faults = faults; + } + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (!node_isset(nid, numa_group->active_nodes)) { + if (faults > max_faults * 6 / 16) + node_set(nid, numa_group->active_nodes); + } else if (faults < max_faults * 3 / 16) + node_clear(nid, numa_group->active_nodes); + } +} + +/* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan * period will be for the next scan window. If local/remote ratio is below @@ -1350,7 +1438,6 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } @@ -1360,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p, memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + } else { + delta = p->se.avg.runnable_avg_sum; + *period = p->se.avg.runnable_avg_period; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; unsigned long max_faults = 0, max_group_faults = 0; unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; spinlock_t *group_lock = NULL; seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -1373,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p) p->numa_scan_seq = seq; p->numa_scan_period_max = task_scan_max(p); + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; @@ -1384,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p) unsigned long faults = 0, group_faults = 0; int priv, i; - for (priv = 0; priv < 2; priv++) { - long diff; + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + long diff, f_diff, f_weight; i = task_faults_idx(nid, priv); - diff = -p->numa_faults[i]; /* Decay existing window, copy faults since last scan */ - p->numa_faults[i] >>= 1; - p->numa_faults[i] += p->numa_faults_buffer[i]; - fault_types[priv] += p->numa_faults_buffer[i]; - p->numa_faults_buffer[i] = 0; + diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2; + fault_types[priv] += p->numa_faults_buffer_memory[i]; + p->numa_faults_buffer_memory[i] = 0; - faults += p->numa_faults[i]; - diff += p->numa_faults[i]; + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) / + (total_faults + 1); + f_diff = f_weight - p->numa_faults_cpu[i] / 2; + p->numa_faults_buffer_cpu[i] = 0; + + p->numa_faults_memory[i] += diff; + p->numa_faults_cpu[i] += f_diff; + faults += p->numa_faults_memory[i]; p->total_numa_faults += diff; if (p->numa_group) { /* safe because we can only change our own group */ p->numa_group->faults[i] += diff; + p->numa_group->faults_cpu[i] += f_diff; p->numa_group->total_faults += diff; group_faults += p->numa_group->faults[i]; } @@ -1421,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { + update_numa_active_node_mask(p->numa_group); /* * If the preferred task and group nids are different, * iterate over the nodes again to find the best place. @@ -1470,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!p->numa_group)) { unsigned int size = sizeof(struct numa_group) + - 2*nr_node_ids*sizeof(unsigned long); + 4*nr_node_ids*sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -1480,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, spin_lock_init(&grp->lock); INIT_LIST_HEAD(&grp->task_list); grp->gid = p->pid; + /* Second half of the array tracks nids where faults happen */ + grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * + nr_node_ids; - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] = p->numa_faults[i]; + node_set(task_node(current), grp->active_nodes); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] = p->numa_faults_memory[i]; grp->total_faults = p->total_numa_faults; @@ -1539,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, double_lock(&my_grp->lock, &grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) { - my_grp->faults[i] -= p->numa_faults[i]; - grp->faults[i] += p->numa_faults[i]; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { + my_grp->faults[i] -= p->numa_faults_memory[i]; + grp->faults[i] += p->numa_faults_memory[i]; } my_grp->total_faults -= p->total_numa_faults; grp->total_faults += p->total_numa_faults; @@ -1567,12 +1707,12 @@ void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; int i; - void *numa_faults = p->numa_faults; + void *numa_faults = p->numa_faults_memory; if (grp) { spin_lock(&grp->lock); - for (i = 0; i < 2*nr_node_ids; i++) - grp->faults[i] -= p->numa_faults[i]; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); @@ -1582,18 +1722,21 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - p->numa_faults_buffer = NULL; + p->numa_faults_memory = NULL; + p->numa_faults_buffer_memory = NULL; + p->numa_faults_cpu= NULL; + p->numa_faults_buffer_cpu = NULL; kfree(numa_faults); } /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int last_cpupid, int node, int pages, int flags) +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) { struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; + int cpu_node = task_node(current); int priv; if (!numabalancing_enabled) @@ -1608,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) return; /* Allocate buffer to track faults on a per-node basis */ - if (unlikely(!p->numa_faults)) { - int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; + if (unlikely(!p->numa_faults_memory)) { + int size = sizeof(*p->numa_faults_memory) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; - /* numa_faults and numa_faults_buffer share the allocation */ - p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); - if (!p->numa_faults) + p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults_memory) return; - BUG_ON(p->numa_faults_buffer); - p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); + BUG_ON(p->numa_faults_buffer_memory); + /* + * The averaged statistics, shared & private, memory & cpu, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ + p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); + p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); + p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); p->total_numa_faults = 0; memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); } @@ -1646,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; - p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; + p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; + p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; } @@ -1762,6 +1914,8 @@ void task_numa_work(struct callback_head *work) start = end; if (pages <= 0) goto out; + + cond_resched(); } while (end != vma->vm_end); } @@ -2222,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) se->avg.load_avg_contrib >>= NICE_0_SHIFT; } } -#else + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_tg_runnable_avg(&rq->avg, &rq->cfs); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, int force_update) {} static inline void __update_tg_runnable_avg(struct sched_avg *sa, struct cfs_rq *cfs_rq) {} static inline void __update_group_entity_contrib(struct sched_entity *se) {} -#endif +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +#endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void __update_task_entity_contrib(struct sched_entity *se) { @@ -2326,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); } -static inline void update_rq_runnable_avg(struct rq *rq, int runnable) -{ - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); - __update_tg_runnable_avg(&rq->avg, &rq->cfs); -} - /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -2365,13 +2520,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } wakeup = 0; } else { - /* - * Task re-woke on same cpu (or else migrate_task_rq_fair() - * would have made count negative); we must be careful to avoid - * double-accounting blocked time after synchronizing decays. - */ - se->avg.last_runnable_update += __synchronize_entity_decay(se) - << 20; + __synchronize_entity_decay(se); } /* migrated tasks did not contribute to our blocked load */ @@ -2425,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq) update_rq_runnable_avg(this_rq, 0); } -#else +static int idle_balance(struct rq *this_rq); + +#else /* CONFIG_SMP */ + static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} @@ -2437,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, int sleep) {} static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) {} -#endif + +static inline int idle_balance(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2587,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last == se) - cfs_rq->last = NULL; - else + if (cfs_rq->last != se) break; + + cfs_rq->last = NULL; } } @@ -2598,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next == se) - cfs_rq->next = NULL; - else + if (cfs_rq->next != se) break; + + cfs_rq->next = NULL; } } @@ -2609,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip == se) - cfs_rq->skip = NULL; - else + if (cfs_rq->skip != se) break; + + cfs_rq->skip = NULL; } } @@ -2755,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - struct sched_entity *left = se; + struct sched_entity *left = __pick_first_entity(cfs_rq); + struct sched_entity *se; + + /* + * If curr is set we have to see if its left of the leftmost entity + * still in the tree, provided there was anything in the tree at all. + */ + if (!left || (curr && entity_before(curr, left))) + left = curr; + + se = left; /* ideally we run the leftmost entity */ /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { - struct sched_entity *second = __pick_next_entity(se); + struct sched_entity *second; + + if (se == curr) { + second = __pick_first_entity(cfs_rq); + } else { + second = __pick_next_entity(se); + if (!second || (curr && entity_before(curr, second))) + second = curr; + } + if (second && wakeup_preempt_entity(second, left) < 1) se = second; } @@ -2787,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { @@ -3442,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) } /* conditionally throttle active cfs_rq's from put_prev_entity() */ -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { if (!cfs_bandwidth_used()) - return; + return false; if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return; + return false; /* * it's possible for a throttled entity to be forced into a running * state (e.g. set_curr_task), in this case we're finished. */ if (cfs_rq_throttled(cfs_rq)) - return; + return true; throttle_cfs_rq(cfs_rq); + return true; } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) @@ -3567,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -4101,12 +4279,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int load_idx) + int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + do { unsigned long load, avg_load; int local_group; @@ -4218,13 +4400,14 @@ done: } /* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * - * Balance, ie. select the least loaded group. + * Balances load by selecting the idlest cpu in the idlest group, or under + * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. * - * Returns the target CPU number, or the same CPU if no balancing is needed. + * Returns the target cpu number. * * preempt must be disabled. */ @@ -4274,7 +4457,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } while (sd) { - int load_idx = sd->forkexec_idx; struct sched_group *group; int weight; @@ -4283,10 +4465,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f continue; } - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - - group = find_idlest_group(sd, p, cpu, load_idx); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; @@ -4503,26 +4682,124 @@ preempt: set_last_buddy(se); } -static struct task_struct *pick_next_task_fair(struct rq *rq) +static struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev) { - struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; + struct task_struct *p; + int new_tasks; +again: +#ifdef CONFIG_FAIR_GROUP_SCHED if (!cfs_rq->nr_running) - return NULL; + goto idle; + + if (prev->sched_class != &fair_sched_class) + goto simple; + + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. + */ do { - se = pick_next_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + + /* + * Since we got here without doing put_prev_entity() we also + * have to consider cfs_rq->curr. If it is still a runnable + * entity, update_curr() will update its vruntime, otherwise + * forget we've ever seen it. + */ + if (curr && curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + + /* + * This call to check_cfs_rq_runtime() will do the throttle and + * dequeue its entity in the parent(s). Therefore the 'simple' + * nr_running test will indeed be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + + se = pick_next_entity(cfs_rq, curr); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + + /* + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; + int pse_depth = pse->depth; + + if (se_depth <= pse_depth) { + put_prev_entity(cfs_rq_of(pse), pse); + pse = parent_entity(pse); + } + if (se_depth >= pse_depth) { + set_next_entity(cfs_rq_of(se), se); + se = parent_entity(se); + } + } + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); + } + + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); + + return p; +simple: + cfs_rq = &rq->cfs; +#endif + + if (!cfs_rq->nr_running) + goto idle; + + put_prev_task(rq, prev); + + do { + se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); + if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); return p; + +idle: + new_tasks = idle_balance(rq); + /* + * Because idle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + if (new_tasks < 0) + return RETRY_TASK; + + if (new_tasks > 0) + goto again; + + return NULL; } /* @@ -4760,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env) * Is this task likely cache-hot: */ static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +task_hot(struct task_struct *p, u64 now) { s64 delta; @@ -4794,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { int src_nid, dst_nid; - if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || + if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) { return false; } @@ -4825,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) return false; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA)) return false; src_nid = cpu_to_node(env->src_cpu); @@ -4921,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) task is cache cold, or * 3) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); + tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); if (!tsk_cache_hot) tsk_cache_hot = migrate_degrades_locality(p, env); @@ -5512,7 +5789,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - unsigned long nr_running; unsigned long load; int i; @@ -5521,8 +5797,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); - nr_running = rq->nr_running; - /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -5530,7 +5804,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->sum_nr_running += nr_running; + sgs->sum_nr_running += rq->nr_running; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5787,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - busiest->group_power; - if (busiest->avg_load > tmp) { + if (busiest->avg_load > scaled_busy_load_per_task) { pwr_move += busiest->group_power * min(busiest->load_per_task, - busiest->avg_load - tmp); + busiest->avg_load - scaled_busy_load_per_task); } /* Amount of load we'd add */ @@ -6371,17 +6643,23 @@ out: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -void idle_balance(int this_cpu, struct rq *this_rq) +static int idle_balance(struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; + int this_cpu = this_rq->cpu; + idle_enter_fair(this_rq); + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ this_rq->idle_stamp = rq_clock(this_rq); if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; + goto out; /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6419,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } rcu_read_unlock(); raw_spin_lock(&this_rq->lock); + /* + * While browsing the domains, we released the rq lock. + * A task could have be enqueued in the meantime + */ + if (this_rq->cfs.h_nr_running && !pulled_task) { + pulled_task = 1; + goto out; + } + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on @@ -6438,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (curr_cost > this_rq->max_idle_balance_cost) this_rq->max_idle_balance_cost = curr_cost; + +out: + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running && + (this_rq->dl.dl_nr_running || + (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + pulled_task = -1; + + if (pulled_task) { + idle_exit_fair(this_rq); + this_rq->idle_stamp = 0; + } + + return pulled_task; } /* @@ -6508,6 +6807,11 @@ out_unlock: return 0; } +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + #ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details @@ -6521,7 +6825,7 @@ static struct { unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -static inline int find_new_ilb(int call_cpu) +static inline int find_new_ilb(void) { int ilb = cpumask_first(nohz.idle_cpus_mask); @@ -6536,13 +6840,13 @@ static inline int find_new_ilb(int call_cpu) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(int cpu) +static void nohz_balancer_kick(void) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(cpu); + ilb_cpu = find_new_ilb(); if (ilb_cpu >= nr_cpu_ids) return; @@ -6562,8 +6866,13 @@ static void nohz_balancer_kick(int cpu) static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); + /* + * Completely isolated CPUs don't ever set, so we must test. + */ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -6617,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu) if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; + /* + * If we're a completely isolated CPU, we don't play. + */ + if (on_null_domain(cpu_rq(cpu))) + return; + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); atomic_inc(&nohz.nr_cpus); set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); @@ -6652,10 +6967,10 @@ void update_max_interval(void) * * Balancing parameters are set up in init_sched_domains. */ -static void rebalance_domains(int cpu, enum cpu_idle_type idle) +static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; - struct rq *rq = cpu_rq(cpu); + int cpu = rq->cpu; unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -6752,9 +7067,9 @@ out: * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { - struct rq *this_rq = cpu_rq(this_cpu); + int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; @@ -6781,7 +7096,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) update_idle_cpu_load(rq); raw_spin_unlock_irq(&rq->lock); - rebalance_domains(balance_cpu, CPU_IDLE); + rebalance_domains(rq, CPU_IDLE); if (time_after(this_rq->next_balance, rq->next_balance)) this_rq->next_balance = rq->next_balance; @@ -6800,14 +7115,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq, int cpu) +static inline int nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_power *sgp; - int nr_busy; + int nr_busy, cpu = rq->cpu; - if (unlikely(idle_cpu(cpu))) + if (unlikely(rq->idle_balance)) return 0; /* @@ -6856,7 +7171,7 @@ need_kick: return 1; } #else -static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } #endif /* @@ -6865,38 +7180,34 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } */ static void run_rebalance_domains(struct softirq_action *h) { - int this_cpu = smp_processor_id(); - struct rq *this_rq = cpu_rq(this_cpu); + struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_cpu, idle); + rebalance_domains(this_rq, idle); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - nohz_idle_balance(this_cpu, idle); -} - -static inline int on_null_domain(int cpu) -{ - return !rcu_dereference_sched(cpu_rq(cpu)->sd); + nohz_idle_balance(this_rq, idle); } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq) { /* Don't need to rebalance while attached to NULL domain */ - if (time_after_eq(jiffies, rq->next_balance) && - likely(!on_null_domain(cpu))) + if (unlikely(on_null_domain(rq))) + return; + + if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) - nohz_balancer_kick(cpu); + if (nohz_kick_needed(rq)) + nohz_balancer_kick(); #endif } @@ -7012,15 +7323,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7047,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { - if (!p->se.on_rq) + struct sched_entity *se = &p->se; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + if (!se->on_rq) return; /* @@ -7095,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq; + /* * If the task was not on the rq at the time of this cgroup movement * it must have been asleep, sleeping tasks keep their ->vruntime @@ -7121,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * To prevent boost or penalty in the new cfs_rq caused by delta * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. */ - if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) on_rq = 1; if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; if (!on_rq) { - cfs_rq = cfs_rq_of(&p->se); - p->se.vruntime += cfs_rq->min_vruntime; + cfs_rq = cfs_rq_of(se); + se->vruntime += cfs_rq->min_vruntime; #ifdef CONFIG_SMP /* * migrate_task_rq_fair() will have removed our previous * contribution, but we must synchronize for ongoing future * decay. */ - p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); - cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; #endif } } @@ -7233,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, if (!se) return; - if (!parent) + if (!parent) { se->cfs_rq = &rq->cfs; - else + se->depth = 0; + } else { se->cfs_rq = parent->my_q; + se->depth = parent->depth + 1; + } se->my_q = cfs_rq; /* guarantee group entities always have weight */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c new file mode 100644 index 000000000000..8f4390a079c7 --- /dev/null +++ b/kernel/sched/idle.c @@ -0,0 +1,265 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> +#include <linux/cpuidle.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <linux/stackprotector.h> + +#include <asm/tlb.h> + +#include <trace/events/power.h> + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + rcu_idle_enter(); + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!tif_need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + rcu_idle_exit(); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; + local_irq_enable(); +} + +/** + * cpuidle_idle_call - the main idle function + * + * NOTE: no locks or semaphores should be used here + * return non-zero on failure + */ +static int cpuidle_idle_call(void) +{ + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int next_state, entered_state, ret; + bool broadcast; + + /* + * Check if the idle task must be rescheduled. If it is the + * case, exit the function after re-enabling the local irq and + * set again the polling flag + */ + if (current_clr_polling_and_test()) { + local_irq_enable(); + __current_set_polling(); + return 0; + } + + /* + * During the idle period, stop measuring the disabled irqs + * critical sections latencies + */ + stop_critical_timings(); + + /* + * Tell the RCU framework we are entering an idle section, + * so no more rcu read side critical sections and one more + * step to the grace period + */ + rcu_idle_enter(); + + /* + * Check if the cpuidle framework is ready, otherwise fallback + * to the default arch specific idle method + */ + ret = cpuidle_enabled(drv, dev); + + if (!ret) { + /* + * Ask the governor to choose an idle state it thinks + * it is convenient to go to. There is *always* a + * convenient idle state + */ + next_state = cpuidle_select(drv, dev); + + /* + * The idle task must be scheduled, it is pointless to + * go to idle, just update no idle residency and get + * out of this function + */ + if (current_clr_polling_and_test()) { + dev->last_residency = 0; + entered_state = next_state; + local_irq_enable(); + } else { + broadcast = !!(drv->states[next_state].flags & + CPUIDLE_FLAG_TIMER_STOP); + + if (broadcast) + /* + * Tell the time framework to switch + * to a broadcast timer because our + * local timer will be shutdown. If a + * local timer is used from another + * cpu as a broadcast timer, this call + * may fail if it is not available + */ + ret = clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_ENTER, + &dev->cpu); + + if (!ret) { + trace_cpu_idle_rcuidle(next_state, dev->cpu); + + /* + * Enter the idle state previously + * returned by the governor + * decision. This function will block + * until an interrupt occurs and will + * take care of re-enabling the local + * interrupts + */ + entered_state = cpuidle_enter(drv, dev, + next_state); + + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, + dev->cpu); + + if (broadcast) + clockevents_notify( + CLOCK_EVT_NOTIFY_BROADCAST_EXIT, + &dev->cpu); + + /* + * Give the governor an opportunity to reflect on the + * outcome + */ + cpuidle_reflect(dev, entered_state); + } + } + } + + /* + * We can't use the cpuidle framework, let's use the default + * idle routine + */ + if (ret) + arch_cpu_idle(); + + __current_set_polling(); + + /* + * It is up to the idle functions to enable back the local + * interrupt + */ + if (WARN_ON_ONCE(irqs_disabled())) + local_irq_enable(); + + rcu_idle_exit(); + start_critical_timings(); + + return 0; +} + +/* + * Generic idle loop implementation + */ +static void cpu_idle_loop(void) +{ + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + /* + * In poll mode we reenable interrupts and spin. + * + * Also if we detected in the wakeup from idle + * path that the tick broadcast device expired + * for us, we don't want to go deep idle as we + * know that the IPI is going to arrive right + * away + */ + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + + arch_cpu_idle_exit(); + } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif + __current_set_polling(); + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 516c3d9ceea1..879f2b75266a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } - -static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) -{ - idle_exit_fair(rq); - rq_last_tick_reset(rq); -} - -static void post_schedule_idle(struct rq *rq) -{ - idle_enter_fair(rq); -} #endif /* CONFIG_SMP */ + /* * Idle tasks are unconditionally rescheduled: */ @@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_task(rq->idle); } -static struct task_struct *pick_next_task_idle(struct rq *rq) +static struct task_struct * +pick_next_task_idle(struct rq *rq, struct task_struct *prev) { + put_prev_task(rq, prev); + schedstat_inc(rq, sched_goidle); -#ifdef CONFIG_SMP - /* Trigger the post schedule to do an idle_enter for CFS */ - rq->post_schedule = 1; -#endif return rq->idle; } @@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + idle_exit_fair(rq); + rq_last_tick_reset(rq); } static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) @@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, - .pre_schedule = pre_schedule_idle, - .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1c4065575fa2..d8cdf1618551 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) #ifdef CONFIG_SMP +static int pull_rt_task(struct rq *this_rq); + +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + return rq->rt.highest_prio.curr > prev->prio; +} + static inline int rt_overloaded(struct rq *rq) { return atomic_read(&rq->rd->rto_count); @@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq) return !plist_head_empty(&rq->rt.pushable_tasks); } +static inline void set_post_schedule(struct rq *rq) +{ + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); @@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { } +static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) +{ + return false; +} + +static inline int pull_rt_task(struct rq *this_rq) +{ + return 0; +} + +static inline void set_post_schedule(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ static inline int on_rt_rq(struct sched_rt_entity *rt_se) @@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; -} - static int rt_se_boosted(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); @@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - static inline const struct cpumask *sched_rt_period_mask(void) { return cpu_online_mask; @@ -538,6 +558,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. @@ -1310,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; - struct rt_rq *rt_rq; - - rt_rq = &rq->rt; - - if (!rt_rq->rt_nr_running) - return NULL; - - if (rt_rq_throttled(rt_rq)) - return NULL; + struct rt_rq *rt_rq = &rq->rt; do { rt_se = pick_next_rt_entity(rq, rt_rq); @@ -1332,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct * +pick_next_task_rt(struct rq *rq, struct task_struct *prev) { - struct task_struct *p = _pick_next_task_rt(rq); + struct task_struct *p; + struct rt_rq *rt_rq = &rq->rt; + + if (need_pull_rt_task(rq, prev)) { + pull_rt_task(rq); + /* + * pull_rt_task() can drop (and re-acquire) rq->lock; this + * means a dl task can slip in, in which case we need to + * re-start task selection. + */ + if (unlikely(rq->dl.dl_nr_running)) + return RETRY_TASK; + } + + /* + * We may dequeue prev's rt_rq in put_prev_task(). + * So, we update time before rt_nr_running check. + */ + if (prev->sched_class == &rt_sched_class) + update_curr_rt(rq); + + if (!rt_rq->rt_nr_running) + return NULL; + + if (rt_rq_throttled(rt_rq)) + return NULL; + + put_prev_task(rq, prev); + + p = _pick_next_task_rt(rq); /* The running task is never eligible for pushing */ if (p) dequeue_pushable_task(rq, p); -#ifdef CONFIG_SMP - /* - * We detect this state here so that we can avoid taking the RQ - * lock again later if there is no need to push - */ - rq->post_schedule = has_pushable_tasks(rq); -#endif + set_post_schedule(rq); return p; } @@ -1716,13 +1760,6 @@ skip: return ret; } -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) -{ - /* Try to pull RT tasks here if we lower this rq's prio */ - if (rq->rt.highest_prio.curr > prev->prio) - pull_rt_task(rq); -} - static void post_schedule_rt(struct rq *rq) { push_rt_tasks(rq); @@ -1738,7 +1775,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && p->nr_cpus_allowed > 1 && - rt_task(rq->curr) && + (dl_task(rq->curr) || rt_task(rq->curr)) && (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); @@ -1825,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } -void init_sched_rt_class(void) +void __init init_sched_rt_class(void) { unsigned int i; @@ -1999,7 +2036,6 @@ const struct sched_class rt_sched_class = { .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, - .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 88c85b21d633..c9007f28d3a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> @@ -9,6 +10,7 @@ #include <linux/slab.h> #include "cpupri.h" +#include "cpudeadline.h" #include "cpuacct.h" struct rq; @@ -22,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq); extern void update_cpu_load_active(struct rq *this_rq); /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* * Helpers for converting nanosecond timing to jiffy resolution */ #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) @@ -73,6 +57,13 @@ extern void update_cpu_load_active(struct rq *this_rq); #define NICE_0_SHIFT SCHED_LOAD_SHIFT /* + * Single value that decides SCHED_DEADLINE internal math precision. + * 10 -> just above 1us + * 9 -> just above 0.5us + */ +#define DL_SCALE (10) + +/* * These are the 'tuning knobs' of the scheduler: */ @@ -81,11 +72,19 @@ extern void update_cpu_load_active(struct rq *this_rq); */ #define RUNTIME_INF ((u64)~0ULL) +static inline int fair_policy(int policy) +{ + return policy == SCHED_NORMAL || policy == SCHED_BATCH; +} + static inline int rt_policy(int policy) { - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; + return policy == SCHED_FIFO || policy == SCHED_RR; +} + +static inline int dl_policy(int policy) +{ + return policy == SCHED_DEADLINE; } static inline int task_has_rt_policy(struct task_struct *p) @@ -93,6 +92,25 @@ static inline int task_has_rt_policy(struct task_struct *p) return rt_policy(p->policy); } +static inline int task_has_dl_policy(struct task_struct *p) +{ + return dl_policy(p->policy); +} + +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + +/* + * Tells if entity @a should preempt entity @b. + */ +static inline bool +dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +{ + return dl_time_before(a->deadline, b->deadline); +} + /* * This is the priority-queue data structure of the RT scheduling class: */ @@ -108,6 +126,47 @@ struct rt_bandwidth { u64 rt_runtime; struct hrtimer rt_period_timer; }; +/* + * To keep the bandwidth of -deadline tasks and groups under control + * we need some place where: + * - store the maximum -deadline bandwidth of the system (the group); + * - cache the fraction of that bandwidth that is currently allocated. + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, the bandwidth is given on a per-CPU basis, + * meaning that: + * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; + * - dl_total_bw array contains, in the i-eth element, the currently + * allocated bandwidth on the i-eth CPU. + * Moreover, groups consume bandwidth on each CPU, while tasks only + * consume bandwidth on the CPU they're running on. + * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw + * that will be shown the next time the proc or cgroup controls will + * be red. It on its turn can be changed by writing on its own + * control. + */ +struct dl_bandwidth { + raw_spinlock_t dl_runtime_lock; + u64 dl_runtime; + u64 dl_period; +}; + +static inline int dl_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +extern struct dl_bw *dl_bw_of(int i); + +struct dl_bw { + raw_spinlock_t lock; + u64 bw, total_bw; +}; extern struct mutex sched_domains_mutex; @@ -364,6 +423,53 @@ struct rt_rq { #endif }; +#ifdef CONFIG_RT_GROUP_SCHED +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} +#else +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} +#endif + +/* Deadline class' related fields in a runqueue */ +struct dl_rq { + /* runqueue is an rbtree, ordered by deadline */ + struct rb_root rb_root; + struct rb_node *rb_leftmost; + + unsigned long dl_nr_running; + +#ifdef CONFIG_SMP + /* + * Deadline values of the currently executing and the + * earliest ready task on this rq. Caching these facilitates + * the decision wether or not a ready but not running task + * should migrate somewhere else. + */ + struct { + u64 curr; + u64 next; + } earliest_dl; + + unsigned long dl_nr_migratory; + int overloaded; + + /* + * Tasks on this rq that can be pushed away. They are kept in + * an rb-tree, ordered by tasks' deadlines, with caching + * of the leftmost (earliest deadline) element. + */ + struct rb_root pushable_dl_tasks_root; + struct rb_node *pushable_dl_tasks_leftmost; +#else + struct dl_bw dl_bw; +#endif +}; + #ifdef CONFIG_SMP /* @@ -382,6 +488,15 @@ struct root_domain { cpumask_var_t online; /* + * The bit corresponding to a CPU gets set here if such CPU has more + * than one runnable -deadline task (as it is below for RT tasks). + */ + cpumask_var_t dlo_mask; + atomic_t dlo_count; + struct dl_bw dl_bw; + struct cpudl cpudl; + + /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. */ @@ -432,15 +547,14 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; + struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif + struct sched_avg avg; +#endif /* CONFIG_FAIR_GROUP_SCHED */ /* * This is part of a global counter where only the total sum @@ -529,8 +643,6 @@ struct rq { #ifdef CONFIG_SMP struct llist_head wake_list; #endif - - struct sched_avg avg; }; static inline int cpu_of(struct rq *rq) @@ -827,8 +939,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } - - static inline int task_current(struct rq *rq, struct task_struct *p) { return rq->curr == p; @@ -988,9 +1098,12 @@ static const u32 prio_to_wmult[40] = { #else #define ENQUEUE_WAKING 0 #endif +#define ENQUEUE_REPLENISH 8 #define DEQUEUE_SLEEP 1 +#define RETRY_TASK ((void *)-1UL) + struct sched_class { const struct sched_class *next; @@ -1001,14 +1114,22 @@ struct sched_class { void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); - struct task_struct * (*pick_next_task) (struct rq *rq); + /* + * It is the responsibility of the pick_next_task() method that will + * return the next task to call put_prev_task() on the @prev task or + * something equivalent. + * + * May return RETRY_TASK when it finds a higher prio class has runnable + * tasks. + */ + struct task_struct * (*pick_next_task) (struct rq *rq, + struct task_struct *prev); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int next_cpu); - void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); @@ -1023,6 +1144,7 @@ struct sched_class { void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_fork) (struct task_struct *p); + void (*task_dead) (struct task_struct *p); void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); @@ -1037,11 +1159,17 @@ struct sched_class { #endif }; +static inline void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + prev->sched_class->put_prev_task(rq, prev); +} + #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) extern const struct sched_class stop_sched_class; +extern const struct sched_class dl_sched_class; extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; @@ -1051,25 +1179,26 @@ extern const struct sched_class idle_sched_class; extern void update_group_power(struct sched_domain *sd, int cpu); -extern void trigger_load_balance(struct rq *rq, int cpu); -extern void idle_balance(int this_cpu, struct rq *this_rq); +extern void trigger_load_balance(struct rq *rq); extern void idle_enter_fair(struct rq *this_rq); extern void idle_exit_fair(struct rq *this_rq); -#else /* CONFIG_SMP */ +#else -static inline void idle_balance(int cpu, struct rq *rq) -{ -} +static inline void idle_enter_fair(struct rq *rq) { } +static inline void idle_exit_fair(struct rq *rq) { } #endif extern void sysrq_sched_debug_show(void); extern void sched_init_granularity(void); extern void update_max_interval(void); + +extern void init_sched_dl_class(void); extern void init_sched_rt_class(void); extern void init_sched_fair_class(void); +extern void init_sched_dl_class(void); extern void resched_task(struct task_struct *p); extern void resched_cpu(int cpu); @@ -1077,19 +1206,15 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); -extern void update_idle_cpu_load(struct rq *this_rq); +extern struct dl_bandwidth def_dl_bandwidth; +extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); +extern void init_dl_task_timer(struct sched_dl_entity *dl_se); -extern void init_task_runnable_average(struct task_struct *p); +unsigned long to_ratio(u64 period, u64 runtime); -#ifdef CONFIG_PARAVIRT -static inline u64 steal_ticks(u64 steal) -{ - if (unlikely(steal > NSEC_PER_SEC)) - return div_u64(steal, TICK_NSEC); +extern void update_idle_cpu_load(struct rq *this_rq); - return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -} -#endif +extern void init_task_runnable_average(struct task_struct *p); static inline void inc_nr_running(struct rq *rq) { @@ -1353,6 +1478,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index da98af347e8b..a476bea17fbc 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void) proc_create("schedstat", 0, NULL, &proc_schedstat_operations); return 0; } -module_init(proc_schedstat_init); +subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 47197de8abd9..d6ce65dde541 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) /* we're never preempted */ } -static struct task_struct *pick_next_task_stop(struct rq *rq) +static struct task_struct * +pick_next_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) { - stop->se.exec_start = rq_clock_task(rq); - return stop; - } + if (!stop || !stop->on_rq) + return NULL; - return NULL; + put_prev_task(rq, prev); + + stop->se.exec_start = rq_clock_task(rq); + + return stop; } static void @@ -103,7 +106,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) * Simple, special scheduling class for the per-CPU stop tasks: */ const struct sched_class stop_sched_class = { - .next = &rt_sched_class, + .next = &dl_sched_class, .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/seccomp.c b/kernel/seccomp.c index eda2da3df822..d8d046c0726a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -55,60 +55,33 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; unsigned short len; /* Instruction count */ - struct sock_filter insns[]; + struct sock_filter_int insnsi[]; }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) -/** - * get_u32 - returns a u32 offset into data - * @data: a unsigned 64 bit value - * @index: 0 or 1 to return the first or second 32-bits - * - * This inline exists to hide the length of unsigned long. If a 32-bit - * unsigned long is passed in, it will be extended and the top 32-bits will be - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be - * properly returned. - * +/* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ -static inline u32 get_u32(u64 data, int index) +static void populate_seccomp_data(struct seccomp_data *sd) { - return ((u32 *)&data)[index]; -} + struct task_struct *task = current; + struct pt_regs *regs = task_pt_regs(task); -/* Helper for bpf_load below. */ -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) -/** - * bpf_load: checks and returns a pointer to the requested offset - * @off: offset into struct seccomp_data to load from - * - * Returns the requested 32-bits of data. - * seccomp_check_filter() should assure that @off is 32-bit aligned - * and not out of bounds. Failure to do so is a BUG. - */ -u32 seccomp_bpf_load(int off) -{ - struct pt_regs *regs = task_pt_regs(current); - if (off == BPF_DATA(nr)) - return syscall_get_nr(current, regs); - if (off == BPF_DATA(arch)) - return syscall_get_arch(); - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { - unsigned long value; - int arg = (off - BPF_DATA(args[0])) / sizeof(u64); - int index = !!(off % sizeof(u64)); - syscall_get_arguments(current, regs, arg, 1, &value); - return get_u32(value, index); - } - if (off == BPF_DATA(instruction_pointer)) - return get_u32(KSTK_EIP(current), 0); - if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) - return get_u32(KSTK_EIP(current), 1); - /* seccomp_check_filter should make this impossible. */ - BUG(); + sd->nr = syscall_get_nr(task, regs); + sd->arch = syscall_get_arch(); + + /* Unroll syscall_get_args to help gcc on arm. */ + syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]); + syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]); + syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]); + syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]); + syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]); + syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]); + + sd->instruction_pointer = KSTK_EIP(task); } /** @@ -133,17 +106,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) switch (code) { case BPF_S_LD_W_ABS: - ftest->code = BPF_S_ANC_SECCOMP_LD_W; + ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; case BPF_S_LD_W_LEN: - ftest->code = BPF_S_LD_IMM; + ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; case BPF_S_LDX_W_LEN: - ftest->code = BPF_S_LDX_IMM; + ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ @@ -185,6 +158,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) case BPF_S_JMP_JGT_X: case BPF_S_JMP_JSET_K: case BPF_S_JMP_JSET_X: + sk_decode_filter(ftest, ftest); continue; default: return -EINVAL; @@ -202,18 +176,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) static u32 seccomp_run_filters(int syscall) { struct seccomp_filter *f; + struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(current->seccomp.filter == NULL)) return SECCOMP_RET_KILL; + populate_seccomp_data(&sd); + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter(NULL, f->insns); + u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -231,6 +208,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) struct seccomp_filter *filter; unsigned long fp_size = fprog->len * sizeof(struct sock_filter); unsigned long total_insns = fprog->len; + struct sock_filter *fp; + int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) @@ -252,28 +231,43 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return -EACCES; - /* Allocate a new seccomp_filter */ - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, - GFP_KERNEL|__GFP_NOWARN); - if (!filter) + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); + if (!fp) return -ENOMEM; - atomic_set(&filter->usage, 1); - filter->len = fprog->len; /* Copy the instructions from fprog. */ ret = -EFAULT; - if (copy_from_user(filter->insns, fprog->filter, fp_size)) - goto fail; + if (copy_from_user(fp, fprog->filter, fp_size)) + goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(filter->insns, filter->len); + ret = sk_chk_filter(fp, fprog->len); if (ret) - goto fail; + goto free_prog; /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(filter->insns, filter->len); + ret = seccomp_check_filter(fp, fprog->len); + if (ret) + goto free_prog; + + /* Convert 'sock_filter' insns to 'sock_filter_int' insns */ + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + if (ret) + goto free_prog; + + /* Allocate a new seccomp_filter */ + filter = kzalloc(sizeof(struct seccomp_filter) + + sizeof(struct sock_filter_int) * new_len, + GFP_KERNEL|__GFP_NOWARN); + if (!filter) + goto free_prog; + + ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len); if (ret) - goto fail; + goto free_filter; + + atomic_set(&filter->usage, 1); + filter->len = new_len; /* * If there is an existing filter, make it the prev and don't drop its @@ -282,8 +276,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) filter->prev = current->seccomp.filter; current->seccomp.filter = filter; return 0; -fail: + +free_filter: kfree(filter); +free_prog: + kfree(fp); return ret; } @@ -293,7 +290,7 @@ fail: * * Returns 0 on success and non-zero otherwise. */ -long seccomp_attach_user_filter(char __user *user_filter) +static long seccomp_attach_user_filter(char __user *user_filter) { struct sock_fprog fprog; long ret = -EFAULT; diff --git a/kernel/signal.c b/kernel/signal.c index 940b30ee9a30..6ea13c09ae56 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -33,6 +33,8 @@ #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/cn_proc.h> +#include <linux/compiler.h> + #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -2047,8 +2049,8 @@ static bool do_signal_stop(int signr) if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; - for (t = next_thread(current); t != current; - t = next_thread(t)) { + t = current; + while_each_thread(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -2382,7 +2384,7 @@ relock: * @regs: user register state * @stepping: nonzero if debugger single-step or block-step in use * - * This function should be called when a signal has succesfully been + * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask * is always blocked, and the signal itself is blocked unless %SA_NODEFER * is set in @ka->sa.sa_flags. Tracing is notified. @@ -3125,8 +3127,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) rm_from_queue_full(&mask, &t->signal->shared_pending); do { rm_from_queue_full(&mask, &t->pending); - t = next_thread(t); - } while (t != current); + } while_each_thread(current, t); } } @@ -3619,7 +3620,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) } #endif -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +__weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } diff --git a/kernel/smp.c b/kernel/smp.c index bd9f94028838..06d574e42c72 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -23,17 +23,11 @@ enum { struct call_function_data { struct call_single_data __percpu *csd; cpumask_var_t cpumask; - cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); -struct call_single_queue { - struct list_head list; - raw_spinlock_t lock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -47,14 +41,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); - if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, - cpu_to_node(cpu))) { - free_cpumask_var(cfd->cpumask); - return notifier_from_errno(-ENOMEM); - } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { - free_cpumask_var(cfd->cpumask_ipi); free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); } @@ -67,7 +55,6 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD: case CPU_DEAD_FROZEN: free_cpumask_var(cfd->cpumask); - free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); break; #endif @@ -85,12 +72,8 @@ void __init call_function_init(void) void *cpu = (void *)(long)smp_processor_id(); int i; - for_each_possible_cpu(i) { - struct call_single_queue *q = &per_cpu(call_single_queue, i); - - raw_spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->list); - } + for_each_possible_cpu(i) + init_llist_head(&per_cpu(call_single_queue, i)); hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); register_cpu_notifier(&hotplug_cfd_notifier); @@ -134,25 +117,46 @@ static void csd_unlock(struct call_single_data *csd) csd->flags &= ~CSD_FLAG_LOCK; } +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); + /* * Insert a previously allocated call_single_data element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static int generic_exec_single(int cpu, struct call_single_data *csd, + smp_call_func_t func, void *info, int wait) { - struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); + struct call_single_data csd_stack = { .flags = 0 }; unsigned long flags; - int ipi; + + + if (cpu == smp_processor_id()) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; + } + + + if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; + + + if (!csd) { + csd = &csd_stack; + if (!wait) + csd = &__get_cpu_var(csd_data); + } + + csd_lock(csd); + + csd->func = func; + csd->info = info; if (wait) csd->flags |= CSD_FLAG_WAIT; - raw_spin_lock_irqsave(&dst->lock, flags); - ipi = list_empty(&dst->list); - list_add_tail(&csd->list, &dst->list); - raw_spin_unlock_irqrestore(&dst->lock, flags); - /* * The list addition should be visible before sending the IPI * handler locks the list to pull the entry off it because of @@ -164,11 +168,13 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ - if (ipi) + if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) arch_send_call_function_single_ipi(cpu); if (wait) csd_lock_wait(csd); + + return 0; } /* @@ -177,32 +183,23 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) */ void generic_smp_call_function_single_interrupt(void) { - struct call_single_queue *q = &__get_cpu_var(call_single_queue); - LIST_HEAD(list); + struct llist_node *entry; + struct call_single_data *csd, *csd_next; /* * Shouldn't receive this interrupt on a cpu that is not yet online. */ WARN_ON_ONCE(!cpu_online(smp_processor_id())); - raw_spin_lock(&q->lock); - list_replace_init(&q->list, &list); - raw_spin_unlock(&q->lock); - - while (!list_empty(&list)) { - struct call_single_data *csd; - - csd = list_entry(list.next, struct call_single_data, list); - list_del(&csd->list); + entry = llist_del_all(&__get_cpu_var(call_single_queue)); + entry = llist_reverse_order(entry); + llist_for_each_entry_safe(csd, csd_next, entry, llist) { csd->func(csd->info); - csd_unlock(csd); } } -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); - /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. @@ -214,12 +211,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { - struct call_single_data d = { - .flags = 0, - }; - unsigned long flags; int this_cpu; - int err = 0; + int err; /* * prevent preemption and reschedule on another processor, @@ -236,32 +229,41 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); - if (cpu == this_cpu) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } else { - if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *csd = &d; + err = generic_exec_single(cpu, NULL, func, info, wait); - if (!wait) - csd = &__get_cpu_var(csd_data); + put_cpu(); - csd_lock(csd); + return err; +} +EXPORT_SYMBOL(smp_call_function_single); - csd->func = func; - csd->info = info; - generic_exec_single(cpu, csd, wait); - } else { - err = -ENXIO; /* CPU not online */ - } - } +/** + * smp_call_function_single_async(): Run an asynchronous function on a + * specific CPU. + * @cpu: The CPU to run on. + * @csd: Pre-allocated and setup data structure + * + * Like smp_call_function_single(), but the call is asynchonous and + * can thus be done from contexts with disabled interrupts. + * + * The caller passes his own pre-allocated data structure + * (ie: embedded in an object) and is responsible for synchronizing it + * such that the IPIs performed on the @csd are strictly serialized. + * + * NOTE: Be careful, there is unfortunately no current debugging facility to + * validate the correctness of this serialization. + */ +int smp_call_function_single_async(int cpu, struct call_single_data *csd) +{ + int err = 0; - put_cpu(); + preempt_disable(); + err = generic_exec_single(cpu, csd, csd->func, csd->info, 0); + preempt_enable(); return err; } -EXPORT_SYMBOL(smp_call_function_single); +EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus @@ -307,44 +309,6 @@ call: EXPORT_SYMBOL_GPL(smp_call_function_any); /** - * __smp_call_function_single(): Run a function on a specific CPU - * @cpu: The CPU to run on. - * @data: Pre-allocated and setup data structure - * @wait: If true, wait until function has completed on specified CPU. - * - * Like smp_call_function_single(), but allow caller to pass in a - * pre-allocated data structure. Useful for embedding @data inside - * other structures, for instance. - */ -void __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) -{ - unsigned int this_cpu; - unsigned long flags; - - this_cpu = get_cpu(); - /* - * Can deadlock when called with interrupts disabled. - * We allow cpu's that are not yet online though, as no one else can - * send smp call function interrupt to this cpu and as such deadlocks - * can't happen. - */ - WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() - && !oops_in_progress); - - if (cpu == this_cpu) { - local_irq_save(flags); - csd->func(csd->info); - local_irq_restore(flags); - } else { - csd_lock(csd); - generic_exec_single(cpu, csd, wait); - } - put_cpu(); -} -EXPORT_SYMBOL_GPL(__smp_call_function_single); - -/** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. @@ -402,30 +366,17 @@ void smp_call_function_many(const struct cpumask *mask, if (unlikely(!cpumask_weight(cfd->cpumask))) return; - /* - * After we put an entry into the list, cfd->cpumask may be cleared - * again when another CPU sends another IPI for a SMP function call, so - * cfd->cpumask will be zero. - */ - cpumask_copy(cfd->cpumask_ipi, cfd->cpumask); - for_each_cpu(cpu, cfd->cpumask) { struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu); - struct call_single_queue *dst = - &per_cpu(call_single_queue, cpu); - unsigned long flags; csd_lock(csd); csd->func = func; csd->info = info; - - raw_spin_lock_irqsave(&dst->lock, flags); - list_add_tail(&csd->list, &dst->list); - raw_spin_unlock_irqrestore(&dst->lock, flags); + llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)); } /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(cfd->cpumask_ipi); + arch_send_call_function_ipi_mask(cfd->cpumask); if (wait) { for_each_cpu(cpu, cfd->cpumask) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 11025ccc06dd..b50990a5bea0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -8,6 +8,8 @@ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/export.h> #include <linux/kernel_stat.h> #include <linux/interrupt.h> @@ -23,6 +25,7 @@ #include <linux/smp.h> #include <linux/smpboot.h> #include <linux/tick.h> +#include <linux/irq.h> #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -54,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); -char *softirq_to_name[NR_SOFTIRQS] = { +const char * const softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" }; @@ -89,7 +92,7 @@ static void wakeup_softirqd(void) * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip, unsigned int cnt) +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -107,33 +110,21 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) /* * Were softirqs turned off above: */ - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_off(ip); raw_local_irq_restore(flags); if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -#else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) -{ - preempt_count_add(cnt); - barrier(); -} +EXPORT_SYMBOL(__local_bh_disable_ip); #endif /* CONFIG_TRACE_IRQFLAGS */ -void local_bh_disable(void) -{ - __local_bh_disable(_RET_IP_, SOFTIRQ_DISABLE_OFFSET); -} - -EXPORT_SYMBOL(local_bh_disable); - static void __local_bh_enable(unsigned int cnt) { WARN_ON_ONCE(!irqs_disabled()); - if (softirq_count() == cnt) + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); preempt_count_sub(cnt); } @@ -148,10 +139,9 @@ void _local_bh_enable(void) WARN_ON_ONCE(in_irq()); __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } - EXPORT_SYMBOL(_local_bh_enable); -static inline void _local_bh_enable_ip(unsigned long ip) +void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { WARN_ON_ONCE(in_irq() || irqs_disabled()); #ifdef CONFIG_TRACE_IRQFLAGS @@ -165,8 +155,8 @@ static inline void _local_bh_enable_ip(unsigned long ip) /* * Keep preemption disabled until we are done with * softirq processing: - */ - preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); + */ + preempt_count_sub(cnt - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) { /* @@ -182,18 +172,7 @@ static inline void _local_bh_enable_ip(unsigned long ip) #endif preempt_check_resched(); } - -void local_bh_enable(void) -{ - _local_bh_enable_ip(_RET_IP_); -} -EXPORT_SYMBOL(local_bh_enable); - -void local_bh_enable_ip(unsigned long ip) -{ - _local_bh_enable_ip(ip); -} -EXPORT_SYMBOL(local_bh_enable_ip); +EXPORT_SYMBOL(__local_bh_enable_ip); /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, @@ -211,14 +190,49 @@ EXPORT_SYMBOL(local_bh_enable_ip); #define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) #define MAX_SOFTIRQ_RESTART 10 +#ifdef CONFIG_TRACE_IRQFLAGS +/* + * When we run softirqs from irq_exit() and thus on the hardirq stack we need + * to keep the lockdep irq context tracking as tight as possible in order to + * not miss-qualify lock contexts and miss possible deadlocks. + */ + +static inline bool lockdep_softirq_start(void) +{ + bool in_hardirq = false; + + if (trace_hardirq_context(current)) { + in_hardirq = true; + trace_hardirq_exit(); + } + + lockdep_softirq_enter(); + + return in_hardirq; +} + +static inline void lockdep_softirq_end(bool in_hardirq) +{ + lockdep_softirq_exit(); + + if (in_hardirq) + trace_hardirq_enter(); +} +#else +static inline bool lockdep_softirq_start(void) { return false; } +static inline void lockdep_softirq_end(bool in_hardirq) { } +#endif + asmlinkage void __do_softirq(void) { - struct softirq_action *h; - __u32 pending; unsigned long end = jiffies + MAX_SOFTIRQ_TIME; - int cpu; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; + struct softirq_action *h; + bool in_hardirq; + __u32 pending; + int softirq_bit; + int cpu; /* * Mask out PF_MEMALLOC s current task context is borrowed for the @@ -230,8 +244,8 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_irq_enter_time(current); - __local_bh_disable(_RET_IP_, SOFTIRQ_OFFSET); - lockdep_softirq_enter(); + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + in_hardirq = lockdep_softirq_start(); cpu = smp_processor_id(); restart: @@ -242,30 +256,30 @@ restart: h = softirq_vec; - do { - if (pending & 1) { - unsigned int vec_nr = h - softirq_vec; - int prev_count = preempt_count(); - - kstat_incr_softirqs_this_cpu(vec_nr); - - trace_softirq_entry(vec_nr); - h->action(h); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %u %s %p" - "with preempt_count %08x," - " exited with %08x?\n", vec_nr, - softirq_to_name[vec_nr], h->action, - prev_count, preempt_count()); - preempt_count_set(prev_count); - } + while ((softirq_bit = ffs(pending))) { + unsigned int vec_nr; + int prev_count; + + h += softirq_bit - 1; + + vec_nr = h - softirq_vec; + prev_count = preempt_count(); + + kstat_incr_softirqs_this_cpu(vec_nr); - rcu_bh_qs(cpu); + trace_softirq_entry(vec_nr); + h->action(h); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", + vec_nr, softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); + preempt_count_set(prev_count); } + rcu_bh_qs(cpu); h++; - pending >>= 1; - } while (pending); + pending >>= softirq_bit; + } local_irq_disable(); @@ -278,16 +292,13 @@ restart: wakeup_softirqd(); } - lockdep_softirq_exit(); - + lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } - - asmlinkage void do_softirq(void) { __u32 pending; @@ -311,8 +322,6 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { - int cpu = smp_processor_id(); - rcu_irq_enter(); if (is_idle_task(current) && !in_interrupt()) { /* @@ -320,7 +329,7 @@ void irq_enter(void) * here, as softirq will be serviced on return from interrupt. */ local_bh_disable(); - tick_check_idle(cpu); + tick_irq_enter(); _local_bh_enable(); } @@ -375,13 +384,13 @@ void irq_exit(void) #endif account_irq_exit_time(current); - trace_hardirq_exit(); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); tick_irq_exit(); rcu_irq_exit(); + trace_hardirq_exit(); /* must be last! */ } /* @@ -427,8 +436,7 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) /* * Tasklets */ -struct tasklet_head -{ +struct tasklet_head { struct tasklet_struct *head; struct tasklet_struct **tail; }; @@ -447,7 +455,6 @@ void __tasklet_schedule(struct tasklet_struct *t) raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_restore(flags); } - EXPORT_SYMBOL(__tasklet_schedule); void __tasklet_hi_schedule(struct tasklet_struct *t) @@ -461,7 +468,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) raise_softirq_irqoff(HI_SOFTIRQ); local_irq_restore(flags); } - EXPORT_SYMBOL(__tasklet_hi_schedule); void __tasklet_hi_schedule_first(struct tasklet_struct *t) @@ -472,7 +478,6 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t) __this_cpu_write(tasklet_hi_vec.head, t); __raise_softirq_irqoff(HI_SOFTIRQ); } - EXPORT_SYMBOL(__tasklet_hi_schedule_first); static void tasklet_action(struct softirq_action *a) @@ -492,7 +497,8 @@ static void tasklet_action(struct softirq_action *a) if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + if (!test_and_clear_bit(TASKLET_STATE_SCHED, + &t->state)) BUG(); t->func(t->data); tasklet_unlock(t); @@ -527,7 +533,8 @@ static void tasklet_hi_action(struct softirq_action *a) if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + if (!test_and_clear_bit(TASKLET_STATE_SCHED, + &t->state)) BUG(); t->func(t->data); tasklet_unlock(t); @@ -545,7 +552,6 @@ static void tasklet_hi_action(struct softirq_action *a) } } - void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data) { @@ -555,13 +561,12 @@ void tasklet_init(struct tasklet_struct *t, t->func = func; t->data = data; } - EXPORT_SYMBOL(tasklet_init); void tasklet_kill(struct tasklet_struct *t) { if (in_interrupt()) - printk("Attempt to kill tasklet from interrupt\n"); + pr_notice("Attempt to kill tasklet from interrupt\n"); while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { do { @@ -571,7 +576,6 @@ void tasklet_kill(struct tasklet_struct *t) tasklet_unlock_wait(t); clear_bit(TASKLET_STATE_SCHED, &t->state); } - EXPORT_SYMBOL(tasklet_kill); /* @@ -721,9 +725,8 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) { switch (action) { #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84571e09c907..01fbae5b97b7 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * */ smp_call_function_single(min(cpu1, cpu2), &irq_cpu_stop_queue_work, - &call_args, 0); + &call_args, 1); lg_local_unlock(&stop_cpus_lock); preempt_enable(); diff --git a/kernel/sys.c b/kernel/sys.c index c72311324ea7..fba0f29401ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) /* normalize: avoid signed division (rounding problems) */ error = -ESRCH; - if (niceval < -20) - niceval = -20; - if (niceval > 19) - niceval = 19; + if (niceval < MIN_NICE) + niceval = MIN_NICE; + if (niceval > MAX_NICE) + niceval = MAX_NICE; rcu_read_lock(); read_lock(&tasklist_lock); @@ -895,8 +895,7 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf) * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * - * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. - * LBT 04.03.94 + * !PF_FORKNOEXEC check to conform completely to POSIX. */ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) { @@ -932,7 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; - if (p->did_exec) + if (!(p->flags & PF_FORKNOEXEC)) goto out; } else { err = -ESRCH; @@ -1572,8 +1571,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) t = p; do { accumulate_thread_rusage(t, r); - t = next_thread(t); - } while (t != p); + } while_each_thread(p, t); break; default: @@ -1998,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; return current->no_new_privs ? 1 : 0; + case PR_GET_THP_DISABLE: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + error = !!(me->mm->def_flags & VM_NOHUGEPAGE); + break; + case PR_SET_THP_DISABLE: + if (arg3 || arg4 || arg5) + return -EINVAL; + down_write(&me->mm->mmap_sem); + if (arg2) + me->mm->def_flags |= VM_NOHUGEPAGE; + else + me->mm->def_flags &= ~VM_NOHUGEPAGE; + up_write(&me->mm->mmap_sem); + break; default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7078052284fd..bc8d1b74a6b9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -146,11 +146,13 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(sys_sysfs); cond_syscall(sys_syslog); cond_syscall(sys_process_vm_readv); cond_syscall(sys_process_vm_writev); cond_syscall(compat_sys_process_vm_readv); cond_syscall(compat_sys_process_vm_writev); +cond_syscall(sys_uselib); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a604726d0b..74f5b580fe34 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,6 +62,7 @@ #include <linux/capability.h> #include <linux/binfmts.h> #include <linux/sched/sysctl.h> +#include <linux/kexec.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -95,8 +96,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int sysctl_overcommit_memory; -extern int sysctl_overcommit_ratio; extern int max_threads; extern int suid_dumpable; #ifdef CONFIG_COREDUMP @@ -113,19 +112,18 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif -#ifdef CONFIG_BLOCK -extern int blk_iopoll_enabled; -#endif /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; #endif +static int __maybe_unused neg_one = -1; + static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; -static int __maybe_unused three = 3; +static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -143,6 +141,11 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +#ifdef CONFIG_DETECT_HUNG_TASK +static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); +#endif + #ifdef CONFIG_INOTIFY_USER #include <linux/inotify.h> #endif @@ -385,18 +388,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "numa_balancing_settle_count", - .data = &sysctl_numa_balancing_settle_count, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "numa_balancing_migrate_deferred", - .data = &sysctl_numa_balancing_migrate_deferred, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, + .procname = "numa_balancing", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numa_balancing, + .extra1 = &zero, + .extra2 = &one, }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ @@ -614,6 +612,18 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_KEXEC + { + .procname = "kexec_load_disabled", + .data = &kexec_load_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif #ifdef CONFIG_MODULES { .procname = "modprobe", @@ -980,13 +990,15 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, }, { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, }, #endif #ifdef CONFIG_COMPAT @@ -1078,15 +1090,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_BLOCK - { - .procname = "blk_iopoll", - .data = &blk_iopoll_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { } }; @@ -1128,7 +1131,14 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = overcommit_ratio_handler, + }, + { + .procname = "overcommit_kbytes", + .data = &sysctl_overcommit_kbytes, + .maxlen = sizeof(sysctl_overcommit_kbytes), + .mode = 0644, + .proc_handler = overcommit_kbytes_handler, }, { .procname = "page-cluster", @@ -1260,7 +1270,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = drop_caches_sysctl_handler, .extra1 = &one, - .extra2 = &three, + .extra2 = &four, }, #ifdef CONFIG_COMPACTION { diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 3ce6e8c5f3fc..f448513a45ed 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -124,7 +124,7 @@ config NO_HZ_FULL endchoice config NO_HZ_FULL_ALL - bool "Full dynticks system on all CPUs by default" + bool "Full dynticks system on all CPUs by default (except CPU 0)" depends on NO_HZ_FULL help If the user doesn't pass the nohz_full boot option to diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 9250130646f5..57a413fd0ebf 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) + obj-y += tick-broadcast.o + obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o +endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 086ad6043bcb..ad362c260ef4 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -439,6 +439,19 @@ void clockevents_config_and_register(struct clock_event_device *dev, } EXPORT_SYMBOL_GPL(clockevents_config_and_register); +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + return clockevents_program_event(dev, dev->next_event, false); + + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + + return 0; +} + /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify @@ -446,17 +459,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register); * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per - * cpu timer events with interrupts disabled! Returns 0 on success, - * -ETIME when the event is in the past. + * cpu timer events. If called for the broadcast device the core takes + * care of serialization. + * + * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { - clockevents_config(dev, freq); - - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) - return 0; + unsigned long flags; + int ret; - return clockevents_program_event(dev, dev->next_event, false); + local_irq_save(flags); + ret = tick_broadcast_update_freq(dev, freq); + if (ret == -ENODEV) + ret = __clockevents_update_freq(dev, freq); + local_irq_restore(flags); + return ret; } /* @@ -524,12 +542,13 @@ void clockevents_resume(void) #ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events + * Returns 0 on success, any other value on error */ -void clockevents_notify(unsigned long reason, void *arg) +int clockevents_notify(unsigned long reason, void *arg) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu; + int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); @@ -542,7 +561,7 @@ void clockevents_notify(unsigned long reason, void *arg) case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_oneshot_control(reason); + ret = tick_broadcast_oneshot_control(reason); break; case CLOCK_EVT_NOTIFY_CPU_DYING: @@ -585,6 +604,7 @@ void clockevents_notify(unsigned long reason, void *arg) break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clockevents_notify); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7a925ba456fb..a6a5bf53e86d 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -51,7 +51,13 @@ * HZ shrinks, so values greater than 8 overflow 32bits when * HZ=100. */ +#if HZ < 34 +#define JIFFIES_SHIFT 6 +#elif HZ < 67 +#define JIFFIES_SHIFT 7 +#else #define JIFFIES_SHIFT 8 +#endif static cycle_t jiffies_read(struct clocksource *cs) { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af8d1d4f3d55..419a52cecd20 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) { - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0abb36464281..4d23dc4d8139 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) void __init sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { + u64 res, wrap, new_mask, new_epoch, cyc, ns; + u32 new_mult, new_shift; + ktime_t new_wrap_kt; unsigned long r; - u64 res, wrap; char r_unit; if (cd.rate > rate) return; WARN_ON(!irqs_disabled()); - read_sched_clock = read; - sched_clock_mask = CLOCKSOURCE_MASK(bits); - cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ - clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600); + clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + + new_mask = CLOCKSOURCE_MASK(bits); + + /* calculate how many ns until we wrap */ + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + /* update epoch for new counter and update epoch_ns from old counter*/ + new_epoch = read(); + cyc = read_sched_clock(); + ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, + cd.mult, cd.shift); + + raw_write_seqcount_begin(&cd.seq); + read_sched_clock = read; + sched_clock_mask = new_mask; + cd.rate = rate; + cd.wrap_kt = new_wrap_kt; + cd.mult = new_mult; + cd.shift = new_shift; + cd.epoch_cyc = new_epoch; + cd.epoch_ns = ns; + raw_write_seqcount_end(&cd.seq); r = rate; if (r >= 4000000) { @@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits, } else r_unit = ' '; - /* calculate how many ns until we wrap */ - wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); - cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); - /* calculate the ns resolution of this counter */ - res = cyc_to_ns(1ULL, cd.mult, cd.shift); + res = cyc_to_ns(1ULL, new_mult, new_shift); + pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - update_sched_clock(); - - /* - * Ensure that sched_clock() starts off at 0ns - */ - cd.epoch_ns = 0; - /* Enable IRQ time accounting if we have a fast enough sched_clock */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c new file mode 100644 index 000000000000..eb682d5c697c --- /dev/null +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -0,0 +1,106 @@ +/* + * linux/kernel/time/tick-broadcast-hrtimer.c + * This file emulates a local clock event device + * via a pseudo clock device. + */ +#include <linux/cpu.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/profile.h> +#include <linux/clockchips.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include "tick-internal.h" + +static struct hrtimer bctimer; + +static void bc_set_mode(enum clock_event_mode mode, + struct clock_event_device *bc) +{ + switch (mode) { + case CLOCK_EVT_MODE_SHUTDOWN: + /* + * Note, we cannot cancel the timer here as we might + * run into the following live lock scenario: + * + * cpu 0 cpu1 + * lock(broadcast_lock); + * hrtimer_interrupt() + * bc_handler() + * tick_handle_oneshot_broadcast(); + * lock(broadcast_lock); + * hrtimer_cancel() + * wait_for_callback() + */ + hrtimer_try_to_cancel(&bctimer); + break; + default: + break; + } +} + +/* + * This is called from the guts of the broadcast code when the cpu + * which is about to enter idle has the earliest broadcast timer event. + */ +static int bc_set_next(ktime_t expires, struct clock_event_device *bc) +{ + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we + * were able to cancel the timer nothing can rearm it as we + * own broadcast_lock. + * + * However we can also be called from the event handler of + * ce_broadcast_hrtimer itself when it expires. We cannot + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. + */ + if (hrtimer_try_to_cancel(&bctimer) >= 0) { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { + hrtimer_set_expires(&bctimer, expires); + } + return 0; +} + +static struct clock_event_device ce_broadcast_hrtimer = { + .set_mode = bc_set_mode, + .set_next_ktime = bc_set_next, + .features = CLOCK_EVT_FEAT_ONESHOT | + CLOCK_EVT_FEAT_KTIME | + CLOCK_EVT_FEAT_HRTIMER, + .rating = 0, + .bound_on = -1, + .min_delta_ns = 1, + .max_delta_ns = KTIME_MAX, + .min_delta_ticks = 1, + .max_delta_ticks = ULONG_MAX, + .mult = 1, + .shift = 0, + .cpumask = cpu_all_mask, +}; + +static enum hrtimer_restart bc_handler(struct hrtimer *t) +{ + ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); + + if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + return HRTIMER_NORESTART; + + return HRTIMER_RESTART; +} + +void tick_setup_hrtimer_broadcast(void) +{ + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + bctimer.function = bc_handler; + clockevents_register_device(&ce_broadcast_hrtimer); +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9532690daaa9..64c5990fd500 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -120,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev) return (dev && tick_broadcast_device.evtdev == dev); } +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) +{ + int ret = -ENODEV; + + if (tick_is_broadcast_device(dev)) { + raw_spin_lock(&tick_broadcast_lock); + ret = __clockevents_update_freq(dev, freq); + raw_spin_unlock(&tick_broadcast_lock); + } + return ret; +} + + static void err_broadcast(const struct cpumask *mask) { pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n"); @@ -272,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask) */ static void tick_do_periodic_broadcast(void) { - raw_spin_lock(&tick_broadcast_lock); - cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); tick_do_broadcast(tmpmask); - - raw_spin_unlock(&tick_broadcast_lock); } /* @@ -287,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { ktime_t next; + raw_spin_lock(&tick_broadcast_lock); + tick_do_periodic_broadcast(); /* * The device is in periodic mode. No reprogramming necessary: */ if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return; + goto unlock; /* * Setup the next period for devices, which do not have @@ -306,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, false)) - return; + goto unlock; tick_do_periodic_broadcast(); } +unlock: + raw_spin_unlock(&tick_broadcast_lock); } /* @@ -538,10 +551,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) * Called from irq_enter() when idle was interrupted to reenable the * per cpu device. */ -void tick_check_oneshot_broadcast(int cpu) +void tick_check_oneshot_broadcast_this_cpu(void) { - if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { + struct tick_device *td = &__get_cpu_var(tick_cpu_device); /* * We might be in the middle of switching over from @@ -630,24 +643,61 @@ again: raw_spin_unlock(&tick_broadcast_lock); } +static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) +{ + if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return 0; + if (bc->next_event.tv64 == KTIME_MAX) + return 0; + return bc->bound_on == cpu ? -EBUSY : 0; +} + +static void broadcast_shutdown_local(struct clock_event_device *bc, + struct clock_event_device *dev) +{ + /* + * For hrtimer based broadcasting we cannot shutdown the cpu + * local device if our own event is the first one to expire or + * if we own the broadcast timer. + */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) { + if (broadcast_needs_cpu(bc, smp_processor_id())) + return; + if (dev->next_event.tv64 < bc->next_event.tv64) + return; + } + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); +} + +static void broadcast_move_bc(int deadcpu) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || !broadcast_needs_cpu(bc, deadcpu)) + return; + /* This moves the broadcast assignment to this cpu */ + clockevents_program_event(bc, bc->next_event, 1); +} + /* * Powerstate information: The system enters/leaves a state, where * affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. */ -void tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(unsigned long reason) { struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags; ktime_t now; - int cpu; + int cpu, ret = 0; /* * Periodic mode does not care about the enter/exit of power * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return; + return 0; /* * We are called with preemtion disabled from the depth of the @@ -658,7 +708,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return; + return 0; bc = tick_broadcast_device.evtdev; @@ -666,7 +716,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + broadcast_shutdown_local(bc, dev); /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -679,6 +729,16 @@ void tick_broadcast_oneshot_control(unsigned long reason) dev->next_event.tv64 < bc->next_event.tv64) tick_broadcast_set_event(bc, cpu, dev->next_event, 1); } + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we remove the + * CPU from the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); @@ -746,6 +806,7 @@ void tick_broadcast_oneshot_control(unsigned long reason) } out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; } /* @@ -756,6 +817,7 @@ out: static void tick_broadcast_clear_oneshot(int cpu) { cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); } static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -851,6 +913,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); + broadcast_move_bc(cpu); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 162b03ab0ad2..015661279b68 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -85,6 +85,7 @@ static void tick_periodic(int cpu) do_timer(1); write_sequnlock(&jiffies_lock); + update_wall_time(); } update_process_times(user_mode(get_irq_regs())); @@ -97,18 +98,19 @@ static void tick_periodic(int cpu) void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); - ktime_t next; + ktime_t next = dev->next_event; tick_periodic(cpu); if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return; - /* - * Setup the next period for devices, which do not have - * periodic mode: - */ - next = ktime_add(dev->next_event, tick_period); for (;;) { + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(next, tick_period); + if (!clockevents_program_event(dev, next, false)) return; /* @@ -117,12 +119,11 @@ void tick_handle_periodic(struct clock_event_device *dev) * to be sure we're using a real hardware clocksource. * Otherwise we could get trapped in an infinite * loop, as the tick_periodic() increments jiffies, - * when then will increment time, posibly causing + * which then will increment time, possibly causing * the loop to trigger again and again. */ if (timekeeping_valid_for_hres()) tick_periodic(cpu); - next = ktime_add(next, tick_period); } } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 18e71f7fbc2a..7ab92b19965a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -46,23 +46,23 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern void tick_broadcast_oneshot_control(unsigned long reason); +extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); -extern void tick_check_oneshot_broadcast(int cpu); +extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return true; } # endif /* !BROADCAST */ @@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { @@ -111,6 +111,7 @@ extern int tick_resume_broadcast(void); extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); #else /* !BROADCAST */ @@ -133,6 +134,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } static inline void tick_broadcast_init(void) { } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, + u32 freq) { return -ENODEV; } /* * Set the periodic handler in non broadcast mode @@ -152,6 +155,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); + #endif extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ea20f7d1ac2c..9f8af69c67ec 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -86,6 +86,7 @@ static void tick_do_update_jiffies64(ktime_t now) tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&jiffies_lock); + update_wall_time(); } /* @@ -177,7 +178,7 @@ static bool can_stop_full_tick(void) * TODO: kick full dynticks CPUs when * sched_clock_stable is set. */ - if (!sched_clock_stable) { + if (!sched_clock_stable()) { trace_tick_stop(0, "unstable sched clock\n"); /* * Don't allow the user to think they can get @@ -391,11 +392,9 @@ __setup("nohz=", setup_tick_nohz); */ static void tick_nohz_update_jiffies(ktime_t now) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - ts->idle_waketime = now; + __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); @@ -426,17 +425,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda } -static void tick_nohz_stop_idle(int cpu, ktime_t now) +static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - - update_ts_time_stats(cpu, ts, now, NULL); + update_ts_time_stats(smp_processor_id(), ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts) { ktime_t now = ktime_get(); @@ -536,12 +533,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; + time_delta = timekeeping_max_deferment(); + /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); last_update = last_jiffies_update; last_jiffies = jiffies; - time_delta = timekeeping_max_deferment(); } while (read_seqretry(&jiffies_lock, seq)); if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || @@ -681,18 +679,18 @@ out: static void tick_nohz_full_stop_tick(struct tick_sched *ts) { #ifdef CONFIG_NO_HZ_FULL - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); - if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) - return; + if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) + return; - if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) - return; + if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) + return; - if (!can_stop_full_tick()) - return; + if (!can_stop_full_tick()) + return; - tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); + tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); #endif } @@ -754,7 +752,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); - now = tick_nohz_start_idle(cpu, ts); + now = tick_nohz_start_idle(ts); if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; @@ -911,8 +909,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) */ void tick_nohz_idle_exit(void) { - int cpu = smp_processor_id(); - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now; local_irq_disable(); @@ -925,7 +922,7 @@ void tick_nohz_idle_exit(void) now = ktime_get(); if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); + tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { tick_nohz_restart_sched_tick(ts, now); @@ -1009,12 +1006,10 @@ static void tick_nohz_switch_to_nohz(void) * timer and do not touch the other magic bits which need to be done * when idle is left. */ -static void tick_nohz_kick_tick(int cpu, ktime_t now) +static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) { #if 0 /* Switch back to 2.6.27 behaviour */ - - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t delta; /* @@ -1029,36 +1024,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now) #endif } -static inline void tick_check_nohz(int cpu) +static inline void tick_nohz_irq_enter(void) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) - tick_nohz_stop_idle(cpu, now); + tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { tick_nohz_update_jiffies(now); - tick_nohz_kick_tick(cpu, now); + tick_nohz_kick_tick(ts, now); } } #else static inline void tick_nohz_switch_to_nohz(void) { } -static inline void tick_check_nohz(int cpu) { } +static inline void tick_nohz_irq_enter(void) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ -void tick_check_idle(int cpu) +void tick_irq_enter(void) { - tick_check_oneshot_broadcast(cpu); - tick_check_nohz(cpu); + tick_check_oneshot_broadcast_this_cpu(); + tick_nohz_irq_enter(); } /* diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 87b4f00284c9..f7df8ea21707 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,6 +22,7 @@ #include <linux/tick.h> #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> +#include <linux/compiler.h> #include "tick-internal.h" #include "ntp_internal.h" @@ -77,7 +78,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) tk->wall_to_monotonic = wtm; set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); tk->offs_real = timespec_to_ktime(tmp); - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)); } static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -90,8 +91,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) } /** - * timekeeper_setup_internals - Set up internals to use clocksource clock. + * tk_setup_internals - Set up internals to use clocksource clock. * + * @tk: The target timekeeper to setup. * @clock: Pointer to clocksource. * * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment @@ -595,7 +597,7 @@ s32 timekeeping_get_tai_offset(void) static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { tk->tai_offset = tai_offset; - tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); + tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } /** @@ -610,6 +612,7 @@ void timekeeping_set_tai_offset(s32 tai_offset) raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&timekeeper_seq); __timekeeping_set_tai_offset(tk, tai_offset); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); clock_was_set(); @@ -758,7 +761,7 @@ u64 timekeeping_max_deferment(void) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +void __weak read_persistent_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -773,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts) * * XXX - Do be sure to remove it once all arches implement it. */ -void __attribute__((weak)) read_boot_clock(struct timespec *ts) +void __weak read_boot_clock(struct timespec *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; @@ -1023,6 +1026,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } + + timekeeping_update(tk, TK_MIRROR); write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1130,16 +1135,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * we can adjust by 1. */ error >>= 2; - /* - * XXX - In update_wall_time, we round up to the next - * nanosecond, and store the amount rounded up into - * the error. This causes the likely below to be unlikely. - * - * The proper fix is to avoid rounding up by using - * the high precision tk->xtime_nsec instead of - * xtime.tv_nsec everywhere. Fixing this will take some - * time. - */ if (likely(error <= interval)) adj = 1; else @@ -1255,7 +1250,7 @@ out_adjust: static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; - unsigned int action = 0; + unsigned int clock_set = 0; while (tk->xtime_nsec >= nsecps) { int leap; @@ -1277,11 +1272,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); - clock_was_set_delayed(); - action = TK_CLOCK_WAS_SET; + clock_set = TK_CLOCK_WAS_SET; } } - return action; + return clock_set; } /** @@ -1294,7 +1288,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) * Returns the unconsumed cycles. */ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, - u32 shift) + u32 shift, + unsigned int *clock_set) { cycle_t interval = tk->cycle_interval << shift; u64 raw_nsecs; @@ -1308,7 +1303,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, tk->cycle_last += interval; tk->xtime_nsec += tk->xtime_interval << shift; - accumulate_nsecs_to_secs(tk); + *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ raw_nsecs = (u64)tk->raw_interval << shift; @@ -1359,14 +1354,14 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * update_wall_time - Uses the current clocksource to increment the wall time * */ -static void update_wall_time(void) +void update_wall_time(void) { struct clocksource *clock; struct timekeeper *real_tk = &timekeeper; struct timekeeper *tk = &shadow_timekeeper; cycle_t offset; int shift = 0, maxshift; - unsigned int action; + unsigned int clock_set = 0; unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1401,7 +1396,8 @@ static void update_wall_time(void) maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { - offset = logarithmic_accumulation(tk, offset, shift); + offset = logarithmic_accumulation(tk, offset, shift, + &clock_set); if (offset < tk->cycle_interval<<shift) shift--; } @@ -1419,7 +1415,7 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime_nsec isn't larger than NSEC_PER_SEC */ - action = accumulate_nsecs_to_secs(tk); + clock_set |= accumulate_nsecs_to_secs(tk); write_seqcount_begin(&timekeeper_seq); /* Update clock->cycle_last with the new value */ @@ -1435,10 +1431,13 @@ static void update_wall_time(void) * updating. */ memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, action); + timekeeping_update(real_tk, clock_set); write_seqcount_end(&timekeeper_seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (clock_set) + /* Have to call _delayed version, since in irq context*/ + clock_was_set_delayed(); } /** @@ -1583,7 +1582,6 @@ struct timespec get_monotonic_coarse(void) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_wall_time(); calc_global_load(ticks); } @@ -1698,12 +1696,14 @@ int do_adjtimex(struct timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); - update_pvclock_gtod(tk, true); - clock_was_set_delayed(); + timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + if (tai != orig_tai) + clock_was_set(); + ntp_notify_cmos_timer(); return ret; @@ -1739,4 +1739,5 @@ void xtime_update(unsigned long ticks) write_seqlock(&jiffies_lock); do_timer(ticks); write_sequnlock(&jiffies_lock); + update_wall_time(); } diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 802433a4f5eb..4d54f97558df 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -21,6 +21,8 @@ #include <linux/seq_file.h> #include <linux/time.h> +#include "timekeeping_internal.h" + static unsigned int sleep_time_bin[32] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e5..87bd529879c2 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -52,7 +52,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/timer.h> -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -81,6 +81,7 @@ struct tvec_base { unsigned long timer_jiffies; unsigned long next_timer; unsigned long active_timers; + unsigned long all_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -337,6 +338,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); +/* + * If the list is empty, catch up ->timer_jiffies to the current time. + * The caller must hold the tvec_base lock. Returns true if the list + * was empty and therefore ->timer_jiffies was updated. + */ +static bool catchup_timer_jiffies(struct tvec_base *base) +{ + if (!base->all_timers) { + base->timer_jiffies = jiffies; + return true; + } + return false; +} + static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { @@ -383,15 +398,17 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { + (void)catchup_timer_jiffies(base); __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer */ if (!tbase_get_deferrable(timer->base)) { - if (time_before(timer->expires, base->next_timer)) + if (!base->active_timers++ || + time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; - base->active_timers++; } + base->all_timers++; } #ifdef CONFIG_TIMER_STATS @@ -671,6 +688,8 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) base->active_timers--; + base->all_timers--; + (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -685,6 +704,8 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } + base->all_timers--; + (void)catchup_timer_jiffies(base); return 1; } @@ -739,12 +760,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = smp_processor_id(); - -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) - cpu = get_nohz_timer_target(); -#endif + cpu = get_nohz_timer_target(pinned); new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { @@ -939,8 +955,15 @@ void add_timer_on(struct timer_list *timer, int cpu) * with the timer by holding the timer base lock. This also * makes sure that a CPU on the way to stop its tick can not * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. */ - wake_up_nohz_cpu(cpu); + if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) + wake_up_nohz_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); @@ -1146,6 +1169,10 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); + if (catchup_timer_jiffies(base)) { + spin_unlock_irq(&base->lock); + return; + } while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; @@ -1160,7 +1187,7 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, &work_list); + list_replace_init(base->tv1.vec + index, head); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; @@ -1523,9 +1550,8 @@ static int init_timers_cpu(int cpu) if (!base) return -ENOMEM; - /* Make sure that tvec_base is 2 byte aligned */ - if (tbase_get_deferrable(base)) { - WARN_ON(1); + /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ + if (WARN_ON(base != tbase_get_base(base))) { kfree(base); return -ENOMEM; } @@ -1559,6 +1585,7 @@ static int init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; base->active_timers = 0; + base->all_timers = 0; return 0; } @@ -1648,9 +1675,9 @@ void __init init_timers(void) err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); - init_timer_stats(); - BUG_ON(err != NOTIFY_OK); + + init_timer_stats(); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/torture.c b/kernel/torture.c new file mode 100644 index 000000000000..acc9afc2f26e --- /dev/null +++ b/kernel/torture.c @@ -0,0 +1,719 @@ +/* + * Common functions for in-kernel torture tests. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2014 + * + * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Based on kernel/rcu/torture.c. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <linux/bitops.h> +#include <linux/completion.h> +#include <linux/moduleparam.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/freezer.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/trace_clock.h> +#include <asm/byteorder.h> +#include <linux/torture.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); + +static char *torture_type; +static bool verbose; + +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ +static int fullstop = FULLSTOP_RMMOD; +static DEFINE_MUTEX(fullstop_mutex); +static int *torture_runnable; + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Variables for online-offline handling. Only present if CPU hotplug + * is enabled, otherwise does nothing. + */ + +static struct task_struct *onoff_task; +static long onoff_holdoff; +static long onoff_interval; +static long n_offline_attempts; +static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; +static long n_online_attempts; +static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +torture_onoff(void *arg) +{ + int cpu; + unsigned long delta; + int maxcpu = -1; + DEFINE_TORTURE_RANDOM(rand); + int ret; + unsigned long starttime; + + VERBOSE_TOROUT_STRING("torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff); + VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); + } + while (!torture_must_stop()) { + cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_offline_attempts++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_online_attempts++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; + } + } + schedule_timeout_interruptible(onoff_interval); + } + torture_kthread_stopping("torture_onoff"); + return 0; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Initiate online-offline handling. + */ +int torture_onoff_init(long ooholdoff, long oointerval) +{ + int ret = 0; + +#ifdef CONFIG_HOTPLUG_CPU + onoff_holdoff = ooholdoff; + onoff_interval = oointerval; + if (onoff_interval <= 0) + return 0; + ret = torture_create_kthread(torture_onoff, NULL, onoff_task); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return ret; +} +EXPORT_SYMBOL_GPL(torture_onoff_init); + +/* + * Clean up after online/offline testing. + */ +static void torture_onoff_cleanup(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task == NULL) + return; + VERBOSE_TOROUT_STRING("Stopping torture_onoff task"); + kthread_stop(onoff_task); + onoff_task = NULL; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_cleanup); + +/* + * Print online/offline testing statistics. + */ +char *torture_onoff_stats(char *page) +{ +#ifdef CONFIG_HOTPLUG_CPU + page += sprintf(page, + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + return page; +} +EXPORT_SYMBOL_GPL(torture_onoff_stats); + +/* + * Were all the online/offline operations successful? + */ +bool torture_onoff_failures(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + return n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts; +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + return false; +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ +} +EXPORT_SYMBOL_GPL(torture_onoff_failures); + +#define TORTURE_RANDOM_MULT 39916801 /* prime */ +#define TORTURE_RANDOM_ADD 479001701 /* prime */ +#define TORTURE_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from cpu_clock(). + */ +unsigned long +torture_random(struct torture_random_state *trsp) +{ + if (--trsp->trs_count < 0) { + trsp->trs_state += (unsigned long)local_clock(); + trsp->trs_count = TORTURE_RANDOM_REFRESH; + } + trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT + + TORTURE_RANDOM_ADD; + return swahw32(trsp->trs_state); +} +EXPORT_SYMBOL_GPL(torture_random); + +/* + * Variables for shuffling. The idea is to ensure that each CPU stays + * idle for an extended period to test interactions with dyntick idle, + * as well as interactions with any per-CPU varibles. + */ +struct shuffle_task { + struct list_head st_l; + struct task_struct *st_t; +}; + +static long shuffle_interval; /* In jiffies. */ +static struct task_struct *shuffler_task; +static cpumask_var_t shuffle_tmp_mask; +static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */ +static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list); +static DEFINE_MUTEX(shuffle_task_mutex); + +/* + * Register a task to be shuffled. If there is no memory, just splat + * and don't bother registering. + */ +void torture_shuffle_task_register(struct task_struct *tp) +{ + struct shuffle_task *stp; + + if (WARN_ON_ONCE(tp == NULL)) + return; + stp = kmalloc(sizeof(*stp), GFP_KERNEL); + if (WARN_ON_ONCE(stp == NULL)) + return; + stp->st_t = tp; + mutex_lock(&shuffle_task_mutex); + list_add(&stp->st_l, &shuffle_task_list); + mutex_unlock(&shuffle_task_mutex); +} +EXPORT_SYMBOL_GPL(torture_shuffle_task_register); + +/* + * Unregister all tasks, for example, at the end of the torture run. + */ +static void torture_shuffle_task_unregister_all(void) +{ + struct shuffle_task *stp; + struct shuffle_task *p; + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) { + list_del(&stp->st_l); + kfree(stp); + } + mutex_unlock(&shuffle_task_mutex); +} + +/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle. + * A special case is when shuffle_idle_cpu = -1, in which case we allow + * the tasks to run on all CPUs. + */ +static void torture_shuffle_tasks(void) +{ + struct shuffle_task *stp; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */ + shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask); + if (shuffle_idle_cpu >= nr_cpu_ids) + shuffle_idle_cpu = -1; + if (shuffle_idle_cpu != -1) { + cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); + if (cpumask_empty(shuffle_tmp_mask)) { + put_online_cpus(); + return; + } + } + + mutex_lock(&shuffle_task_mutex); + list_for_each_entry(stp, &shuffle_task_list, st_l) + set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + mutex_unlock(&shuffle_task_mutex); + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int torture_shuffle(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval); + torture_shuffle_tasks(); + torture_shutdown_absorb("torture_shuffle"); + } while (!torture_must_stop()); + torture_kthread_stopping("torture_shuffle"); + return 0; +} + +/* + * Start the shuffler, with shuffint in jiffies. + */ +int torture_shuffle_init(long shuffint) +{ + shuffle_interval = shuffint; + + shuffle_idle_cpu = -1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + return -ENOMEM; + } + + /* Create the shuffler thread */ + return torture_create_kthread(torture_shuffle, NULL, shuffler_task); +} +EXPORT_SYMBOL_GPL(torture_shuffle_init); + +/* + * Stop the shuffling. + */ +static void torture_shuffle_cleanup(void) +{ + torture_shuffle_task_unregister_all(); + if (shuffler_task) { + VERBOSE_TOROUT_STRING("Stopping torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; +} +EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); + +/* + * Variables for auto-shutdown. This allows "lights out" torture runs + * to be fully scripted. + */ +static int shutdown_secs; /* desired test duration in seconds. */ +static struct task_struct *shutdown_task; +static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static void (*torture_shutdown_hook)(void); + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +void torture_shutdown_absorb(const char *title) +{ + while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_notice("torture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} +EXPORT_SYMBOL_GPL(torture_shutdown_absorb); + +/* + * Cause the torture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs parameter. + */ +static int torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_TOROUT_STRING("torture_shutdown task started"); + jiffies_snap = jiffies; + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !torture_must_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = jiffies; + } + if (torture_must_stop()) { + torture_kthread_stopping("torture_shutdown"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + if (torture_shutdown_hook) + torture_shutdown_hook(); + else + VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +/* + * Start up the shutdown task. + */ +int torture_shutdown_init(int ssecs, void (*cleanup)(void)) +{ + int ret = 0; + + shutdown_secs = ssecs; + torture_shutdown_hook = cleanup; + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + ret = torture_create_kthread(torture_shutdown, NULL, + shutdown_task); + } + return ret; +} +EXPORT_SYMBOL_GPL(torture_shutdown_init); + +/* + * Detect and respond to a system shutdown. + */ +static int torture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) { + VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected"); + ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN; + } else { + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + } + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block torture_shutdown_nb = { + .notifier_call = torture_shutdown_notify, +}; + +/* + * Shut down the shutdown task. Say what??? Heh! This can happen if + * the torture module gets an rmmod before the shutdown time arrives. ;-) + */ +static void torture_shutdown_cleanup(void) +{ + unregister_reboot_notifier(&torture_shutdown_nb); + if (shutdown_task != NULL) { + VERBOSE_TOROUT_STRING("Stopping torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; +} + +/* + * Variables for stuttering, which means to periodically pause and + * restart testing in order to catch bugs that appear when load is + * suddenly applied to or removed from the system. + */ +static struct task_struct *stutter_task; +static int stutter_pause_test; +static int stutter; + +/* + * Block until the stutter interval ends. This must be called periodically + * by all running kthreads that need to be subject to stuttering. + */ +void stutter_wait(const char *title) +{ + while (ACCESS_ONCE(stutter_pause_test) || + (torture_runnable && !ACCESS_ONCE(*torture_runnable))) { + if (stutter_pause_test) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + torture_shutdown_absorb(title); + } +} +EXPORT_SYMBOL_GPL(stutter_wait); + +/* + * Cause the torture test to "stutter", starting and stopping all + * threads periodically. + */ +static int torture_stutter(void *arg) +{ + VERBOSE_TOROUT_STRING("torture_stutter task started"); + do { + if (!torture_must_stop()) { + schedule_timeout_interruptible(stutter); + ACCESS_ONCE(stutter_pause_test) = 1; + } + if (!torture_must_stop()) + schedule_timeout_interruptible(stutter); + ACCESS_ONCE(stutter_pause_test) = 0; + torture_shutdown_absorb("torture_stutter"); + } while (!torture_must_stop()); + torture_kthread_stopping("torture_stutter"); + return 0; +} + +/* + * Initialize and kick off the torture_stutter kthread. + */ +int torture_stutter_init(int s) +{ + int ret; + + stutter = s; + ret = torture_create_kthread(torture_stutter, NULL, stutter_task); + return ret; +} +EXPORT_SYMBOL_GPL(torture_stutter_init); + +/* + * Cleanup after the torture_stutter kthread. + */ +static void torture_stutter_cleanup(void) +{ + if (!stutter_task) + return; + VERBOSE_TOROUT_STRING("Stopping torture_stutter task"); + kthread_stop(stutter_task); + stutter_task = NULL; +} + +/* + * Initialize torture module. Please note that this is -not- invoked via + * the usual module_init() mechanism, but rather by an explicit call from + * the client torture module. This call must be paired with a later + * torture_init_end(). + * + * The runnable parameter points to a flag that controls whether or not + * the test is currently runnable. If there is no such flag, pass in NULL. + */ +void __init torture_init_begin(char *ttype, bool v, int *runnable) +{ + mutex_lock(&fullstop_mutex); + torture_type = ttype; + verbose = v; + torture_runnable = runnable; + fullstop = FULLSTOP_DONTSTOP; + +} +EXPORT_SYMBOL_GPL(torture_init_begin); + +/* + * Tell the torture module that initialization is complete. + */ +void __init torture_init_end(void) +{ + mutex_unlock(&fullstop_mutex); + register_reboot_notifier(&torture_shutdown_nb); +} +EXPORT_SYMBOL_GPL(torture_init_end); + +/* + * Clean up torture module. Please note that this is -not- invoked via + * the usual module_exit() mechanism, but rather by an explicit call from + * the client torture module. Returns true if a race with system shutdown + * is detected, otherwise, all kthreads started by functions in this file + * will be shut down. + * + * This must be called before the caller starts shutting down its own + * kthreads. + */ +bool torture_cleanup(void) +{ + mutex_lock(&fullstop_mutex); + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_warn("Concurrent rmmod and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + return true; + } + ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + torture_shutdown_cleanup(); + torture_shuffle_cleanup(); + torture_stutter_cleanup(); + torture_onoff_cleanup(); + return false; +} +EXPORT_SYMBOL_GPL(torture_cleanup); + +/* + * Is it time for the current torture test to stop? + */ +bool torture_must_stop(void) +{ + return torture_must_stop_irq() || kthread_should_stop(); +} +EXPORT_SYMBOL_GPL(torture_must_stop); + +/* + * Is it time for the current torture test to stop? This is the irq-safe + * version, hence no check for kthread_should_stop(). + */ +bool torture_must_stop_irq(void) +{ + return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP; +} +EXPORT_SYMBOL_GPL(torture_must_stop_irq); + +/* + * Each kthread must wait for kthread_should_stop() before returning from + * its top-level function, otherwise segfaults ensue. This function + * prints a "stopping" message and waits for kthread_should_stop(), and + * should be called from all torture kthreads immediately prior to + * returning. + */ +void torture_kthread_stopping(char *title) +{ + if (verbose) + VERBOSE_TOROUT_STRING(title); + while (!kthread_should_stop()) { + torture_shutdown_absorb(title); + schedule_timeout_uninterruptible(1); + } +} +EXPORT_SYMBOL_GPL(torture_kthread_stopping); + +/* + * Create a generic torture kthread that is immediately runnable. If you + * need the kthread to be stopped so that you can do something to it before + * it starts, you will need to open-code your own. + */ +int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, + char *f, struct task_struct **tp) +{ + int ret = 0; + + VERBOSE_TOROUT_STRING(m); + *tp = kthread_run(fn, arg, s); + if (IS_ERR(*tp)) { + ret = PTR_ERR(*tp); + VERBOSE_TOROUT_ERRSTRING(f); + *tp = NULL; + } + torture_shuffle_task_register(*tp); + return ret; +} +EXPORT_SYMBOL_GPL(_torture_create_kthread); + +/* + * Stop a generic kthread, emitting a message. + */ +void _torture_stop_kthread(char *m, struct task_struct **tp) +{ + if (*tp == NULL) + return; + VERBOSE_TOROUT_STRING(m); + kthread_stop(*tp); + *tp = NULL; +} +EXPORT_SYMBOL_GPL(_torture_stop_kthread); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 015f85aaca08..8639819f6cef 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -424,6 +424,7 @@ config UPROBE_EVENT bool "Enable uprobes-based dynamic events" depends on ARCH_SUPPORTS_UPROBES depends on MMU + depends on PERF_EVENTS select UPROBES select PROBE_EVENTS select TRACING diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d7e2068e4b71..1378e84fbe39 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM_RUNTIME),y) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f785aef65799..c1bd4ada2a04 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -702,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q) * blk_add_trace_rq - Add a trace for a request oriented action * @q: queue the io is for * @rq: the source request + * @nr_bytes: number of completed bytes * @what: the action * * Description: @@ -709,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + unsigned int nr_bytes, u32 what) { struct blk_trace *bt = q->blk_trace; @@ -718,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, + __blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + __blk_add_trace(bt, blk_rq_pos(rq), nr_bytes, rq->cmd_flags, what, rq->errors, 0, NULL); } } @@ -730,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, static void blk_add_trace_rq_abort(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ABORT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_INSERT); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); + blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); } static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, - struct request *rq) + struct request *rq, + unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); + blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); } /** @@ -781,8 +783,8 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, if (!error && !bio_flagged(bio, BIO_UPTODATE)) error = EIO; - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - error, 0, NULL); + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, what, error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, @@ -885,8 +887,9 @@ static void blk_add_trace_split(void *ignore, if (bt) { __be64 rpdu = cpu_to_be64(pdu); - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + __blk_add_trace(bt, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, + !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); } } @@ -918,9 +921,9 @@ static void blk_add_trace_bio_remap(void *ignore, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), - sizeof(r), &r); + __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, + bio->bi_rw, BLK_TA_REMAP, + !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); } /** @@ -1426,7 +1429,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } -static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +static int +blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* don't output context-info for blk_classic output */ if (bit == TRACE_BLK_OPT_CLASSIC) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 72a0f81dc5a8..1fd4b9479210 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -85,6 +85,8 @@ int function_trace_stop __read_mostly; /* Current function tracing op */ struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; +/* What to set function_trace_op to */ +static struct ftrace_ops *set_function_trace_op; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -235,14 +237,13 @@ static int control_ops_alloc(struct ftrace_ops *ops) return 0; } -static void control_ops_free(struct ftrace_ops *ops) -{ - free_percpu(ops->disabled); -} - static void update_global_ops(void) { - ftrace_func_t func; + ftrace_func_t func = ftrace_global_list_func; + void *private = NULL; + + /* The list has its own recursion protection. */ + global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; /* * If there's only one function registered, then call that @@ -252,23 +253,17 @@ static void update_global_ops(void) if (ftrace_global_list == &ftrace_list_end || ftrace_global_list->next == &ftrace_list_end) { func = ftrace_global_list->func; + private = ftrace_global_list->private; /* * As we are calling the function directly. * If it does not have recursion protection, * the function_trace_op needs to be updated * accordingly. */ - if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; - else + if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)) global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE; - } else { - func = ftrace_global_list_func; - /* The list has its own recursion protection. */ - global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE; } - /* If we filter on pids, update to use the pid function */ if (!list_empty(&ftrace_pids)) { set_ftrace_pid_function(func); @@ -276,8 +271,32 @@ static void update_global_ops(void) } global_ops.func = func; + global_ops.private = private; +} + +static void ftrace_sync(struct work_struct *work) +{ + /* + * This function is just a stub to implement a hard force + * of synchronize_sched(). This requires synchronizing + * tasks even in userspace and idle. + * + * Yes, function tracing is rude. + */ } +static void ftrace_sync_ipi(void *data) +{ + /* Probably not needed, but do it anyway */ + smp_rmb(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void update_function_graph_func(void); +#else +static inline void update_function_graph_func(void) { } +#endif + static void update_ftrace_function(void) { ftrace_func_t func; @@ -296,16 +315,61 @@ static void update_ftrace_function(void) !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ if (ftrace_ops_list == &global_ops) - function_trace_op = ftrace_global_list; + set_function_trace_op = ftrace_global_list; else - function_trace_op = ftrace_ops_list; + set_function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; } else { /* Just use the default ftrace_ops */ - function_trace_op = &ftrace_list_end; + set_function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; } + /* If there's no change, then do nothing more here */ + if (ftrace_trace_function == func) + return; + + update_function_graph_func(); + + /* + * If we are using the list function, it doesn't care + * about the function_trace_ops. + */ + if (func == ftrace_ops_list_func) { + ftrace_trace_function = func; + /* + * Don't even bother setting function_trace_ops, + * it would be racy to do so anyway. + */ + return; + } + +#ifndef CONFIG_DYNAMIC_FTRACE + /* + * For static tracing, we need to be a bit more careful. + * The function change takes affect immediately. Thus, + * we need to coorditate the setting of the function_trace_ops + * with the setting of the ftrace_trace_function. + * + * Set the function to the list ops, which will call the + * function we want, albeit indirectly, but it handles the + * ftrace_ops and doesn't depend on function_trace_op. + */ + ftrace_trace_function = ftrace_ops_list_func; + /* + * Make sure all CPUs see this. Yes this is slow, but static + * tracing is slow and nasty to have enabled. + */ + schedule_on_each_cpu(ftrace_sync); + /* Now all cpus are using the list ops. */ + function_trace_op = set_function_trace_op; + /* Make sure the function_trace_op is visible on all CPUs */ + smp_wmb(); + /* Nasty way to force a rmb on all cpus */ + smp_call_function(ftrace_sync_ipi, NULL, 1); + /* OK, we are all set to update the ftrace_trace_function now! */ +#endif /* !CONFIG_DYNAMIC_FTRACE */ + ftrace_trace_function = func; } @@ -367,6 +431,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { + if (ops->flags & FTRACE_OPS_FL_DELETED) + return -EINVAL; + if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; @@ -410,17 +477,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops) return 0; } -static void ftrace_sync(struct work_struct *work) -{ - /* - * This function is just a stub to implement a hard force - * of synchronize_sched(). This requires synchronizing - * tasks even in userspace and idle. - * - * Yes, function tracing is rude. - */ -} - static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; @@ -439,20 +495,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { ret = remove_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - if (!ret) { - /* - * The ftrace_ops is now removed from the list, - * so there'll be no new users. We must ensure - * all current users are done before we free - * the control data. - * Note synchronize_sched() is not enough, as we - * use preempt_disable() to do RCU, but the function - * tracer can be called where RCU is not active - * (before user_exit()). - */ - schedule_on_each_cpu(ftrace_sync); - control_ops_free(ops); - } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -462,17 +504,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); - /* - * Dynamic ops may be freed, we must make sure that all - * callers are done before leaving this function. - * - * Again, normal synchronize_sched() is not good enough. - * We need to do a hard force of sched synchronization. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - schedule_on_each_cpu(ftrace_sync); - - return 0; } @@ -1082,19 +1113,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; -loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, whence); - else - file->f_pos = ret = 1; - - return ret; -} - #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -1151,8 +1169,6 @@ struct ftrace_page { int size; }; -static struct ftrace_page *ftrace_new_pgs; - #define ENTRY_SIZE sizeof(struct dyn_ftrace) #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE) @@ -1539,7 +1555,7 @@ unsigned long ftrace_location(unsigned long ip) * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. */ -int ftrace_text_reserved(void *start, void *end) +int ftrace_text_reserved(const void *start, const void *end) { unsigned long ret; @@ -1973,6 +1989,7 @@ int __weak ftrace_arch_code_modify_post_process(void) void ftrace_modify_all_code(int command) { int update = command & FTRACE_UPDATE_TRACE_FUNC; + int err = 0; /* * If the ftrace_caller calls a ftrace_ops func directly, @@ -1984,21 +2001,33 @@ void ftrace_modify_all_code(int command) * to make sure the ops are having the right functions * traced. */ - if (update) - ftrace_update_ftrace_func(ftrace_ops_list_func); + if (update) { + err = ftrace_update_ftrace_func(ftrace_ops_list_func); + if (FTRACE_WARN_ON(err)) + return; + } if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (update && ftrace_trace_function != ftrace_ops_list_func) - ftrace_update_ftrace_func(ftrace_trace_function); + if (update && ftrace_trace_function != ftrace_ops_list_func) { + function_trace_op = set_function_trace_op; + smp_wmb(); + /* If irqs are disabled, we are in stop machine */ + if (!irqs_disabled()) + smp_call_function(ftrace_sync_ipi, NULL, 1); + err = ftrace_update_ftrace_func(ftrace_trace_function); + if (FTRACE_WARN_ON(err)) + return; + } if (command & FTRACE_START_FUNC_RET) - ftrace_enable_ftrace_graph_caller(); + err = ftrace_enable_ftrace_graph_caller(); else if (command & FTRACE_STOP_FUNC_RET) - ftrace_disable_ftrace_graph_caller(); + err = ftrace_disable_ftrace_graph_caller(); + FTRACE_WARN_ON(err); } static int __ftrace_modify_code(void *data) @@ -2066,6 +2095,11 @@ static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; static int global_start_up; +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2156,10 +2190,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) command |= FTRACE_UPDATE_TRACE_FUNC; } - if (!command || !ftrace_enabled) + if (!command || !ftrace_enabled) { + /* + * If these are control ops, they still need their + * per_cpu field freed. Since, function tracing is + * not currently active, we can just free them + * without synchronizing all CPUs. + */ + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); return 0; + } ftrace_run_update_code(command); + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + * The same goes for freeing the per_cpu data of the control + * ops. + * + * Again, normal synchronize_sched() is not good enough. + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + schedule_on_each_cpu(ftrace_sync); + + if (ops->flags & FTRACE_OPS_FL_CONTROL) + control_ops_free(ops); + } + return 0; } @@ -2186,7 +2251,6 @@ static void ftrace_shutdown_sysctl(void) } static cycle_t ftrace_update_time; -static unsigned long ftrace_update_cnt; unsigned long ftrace_update_tot_cnt; static inline int ops_traces_mod(struct ftrace_ops *ops) @@ -2242,11 +2306,12 @@ static int referenced_filters(struct dyn_ftrace *rec) return cnt; } -static int ftrace_update_code(struct module *mod) +static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; + unsigned long update_cnt = 0; unsigned long ref = 0; bool test = false; int i; @@ -2272,9 +2337,8 @@ static int ftrace_update_code(struct module *mod) } start = ftrace_now(raw_smp_processor_id()); - ftrace_update_cnt = 0; - for (pg = ftrace_new_pgs; pg; pg = pg->next) { + for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { int cnt = ref; @@ -2295,7 +2359,7 @@ static int ftrace_update_code(struct module *mod) if (!ftrace_code_disable(mod, p)) break; - ftrace_update_cnt++; + update_cnt++; /* * If the tracing is enabled, go ahead and enable the record. @@ -2314,11 +2378,9 @@ static int ftrace_update_code(struct module *mod) } } - ftrace_new_pgs = NULL; - stop = ftrace_now(raw_smp_processor_id()); ftrace_update_time = stop - start; - ftrace_update_tot_cnt += ftrace_update_cnt; + ftrace_update_tot_cnt += update_cnt; return 0; } @@ -2410,22 +2472,6 @@ ftrace_allocate_pages(unsigned long num_to_init) return NULL; } -static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) -{ - int cnt; - - if (!num_to_init) { - pr_info("ftrace: No functions to be traced?\n"); - return -1; - } - - cnt = num_to_init / ENTRIES_PER_PAGE; - pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt + 1); - - return 0; -} - #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { @@ -2739,7 +2785,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_filter_lseek() should be used as the lseek routine, and + * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -2813,7 +2859,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, static int ftrace_filter_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, inode, file); } @@ -2821,7 +2869,9 @@ ftrace_filter_open(struct inode *inode, struct file *file) static int ftrace_notrace_open(struct inode *inode, struct file *file) { - return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, + struct ftrace_ops *ops = inode->i_private; + + return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -3767,7 +3817,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -3775,7 +3825,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; @@ -4038,7 +4088,7 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_graph_release, }; @@ -4046,11 +4096,41 @@ static const struct file_operations ftrace_graph_notrace_fops = { .open = ftrace_graph_notrace_open, .read = seq_read, .write = ftrace_graph_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_graph_release, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent) +{ + + trace_create_file("set_ftrace_filter", 0644, parent, + ops, &ftrace_filter_fops); + + trace_create_file("set_ftrace_notrace", 0644, parent, + ops, &ftrace_notrace_fops); +} + +/* + * The name "destroy_filter_files" is really a misnomer. Although + * in the future, it may actualy delete the files, but this is + * really intended to make sure the ops passed in are disabled + * and that when this function returns, the caller is free to + * free the ops. + * + * The "destroy" name is only to match the "create" name that this + * should be paired with. + */ +void ftrace_destroy_filter_files(struct ftrace_ops *ops) +{ + mutex_lock(&ftrace_lock); + if (ops->flags & FTRACE_OPS_FL_ENABLED) + ftrace_shutdown(ops, 0); + ops->flags |= FTRACE_OPS_FL_DELETED; + mutex_unlock(&ftrace_lock); +} + static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { @@ -4060,11 +4140,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("enabled_functions", 0444, d_tracer, NULL, &ftrace_enabled_fops); - trace_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - - trace_create_file("set_ftrace_notrace", 0644, d_tracer, - NULL, &ftrace_notrace_fops); + ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER trace_create_file("set_graph_function", 0444, d_tracer, @@ -4180,9 +4256,6 @@ static int ftrace_process_locs(struct module *mod, /* Assign the last page to ftrace_pages */ ftrace_pages = pg; - /* These new locations need to be initialized */ - ftrace_new_pgs = start_pg; - /* * We only need to disable interrupts on start up * because we are modifying code that an interrupt @@ -4193,7 +4266,7 @@ static int ftrace_process_locs(struct module *mod, */ if (!mod) local_irq_save(flags); - ftrace_update_code(mod); + ftrace_update_code(mod, start_pg); if (!mod) local_irq_restore(flags); ret = 0; @@ -4302,30 +4375,27 @@ struct notifier_block ftrace_module_exit_nb = { .priority = INT_MIN, /* Run after anything that can remove kprobes */ }; -extern unsigned long __start_mcount_loc[]; -extern unsigned long __stop_mcount_loc[]; - void __init ftrace_init(void) { - unsigned long count, addr, flags; + extern unsigned long __start_mcount_loc[]; + extern unsigned long __stop_mcount_loc[]; + unsigned long count, flags; int ret; - /* Keep the ftrace pointer to the stub */ - addr = (unsigned long)ftrace_stub; - local_irq_save(flags); - ftrace_dyn_arch_init(&addr); + ret = ftrace_dyn_arch_init(); local_irq_restore(flags); - - /* ftrace_dyn_arch_init places the return code in addr */ - if (addr) + if (ret) goto failed; count = __stop_mcount_loc - __start_mcount_loc; - - ret = ftrace_dyn_table_alloc(count); - if (ret) + if (!count) { + pr_info("ftrace: No functions to be traced?\n"); goto failed; + } + + pr_info("ftrace: allocating %ld entries in %ld pages\n", + count, count / ENTRIES_PER_PAGE + 1); last_ftrace_enabled = ftrace_enabled = 1; @@ -4373,7 +4443,13 @@ static inline void ftrace_startup_enable(int command) { } (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ ___ret; \ }) -# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) +# define ftrace_shutdown(ops, command) \ + ({ \ + int ___ret = __unregister_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags &= ~FTRACE_OPS_FL_ENABLED; \ + ___ret; \ + }) # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -4719,7 +4795,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_pid_release, }; @@ -4862,6 +4938,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) trace_func_graph_ret_t ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; +static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) @@ -5003,6 +5080,30 @@ static struct ftrace_ops fgraph_ops __read_mostly = { FTRACE_OPS_FL_RECURSION_SAFE, }; +static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) +{ + if (!ftrace_ops_test(&global_ops, trace->func, NULL)) + return 0; + return __ftrace_graph_entry(trace); +} + +/* + * The function graph tracer should only trace the functions defined + * by set_ftrace_filter and set_ftrace_notrace. If another function + * tracer ops is registered, the graph tracer requires testing the + * function against the global ops, and not just trace any function + * that any ftrace_ops registered. + */ +static void update_function_graph_func(void) +{ + if (ftrace_ops_list == &ftrace_list_end || + (ftrace_ops_list == &global_ops && + global_ops.next == &ftrace_list_end)) + ftrace_graph_entry = __ftrace_graph_entry; + else + ftrace_graph_entry = ftrace_graph_entry_test; +} + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -5027,7 +5128,16 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, } ftrace_graph_return = retfunc; - ftrace_graph_entry = entryfunc; + + /* + * Update the indirect function to the entryfunc, and the + * function that gets called to the entry_test first. Then + * call the update fgraph entry function to determine if + * the entryfunc should be called directly or not. + */ + __ftrace_graph_entry = entryfunc; + ftrace_graph_entry = ftrace_graph_entry_test; + update_function_graph_func(); ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); @@ -5046,6 +5156,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; + __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cc2f66f68dc5..c634868c2921 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1301,7 +1301,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, * In that off case, we need to allocate for all possible cpus. */ #ifdef CONFIG_HOTPLUG_CPU - get_online_cpus(); + cpu_notifier_register_begin(); cpumask_copy(buffer->cpumask, cpu_online_mask); #else cpumask_copy(buffer->cpumask, cpu_possible_mask); @@ -1324,10 +1324,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, #ifdef CONFIG_HOTPLUG_CPU buffer->cpu_notify.notifier_call = rb_cpu_notify; buffer->cpu_notify.priority = 0; - register_cpu_notifier(&buffer->cpu_notify); + __register_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_done(); #endif - put_online_cpus(); mutex_init(&buffer->mutex); return buffer; @@ -1341,7 +1341,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif fail_free_buffer: kfree(buffer); @@ -1358,16 +1360,17 @@ ring_buffer_free(struct ring_buffer *buffer) { int cpu; - get_online_cpus(); - #ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&buffer->cpu_notify); + cpu_notifier_register_begin(); + __unregister_cpu_notifier(&buffer->cpu_notify); #endif for_each_buffer_cpu(buffer, cpu) rb_free_cpu_buffer(buffer->buffers[cpu]); - put_online_cpus(); +#ifdef CONFIG_HOTPLUG_CPU + cpu_notifier_register_done(); +#endif kfree(buffer->buffers); free_cpumask_var(buffer->cpumask); @@ -2397,6 +2400,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, write &= RB_WRITE_MASK; tail = write - length; + /* + * If this is the first commit on the page, then it has the same + * timestamp as the page itself. + */ + if (!tail) + delta = 0; + /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, @@ -2558,7 +2568,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (unlikely(test_time_stamp(delta))) { int local_clock_stable = 1; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - local_clock_stable = sched_clock_stable; + local_clock_stable = sched_clock_stable(); #endif WARN_ONCE(delta > (1ULL << 59), KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a5457d577b98..0434ff1b808e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -40,8 +40,8 @@ static int write_iteration = 50; module_param(write_iteration, uint, 0644); MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); -static int producer_nice = 19; -static int consumer_nice = 19; +static int producer_nice = MAX_NICE; +static int consumer_nice = MAX_NICE; static int producer_fifo = -1; static int consumer_fifo = -1; @@ -308,7 +308,7 @@ static void ring_buffer_producer(void) /* Let the user know that the test is running at low priority */ if (producer_fifo < 0 && consumer_fifo < 0 && - producer_nice == 19 && consumer_nice == 19) + producer_nice == MAX_NICE && consumer_nice == MAX_NICE) trace_printk("WARNING!!! This test is running at lowest priority.\n"); trace_printk("Time: %lld (usecs)\n", time); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9d20cd9743ef..9be67c5e5b0f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = { .opts = dummy_tracer_opt }; -static int dummy_set_flag(u32 old_flags, u32 bit, int set) +static int +dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return 0; } @@ -118,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; -static int tracing_set_tracer(const char *buf); +static int tracing_set_tracer(struct trace_array *tr, const char *buf); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -180,6 +181,17 @@ static int __init set_trace_boot_options(char *str) } __setup("trace_options=", set_trace_boot_options); +static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata; +static char *trace_boot_clock __initdata; + +static int __init set_trace_boot_clock(char *str) +{ + strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); + trace_boot_clock = trace_boot_clock_buf; + return 0; +} +__setup("trace_clock=", set_trace_boot_clock); + unsigned long long ns2usecs(cycle_t nsec) { @@ -455,6 +467,9 @@ int __trace_puts(unsigned long ip, const char *str, int size) unsigned long irq_flags; int alloc; + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + alloc = sizeof(*entry) + size + 2; /* possible \n added */ local_save_flags(irq_flags); @@ -495,6 +510,9 @@ int __trace_bputs(unsigned long ip, const char *str) unsigned long irq_flags; int size = sizeof(struct bputs_entry); + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, @@ -595,6 +613,28 @@ void free_snapshot(struct trace_array *tr) } /** + * tracing_alloc_snapshot - allocate snapshot buffer. + * + * This only allocates the snapshot buffer if it isn't already + * allocated - it doesn't also take a snapshot. + * + * This is meant to be used in cases where the snapshot buffer needs + * to be set up for events that can't sleep but need to be able to + * trigger a snapshot. + */ +int tracing_alloc_snapshot(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = alloc_snapshot(tr); + WARN_ON(ret < 0); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); + +/** * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. * * This is similar to trace_snapshot(), but it will allocate the @@ -607,11 +647,10 @@ void free_snapshot(struct trace_array *tr) */ void tracing_snapshot_alloc(void) { - struct trace_array *tr = &global_trace; int ret; - ret = alloc_snapshot(tr); - if (WARN_ON(ret < 0)) + ret = tracing_alloc_snapshot(); + if (ret < 0) return; tracing_snapshot(); @@ -623,6 +662,12 @@ void tracing_snapshot(void) WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +int tracing_alloc_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ @@ -1197,7 +1242,7 @@ int register_tracer(struct tracer *type) printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ - tracing_set_tracer(type->name); + tracing_set_tracer(&global_trace, type->name); default_bootup_tracer = NULL; /* disable other selftests, since this will break it. */ tracing_selftest_disabled = true; @@ -1567,15 +1612,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); +static struct ring_buffer *temp_buffer; + struct ring_buffer_event * trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, struct ftrace_event_file *ftrace_file, int type, unsigned long len, unsigned long flags, int pc) { + struct ring_buffer_event *entry; + *current_rb = ftrace_file->tr->trace_buffer.buffer; - return trace_buffer_lock_reserve(*current_rb, + entry = trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); + /* + * If tracing is off, but we have triggers enabled + * we still need to look at the event data. Use the temp_buffer + * to store the trace event for the tigger to use. It's recusive + * safe and will not be recorded anywhere. + */ + if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) { + *current_rb = temp_buffer; + entry = trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); + } + return entry; } EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); @@ -3088,27 +3149,52 @@ static int tracing_open(struct inode *inode, struct file *file) return ret; } +/* + * Some tracers are not suitable for instance buffers. + * A tracer is always available for the global array (toplevel) + * or if it explicitly states that it is. + */ +static bool +trace_ok_for_array(struct tracer *t, struct trace_array *tr) +{ + return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances; +} + +/* Find the next tracer that this trace array may use */ +static struct tracer * +get_tracer_for_array(struct trace_array *tr, struct tracer *t) +{ + while (t && !trace_ok_for_array(t, tr)) + t = t->next; + + return t; +} + static void * t_next(struct seq_file *m, void *v, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t = v; (*pos)++; if (t) - t = t->next; + t = get_tracer_for_array(tr, t->next); return t; } static void *t_start(struct seq_file *m, loff_t *pos) { + struct trace_array *tr = m->private; struct tracer *t; loff_t l = 0; mutex_lock(&trace_types_lock); - for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) - ; + + t = get_tracer_for_array(tr, trace_types); + for (; t && l < *pos; t = t_next(m, t, &l)) + ; return t; } @@ -3143,10 +3229,21 @@ static const struct seq_operations show_traces_seq_ops = { static int show_traces_open(struct inode *inode, struct file *file) { + struct trace_array *tr = inode->i_private; + struct seq_file *m; + int ret; + if (tracing_disabled) return -ENODEV; - return seq_open(file, &show_traces_seq_ops); + ret = seq_open(file, &show_traces_seq_ops); + if (ret) + return ret; + + m = file->private_data; + m->private = tr; + + return 0; } static ssize_t @@ -3156,19 +3253,23 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } -static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +loff_t tracing_lseek(struct file *file, loff_t offset, int whence) { + int ret; + if (file->f_mode & FMODE_READ) - return seq_lseek(file, offset, origin); + ret = seq_lseek(file, offset, whence); else - return 0; + file->f_pos = ret = 0; + + return ret; } static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_release, }; @@ -3302,13 +3403,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v) return 0; } -static int __set_tracer_option(struct tracer *trace, +static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { + struct tracer *trace = tr->current_trace; int ret; - ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); if (ret) return ret; @@ -3320,8 +3422,9 @@ static int __set_tracer_option(struct tracer *trace, } /* Try to assign a tracer specific option */ -static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +static int set_tracer_option(struct trace_array *tr, char *cmp, int neg) { + struct tracer *trace = tr->current_trace; struct tracer_flags *tracer_flags = trace->flags; struct tracer_opt *opts = NULL; int i; @@ -3330,8 +3433,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) opts = &tracer_flags->opts[i]; if (strcmp(cmp, opts->name) == 0) - return __set_tracer_option(trace, trace->flags, - opts, neg); + return __set_tracer_option(tr, trace->flags, opts, neg); } return -EINVAL; @@ -3354,7 +3456,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) /* Give the tracer a chance to approve the change */ if (tr->current_trace->flag_changed) - if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) + if (tr->current_trace->flag_changed(tr, mask, !!enabled)) return -EINVAL; if (enabled) @@ -3403,7 +3505,7 @@ static int trace_set_options(struct trace_array *tr, char *option) /* If no option could be set, test the specific tracer options */ if (!trace_options[i]) - ret = set_tracer_option(tr->current_trace, cmp, neg); + ret = set_tracer_option(tr, cmp, neg); mutex_unlock(&trace_types_lock); @@ -3488,60 +3590,103 @@ static const char readme_msg[] = " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" - "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" + "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t option name\n" #ifdef CONFIG_DYNAMIC_FTRACE "\n available_filter_functions - list of functions that can be filtered on\n" - " set_ftrace_filter\t- echo function name in here to only trace these functions\n" - " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" - " modules: Can select a group via module\n" - " Format: :mod:<module-name>\n" - " example: echo :mod:ext3 > set_ftrace_filter\n" - " triggers: a command to perform when function is hit\n" - " Format: <function>:<trigger>[:count]\n" - " trigger: traceon, traceoff\n" - " enable_event:<system>:<event>\n" - " disable_event:<system>:<event>\n" + " set_ftrace_filter\t- echo function name in here to only trace these\n" + "\t\t\t functions\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module\n" + "\t Format: :mod:<module-name>\n" + "\t example: echo :mod:ext3 > set_ftrace_filter\n" + "\t triggers: a command to perform when function is hit\n" + "\t Format: <function>:<trigger>[:count]\n" + "\t trigger: traceon, traceoff\n" + "\t\t enable_event:<system>:<event>\n" + "\t\t disable_event:<system>:<event>\n" #ifdef CONFIG_STACKTRACE - " stacktrace\n" + "\t\t stacktrace\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT - " snapshot\n" + "\t\t snapshot\n" #endif - " example: echo do_fault:traceoff > set_ftrace_filter\n" - " echo do_trap:traceoff:3 > set_ftrace_filter\n" - " The first one will disable tracing every time do_fault is hit\n" - " The second will disable tracing at most 3 times when do_trap is hit\n" - " The first time do trap is hit and it disables tracing, the counter\n" - " will decrement to 2. If tracing is already disabled, the counter\n" - " will not decrement. It only decrements when the trigger did work\n" - " To remove trigger without count:\n" - " echo '!<function>:<trigger> > set_ftrace_filter\n" - " To remove trigger with a count:\n" - " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + "\t example: echo do_fault:traceoff > set_ftrace_filter\n" + "\t echo do_trap:traceoff:3 > set_ftrace_filter\n" + "\t The first one will disable tracing every time do_fault is hit\n" + "\t The second will disable tracing at most 3 times when do_trap is hit\n" + "\t The first time do trap is hit and it disables tracing, the\n" + "\t counter will decrement to 2. If tracing is already disabled,\n" + "\t the counter will not decrement. It only decrements when the\n" + "\t trigger did work\n" + "\t To remove trigger without count:\n" + "\t echo '!<function>:<trigger> > set_ftrace_filter\n" + "\t To remove trigger with a count:\n" + "\t echo '!<function>:<trigger>:0 > set_ftrace_filter\n" " set_ftrace_notrace\t- echo function name in here to never trace.\n" - " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" - " modules: Can select a group via module command :mod:\n" - " Does not accept triggers\n" + "\t accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + "\t modules: Can select a group via module command :mod:\n" + "\t Does not accept triggers\n" #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_TRACER - " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" + " set_ftrace_pid\t- Write pid(s) to only function trace those pids\n" + "\t\t (function)\n" #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" #endif #ifdef CONFIG_TRACER_SNAPSHOT - "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" - "\t\t\t Read the contents for more information\n" + "\n snapshot\t\t- Like 'trace' but shows the content of the static\n" + "\t\t\t snapshot buffer. Read the contents for more\n" + "\t\t\t information\n" #endif #ifdef CONFIG_STACK_TRACER " stack_trace\t\t- Shows the max stack trace when active\n" " stack_max_size\t- Shows current max stack size that was traced\n" - "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" + "\t\t\t Write into this file to reset the max size (trigger a\n" + "\t\t\t new trace)\n" #ifdef CONFIG_DYNAMIC_FTRACE - " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace\n" + "\t\t\t traces\n" #endif #endif /* CONFIG_STACK_TRACER */ + " events/\t\t- Directory containing all trace event subsystems:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all events\n" + " events/<system>/\t- Directory containing all trace events for <system>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of all <system>\n" + "\t\t\t events\n" + " filter\t\t- If set, only events passing filter are traced\n" + " events/<system>/<event>/\t- Directory containing control files for\n" + "\t\t\t <event>:\n" + " enable\t\t- Write 0/1 to enable/disable tracing of <event>\n" + " filter\t\t- If set, only events passing filter are traced\n" + " trigger\t\t- If set, a command to perform when event is hit\n" + "\t Format: <trigger>[:count][if <filter>]\n" + "\t trigger: traceon, traceoff\n" + "\t enable_event:<system>:<event>\n" + "\t disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + "\t\t stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\t\t snapshot\n" +#endif + "\t example: echo traceoff > events/block/block_unplug/trigger\n" + "\t echo traceoff:3 > events/block/block_unplug/trigger\n" + "\t echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' > \\\n" + "\t events/block/block_unplug/trigger\n" + "\t The first disables tracing every time block_unplug is hit.\n" + "\t The second disables tracing the first 3 times block_unplug is hit.\n" + "\t The third enables the kmalloc event the first 3 times block_unplug\n" + "\t is hit and has value of greater than 1 for the 'nr_rq' event field.\n" + "\t Like function triggers, the counter is only decremented if it\n" + "\t enabled or disabled tracing.\n" + "\t To remove a trigger without a count:\n" + "\t echo '!<trigger> > <system>/<event>/trigger\n" + "\t To remove a trigger with a count:\n" + "\t echo '!<trigger>:0 > <system>/<event>/trigger\n" + "\t Filters can be ignored when removing a trigger.\n" ; static ssize_t @@ -3789,10 +3934,26 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); -static int tracing_set_tracer(const char *buf) +/* + * Used to clear out the tracer before deletion of an instance. + * Must have trace_types_lock held. + */ +static void tracing_set_nop(struct trace_array *tr) +{ + if (tr->current_trace == &nop_trace) + return; + + tr->current_trace->enabled--; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + tr->current_trace = &nop_trace; +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf) { static struct trace_option_dentry *topts; - struct trace_array *tr = &global_trace; struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; @@ -3820,9 +3981,15 @@ static int tracing_set_tracer(const char *buf) if (t == tr->current_trace) goto out; + /* Some tracers are only allowed for the top level buffer */ + if (!trace_ok_for_array(t, tr)) { + ret = -EINVAL; + goto out; + } + trace_branch_disable(); - tr->current_trace->enabled = false; + tr->current_trace->enabled--; if (tr->current_trace->reset) tr->current_trace->reset(tr); @@ -3845,9 +4012,11 @@ static int tracing_set_tracer(const char *buf) free_snapshot(tr); } #endif - destroy_trace_option_files(topts); - - topts = create_trace_option_files(tr, t); + /* Currently, only the top instance has options */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); + } #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -3864,7 +4033,7 @@ static int tracing_set_tracer(const char *buf) } tr->current_trace = t; - tr->current_trace->enabled = true; + tr->current_trace->enabled++; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -3876,6 +4045,7 @@ static ssize_t tracing_set_trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+1]; int i; size_t ret; @@ -3895,7 +4065,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) buf[i] = 0; - err = tracing_set_tracer(buf); + err = tracing_set_tracer(tr, buf); if (err) return err; @@ -4212,12 +4382,6 @@ out: return sret; } -static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - __free_page(buf->page); -} - static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, unsigned int idx) { @@ -4229,7 +4393,7 @@ static const struct pipe_buf_operations tracing_pipe_buf_ops = { .map = generic_pipe_buf_map, .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, - .release = tracing_pipe_buf_release, + .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -4609,25 +4773,10 @@ static int tracing_clock_show(struct seq_file *m, void *v) return 0; } -static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *fpos) +static int tracing_set_clock(struct trace_array *tr, const char *clockstr) { - struct seq_file *m = filp->private_data; - struct trace_array *tr = m->private; - char buf[64]; - const char *clockstr; int i; - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - clockstr = strstrip(buf); - for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { if (strcmp(trace_clocks[i].name, clockstr) == 0) break; @@ -4655,6 +4804,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, mutex_unlock(&trace_types_lock); + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; + char buf[64]; + const char *clockstr; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + ret = tracing_set_clock(tr, clockstr); + if (ret) + return ret; + *fpos += cnt; return cnt; @@ -4913,7 +5088,7 @@ static const struct file_operations snapshot_fops = { .open = tracing_snapshot_open, .read = seq_read, .write = tracing_snapshot_write, - .llseek = tracing_seek, + .llseek = tracing_lseek, .release = tracing_snapshot_release, }; @@ -5615,7 +5790,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - ret = __set_tracer_option(topt->tr->current_trace, topt->flags, + ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) @@ -5883,6 +6058,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + buf->tr = tr; + buf->buffer = ring_buffer_alloc(size, rb_flags); if (!buf->buffer) return -ENOMEM; @@ -6020,7 +6197,9 @@ static int instance_delete(const char *name) list_del(&tr->list); + tracing_set_nop(tr); event_trace_del_tracer(tr); + ftrace_destroy_function_files(tr); debugfs_remove_recursive(tr->dir); free_percpu(tr->trace_buffer.data); ring_buffer_free(tr->trace_buffer.buffer); @@ -6115,6 +6294,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; + trace_create_file("available_tracers", 0444, d_tracer, + tr, &show_traces_fops); + + trace_create_file("current_tracer", 0644, d_tracer, + tr, &set_tracer_fops); + trace_create_file("tracing_cpumask", 0644, d_tracer, tr, &tracing_cpumask_fops); @@ -6145,6 +6330,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + if (ftrace_create_function_files(tr, d_tracer)) + WARN(1, "Could not allocate function filter files"); + #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, tr, &snapshot_fops); @@ -6167,12 +6355,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); - trace_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - - trace_create_file("current_tracer", 0644, d_tracer, - &global_trace, &set_tracer_fops); - #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); @@ -6418,11 +6600,16 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); + /* Used for event triggers */ + temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); + if (!temp_buffer) + goto out_free_cpumask; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); - goto out_free_cpumask; + goto out_free_temp_buffer; } if (global_trace.buffer_disabled) @@ -6430,6 +6617,13 @@ __init static int tracer_alloc_buffers(void) trace_init_cmdlines(); + if (trace_boot_clock) { + ret = tracing_set_clock(&global_trace, trace_boot_clock); + if (ret < 0) + pr_warning("Trace clock %s not defined, going back to default\n", + trace_boot_clock); + } + /* * register_tracer() might reference current_trace, so it * needs to be set before we register anything. This is @@ -6464,6 +6658,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_temp_buffer: + ring_buffer_free(temp_buffer); out_free_cpumask: free_percpu(global_trace.trace_buffer.data); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ea189e027b80..2e29d7ba5a52 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1,3 +1,4 @@ + #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H @@ -12,6 +13,7 @@ #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/ftrace_event.h> +#include <linux/compiler.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -209,6 +211,11 @@ struct trace_array { struct list_head events; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops *ops; + /* function tracing enabled */ + int function_enabled; +#endif }; enum { @@ -354,14 +361,16 @@ struct tracer { void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ - int (*set_flag)(u32 old_flags, u32 bit, int set); + int (*set_flag)(struct trace_array *tr, + u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ - int (*flag_changed)(struct tracer *tracer, + int (*flag_changed)(struct trace_array *tr, u32 mask, int set); struct tracer *next; struct tracer_flags *flags; + int enabled; bool print_max; - bool enabled; + bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif @@ -587,6 +596,8 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); +loff_t tracing_lseek(struct file *file, loff_t offset, int whence); + extern cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ @@ -809,13 +820,36 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } extern int ftrace_is_dead(void); +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent); +void ftrace_destroy_function_files(struct trace_array *tr); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } static inline int ftrace_is_dead(void) { return 0; } -#endif +static inline int +ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + return 0; +} +static inline void ftrace_destroy_function_files(struct trace_array *tr) { } +#endif /* CONFIG_FUNCTION_TRACER */ + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +void ftrace_create_filter_files(struct ftrace_ops *ops, + struct dentry *parent); +void ftrace_destroy_filter_files(struct ftrace_ops *ops); +#else +/* + * The ops parameter passed in is usually undefined. + * This must be a macro. + */ +#define ftrace_create_filter_files(ops, parent) do { } while (0) +#define ftrace_destroy_filter_files(ops) do { } while (0) +#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ int ftrace_event_is_function(struct ftrace_event_call *call); @@ -1020,6 +1054,10 @@ extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); +extern int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp); +extern void free_event_filter(struct event_filter *filter); struct ftrace_event_field * trace_find_event_field(struct ftrace_event_call *call, char *name); @@ -1028,9 +1066,195 @@ extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern struct ftrace_event_file *find_event_file(struct trace_array *tr, + const char *system, + const char *event); + +static inline void *event_file_data(struct file *filp) +{ + return ACCESS_ONCE(file_inode(filp)->i_private); +} + extern struct mutex event_mutex; extern struct list_head ftrace_events; +extern const struct file_operations event_trigger_fops; + +extern int register_trigger_cmds(void); +extern void clear_event_triggers(struct trace_array *tr); + +struct event_trigger_data { + unsigned long count; + int ref; + struct event_trigger_ops *ops; + struct event_command *cmd_ops; + struct event_filter __rcu *filter; + char *filter_str; + void *private_data; + struct list_head list; +}; + +/** + * struct event_trigger_ops - callbacks for trace event triggers + * + * The methods in this structure provide per-event trigger hooks for + * various trigger operations. + * + * All the methods below, except for @init() and @free(), must be + * implemented. + * + * @func: The trigger 'probe' function called when the triggering + * event occurs. The data passed into this callback is the data + * that was supplied to the event_command @reg() function that + * registered the trigger (see struct event_command). + * + * @init: An optional initialization function called for the trigger + * when the trigger is registered (via the event_command reg() + * function). This can be used to perform per-trigger + * initialization such as incrementing a per-trigger reference + * count, for instance. This is usually implemented by the + * generic utility function @event_trigger_init() (see + * trace_event_triggers.c). + * + * @free: An optional de-initialization function called for the + * trigger when the trigger is unregistered (via the + * event_command @reg() function). This can be used to perform + * per-trigger de-initialization such as decrementing a + * per-trigger reference count and freeing corresponding trigger + * data, for instance. This is usually implemented by the + * generic utility function @event_trigger_free() (see + * trace_event_triggers.c). + * + * @print: The callback function invoked to have the trigger print + * itself. This is usually implemented by a wrapper function + * that calls the generic utility function @event_trigger_print() + * (see trace_event_triggers.c). + */ +struct event_trigger_ops { + void (*func)(struct event_trigger_data *data); + int (*init)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + void (*free)(struct event_trigger_ops *ops, + struct event_trigger_data *data); + int (*print)(struct seq_file *m, + struct event_trigger_ops *ops, + struct event_trigger_data *data); +}; + +/** + * struct event_command - callbacks and data members for event commands + * + * Event commands are invoked by users by writing the command name + * into the 'trigger' file associated with a trace event. The + * parameters associated with a specific invocation of an event + * command are used to create an event trigger instance, which is + * added to the list of trigger instances associated with that trace + * event. When the event is hit, the set of triggers associated with + * that event is invoked. + * + * The data members in this structure provide per-event command data + * for various event commands. + * + * All the data members below, except for @post_trigger, must be set + * for each event command. + * + * @name: The unique name that identifies the event command. This is + * the name used when setting triggers via trigger files. + * + * @trigger_type: A unique id that identifies the event command + * 'type'. This value has two purposes, the first to ensure that + * only one trigger of the same type can be set at a given time + * for a particular event e.g. it doesn't make sense to have both + * a traceon and traceoff trigger attached to a single event at + * the same time, so traceon and traceoff have the same type + * though they have different names. The @trigger_type value is + * also used as a bit value for deferring the actual trigger + * action until after the current event is finished. Some + * commands need to do this if they themselves log to the trace + * buffer (see the @post_trigger() member below). @trigger_type + * values are defined by adding new values to the trigger_type + * enum in include/linux/ftrace_event.h. + * + * @post_trigger: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * All the methods below, except for @set_filter(), must be + * implemented. + * + * @func: The callback function responsible for parsing and + * registering the trigger written to the 'trigger' file by the + * user. It allocates the trigger instance and registers it with + * the appropriate trace event. It makes use of the other + * event_command callback functions to orchestrate this, and is + * usually implemented by the generic utility function + * @event_trigger_callback() (see trace_event_triggers.c). + * + * @reg: Adds the trigger to the list of triggers associated with the + * event, and enables the event trigger itself, after + * initializing it (via the event_trigger_ops @init() function). + * This is also where commands can use the @trigger_type value to + * make the decision as to whether or not multiple instances of + * the trigger should be allowed. This is usually implemented by + * the generic utility function @register_trigger() (see + * trace_event_triggers.c). + * + * @unreg: Removes the trigger from the list of triggers associated + * with the event, and disables the event trigger itself, after + * initializing it (via the event_trigger_ops @free() function). + * This is usually implemented by the generic utility function + * @unregister_trigger() (see trace_event_triggers.c). + * + * @set_filter: An optional function called to parse and set a filter + * for the trigger. If no @set_filter() method is set for the + * event command, filters set by the user for the command will be + * ignored. This is usually implemented by the generic utility + * function @set_trigger_filter() (see trace_event_triggers.c). + * + * @get_trigger_ops: The callback function invoked to retrieve the + * event_trigger_ops implementation associated with the command. + */ +struct event_command { + struct list_head list; + char *name; + enum event_trigger_type trigger_type; + bool post_trigger; + int (*func)(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *params); + int (*reg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + void (*unreg)(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file); + int (*set_filter)(char *filter_str, + struct event_trigger_data *data, + struct ftrace_event_file *file); + struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); +}; + +extern int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable); +extern int tracing_alloc_snapshot(void); + extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; @@ -1056,7 +1280,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ - __attribute__((__aligned__(4))) event_##call; + __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e854f420e033..c894614de14d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -31,9 +31,25 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, } /* The ftrace function trace is allowed only for root. */ - if (ftrace_event_is_function(tp_event) && - perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (ftrace_event_is_function(tp_event)) { + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* + * We don't allow user space callchains for function trace + * event, due to issues with page faults while tracing page + * fault handler and its overall trickiness nature. + */ + if (!p_event->attr.exclude_callchain_user) + return -EINVAL; + + /* + * Same reason to disable user stack dump as for user space + * callchains above. + */ + if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) + return -EINVAL; + } /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a11800ae96de..83a4378dc5e0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); @@ -194,6 +188,36 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, + struct ftrace_event_file *ftrace_file, + unsigned long len) +{ + struct ftrace_event_call *event_call = ftrace_file->event_call; + + local_save_flags(fbuffer->flags); + fbuffer->pc = preempt_count(); + fbuffer->ftrace_file = ftrace_file; + + fbuffer->event = + trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file, + event_call->event.type, len, + fbuffer->flags, fbuffer->pc); + if (!fbuffer->event) + return NULL; + + fbuffer->entry = ring_buffer_event_data(fbuffer->event); + return fbuffer->entry; +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); + +void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) +{ + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, + fbuffer->event, fbuffer->entry, + fbuffer->flags, fbuffer->pc); +} +EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit); + int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type, void *data) { @@ -342,6 +366,12 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file, return ret; } +int trace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) +{ + return __ftrace_event_enable_disable(file, enable, soft_disable); +} + static int ftrace_event_enable_disable(struct ftrace_event_file *file, int enable) { @@ -421,11 +451,6 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) } } -static void *event_file_data(struct file *filp) -{ - return ACCESS_ONCE(file_inode(filp)->i_private); -} - static void remove_event_file_dir(struct ftrace_event_file *file) { struct dentry *dir = file->dir; @@ -1549,6 +1574,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); + trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); @@ -1645,6 +1673,8 @@ trace_create_new_event(struct ftrace_event_call *call, file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); + atomic_set(&file->tm_ref, 0); + INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); return file; @@ -1771,6 +1801,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1849,20 +1889,7 @@ __trace_add_event_dirs(struct trace_array *tr) } } -#ifdef CONFIG_DYNAMIC_FTRACE - -/* Avoid typos */ -#define ENABLE_EVENT_STR "enable_event" -#define DISABLE_EVENT_STR "disable_event" - -struct event_probe_data { - struct ftrace_event_file *file; - unsigned long count; - int ref; - bool enable; -}; - -static struct ftrace_event_file * +struct ftrace_event_file * find_event_file(struct trace_array *tr, const char *system, const char *event) { struct ftrace_event_file *file; @@ -1885,6 +1912,19 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + static void event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) { @@ -2311,6 +2351,9 @@ int event_trace_del_tracer(struct trace_array *tr) { mutex_lock(&event_mutex); + /* Disable any event triggers and associated soft-disabled events */ + clear_event_triggers(tr); + /* Disable any running events */ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); @@ -2377,6 +2420,8 @@ static __init int event_trace_enable(void) register_event_cmds(); + register_trigger_cmds(); + return 0; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2468f56dc5db..8a8631926a07 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -799,6 +799,11 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +void free_event_filter(struct event_filter *filter) +{ + __free_filter(filter); +} + void destroy_call_preds(struct ftrace_event_call *call) { __free_filter(call->filter); @@ -1938,6 +1943,13 @@ static int create_filter(struct ftrace_event_call *call, return err; } +int create_event_filter(struct ftrace_event_call *call, + char *filter_str, bool set_str, + struct event_filter **filterp) +{ + return create_filter(call, filter_str, set_str, filterp); +} + /** * create_system_filter - create a filter for an event_subsystem * @system: event_subsystem to create a filter for diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c new file mode 100644 index 000000000000..8efbb69b04f0 --- /dev/null +++ b/kernel/trace/trace_events_trigger.c @@ -0,0 +1,1437 @@ +/* + * trace_events_trigger - trace event triggers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> + */ + +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +#include "trace.h" + +static LIST_HEAD(trigger_commands); +static DEFINE_MUTEX(trigger_cmd_mutex); + +static void +trigger_data_free(struct event_trigger_data *data) +{ + if (data->cmd_ops->set_filter) + data->cmd_ops->set_filter(NULL, data, NULL); + + synchronize_sched(); /* make sure current triggers exit before free */ + kfree(data); +} + +/** + * event_triggers_call - Call triggers associated with a trace event + * @file: The ftrace_event_file associated with the event + * @rec: The trace entry for the event, NULL for unconditional invocation + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command. If rec is + * non-NULL, it means that the trigger requires further processing and + * shouldn't be unconditionally invoked. If rec is non-NULL and the + * trigger has a filter associated with it, rec will checked against + * the filter and if the record matches the trigger will be invoked. + * If the trigger is a 'post_trigger', meaning it shouldn't be invoked + * in any case until the current event is written, the trigger + * function isn't invoked but the bit associated with the deferred + * trigger is set in the return value. + * + * Returns an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + * + * Return: an enum event_trigger_type value containing a set bit for + * any trigger that should be deferred, ETT_NONE if nothing to defer. + */ +enum event_trigger_type +event_triggers_call(struct ftrace_event_file *file, void *rec) +{ + struct event_trigger_data *data; + enum event_trigger_type tt = ETT_NONE; + struct event_filter *filter; + + if (list_empty(&file->triggers)) + return tt; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (!rec) { + data->ops->func(data); + continue; + } + filter = rcu_dereference(data->filter); + if (filter && !filter_match_preds(filter, rec)) + continue; + if (data->cmd_ops->post_trigger) { + tt |= data->cmd_ops->trigger_type; + continue; + } + data->ops->func(data); + } + return tt; +} +EXPORT_SYMBOL_GPL(event_triggers_call); + +/** + * event_triggers_post_call - Call 'post_triggers' for a trace event + * @file: The ftrace_event_file associated with the event + * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * + * For each trigger associated with an event, invoke the trigger + * function registered with the associated trigger command, if the + * corresponding bit is set in the tt enum passed into this function. + * See @event_triggers_call for details on how those bits are set. + * + * Called from tracepoint handlers (with rcu_read_lock_sched() held). + */ +void +event_triggers_post_call(struct ftrace_event_file *file, + enum event_trigger_type tt) +{ + struct event_trigger_data *data; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type & tt) + data->ops->func(data); + } +} +EXPORT_SYMBOL_GPL(event_triggers_post_call); + +#define SHOW_AVAILABLE_TRIGGERS (void *)(1UL) + +static void *trigger_next(struct seq_file *m, void *t, loff_t *pos) +{ + struct ftrace_event_file *event_file = event_file_data(m->private); + + if (t == SHOW_AVAILABLE_TRIGGERS) + return NULL; + + return seq_list_next(t, &event_file->triggers, pos); +} + +static void *trigger_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_file *event_file; + + /* ->stop() is called even if ->start() fails */ + mutex_lock(&event_mutex); + event_file = event_file_data(m->private); + if (unlikely(!event_file)) + return ERR_PTR(-ENODEV); + + if (list_empty(&event_file->triggers)) + return *pos == 0 ? SHOW_AVAILABLE_TRIGGERS : NULL; + + return seq_list_start(&event_file->triggers, *pos); +} + +static void trigger_stop(struct seq_file *m, void *t) +{ + mutex_unlock(&event_mutex); +} + +static int trigger_show(struct seq_file *m, void *v) +{ + struct event_trigger_data *data; + struct event_command *p; + + if (v == SHOW_AVAILABLE_TRIGGERS) { + seq_puts(m, "# Available triggers:\n"); + seq_putc(m, '#'); + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_reverse(p, &trigger_commands, list) + seq_printf(m, " %s", p->name); + seq_putc(m, '\n'); + mutex_unlock(&trigger_cmd_mutex); + return 0; + } + + data = list_entry(v, struct event_trigger_data, list); + data->ops->print(m, data->ops, data); + + return 0; +} + +static const struct seq_operations event_triggers_seq_ops = { + .start = trigger_start, + .next = trigger_next, + .stop = trigger_stop, + .show = trigger_show, +}; + +static int event_trigger_regex_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&event_mutex); + + if (unlikely(!event_file_data(file))) { + mutex_unlock(&event_mutex); + return -ENODEV; + } + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &event_triggers_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = file; + } + } + + mutex_unlock(&event_mutex); + + return ret; +} + +static int trigger_process_regex(struct ftrace_event_file *file, char *buff) +{ + char *command, *next = buff; + struct event_command *p; + int ret = -EINVAL; + + command = strsep(&next, ": \t"); + command = (command[0] != '!') ? command : command + 1; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(p, file, buff, command, next); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +static ssize_t event_trigger_regex_write(struct file *file, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ftrace_event_file *event_file; + ssize_t ret; + char *buf; + + if (!cnt) + return 0; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long)buf); + return -EFAULT; + } + buf[cnt] = '\0'; + strim(buf); + + mutex_lock(&event_mutex); + event_file = event_file_data(file); + if (unlikely(!event_file)) { + mutex_unlock(&event_mutex); + free_page((unsigned long)buf); + return -ENODEV; + } + ret = trigger_process_regex(event_file, buf); + mutex_unlock(&event_mutex); + + free_page((unsigned long)buf); + if (ret < 0) + goto out; + + *ppos += cnt; + ret = cnt; + out: + return ret; +} + +static int event_trigger_regex_release(struct inode *inode, struct file *file) +{ + mutex_lock(&event_mutex); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + mutex_unlock(&event_mutex); + + return 0; +} + +static ssize_t +event_trigger_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return event_trigger_regex_write(filp, ubuf, cnt, ppos); +} + +static int +event_trigger_open(struct inode *inode, struct file *filp) +{ + return event_trigger_regex_open(inode, filp); +} + +static int +event_trigger_release(struct inode *inode, struct file *file) +{ + return event_trigger_regex_release(inode, file); +} + +const struct file_operations event_trigger_fops = { + .open = event_trigger_open, + .read = seq_read, + .write = event_trigger_write, + .llseek = tracing_lseek, + .release = event_trigger_release, +}; + +/* + * Currently we only register event commands from __init, so mark this + * __init too. + */ +static __init int register_event_command(struct event_command *cmd) +{ + struct event_command *p; + int ret = 0; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry(p, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &trigger_commands); + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/* + * Currently we only unregister event commands from __init, so mark + * this __init too. + */ +static __init int unregister_event_command(struct event_command *cmd) +{ + struct event_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&trigger_cmd_mutex); + list_for_each_entry_safe(p, n, &trigger_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&trigger_cmd_mutex); + + return ret; +} + +/** + * event_trigger_print - Generic event_trigger_ops @print implementation + * @name: The name of the event trigger + * @m: The seq_file being printed to + * @data: Trigger-specific data + * @filter_str: filter_str to print, if present + * + * Common implementation for event triggers to print themselves. + * + * Usually wrapped by a function that simply sets the @name of the + * trigger command and then invokes this. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_print(const char *name, struct seq_file *m, + void *data, char *filter_str) +{ + long count = (long)data; + + seq_printf(m, "%s", name); + + if (count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", count); + + if (filter_str) + seq_printf(m, " if %s\n", filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +/** + * event_trigger_init - Generic event_trigger_ops @init implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger initialization. + * + * Usually used directly as the @init method in event trigger + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + data->ref++; + return 0; +} + +/** + * event_trigger_free - Generic event_trigger_ops @free implementation + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data + * + * Common implementation of event trigger de-initialization. + * + * Usually used directly as the @free method in event trigger + * implementations. + */ +static void +event_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) + trigger_data_free(data); +} + +static int trace_event_trigger_enable_disable(struct ftrace_event_file *file, + int trigger_enable) +{ + int ret = 0; + + if (trigger_enable) { + if (atomic_inc_return(&file->tm_ref) > 1) + return ret; + set_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 1, 1); + } else { + if (atomic_dec_return(&file->tm_ref) > 0) + return ret; + clear_bit(FTRACE_EVENT_FL_TRIGGER_MODE_BIT, &file->flags); + ret = trace_event_enable_disable(file, 0, 1); + } + + return ret; +} + +/** + * clear_event_triggers - Clear all triggers associated with a trace array + * @tr: The trace array to clear + * + * For each trigger, the triggering event has its tm_ref decremented + * via trace_event_trigger_enable_disable(), and any associated event + * (in the case of enable/disable_event triggers) will have its sm_ref + * decremented via free()->trace_event_enable_disable(). That + * combination effectively reverses the soft-mode/trigger state added + * by trigger registration. + * + * Must be called with event_mutex held. + */ +void +clear_event_triggers(struct trace_array *tr) +{ + struct ftrace_event_file *file; + + list_for_each_entry(file, &tr->events, list) { + struct event_trigger_data *data; + list_for_each_entry_rcu(data, &file->triggers, list) { + trace_event_trigger_enable_disable(file, 0); + if (data->ops->free) + data->ops->free(data->ops, data); + } + } +} + +/** + * update_cond_flag - Set or reset the TRIGGER_COND bit + * @file: The ftrace_event_file associated with the event + * + * If an event has triggers and any of those triggers has a filter or + * a post_trigger, trigger invocation needs to be deferred until after + * the current event has logged its data, and the event should have + * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be + * cleared. + */ +static void update_cond_flag(struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool set_cond = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->filter || data->cmd_ops->post_trigger) { + set_cond = true; + break; + } + } + + if (set_cond) + set_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); + else + clear_bit(FTRACE_EVENT_FL_TRIGGER_COND_BIT, &file->flags); +} + +/** + * register_trigger - Generic event_command @reg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @data: Trigger-specific data to associate with the trigger + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger registration. + * + * Usually used directly as the @reg method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int register_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @ops: The trigger ops associated with the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +/** + * event_trigger_callback - Generic event_command @func implementation + * @cmd_ops: The command ops, used for trigger registration + * @file: The ftrace_event_file associated with the event + * @glob: The raw string used to register the trigger + * @cmd: The cmd portion of the string used to register the trigger + * @param: The params portion of the string used to register the trigger + * + * Common implementation for event command parsing and trigger + * instantiation. + * + * Usually used directly as the @func method in event command + * implementations. + * + * Return: 0 on success, errno otherwise + */ +static int +event_trigger_callback(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + char *trigger = NULL; + char *number; + int ret; + + /* separate the trigger from the filter (t:n [if filter]) */ + if (param && isdigit(param[0])) + trigger = strsep(¶m, " \t"); + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_free; + } else if (ret < 0) + goto out_free; + ret = 0; + out: + return ret; + + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + goto out; +} + +/** + * set_trigger_filter - Generic event_command @set_filter implementation + * @filter_str: The filter string for the trigger, NULL to remove filter + * @trigger_data: Trigger-specific data + * @file: The ftrace_event_file associated with the event + * + * Common implementation for event command filter parsing and filter + * instantiation. + * + * Usually used directly as the @set_filter method in event command + * implementations. + * + * Also used to remove a filter (if filter_str = NULL). + * + * Return: 0 on success, errno otherwise + */ +static int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct ftrace_event_file *file) +{ + struct event_trigger_data *data = trigger_data; + struct event_filter *filter = NULL, *tmp; + int ret = -EINVAL; + char *s; + + if (!filter_str) /* clear the current filter */ + goto assign; + + s = strsep(&filter_str, " \t"); + + if (!strlen(s) || strcmp(s, "if") != 0) + goto out; + + if (!filter_str) + goto out; + + /* The filter is for the 'trigger' event, not the triggered event */ + ret = create_event_filter(file->event_call, filter_str, false, &filter); + if (ret) + goto out; + assign: + tmp = rcu_access_pointer(data->filter); + + rcu_assign_pointer(data->filter, filter); + + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + free_event_filter(tmp); + } + + kfree(data->filter_str); + data->filter_str = NULL; + + if (filter_str) { + data->filter_str = kstrdup(filter_str, GFP_KERNEL); + if (!data->filter_str) { + free_event_filter(rcu_access_pointer(data->filter)); + data->filter = NULL; + ret = -ENOMEM; + } + } + out: + return ret; +} + +static void +traceon_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +traceon_count_trigger(struct event_trigger_data *data) +{ + if (tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_on(); +} + +static void +traceoff_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + tracing_off(); +} + +static void +traceoff_count_trigger(struct event_trigger_data *data) +{ + if (!tracing_is_on()) + return; + + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + tracing_off(); +} + +static int +traceon_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceon", m, (void *)data->count, + data->filter_str); +} + +static int +traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("traceoff", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops traceon_trigger_ops = { + .func = traceon_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceon_count_trigger_ops = { + .func = traceon_count_trigger, + .print = traceon_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_trigger_ops = { + .func = traceoff_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops traceoff_count_trigger_ops = { + .func = traceoff_count_trigger, + .print = traceoff_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +onoff_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_trigger_ops : + &traceon_trigger_ops; + else + ops = param ? &traceoff_count_trigger_ops : + &traceoff_trigger_ops; + + return ops; +} + +static struct event_command trigger_traceon_cmd = { + .name = "traceon", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_traceoff_cmd = { + .name = "traceoff", + .trigger_type = ETT_TRACE_ONOFF, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = onoff_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +#ifdef CONFIG_TRACER_SNAPSHOT +static void +snapshot_trigger(struct event_trigger_data *data) +{ + tracing_snapshot(); +} + +static void +snapshot_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + snapshot_trigger(data); +} + +static int +register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + int ret = register_trigger(glob, ops, data, file); + + if (ret > 0 && tracing_alloc_snapshot() != 0) { + unregister_trigger(glob, ops, data, file); + ret = 0; + } + + return ret; +} + +static int +snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("snapshot", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops snapshot_trigger_ops = { + .func = snapshot_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops snapshot_count_trigger_ops = { + .func = snapshot_count_trigger, + .print = snapshot_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +snapshot_get_trigger_ops(char *cmd, char *param) +{ + return param ? &snapshot_count_trigger_ops : &snapshot_trigger_ops; +} + +static struct event_command trigger_snapshot_cmd = { + .name = "snapshot", + .trigger_type = ETT_SNAPSHOT, + .func = event_trigger_callback, + .reg = register_snapshot_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = snapshot_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_snapshot_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_snapshot_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_snapshot_cmd(void) { return 0; } +#endif /* CONFIG_TRACER_SNAPSHOT */ + +#ifdef CONFIG_STACKTRACE +/* + * Skip 3: + * stacktrace_trigger() + * event_triggers_post_call() + * ftrace_raw_event_xxx() + */ +#define STACK_SKIP 3 + +static void +stacktrace_trigger(struct event_trigger_data *data) +{ + trace_dump_stack(STACK_SKIP); +} + +static void +stacktrace_count_trigger(struct event_trigger_data *data) +{ + if (!data->count) + return; + + if (data->count != -1) + (data->count)--; + + stacktrace_trigger(data); +} + +static int +stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + return event_trigger_print("stacktrace", m, (void *)data->count, + data->filter_str); +} + +static struct event_trigger_ops stacktrace_trigger_ops = { + .func = stacktrace_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops stacktrace_count_trigger_ops = { + .func = stacktrace_count_trigger, + .print = stacktrace_trigger_print, + .init = event_trigger_init, + .free = event_trigger_free, +}; + +static struct event_trigger_ops * +stacktrace_get_trigger_ops(char *cmd, char *param) +{ + return param ? &stacktrace_count_trigger_ops : &stacktrace_trigger_ops; +} + +static struct event_command trigger_stacktrace_cmd = { + .name = "stacktrace", + .trigger_type = ETT_STACKTRACE, + .post_trigger = true, + .func = event_trigger_callback, + .reg = register_trigger, + .unreg = unregister_trigger, + .get_trigger_ops = stacktrace_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init int register_trigger_stacktrace_cmd(void) +{ + int ret; + + ret = register_event_command(&trigger_stacktrace_cmd); + WARN_ON(ret < 0); + + return ret; +} +#else +static __init int register_trigger_stacktrace_cmd(void) { return 0; } +#endif /* CONFIG_STACKTRACE */ + +static __init void unregister_trigger_traceon_traceoff_cmds(void) +{ + unregister_event_command(&trigger_traceon_cmd); + unregister_event_command(&trigger_traceoff_cmd); +} + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct enable_trigger_data { + struct ftrace_event_file *file; + bool enable; +}; + +static void +event_enable_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (enable_data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &enable_data->file->flags); +} + +static void +event_enable_count_trigger(struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (enable_data->enable == !(enable_data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_trigger(data); +} + +static int +event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + seq_printf(m, "%s:%s:%s", + enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + enable_data->file->event_call->class->system, + enable_data->file->event_call->name); + + if (data->count == -1) + seq_puts(m, ":unlimited"); + else + seq_printf(m, ":count=%ld", data->count); + + if (data->filter_str) + seq_printf(m, " if %s\n", data->filter_str); + else + seq_puts(m, "\n"); + + return 0; +} + +static void +event_enable_trigger_free(struct event_trigger_ops *ops, + struct event_trigger_data *data) +{ + struct enable_trigger_data *enable_data = data->private_data; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + trace_event_enable_disable(enable_data->file, 0, 1); + module_put(enable_data->file->event_call->mod); + trigger_data_free(data); + kfree(enable_data); + } +} + +static struct event_trigger_ops event_enable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_enable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_trigger_ops = { + .func = event_enable_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static struct event_trigger_ops event_disable_count_trigger_ops = { + .func = event_enable_count_trigger, + .print = event_enable_trigger_print, + .init = event_trigger_init, + .free = event_enable_trigger_free, +}; + +static int +event_enable_trigger_func(struct event_command *cmd_ops, + struct ftrace_event_file *file, + char *glob, char *cmd, char *param) +{ + struct ftrace_event_file *event_enable_file; + struct enable_trigger_data *enable_data; + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + struct trace_array *tr = file->tr; + const char *system; + const char *event; + char *trigger; + char *number; + bool enable; + int ret; + + if (!param) + return -EINVAL; + + /* separate the trigger from the filter (s:e:n [if filter]) */ + trigger = strsep(¶m, " \t"); + if (!trigger) + return -EINVAL; + + system = strsep(&trigger, ":"); + if (!trigger) + return -EINVAL; + + event = strsep(&trigger, ":"); + + ret = -EINVAL; + event_enable_file = find_event_file(tr, system, event); + if (!event_enable_file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); + + ret = -ENOMEM; + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + goto out; + + enable_data = kzalloc(sizeof(*enable_data), GFP_KERNEL); + if (!enable_data) { + kfree(trigger_data); + goto out; + } + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + INIT_LIST_HEAD(&trigger_data->list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + enable_data->enable = enable; + enable_data->file = event_enable_file; + trigger_data->private_data = enable_data; + + if (glob[0] == '!') { + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + kfree(trigger_data); + kfree(enable_data); + ret = 0; + goto out; + } + + if (trigger) { + number = strsep(&trigger, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + if (ret) + goto out_free; + } + + if (!param) /* if param is non-empty, it's supposed to be a filter */ + goto out_reg; + + if (!cmd_ops->set_filter) + goto out_reg; + + ret = cmd_ops->set_filter(param, trigger_data, file); + if (ret < 0) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(event_enable_file->event_call->mod); + if (!ret) { + ret = -EBUSY; + goto out_free; + } + + ret = trace_event_enable_disable(event_enable_file, 1, 1); + if (ret < 0) + goto out_put; + ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + ret = -ENOENT; + goto out_disable; + } else if (ret < 0) + goto out_disable; + /* Just return zero, not the number of enabled functions */ + ret = 0; + out: + return ret; + + out_disable: + trace_event_enable_disable(event_enable_file, 0, 1); + out_put: + module_put(event_enable_file->event_call->mod); + out_free: + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); + kfree(trigger_data); + kfree(enable_data); + goto out; +} + +static int event_enable_register_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *data, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *enable_data = data->private_data; + struct enable_trigger_data *test_enable_data; + struct event_trigger_data *test; + int ret = 0; + + list_for_each_entry_rcu(test, &file->triggers, list) { + test_enable_data = test->private_data; + if (test_enable_data && + (test_enable_data->file == enable_data->file)) { + ret = -EEXIST; + goto out; + } + } + + if (data->ops->init) { + ret = data->ops->init(data->ops, data); + if (ret < 0) + goto out; + } + + list_add_rcu(&data->list, &file->triggers); + ret++; + + if (trace_event_trigger_enable_disable(file, 1) < 0) { + list_del_rcu(&data->list); + ret--; + } + update_cond_flag(file); +out: + return ret; +} + +static void event_enable_unregister_trigger(char *glob, + struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct ftrace_event_file *file) +{ + struct enable_trigger_data *test_enable_data = test->private_data; + struct enable_trigger_data *enable_data; + struct event_trigger_data *data; + bool unregistered = false; + + list_for_each_entry_rcu(data, &file->triggers, list) { + enable_data = data->private_data; + if (enable_data && + (enable_data->file == test_enable_data->file)) { + unregistered = true; + list_del_rcu(&data->list); + update_cond_flag(file); + trace_event_trigger_enable_disable(file, 0); + break; + } + } + + if (unregistered && data->ops->free) + data->ops->free(data->ops, data); +} + +static struct event_trigger_ops * +event_enable_get_trigger_ops(char *cmd, char *param) +{ + struct event_trigger_ops *ops; + bool enable; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_trigger_ops : + &event_enable_trigger_ops; + else + ops = param ? &event_disable_count_trigger_ops : + &event_disable_trigger_ops; + + return ops; +} + +static struct event_command trigger_enable_cmd = { + .name = ENABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static struct event_command trigger_disable_cmd = { + .name = DISABLE_EVENT_STR, + .trigger_type = ETT_EVENT_ENABLE, + .func = event_enable_trigger_func, + .reg = event_enable_register_trigger, + .unreg = event_enable_unregister_trigger, + .get_trigger_ops = event_enable_get_trigger_ops, + .set_filter = set_trigger_filter, +}; + +static __init void unregister_trigger_enable_disable_cmds(void) +{ + unregister_event_command(&trigger_enable_cmd); + unregister_event_command(&trigger_disable_cmd); +} + +static __init int register_trigger_enable_disable_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_enable_disable_cmds(); + + return ret; +} + +static __init int register_trigger_traceon_traceoff_cmds(void) +{ + int ret; + + ret = register_event_command(&trigger_traceon_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_event_command(&trigger_traceoff_cmd); + if (WARN_ON(ret < 0)) + unregister_trigger_traceon_traceoff_cmds(); + + return ret; +} + +__init int register_trigger_cmds(void) +{ + register_trigger_traceon_traceoff_cmds(); + register_trigger_snapshot_cmd(); + register_trigger_stacktrace_cmd(); + register_trigger_enable_disable_cmds(); + + return 0; +} diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 7c3e3e72e2b6..ee0a5098ac43 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 38fe1483c508..5b781d2be383 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,32 +13,106 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/slab.h> #include <linux/fs.h> #include "trace.h" -/* function tracing enabled */ -static int ftrace_function_enabled; +static void tracing_start_function_trace(struct trace_array *tr); +static void tracing_stop_function_trace(struct trace_array *tr); +static void +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs); +static struct ftrace_ops trace_ops; +static struct ftrace_ops trace_stack_ops; +static struct tracer_flags func_flags; + +/* Our option */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static int allocate_ftrace_ops(struct trace_array *tr) +{ + struct ftrace_ops *ops; + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; -static struct trace_array *func_trace; + /* Currently only the non stack verision is supported */ + ops->func = function_trace_call; + ops->flags = FTRACE_OPS_FL_RECURSION_SAFE; + + tr->ops = ops; + ops->private = tr; + return 0; +} + + +int ftrace_create_function_files(struct trace_array *tr, + struct dentry *parent) +{ + int ret; + + /* The top level array uses the "global_ops". */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) { + ret = allocate_ftrace_ops(tr); + if (ret) + return ret; + } + + ftrace_create_filter_files(tr->ops, parent); + + return 0; +} -static void tracing_start_function_trace(void); -static void tracing_stop_function_trace(void); +void ftrace_destroy_function_files(struct trace_array *tr) +{ + ftrace_destroy_filter_files(tr->ops); + kfree(tr->ops); + tr->ops = NULL; +} static int function_trace_init(struct trace_array *tr) { - func_trace = tr; + struct ftrace_ops *ops; + + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + /* There's only one global tr */ + if (!trace_ops.private) { + trace_ops.private = tr; + trace_stack_ops.private = tr; + } + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + ops = &trace_stack_ops; + else + ops = &trace_ops; + tr->ops = ops; + } else if (!tr->ops) { + /* + * Instance trace_arrays get their ops allocated + * at instance creation. Unless it failed + * the allocation. + */ + return -ENOMEM; + } + tr->trace_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); - tracing_start_function_trace(); + tracing_start_function_trace(tr); return 0; } static void function_trace_reset(struct trace_array *tr) { - tracing_stop_function_trace(); + tracing_stop_function_trace(tr); tracing_stop_cmdline_record(); } @@ -47,25 +121,18 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->trace_buffer); } -/* Our option */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - -static struct tracer_flags func_flags; - static void function_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; int bit; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; pc = preempt_count(); @@ -91,14 +158,14 @@ static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct trace_array *tr = func_trace; + struct trace_array *tr = op->private; struct trace_array_cpu *data; unsigned long flags; long disabled; int cpu; int pc; - if (unlikely(!ftrace_function_enabled)) + if (unlikely(!tr->function_enabled)) return; /* @@ -128,7 +195,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } - static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, @@ -153,29 +219,21 @@ static struct tracer_flags func_flags = { .opts = func_opts }; -static void tracing_start_function_trace(void) +static void tracing_start_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - register_ftrace_function(&trace_stack_ops); - else - register_ftrace_function(&trace_ops); - - ftrace_function_enabled = 1; + tr->function_enabled = 0; + register_ftrace_function(tr->ops); + tr->function_enabled = 1; } -static void tracing_stop_function_trace(void) +static void tracing_stop_function_trace(struct trace_array *tr) { - ftrace_function_enabled = 0; - - if (func_flags.val & TRACE_FUNC_OPT_STACK) - unregister_ftrace_function(&trace_stack_ops); - else - unregister_ftrace_function(&trace_ops); + tr->function_enabled = 0; + unregister_ftrace_function(tr->ops); } -static int func_set_flag(u32 old_flags, u32 bit, int set) +static int +func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { switch (bit) { case TRACE_FUNC_OPT_STACK: @@ -183,12 +241,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + unregister_ftrace_function(tr->ops); + if (set) { - unregister_ftrace_function(&trace_ops); - register_ftrace_function(&trace_stack_ops); + tr->ops = &trace_stack_ops; + register_ftrace_function(tr->ops); } else { - unregister_ftrace_function(&trace_stack_ops); - register_ftrace_function(&trace_ops); + tr->ops = &trace_ops; + register_ftrace_function(tr->ops); } break; @@ -208,6 +268,7 @@ static struct tracer function_trace __tracer_data = .wait_pipe = poll_wait_pipe, .flags = &func_flags, .set_flag = func_set_flag, + .allow_instances = true, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function, #endif diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0b99120d395c..deff11200261 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1476,7 +1476,8 @@ void graph_trace_close(struct trace_iterator *iter) } } -static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +static int +func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (bit == TRACE_GRAPH_PRINT_IRQS) ftrace_graph_skip_irqs = !set; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2aefbee93a6d..8ff02cbb892f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -160,7 +160,8 @@ static struct ftrace_ops trace_ops __read_mostly = #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { int cpu; @@ -266,7 +267,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +static int +irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } @@ -498,14 +500,14 @@ void trace_hardirqs_off(void) } EXPORT_SYMBOL(trace_hardirqs_off); -void trace_hardirqs_on_caller(unsigned long caller_addr) +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } EXPORT_SYMBOL(trace_hardirqs_on_caller); -void trace_hardirqs_off_caller(unsigned long caller_addr) +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); @@ -570,8 +572,10 @@ static void irqsoff_function_set(int set) unregister_irqsoff_function(is_graph()); } -static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set) { + struct tracer *tracer = tr->current_trace; + if (mask & TRACE_ITER_FUNCTION) irqsoff_function_set(set); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index dae9541ada9e..d021d21dd150 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -27,75 +27,54 @@ /** * Kprobe event core functions */ -struct trace_probe { +struct trace_kprobe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ const char *symbol; /* symbol name */ - struct ftrace_event_class class; - struct ftrace_event_call call; - struct list_head files; - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; + struct trace_probe tp; }; -struct event_file_link { - struct ftrace_event_file *file; - struct list_head list; -}; - -#define SIZEOF_TRACE_PROBE(n) \ - (offsetof(struct trace_probe, args) + \ +#define SIZEOF_TRACE_KPROBE(n) \ + (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -static __kprobes bool trace_probe_is_return(struct trace_probe *tp) -{ - return tp->rp.handler != NULL; -} - -static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) -{ - return tp->symbol ? tp->symbol : "unknown"; -} - -static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) +static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk) { - return tp->rp.kp.offset; + return tk->rp.handler != NULL; } -static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) +static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk) { - return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); + return tk->symbol ? tk->symbol : "unknown"; } -static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) +static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk) { - return !!(tp->flags & TP_FLAG_REGISTERED); + return tk->rp.kp.offset; } -static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) +static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk) { - return !!(kprobe_gone(&tp->rp.kp)); + return !!(kprobe_gone(&tk->rp.kp)); } -static __kprobes bool trace_probe_within_module(struct trace_probe *tp, - struct module *mod) +static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk, + struct module *mod) { int len = strlen(mod->name); - const char *name = trace_probe_symbol(tp); + const char *name = trace_kprobe_symbol(tk); return strncmp(mod->name, name, len) == 0 && name[len] == ':'; } -static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) +static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk) { - return !!strchr(trace_probe_symbol(tp), ':'); + return !!strchr(trace_kprobe_symbol(tk), ':'); } -static int register_probe_event(struct trace_probe *tp); -static int unregister_probe_event(struct trace_probe *tp); +static int register_kprobe_event(struct trace_kprobe *tk); +static int unregister_kprobe_event(struct trace_kprobe *tk); static DEFINE_MUTEX(probe_lock); static LIST_HEAD(probe_list); @@ -104,45 +83,224 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + + if (sc->addr) + sc->addr += sc->offset; + + return sc->addr; +} + +void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + update_symbol_cache(sc); + + return sc; +} + +/* + * Kprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + + if (!maxlen) + return; + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else { + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); + } +} + +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + mm_segment_t old_fs; + int ret, len = 0; + u8 c; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +#define DEFINE_FETCH_symbol(type) \ +__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) + +/* kprobes don't support file_offset fetch methods */ +#define fetch_file_offset_u8 NULL +#define fetch_file_offset_u16 NULL +#define fetch_file_offset_u32 NULL +#define fetch_file_offset_u64 NULL +#define fetch_file_offset_string NULL +#define fetch_file_offset_string_size NULL + +/* Fetch type information table */ +const struct fetch_type kprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + /* * Allocate new trace_probe and initialize it (including kprobes). */ -static struct trace_probe *alloc_trace_probe(const char *group, +static struct trace_kprobe *alloc_trace_kprobe(const char *group, const char *event, void *addr, const char *symbol, unsigned long offs, int nargs, bool is_return) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = -ENOMEM; - tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); - if (!tp) + tk = kzalloc(SIZEOF_TRACE_KPROBE(nargs), GFP_KERNEL); + if (!tk) return ERR_PTR(ret); if (symbol) { - tp->symbol = kstrdup(symbol, GFP_KERNEL); - if (!tp->symbol) + tk->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tk->symbol) goto error; - tp->rp.kp.symbol_name = tp->symbol; - tp->rp.kp.offset = offs; + tk->rp.kp.symbol_name = tk->symbol; + tk->rp.kp.offset = offs; } else - tp->rp.kp.addr = addr; + tk->rp.kp.addr = addr; if (is_return) - tp->rp.handler = kretprobe_dispatcher; + tk->rp.handler = kretprobe_dispatcher; else - tp->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.kp.pre_handler = kprobe_dispatcher; if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; } - tp->call.class = &tp->class; - tp->call.name = kstrdup(event, GFP_KERNEL); - if (!tp->call.name) + tk->tp.call.class = &tk->tp.class; + tk->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tk->tp.call.name) goto error; if (!group || !is_good_name(group)) { @@ -150,42 +308,42 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } - tp->class.system = kstrdup(group, GFP_KERNEL); - if (!tp->class.system) + tk->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tk->tp.class.system) goto error; - INIT_LIST_HEAD(&tp->list); - INIT_LIST_HEAD(&tp->files); - return tp; + INIT_LIST_HEAD(&tk->list); + INIT_LIST_HEAD(&tk->tp.files); + return tk; error: - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); return ERR_PTR(ret); } -static void free_trace_probe(struct trace_probe *tp) +static void free_trace_kprobe(struct trace_kprobe *tk) { int i; - for (i = 0; i < tp->nr_args; i++) - traceprobe_free_probe_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_free_probe_arg(&tk->tp.args[i]); - kfree(tp->call.class->system); - kfree(tp->call.name); - kfree(tp->symbol); - kfree(tp); + kfree(tk->tp.call.class->system); + kfree(tk->tp.call.name); + kfree(tk->symbol); + kfree(tk); } -static struct trace_probe *find_trace_probe(const char *event, - const char *group) +static struct trace_kprobe *find_trace_kprobe(const char *event, + const char *group) { - struct trace_probe *tp; + struct trace_kprobe *tk; - list_for_each_entry(tp, &probe_list, list) - if (strcmp(tp->call.name, event) == 0 && - strcmp(tp->call.class->system, group) == 0) - return tp; + list_for_each_entry(tk, &probe_list, list) + if (strcmp(tk->tp.call.name, event) == 0 && + strcmp(tk->tp.call.class->system, group) == 0) + return tk; return NULL; } @@ -194,7 +352,7 @@ static struct trace_probe *find_trace_probe(const char *event, * if the file is NULL, enable "perf" handler, or enable "trace" handler. */ static int -enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) +enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { int ret = 0; @@ -208,47 +366,35 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } link->file = file; - list_add_tail_rcu(&link->list, &tp->files); + list_add_tail_rcu(&link->list, &tk->tp.files); - tp->flags |= TP_FLAG_TRACE; + tk->tp.flags |= TP_FLAG_TRACE; } else - tp->flags |= TP_FLAG_PROFILE; + tk->tp.flags |= TP_FLAG_PROFILE; - if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) { - if (trace_probe_is_return(tp)) - ret = enable_kretprobe(&tp->rp); + if (trace_probe_is_registered(&tk->tp) && !trace_kprobe_has_gone(tk)) { + if (trace_kprobe_is_return(tk)) + ret = enable_kretprobe(&tk->rp); else - ret = enable_kprobe(&tp->rp.kp); + ret = enable_kprobe(&tk->rp.kp); } out: return ret; } -static struct event_file_link * -find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) -{ - struct event_file_link *link; - - list_for_each_entry(link, &tp->files, list) - if (link->file == file) - return link; - - return NULL; -} - /* * Disable trace_probe * if the file is NULL, disable "perf" handler, or disable "trace" handler. */ static int -disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) +disable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file) { struct event_file_link *link = NULL; int wait = 0; int ret = 0; if (file) { - link = find_event_file_link(tp, file); + link = find_event_file_link(&tk->tp, file); if (!link) { ret = -EINVAL; goto out; @@ -256,18 +402,18 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) list_del_rcu(&link->list); wait = 1; - if (!list_empty(&tp->files)) + if (!list_empty(&tk->tp.files)) goto out; - tp->flags &= ~TP_FLAG_TRACE; + tk->tp.flags &= ~TP_FLAG_TRACE; } else - tp->flags &= ~TP_FLAG_PROFILE; + tk->tp.flags &= ~TP_FLAG_PROFILE; - if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - disable_kretprobe(&tp->rp); + if (!trace_probe_is_enabled(&tk->tp) && trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + disable_kretprobe(&tk->rp); else - disable_kprobe(&tp->rp.kp); + disable_kprobe(&tk->rp.kp); wait = 1; } out: @@ -288,40 +434,40 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) } /* Internal register function - just handle k*probes and flags */ -static int __register_trace_probe(struct trace_probe *tp) +static int __register_trace_kprobe(struct trace_kprobe *tk) { int i, ret; - if (trace_probe_is_registered(tp)) + if (trace_probe_is_registered(&tk->tp)) return -EINVAL; - for (i = 0; i < tp->nr_args; i++) - traceprobe_update_arg(&tp->args[i]); + for (i = 0; i < tk->tp.nr_args; i++) + traceprobe_update_arg(&tk->tp.args[i]); /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(tp)) - tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; + if (trace_probe_is_enabled(&tk->tp)) + tk->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; else - tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + tk->rp.kp.flags |= KPROBE_FLAG_DISABLED; - if (trace_probe_is_return(tp)) - ret = register_kretprobe(&tp->rp); + if (trace_kprobe_is_return(tk)) + ret = register_kretprobe(&tk->rp); else - ret = register_kprobe(&tp->rp.kp); + ret = register_kprobe(&tk->rp.kp); if (ret == 0) - tp->flags |= TP_FLAG_REGISTERED; + tk->tp.flags |= TP_FLAG_REGISTERED; else { pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_probe_symbol(tp), trace_probe_offset(tp), ret); - if (ret == -ENOENT && trace_probe_is_on_module(tp)) { + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { pr_warning("This probe might be able to register after" "target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { pr_warning("Probing address(0x%p) is not an " "instruction boundary.\n", - tp->rp.kp.addr); + tk->rp.kp.addr); ret = -EINVAL; } } @@ -330,67 +476,67 @@ static int __register_trace_probe(struct trace_probe *tp) } /* Internal unregister function - just handle k*probes and flags */ -static void __unregister_trace_probe(struct trace_probe *tp) +static void __unregister_trace_kprobe(struct trace_kprobe *tk) { - if (trace_probe_is_registered(tp)) { - if (trace_probe_is_return(tp)) - unregister_kretprobe(&tp->rp); + if (trace_probe_is_registered(&tk->tp)) { + if (trace_kprobe_is_return(tk)) + unregister_kretprobe(&tk->rp); else - unregister_kprobe(&tp->rp.kp); - tp->flags &= ~TP_FLAG_REGISTERED; + unregister_kprobe(&tk->rp.kp); + tk->tp.flags &= ~TP_FLAG_REGISTERED; /* Cleanup kprobe for reuse */ - if (tp->rp.kp.symbol_name) - tp->rp.kp.addr = NULL; + if (tk->rp.kp.symbol_name) + tk->rp.kp.addr = NULL; } } /* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static int unregister_trace_probe(struct trace_probe *tp) +static int unregister_trace_kprobe(struct trace_kprobe *tk) { /* Enabled event can not be unregistered */ - if (trace_probe_is_enabled(tp)) + if (trace_probe_is_enabled(&tk->tp)) return -EBUSY; /* Will fail if probe is being used by ftrace or perf */ - if (unregister_probe_event(tp)) + if (unregister_kprobe_event(tk)) return -EBUSY; - __unregister_trace_probe(tp); - list_del(&tp->list); + __unregister_trace_kprobe(tk); + list_del(&tk->list); return 0; } /* Register a trace_probe and probe_event */ -static int register_trace_probe(struct trace_probe *tp) +static int register_trace_kprobe(struct trace_kprobe *tk) { - struct trace_probe *old_tp; + struct trace_kprobe *old_tk; int ret; mutex_lock(&probe_lock); /* Delete old (same name) event if exist */ - old_tp = find_trace_probe(tp->call.name, tp->call.class->system); - if (old_tp) { - ret = unregister_trace_probe(old_tp); + old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system); + if (old_tk) { + ret = unregister_trace_kprobe(old_tk); if (ret < 0) goto end; - free_trace_probe(old_tp); + free_trace_kprobe(old_tk); } /* Register new event */ - ret = register_probe_event(tp); + ret = register_kprobe_event(tk); if (ret) { pr_warning("Failed to register probe event(%d)\n", ret); goto end; } /* Register k*probe */ - ret = __register_trace_probe(tp); + ret = __register_trace_kprobe(tk); if (ret < 0) - unregister_probe_event(tp); + unregister_kprobe_event(tk); else - list_add_tail(&tp->list, &probe_list); + list_add_tail(&tk->list, &probe_list); end: mutex_unlock(&probe_lock); @@ -398,11 +544,11 @@ end: } /* Module notifier call back, checking event on the module */ -static int trace_probe_module_callback(struct notifier_block *nb, +static int trace_kprobe_module_callback(struct notifier_block *nb, unsigned long val, void *data) { struct module *mod = data; - struct trace_probe *tp; + struct trace_kprobe *tk; int ret; if (val != MODULE_STATE_COMING) @@ -410,15 +556,15 @@ static int trace_probe_module_callback(struct notifier_block *nb, /* Update probes on coming module */ mutex_lock(&probe_lock); - list_for_each_entry(tp, &probe_list, list) { - if (trace_probe_within_module(tp, mod)) { + list_for_each_entry(tk, &probe_list, list) { + if (trace_kprobe_within_module(tk, mod)) { /* Don't need to check busy - this should have gone. */ - __unregister_trace_probe(tp); - ret = __register_trace_probe(tp); + __unregister_trace_kprobe(tk); + ret = __register_trace_kprobe(tk); if (ret) pr_warning("Failed to re-register probe %s on" "%s: %d\n", - tp->call.name, mod->name, ret); + tk->tp.call.name, mod->name, ret); } } mutex_unlock(&probe_lock); @@ -426,12 +572,12 @@ static int trace_probe_module_callback(struct notifier_block *nb, return NOTIFY_DONE; } -static struct notifier_block trace_probe_module_nb = { - .notifier_call = trace_probe_module_callback, +static struct notifier_block trace_kprobe_module_nb = { + .notifier_call = trace_kprobe_module_callback, .priority = 1 /* Invoked after kprobe module callback */ }; -static int create_trace_probe(int argc, char **argv) +static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: @@ -451,7 +597,7 @@ static int create_trace_probe(int argc, char **argv) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_probe *tp; + struct trace_kprobe *tk; int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; @@ -498,16 +644,16 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } mutex_lock(&probe_lock); - tp = find_trace_probe(event, group); - if (!tp) { + tk = find_trace_kprobe(event, group); + if (!tk) { mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ - ret = unregister_trace_probe(tp); + ret = unregister_trace_kprobe(tk); if (ret == 0) - free_trace_probe(tp); + free_trace_kprobe(tk); mutex_unlock(&probe_lock); return ret; } @@ -554,47 +700,49 @@ static int create_trace_probe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, is_return); - if (IS_ERR(tp)) { + if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tp)); - return PTR_ERR(tp); + (int)PTR_ERR(tk)); + return PTR_ERR(tk); } /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + /* Increment count for freeing args in error case */ - tp->nr_args++; + tk->tp.nr_args++; /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) { *arg++ = '\0'; - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + parg->name = kstrdup(argv[i], GFP_KERNEL); } else { arg = argv[i]; /* If argument name is omitted, set "argN" */ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tp->args[i].name = kstrdup(buf, GFP_KERNEL); + parg->name = kstrdup(buf, GFP_KERNEL); } - if (!tp->args[i].name) { + if (!parg->name) { pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - if (!is_good_name(tp->args[i].name)) { + if (!is_good_name(parg->name)) { pr_info("Invalid argument[%d] name: %s\n", - i, tp->args[i].name); + i, parg->name); ret = -EINVAL; goto error; } - if (traceprobe_conflict_field_name(tp->args[i].name, - tp->args, i)) { + if (traceprobe_conflict_field_name(parg->name, + tk->tp.args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -602,7 +750,7 @@ static int create_trace_probe(int argc, char **argv) } /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tp->size, &tp->args[i], + ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, is_return, true); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); @@ -610,35 +758,35 @@ static int create_trace_probe(int argc, char **argv) } } - ret = register_trace_probe(tp); + ret = register_trace_kprobe(tk); if (ret) goto error; return 0; error: - free_trace_probe(tp); + free_trace_kprobe(tk); return ret; } -static int release_all_trace_probes(void) +static int release_all_trace_kprobes(void) { - struct trace_probe *tp; + struct trace_kprobe *tk; int ret = 0; mutex_lock(&probe_lock); /* Ensure no probe is in use. */ - list_for_each_entry(tp, &probe_list, list) - if (trace_probe_is_enabled(tp)) { + list_for_each_entry(tk, &probe_list, list) + if (trace_probe_is_enabled(&tk->tp)) { ret = -EBUSY; goto end; } /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { - tp = list_entry(probe_list.next, struct trace_probe, list); - ret = unregister_trace_probe(tp); + tk = list_entry(probe_list.next, struct trace_kprobe, list); + ret = unregister_trace_kprobe(tk); if (ret) goto end; - free_trace_probe(tp); + free_trace_kprobe(tk); } end: @@ -666,22 +814,22 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; int i; - seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); + seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name); - if (!tp->symbol) - seq_printf(m, " 0x%p", tp->rp.kp.addr); - else if (tp->rp.kp.offset) - seq_printf(m, " %s+%u", trace_probe_symbol(tp), - tp->rp.kp.offset); + if (!tk->symbol) + seq_printf(m, " 0x%p", tk->rp.kp.addr); + else if (tk->rp.kp.offset) + seq_printf(m, " %s+%u", trace_kprobe_symbol(tk), + tk->rp.kp.offset); else - seq_printf(m, " %s", trace_probe_symbol(tp)); + seq_printf(m, " %s", trace_kprobe_symbol(tk)); - for (i = 0; i < tp->nr_args; i++) - seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); + for (i = 0; i < tk->tp.nr_args; i++) + seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); seq_printf(m, "\n"); return 0; @@ -699,7 +847,7 @@ static int probes_open(struct inode *inode, struct file *file) int ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ret = release_all_trace_probes(); + ret = release_all_trace_kprobes(); if (ret < 0) return ret; } @@ -711,7 +859,7 @@ static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { return traceprobe_probes_write(file, buffer, count, ppos, - create_trace_probe); + create_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -726,10 +874,10 @@ static const struct file_operations kprobe_events_ops = { /* Probes profiling interfaces */ static int probes_profile_seq_show(struct seq_file *m, void *v) { - struct trace_probe *tp = v; + struct trace_kprobe *tk = v; - seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, - tp->rp.kp.nmissed); + seq_printf(m, " %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit, + tk->rp.kp.nmissed); return 0; } @@ -754,57 +902,9 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; -/* Sum up total data length for dynamic arraies (strings) */ -static __kprobes int __get_data_size(struct trace_probe *tp, - struct pt_regs *regs) -{ - int i, ret = 0; - u32 len; - - for (i = 0; i < tp->nr_args; i++) - if (unlikely(tp->args[i].fetch_size.fn)) { - call_fetch(&tp->args[i].fetch_size, regs, &len); - ret += len; - } - - return ret; -} - -/* Store the value of each argument */ -static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, - struct pt_regs *regs, - u8 *data, int maxlen) -{ - int i; - u32 end = tp->size; - u32 *dl; /* Data (relative) location */ - - for (i = 0; i < tp->nr_args; i++) { - if (unlikely(tp->args[i].fetch_size.fn)) { - /* - * First, we set the relative location and - * maximum data length to *dl - */ - dl = (u32 *)(data + tp->args[i].offset); - *dl = make_data_rloc(maxlen, end - tp->args[i].offset); - /* Then try to fetch string or dynamic array data */ - call_fetch(&tp->args[i].fetch, regs, dl); - /* Reduce maximum length */ - end += get_rloc_len(*dl); - maxlen -= get_rloc_len(*dl); - /* Trick here, convert data_rloc to data_loc */ - *dl = convert_rloc_to_loc(*dl, - ent_size + tp->args[i].offset); - } else - /* Just fetching data normally */ - call_fetch(&tp->args[i].fetch, regs, - data + tp->args[i].offset); - } -} - /* Kprobe handler */ static __kprobes void -__kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, +__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { struct kprobe_trace_entry_head *entry; @@ -812,18 +912,18 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, struct ring_buffer *buffer; int size, dsize, pc; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; WARN_ON(call != ftrace_file->event_call); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, call->event.type, @@ -832,26 +932,25 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, return; entry = ring_buffer_event_data(event); - entry->ip = (unsigned long)tp->rp.kp.addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + entry->ip = (unsigned long)tk->rp.kp.addr; + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); } static __kprobes void -kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) +kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tp->files, list) - __kprobe_trace_func(tp, regs, link->file); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kprobe_trace_func(tk, regs, link->file); } /* Kretprobe handler */ static __kprobes void -__kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs, struct ftrace_event_file *ftrace_file) { @@ -860,18 +959,18 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, struct ring_buffer *buffer; int size, pc, dsize; unsigned long irq_flags; - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; WARN_ON(call != ftrace_file->event_call); - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; local_save_flags(irq_flags); pc = preempt_count(); - dsize = __get_data_size(tp, regs); - size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + size = sizeof(*entry) + tk->tp.size + dsize; event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, call->event.type, @@ -880,23 +979,22 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, return; entry = ring_buffer_event_data(event); - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_buffer_unlock_commit_regs(buffer, event, - irq_flags, pc, regs); + event_trigger_unlock_commit_regs(ftrace_file, buffer, event, + entry, irq_flags, pc, regs); } static __kprobes void -kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, +kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct event_file_link *link; - list_for_each_entry_rcu(link, &tp->files, list) - __kretprobe_trace_func(tp, ri, regs, link->file); + list_for_each_entry_rcu(link, &tk->tp.files, list) + __kretprobe_trace_func(tk, ri, regs, link->file); } /* Event entry printers */ @@ -983,16 +1081,18 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1004,17 +1104,19 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; struct kretprobe_trace_entry_head field; - struct trace_probe *tp = (struct trace_probe *)event_call->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->fmttype, - tp->args[i].name, - sizeof(field) + tp->args[i].offset, - tp->args[i].type->size, - tp->args[i].type->is_signed, + for (i = 0; i < tk->tp.nr_args; i++) { + struct probe_arg *parg = &tk->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, + sizeof(field) + parg->offset, + parg->type->size, + parg->type->is_signed, FILTER_OTHER); if (ret) return ret; @@ -1022,74 +1124,13 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) -{ - int i; - int pos = 0; - - const char *fmt, *arg; - - if (!trace_probe_is_return(tp)) { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } else { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } - - /* When len=0, we just calculate the needed length */ -#define LEN_OR_ZERO (len ? len - pos : 0) - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tp->args[i].name, tp->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tp->nr_args; i++) { - if (strcmp(tp->args[i].type->name, "string") == 0) - pos += snprintf(buf + pos, LEN_OR_ZERO, - ", __get_str(%s)", - tp->args[i].name); - else - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); - } - -#undef LEN_OR_ZERO - - /* return the length of print_fmt */ - return pos; -} - -static int set_print_fmt(struct trace_probe *tp) -{ - int len; - char *print_fmt; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tp, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tp, print_fmt, len + 1); - tp->call.print_fmt = print_fmt; - - return 0; -} - #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ static __kprobes void -kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) +kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1099,8 +1140,8 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) if (hlist_empty(head)) return; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1108,18 +1149,18 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs) if (!entry) return; - entry->ip = (unsigned long)tp->rp.kp.addr; + entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } /* Kretprobe profile handler */ static __kprobes void -kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, +kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; @@ -1129,8 +1170,8 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, if (hlist_empty(head)) return; - dsize = __get_data_size(tp, regs); - __size = sizeof(*entry) + tp->size + dsize; + dsize = __get_data_size(&tk->tp, regs); + __size = sizeof(*entry) + tk->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1138,9 +1179,9 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri, if (!entry) return; - entry->func = (unsigned long)tp->rp.kp.addr; + entry->func = (unsigned long)tk->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ @@ -1155,20 +1196,20 @@ static __kprobes int kprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { - struct trace_probe *tp = (struct trace_probe *)event->data; + struct trace_kprobe *tk = (struct trace_kprobe *)event->data; struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return enable_trace_probe(tp, file); + return enable_trace_kprobe(tk, file); case TRACE_REG_UNREGISTER: - return disable_trace_probe(tp, file); + return disable_trace_kprobe(tk, file); #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return enable_trace_probe(tp, NULL); + return enable_trace_kprobe(tk, NULL); case TRACE_REG_PERF_UNREGISTER: - return disable_trace_probe(tp, NULL); + return disable_trace_kprobe(tk, NULL); case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: case TRACE_REG_PERF_ADD: @@ -1182,15 +1223,15 @@ int kprobe_register(struct ftrace_event_call *event, static __kprobes int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { - struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - tp->nhit++; + tk->nhit++; - if (tp->flags & TP_FLAG_TRACE) - kprobe_trace_func(tp, regs); + if (tk->tp.flags & TP_FLAG_TRACE) + kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kprobe_perf_func(tp, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kprobe_perf_func(tk, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1198,15 +1239,15 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) static __kprobes int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); - tp->nhit++; + tk->nhit++; - if (tp->flags & TP_FLAG_TRACE) - kretprobe_trace_func(tp, ri, regs); + if (tk->tp.flags & TP_FLAG_TRACE) + kretprobe_trace_func(tk, ri, regs); #ifdef CONFIG_PERF_EVENTS - if (tp->flags & TP_FLAG_PROFILE) - kretprobe_perf_func(tp, ri, regs); + if (tk->tp.flags & TP_FLAG_PROFILE) + kretprobe_perf_func(tk, ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1219,21 +1260,21 @@ static struct trace_event_functions kprobe_funcs = { .trace = print_kprobe_event }; -static int register_probe_event(struct trace_probe *tp) +static int register_kprobe_event(struct trace_kprobe *tk) { - struct ftrace_event_call *call = &tp->call; + struct ftrace_event_call *call = &tk->tp.call; int ret; /* Initialize ftrace_event_call */ INIT_LIST_HEAD(&call->class->fields); - if (trace_probe_is_return(tp)) { + if (trace_kprobe_is_return(tk)) { call->event.funcs = &kretprobe_funcs; call->class->define_fields = kretprobe_event_define_fields; } else { call->event.funcs = &kprobe_funcs; call->class->define_fields = kprobe_event_define_fields; } - if (set_print_fmt(tp) < 0) + if (set_print_fmt(&tk->tp, trace_kprobe_is_return(tk)) < 0) return -ENOMEM; ret = register_ftrace_event(&call->event); if (!ret) { @@ -1242,7 +1283,7 @@ static int register_probe_event(struct trace_probe *tp) } call->flags = 0; call->class->reg = kprobe_register; - call->data = tp; + call->data = tk; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); @@ -1252,14 +1293,14 @@ static int register_probe_event(struct trace_probe *tp) return ret; } -static int unregister_probe_event(struct trace_probe *tp) +static int unregister_kprobe_event(struct trace_kprobe *tk) { int ret; /* tp->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tp->call); + ret = trace_remove_event_call(&tk->tp.call); if (!ret) - kfree(tp->call.print_fmt); + kfree(tk->tp.call.print_fmt); return ret; } @@ -1269,7 +1310,7 @@ static __init int init_kprobe_trace(void) struct dentry *d_tracer; struct dentry *entry; - if (register_module_notifier(&trace_probe_module_nb)) + if (register_module_notifier(&trace_kprobe_module_nb)) return -EINVAL; d_tracer = tracing_init_dentry(); @@ -1309,26 +1350,26 @@ static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, } static struct ftrace_event_file * -find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr) +find_trace_probe_file(struct trace_kprobe *tk, struct trace_array *tr) { struct ftrace_event_file *file; list_for_each_entry(file, &tr->events, list) - if (file->event_call == &tp->call) + if (file->event_call == &tk->tp.call) return file; return NULL; } /* - * Nobody but us can call enable_trace_probe/disable_trace_probe at this + * Nobody but us can call enable_trace_kprobe/disable_trace_kprobe at this * stage, we can do this lockless. */ static __init int kprobe_trace_self_tests_init(void) { int ret, warn = 0; int (*target)(int, int, int, int, int, int); - struct trace_probe *tp; + struct trace_kprobe *tk; struct ftrace_event_file *file; target = kprobe_trace_selftest_target; @@ -1337,44 +1378,44 @@ static __init int kprobe_trace_self_tests_init(void) ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " "$stack $stack0 +0($stack)", - create_trace_probe); + create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting new probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_probe(tp, file); + enable_trace_kprobe(tk, file); } } ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " - "$retval", create_trace_probe); + "$retval", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; } else { /* Enable trace point */ - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting 2nd new probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - enable_trace_probe(tp, file); + enable_trace_kprobe(tk, file); } } @@ -1384,46 +1425,46 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); /* Disable trace points before removing it */ - tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting test probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_probe(tp, file); + disable_trace_kprobe(tk, file); } - tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); - if (WARN_ON_ONCE(tp == NULL)) { + tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tk == NULL)) { pr_warn("error on getting 2nd test probe.\n"); warn++; } else { - file = find_trace_probe_file(tp, top_trace_array()); + file = find_trace_probe_file(tk, top_trace_array()); if (WARN_ON_ONCE(file == NULL)) { pr_warn("error on getting probe file.\n"); warn++; } else - disable_trace_probe(tp, file); + disable_trace_kprobe(tk, file); } - ret = traceprobe_command("-:testprobe", create_trace_probe); + ret = traceprobe_command("-:testprobe", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = traceprobe_command("-:testprobe2", create_trace_probe); + ret = traceprobe_command("-:testprobe2", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } end: - release_all_trace_probes(); + release_all_trace_kprobes(); if (warn) pr_cont("NG: Some tests are failed. Please check them.\n"); else diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 394f94417e2f..69a5cc94c01a 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr) * If you don't implement it, then the flag setting will be * automatically accepted. */ -static int nop_set_flag(u32 old_flags, u32 bit, int set) +static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { /* * Note that you don't need to update nop_flags.val yourself. @@ -96,6 +96,7 @@ struct tracer nop_trace __read_mostly = .selftest = trace_selftest_startup_nop, #endif .flags = &nop_flags, - .set_flag = nop_set_flag + .set_flag = nop_set_flag, + .allow_instances = true, }; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed32284fbe32..ca0e79e2abaa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -439,6 +439,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } EXPORT_SYMBOL(ftrace_raw_output_prep); +static int ftrace_output_raw(struct trace_iterator *iter, char *name, + char *fmt, va_list ap) +{ + struct trace_seq *s = &iter->seq; + int ret; + + ret = trace_seq_printf(s, "%s: ", name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_vprintf(s, fmt, ap); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = ftrace_output_raw(iter, name, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_output_call); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 412e959709b4..8364a421b4df 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -35,46 +35,27 @@ const char *reserved_field_names[] = { FIELD_STRING_FUNC, }; -/* Printing function type */ -#define PRINT_TYPE_FUNC_NAME(type) print_type_##type -#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type - /* Printing in basic type function template */ -#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ -static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \ +__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ const char *name, \ - void *data, void *ent)\ + void *data, void *ent) \ { \ - return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ + return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ } \ -static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; - -DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) -DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) -DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) -DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) -DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) - -static inline void *get_rloc_data(u32 *dl) -{ - return (u8 *)dl + get_rloc_offs(*dl); -} +const char PRINT_TYPE_FMT_NAME(type)[] = fmt; -/* For data_loc conversion */ -static inline void *get_loc_data(u32 *dl, void *ent) -{ - return (u8 *)ent + get_rloc_offs(*dl); -} - -/* For defining macros, define string/string_size types */ -typedef u32 string; -typedef u32 string_size; +DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "0x%x") +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d") +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld") /* Print type function for string type */ -static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, +__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, void *data, void *ent) { @@ -87,18 +68,7 @@ static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, (const char *)get_loc_data(data, ent)); } -static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; - -#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type -/* - * Define macro for basic types - we don't need to define s* types, because - * we have to care only about bitwidth at recording time. - */ -#define DEFINE_BASIC_FETCH_FUNCS(method) \ -DEFINE_FETCH_##method(u8) \ -DEFINE_FETCH_##method(u16) \ -DEFINE_FETCH_##method(u32) \ -DEFINE_FETCH_##method(u64) +const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; #define CHECK_FETCH_FUNCS(method, fn) \ (((FETCH_FUNC_NAME(method, u8) == fn) || \ @@ -111,7 +81,7 @@ DEFINE_FETCH_##method(u64) /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ -static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ +__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ @@ -122,20 +92,8 @@ DEFINE_BASIC_FETCH_FUNCS(reg) #define fetch_reg_string NULL #define fetch_reg_string_size NULL -#define DEFINE_FETCH_stack(type) \ -static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ - void *offset, void *dest) \ -{ \ - *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ - (unsigned int)((unsigned long)offset)); \ -} -DEFINE_BASIC_FETCH_FUNCS(stack) -/* No string on the stack entry */ -#define fetch_stack_string NULL -#define fetch_stack_string_size NULL - #define DEFINE_FETCH_retval(type) \ -static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \ void *dummy, void *dest) \ { \ *(type *)dest = (type)regs_return_value(regs); \ @@ -145,150 +103,16 @@ DEFINE_BASIC_FETCH_FUNCS(retval) #define fetch_retval_string NULL #define fetch_retval_string_size NULL -#define DEFINE_FETCH_memory(type) \ -static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ - void *addr, void *dest) \ -{ \ - type retval; \ - if (probe_kernel_address(addr, retval)) \ - *(type *)dest = 0; \ - else \ - *(type *)dest = retval; \ -} -DEFINE_BASIC_FETCH_FUNCS(memory) -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max - * length and relative data location. - */ -static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, - void *addr, void *dest) -{ - long ret; - int maxlen = get_rloc_len(*(u32 *)dest); - u8 *dst = get_rloc_data(dest); - u8 *src = addr; - mm_segment_t old_fs = get_fs(); - - if (!maxlen) - return; - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - set_fs(KERNEL_DS); - pagefault_disable(); - - do - ret = __copy_from_user_inatomic(dst++, src++, 1); - while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); - - dst[-1] = '\0'; - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) { /* Failed to fetch string */ - ((u8 *)get_rloc_data(dest))[0] = '\0'; - *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); - } else { - *(u32 *)dest = make_data_rloc(src - (u8 *)addr, - get_rloc_offs(*(u32 *)dest)); - } -} - -/* Return the length of string -- including null terminal byte */ -static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, - void *addr, void *dest) -{ - mm_segment_t old_fs; - int ret, len = 0; - u8 c; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - pagefault_enable(); - set_fs(old_fs); - - if (ret < 0) /* Failed to check the length */ - *(u32 *)dest = 0; - else - *(u32 *)dest = len; -} - -/* Memory fetching by symbol */ -struct symbol_cache { - char *symbol; - long offset; - unsigned long addr; -}; - -static unsigned long update_symbol_cache(struct symbol_cache *sc) -{ - sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); - - if (sc->addr) - sc->addr += sc->offset; - - return sc->addr; -} - -static void free_symbol_cache(struct symbol_cache *sc) -{ - kfree(sc->symbol); - kfree(sc); -} - -static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) -{ - struct symbol_cache *sc; - - if (!sym || strlen(sym) == 0) - return NULL; - - sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); - if (!sc) - return NULL; - - sc->symbol = kstrdup(sym, GFP_KERNEL); - if (!sc->symbol) { - kfree(sc); - return NULL; - } - sc->offset = offset; - update_symbol_cache(sc); - - return sc; -} - -#define DEFINE_FETCH_symbol(type) \ -static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ - void *data, void *dest) \ -{ \ - struct symbol_cache *sc = data; \ - if (sc->addr) \ - fetch_memory_##type(regs, (void *)sc->addr, dest); \ - else \ - *(type *)dest = 0; \ -} -DEFINE_BASIC_FETCH_FUNCS(symbol) -DEFINE_FETCH_symbol(string) -DEFINE_FETCH_symbol(string_size) - /* Dereference memory access function */ struct deref_fetch_param { struct fetch_param orig; long offset; + fetch_func_t fetch; + fetch_func_t fetch_size; }; #define DEFINE_FETCH_deref(type) \ -static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \ void *data, void *dest) \ { \ struct deref_fetch_param *dprm = data; \ @@ -296,13 +120,26 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ call_fetch(&dprm->orig, regs, &addr); \ if (addr) { \ addr += dprm->offset; \ - fetch_memory_##type(regs, (void *)addr, dest); \ + dprm->fetch(regs, (void *)addr, dest); \ } else \ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(deref) DEFINE_FETCH_deref(string) -DEFINE_FETCH_deref(string_size) + +__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs, + void *data, void *dest) +{ + struct deref_fetch_param *dprm = data; + unsigned long addr; + + call_fetch(&dprm->orig, regs, &addr); + if (addr && dprm->fetch_size) { + addr += dprm->offset; + dprm->fetch_size(regs, (void *)addr, dest); + } else + *(string_size *)dest = 0; +} static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) { @@ -329,7 +166,7 @@ struct bitfield_fetch_param { }; #define DEFINE_FETCH_bitfield(type) \ -static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ +__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \ void *data, void *dest) \ { \ struct bitfield_fetch_param *bprm = data; \ @@ -374,58 +211,8 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) kfree(data); } -/* Default (unsigned long) fetch type */ -#define __DEFAULT_FETCH_TYPE(t) u##t -#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) -#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) -#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) - -#define ASSIGN_FETCH_FUNC(method, type) \ - [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) - -#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ - {.name = _name, \ - .size = _size, \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ - .fmttype = _fmttype, \ - .fetch = { \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ -ASSIGN_FETCH_FUNC(bitfield, ftype), \ - } \ - } - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) - -#define FETCH_TYPE_STRING 0 -#define FETCH_TYPE_STRSIZE 1 - -/* Fetch type information table */ -static const struct fetch_type fetch_type_table[] = { - /* Special types */ - [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, - sizeof(u32), 1, "__data_loc char[]"), - [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, - string_size, sizeof(u32), 0, "u32"), - /* Basic types */ - ASSIGN_FETCH_TYPE(u8, u8, 0), - ASSIGN_FETCH_TYPE(u16, u16, 0), - ASSIGN_FETCH_TYPE(u32, u32, 0), - ASSIGN_FETCH_TYPE(u64, u64, 0), - ASSIGN_FETCH_TYPE(s8, u8, 1), - ASSIGN_FETCH_TYPE(s16, u16, 1), - ASSIGN_FETCH_TYPE(s32, u32, 1), - ASSIGN_FETCH_TYPE(s64, u64, 1), -}; - -static const struct fetch_type *find_fetch_type(const char *type) +static const struct fetch_type *find_fetch_type(const char *type, + const struct fetch_type *ftbl) { int i; @@ -446,44 +233,52 @@ static const struct fetch_type *find_fetch_type(const char *type) switch (bs) { case 8: - return find_fetch_type("u8"); + return find_fetch_type("u8", ftbl); case 16: - return find_fetch_type("u16"); + return find_fetch_type("u16", ftbl); case 32: - return find_fetch_type("u32"); + return find_fetch_type("u32", ftbl); case 64: - return find_fetch_type("u64"); + return find_fetch_type("u64", ftbl); default: goto fail; } } - for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) - if (strcmp(type, fetch_type_table[i].name) == 0) - return &fetch_type_table[i]; + for (i = 0; ftbl[i].name; i++) { + if (strcmp(type, ftbl[i].name) == 0) + return &ftbl[i]; + } fail: return NULL; } /* Special function : only accept unsigned long */ -static __kprobes void fetch_stack_address(struct pt_regs *regs, - void *dummy, void *dest) +static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs, + void *dummy, void *dest) { *(unsigned long *)dest = kernel_stack_pointer(regs); } +static __kprobes void fetch_user_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = user_stack_pointer(regs); +} + static fetch_func_t get_fetch_size_function(const struct fetch_type *type, - fetch_func_t orig_fn) + fetch_func_t orig_fn, + const struct fetch_type *ftbl) { int i; - if (type != &fetch_type_table[FETCH_TYPE_STRING]) + if (type != &ftbl[FETCH_TYPE_STRING]) return NULL; /* Only string type needs size function */ for (i = 0; i < FETCH_MTD_END; i++) if (type->fetch[i] == orig_fn) - return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + return ftbl[FETCH_TYPE_STRSIZE].fetch[i]; WARN_ON(1); /* This should not happen */ @@ -516,7 +311,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return) + struct fetch_param *f, bool is_return, + bool is_kprobe) { int ret = 0; unsigned long param; @@ -528,13 +324,16 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) - f->fn = fetch_stack_address; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR)) + return -EINVAL; + + if (is_kprobe) + f->fn = fetch_kernel_stack_address; else - ret = -EINVAL; + f->fn = fetch_user_stack_address; } else if (isdigit(arg[5])) { ret = kstrtoul(arg + 5, 10, ¶m); - if (ret || param > PARAM_MAX_STACK) + if (ret || (is_kprobe && param > PARAM_MAX_STACK)) ret = -EINVAL; else { f->fn = t->fetch[FETCH_MTD_stack]; @@ -552,20 +351,18 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, static int parse_probe_arg(char *arg, const struct fetch_type *t, struct fetch_param *f, bool is_return, bool is_kprobe) { + const struct fetch_type *ftbl; unsigned long param; long offset; char *tmp; - int ret; - - ret = 0; + int ret = 0; - /* Until uprobe_events supports only reg arguments */ - if (!is_kprobe && arg[0] != '%') - return -EINVAL; + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, t, f, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); break; case '%': /* named register */ @@ -577,7 +374,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, } break; - case '@': /* memory or symbol */ + case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); if (ret) @@ -585,7 +382,22 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, f->fn = t->fetch[FETCH_MTD_memory]; f->data = (void *)param; + } else if (arg[1] == '+') { + /* kprobes don't support file offsets */ + if (is_kprobe) + return -EINVAL; + + ret = kstrtol(arg + 2, 0, &offset); + if (ret) + break; + + f->fn = t->fetch[FETCH_MTD_file_offset]; + f->data = (void *)offset; } else { + /* uprobes don't support symbols */ + if (!is_kprobe) + return -EINVAL; + ret = traceprobe_split_symbol_offset(arg + 1, &offset); if (ret) break; @@ -616,7 +428,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, struct deref_fetch_param *dprm; const struct fetch_type *t2; - t2 = find_fetch_type(NULL); + t2 = find_fetch_type(NULL, ftbl); *tmp = '\0'; dprm = kzalloc(sizeof(struct deref_fetch_param), GFP_KERNEL); @@ -624,6 +436,9 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, return -ENOMEM; dprm->offset = offset; + dprm->fetch = t->fetch[FETCH_MTD_memory]; + dprm->fetch_size = get_fetch_size_function(t, + dprm->fetch, ftbl); ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, is_kprobe); if (ret) @@ -685,9 +500,13 @@ static int __parse_bitfield_probe_arg(const char *bf, int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe) { + const struct fetch_type *ftbl; const char *t; int ret; + ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; + BUG_ON(ftbl == NULL); + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; @@ -702,7 +521,7 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, arg[t - parg->comm] = '\0'; t++; } - parg->type = find_fetch_type(t); + parg->type = find_fetch_type(t, ftbl); if (!parg->type) { pr_info("Unsupported type: %s\n", t); return -EINVAL; @@ -716,7 +535,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, - parg->fetch.fn); + parg->fetch.fn, + ftbl); parg->fetch_size.data = parg->fetch.data; } @@ -837,3 +657,65 @@ out: return ret; } + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, + bool is_return) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!is_return) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +int set_print_fmt(struct trace_probe *tp, bool is_return) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0, is_return); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1, is_return); + tp->call.print_fmt = print_fmt; + + return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5c7e09d10d74..fb1ab5dfbd42 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -81,6 +81,17 @@ */ #define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); /* Printing function type */ @@ -95,6 +106,7 @@ enum { FETCH_MTD_symbol, FETCH_MTD_deref, FETCH_MTD_bitfield, + FETCH_MTD_file_offset, FETCH_MTD_END, }; @@ -115,6 +127,148 @@ struct fetch_param { void *data; }; +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \ +__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, \ + void *data, void *ent); \ +extern const char PRINT_TYPE_FMT_NAME(type)[] + +DECLARE_BASIC_PRINT_TYPE_FUNC(u8); +DECLARE_BASIC_PRINT_TYPE_FUNC(u16); +DECLARE_BASIC_PRINT_TYPE_FUNC(u32); +DECLARE_BASIC_PRINT_TYPE_FUNC(u64); +DECLARE_BASIC_PRINT_TYPE_FUNC(s8); +DECLARE_BASIC_PRINT_TYPE_FUNC(s16); +DECLARE_BASIC_PRINT_TYPE_FUNC(s32); +DECLARE_BASIC_PRINT_TYPE_FUNC(s64); +DECLARE_BASIC_PRINT_TYPE_FUNC(string); + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type + +/* Declare macro for basic types */ +#define DECLARE_FETCH_FUNC(method, type) \ +extern void FETCH_FUNC_NAME(method, type)(struct pt_regs *regs, \ + void *data, void *dest) + +#define DECLARE_BASIC_FETCH_FUNCS(method) \ +DECLARE_FETCH_FUNC(method, u8); \ +DECLARE_FETCH_FUNC(method, u16); \ +DECLARE_FETCH_FUNC(method, u32); \ +DECLARE_FETCH_FUNC(method, u64) + +DECLARE_BASIC_FETCH_FUNCS(reg); +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(retval); +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +DECLARE_BASIC_FETCH_FUNCS(symbol); +DECLARE_FETCH_FUNC(symbol, string); +DECLARE_FETCH_FUNC(symbol, string_size); + +DECLARE_BASIC_FETCH_FUNCS(deref); +DECLARE_FETCH_FUNC(deref, string); +DECLARE_FETCH_FUNC(deref, string_size); + +DECLARE_BASIC_FETCH_FUNCS(bitfield); +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ +ASSIGN_FETCH_FUNC(file_offset, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define ASSIGN_FETCH_TYPE_END {} + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +/* + * Fetch type information table. + * It's declared as a weak symbol due to conditional compilation. + */ +extern __weak const struct fetch_type kprobes_fetch_type_table[]; +extern __weak const struct fetch_type uprobes_fetch_type_table[]; + +#ifdef CONFIG_KPROBE_EVENT +struct symbol_cache; +unsigned long update_symbol_cache(struct symbol_cache *sc); +void free_symbol_cache(struct symbol_cache *sc); +struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +#else +/* uprobes do not support symbol fetch methods */ +#define fetch_symbol_u8 NULL +#define fetch_symbol_u16 NULL +#define fetch_symbol_u32 NULL +#define fetch_symbol_u64 NULL +#define fetch_symbol_string NULL +#define fetch_symbol_string_size NULL + +struct symbol_cache { +}; +static inline unsigned long __used update_symbol_cache(struct symbol_cache *sc) +{ + return 0; +} + +static inline void __used free_symbol_cache(struct symbol_cache *sc) +{ +} + +static inline struct symbol_cache * __used +alloc_symbol_cache(const char *sym, long offset) +{ + return NULL; +} +#endif /* CONFIG_KPROBE_EVENT */ + struct probe_arg { struct fetch_param fetch; struct fetch_param fetch_size; @@ -124,6 +278,31 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct trace_probe { + unsigned int flags; /* For TP_FLAG_* */ + struct ftrace_event_class class; + struct ftrace_event_call call; + struct list_head files; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +struct event_file_link { + struct ftrace_event_file *file; + struct list_head list; +}; + +static inline bool trace_probe_is_enabled(struct trace_probe *tp) +{ + return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); +} + +static inline bool trace_probe_is_registered(struct trace_probe *tp) +{ + return !!(tp->flags & TP_FLAG_REGISTERED); +} + static inline __kprobes void call_fetch(struct fetch_param *fprm, struct pt_regs *regs, void *dest) { @@ -142,6 +321,18 @@ static inline int is_good_name(const char *name) return 1; } +static inline struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) +{ + struct event_file_link *link; + + list_for_each_entry(link, &tp->files, list) + if (link->file == file) + return link; + + return NULL; +} + extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, struct probe_arg *parg, bool is_return, bool is_kprobe); @@ -158,3 +349,53 @@ extern ssize_t traceprobe_probes_write(struct file *file, int (*createfn)(int, char**)); extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); + +/* Sum up total data length for dynamic arraies (strings) */ +static inline __kprobes int +__get_data_size(struct trace_probe *tp, struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static inline __kprobes void +store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + +extern int set_print_fmt(struct trace_probe *tp, bool is_return); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fee77e15d815..e14da5e97a69 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -16,6 +16,7 @@ #include <linux/uaccess.h> #include <linux/ftrace.h> #include <linux/sched/rt.h> +#include <linux/sched/deadline.h> #include <trace/events/sched.h> #include "trace.h" @@ -27,6 +28,8 @@ static int wakeup_cpu; static int wakeup_current_cpu; static unsigned wakeup_prio = -1; static int wakeup_rt; +static int wakeup_dl; +static int tracing_dl = 0; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -176,8 +179,10 @@ static void wakeup_function_set(int set) unregister_wakeup_function(is_graph()); } -static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set) { + struct tracer *tracer = tr->current_trace; + if (mask & TRACE_ITER_FUNCTION) wakeup_function_set(set); @@ -206,7 +211,8 @@ static void stop_func_tracer(int graph) } #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { if (!(bit & TRACE_DISPLAY_GRAPH)) @@ -308,7 +314,8 @@ __trace_function(struct trace_array *tr, #else #define __trace_function trace_function -static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +static int +wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { return -EINVAL; } @@ -437,6 +444,7 @@ static void __wakeup_reset(struct trace_array *tr) { wakeup_cpu = -1; wakeup_prio = -1; + tracing_dl = 0; if (wakeup_task) put_task_struct(wakeup_task); @@ -472,9 +480,17 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) tracing_record_cmdline(p); tracing_record_cmdline(current); - if ((wakeup_rt && !rt_task(p)) || - p->prio >= wakeup_prio || - p->prio >= current->prio) + /* + * Semantic is like this: + * - wakeup tracer handles all tasks in the system, independently + * from their scheduling class; + * - wakeup_rt tracer handles tasks belonging to sched_dl and + * sched_rt class; + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || + (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; pc = preempt_count(); @@ -486,7 +502,8 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) arch_spin_lock(&wakeup_lock); /* check for races. */ - if (!tracer_enabled || p->prio >= wakeup_prio) + if (!tracer_enabled || tracing_dl || + (!dl_task(p) && p->prio >= wakeup_prio)) goto out_locked; /* reset the trace */ @@ -496,6 +513,15 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) wakeup_current_cpu = wakeup_cpu; wakeup_prio = p->prio; + /* + * Once you start tracing a -deadline task, don't bother tracing + * another task until the first one wakes up. + */ + if (dl_task(p)) + tracing_dl = 1; + else + tracing_dl = 0; + wakeup_task = p; get_task_struct(wakeup_task); @@ -597,16 +623,25 @@ static int __wakeup_tracer_init(struct trace_array *tr) static int wakeup_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 0; return __wakeup_tracer_init(tr); } static int wakeup_rt_tracer_init(struct trace_array *tr) { + wakeup_dl = 0; wakeup_rt = 1; return __wakeup_tracer_init(tr); } +static int wakeup_dl_tracer_init(struct trace_array *tr) +{ + wakeup_dl = 1; + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + static void wakeup_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; @@ -674,6 +709,28 @@ static struct tracer wakeup_rt_tracer __read_mostly = .use_max_tr = true, }; +static struct tracer wakeup_dl_tracer __read_mostly = +{ + .name = "wakeup_dl", + .init = wakeup_dl_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .wait_pipe = poll_wait_pipe, + .print_max = true, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = true, +}; + __init static int init_wakeup_tracer(void) { int ret; @@ -686,6 +743,10 @@ __init static int init_wakeup_tracer(void) if (ret) return ret; + ret = register_tracer(&wakeup_dl_tracer); + if (ret) + return ret; + return 0; } core_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index a7329b7902f8..e98fca60974f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1022,11 +1022,16 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { - /* Make this a RT thread, doesn't need to be too high */ - static const struct sched_param param = { .sched_priority = 5 }; + /* Make this a -deadline thread */ + static const struct sched_attr attr = { + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL + }; struct completion *x = data; - sched_setscheduler(current, SCHED_FIFO, ¶m); + sched_setattr(current, &attr); /* Make it know we have a new prio */ complete(x); @@ -1040,8 +1045,8 @@ static int trace_wakeup_test_thread(void *data) /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* - * This is an RT task, do short sleeps to let - * others run. + * This will likely be the system top priority + * task, do short sleeps to let others run. */ msleep(100); } @@ -1054,21 +1059,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) { unsigned long save_max = tracing_max_latency; struct task_struct *p; - struct completion isrt; + struct completion is_ready; unsigned long count; int ret; - init_completion(&isrt); + init_completion(&is_ready); - /* create a high prio thread */ - p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + /* create a -deadline thread */ + p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); if (IS_ERR(p)) { printk(KERN_CONT "Failed to create ftrace wakeup test thread "); return -1; } - /* make sure the thread is running at an RT prio */ - wait_for_completion(&isrt); + /* make sure the thread is running at -deadline policy */ + wait_for_completion(&is_ready); /* start the tracing */ ret = tracer_init(trace, tr); @@ -1082,19 +1087,19 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) while (p->on_rq) { /* - * Sleep to make sure the RT thread is asleep too. + * Sleep to make sure the -deadline thread is asleep too. * On virtual machines we can't rely on timings, * but we want to make sure this test still works. */ msleep(100); } - init_completion(&isrt); + init_completion(&is_ready); wake_up_process(p); /* Wait for the task to wake up */ - wait_for_completion(&isrt); + wait_for_completion(&is_ready); /* stop the tracing. */ tracing_stop(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index b20428c5efe2..21b320e5d163 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -13,6 +13,7 @@ #include <linux/sysctl.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/magic.h> #include <asm/setup.h> @@ -144,6 +145,8 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } + BUG_ON(current != &init_task && + *(end_of_stack(current)) != STACK_END_MAGIC); out: arch_spin_unlock(&max_stack_lock); local_irq_restore(flags); @@ -382,7 +385,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_filter_lseek, + .llseek = tracing_lseek, .release = ftrace_regex_release, }; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ea90eb5f6f17..759d5e004517 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (!ftrace_file) return; - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -343,9 +343,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_current_buffer_unlock_commit(buffer, event, - irq_flags, pc); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -369,7 +368,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!ftrace_file) return; - if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) + if (ftrace_trigger_soft_disabled(ftrace_file)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -390,9 +389,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_check_discard(ftrace_file, entry, buffer, event)) - trace_current_buffer_unlock_commit(buffer, event, - irq_flags, pc); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, + irq_flags, pc); } static int reg_event_syscall_enter(struct ftrace_event_file *file, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b6dcc42ef7f5..e4473367e7a4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -51,22 +51,17 @@ struct trace_uprobe_filter { */ struct trace_uprobe { struct list_head list; - struct ftrace_event_class class; - struct ftrace_event_call call; struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct inode *inode; char *filename; unsigned long offset; unsigned long nhit; - unsigned int flags; /* For TP_FLAG_* */ - ssize_t size; /* trace entry size */ - unsigned int nr_args; - struct probe_arg args[]; + struct trace_probe tp; }; -#define SIZEOF_TRACE_UPROBE(n) \ - (offsetof(struct trace_uprobe, args) + \ +#define SIZEOF_TRACE_UPROBE(n) \ + (offsetof(struct trace_uprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) static int register_uprobe_event(struct trace_uprobe *tu); @@ -75,10 +70,151 @@ static int unregister_uprobe_event(struct trace_uprobe *tu); static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); +struct uprobe_dispatch_data { + struct trace_uprobe *tu; + unsigned long bp_addr; +}; + static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs); +#ifdef CONFIG_STACK_GROWSUP +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr - (n * sizeof(long)); +} +#else +static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) +{ + return addr + (n * sizeof(long)); +} +#endif + +static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long ret; + unsigned long addr = user_stack_pointer(regs); + + addr = adjust_stack_addr(addr, n); + + if (copy_from_user(&ret, (void __force __user *) addr, sizeof(ret))) + return 0; + + return ret; +} + +/* + * Uprobes-specific fetch functions + */ +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)get_user_stack_nth(regs, \ + ((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + void __user *vaddr = (void __force __user *) addr; \ + \ + if (copy_from_user(&retval, vaddr, sizeof(type))) \ + *(type *)dest = 0; \ + else \ + *(type *) dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + u32 rloc = *(u32 *)dest; + int maxlen = get_rloc_len(rloc); + u8 *dst = get_rloc_data(dest); + void __user *src = (void __force __user *) addr; + + if (!maxlen) + return; + + ret = strncpy_from_user(dst, src, maxlen); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(rloc)); + } else { + *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(rloc)); + } +} + +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int len; + void __user *vaddr = (void __force __user *) addr; + + len = strnlen_user(vaddr, MAX_STRING_SIZE); + + if (len == 0 || len > MAX_STRING_SIZE) /* Failed to check length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +static unsigned long translate_user_vaddr(void *file_offset) +{ + unsigned long base_addr; + struct uprobe_dispatch_data *udd; + + udd = (void *) current->utask->vaddr; + + base_addr = udd->bp_addr - udd->tu->offset; + return base_addr + (unsigned long)file_offset; +} + +#define DEFINE_FETCH_file_offset(type) \ +static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + void *vaddr = (void *)translate_user_vaddr(offset); \ + \ + FETCH_FUNC_NAME(memory, type)(regs, vaddr, dest); \ +} +DEFINE_BASIC_FETCH_FUNCS(file_offset) +DEFINE_FETCH_file_offset(string) +DEFINE_FETCH_file_offset(string_size) + +/* Fetch type information table */ +const struct fetch_type uprobes_fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), + + ASSIGN_FETCH_TYPE_END +}; + static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { rwlock_init(&filter->rwlock); @@ -114,25 +250,26 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - tu->call.class = &tu->class; - tu->call.name = kstrdup(event, GFP_KERNEL); - if (!tu->call.name) + tu->tp.call.class = &tu->tp.class; + tu->tp.call.name = kstrdup(event, GFP_KERNEL); + if (!tu->tp.call.name) goto error; - tu->class.system = kstrdup(group, GFP_KERNEL); - if (!tu->class.system) + tu->tp.class.system = kstrdup(group, GFP_KERNEL); + if (!tu->tp.class.system) goto error; INIT_LIST_HEAD(&tu->list); + INIT_LIST_HEAD(&tu->tp.files); tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); - tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; + tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: - kfree(tu->call.name); + kfree(tu->tp.call.name); kfree(tu); return ERR_PTR(-ENOMEM); @@ -142,12 +279,12 @@ static void free_trace_uprobe(struct trace_uprobe *tu) { int i; - for (i = 0; i < tu->nr_args; i++) - traceprobe_free_probe_arg(&tu->args[i]); + for (i = 0; i < tu->tp.nr_args; i++) + traceprobe_free_probe_arg(&tu->tp.args[i]); iput(tu->inode); - kfree(tu->call.class->system); - kfree(tu->call.name); + kfree(tu->tp.call.class->system); + kfree(tu->tp.call.name); kfree(tu->filename); kfree(tu); } @@ -157,8 +294,8 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou struct trace_uprobe *tu; list_for_each_entry(tu, &uprobe_list, list) - if (strcmp(tu->call.name, event) == 0 && - strcmp(tu->call.class->system, group) == 0) + if (strcmp(tu->tp.call.name, event) == 0 && + strcmp(tu->tp.call.class->system, group) == 0) return tu; return NULL; @@ -181,16 +318,16 @@ static int unregister_trace_uprobe(struct trace_uprobe *tu) /* Register a trace_uprobe and probe_event */ static int register_trace_uprobe(struct trace_uprobe *tu) { - struct trace_uprobe *old_tp; + struct trace_uprobe *old_tu; int ret; mutex_lock(&uprobe_lock); /* register as an event */ - old_tp = find_probe_event(tu->call.name, tu->call.class->system); - if (old_tp) { + old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system); + if (old_tu) { /* delete old event */ - ret = unregister_trace_uprobe(old_tp); + ret = unregister_trace_uprobe(old_tu); if (ret) goto end; } @@ -211,7 +348,7 @@ end: /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] * * - Remove uprobe: -:[GRP/]EVENT */ @@ -360,34 +497,36 @@ static int create_trace_uprobe(int argc, char **argv) /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + /* Increment count for freeing args in error case */ - tu->nr_args++; + tu->tp.nr_args++; /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) { *arg++ = '\0'; - tu->args[i].name = kstrdup(argv[i], GFP_KERNEL); + parg->name = kstrdup(argv[i], GFP_KERNEL); } else { arg = argv[i]; /* If argument name is omitted, set "argN" */ snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); - tu->args[i].name = kstrdup(buf, GFP_KERNEL); + parg->name = kstrdup(buf, GFP_KERNEL); } - if (!tu->args[i].name) { + if (!parg->name) { pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - if (!is_good_name(tu->args[i].name)) { - pr_info("Invalid argument[%d] name: %s\n", i, tu->args[i].name); + if (!is_good_name(parg->name)) { + pr_info("Invalid argument[%d] name: %s\n", i, parg->name); ret = -EINVAL; goto error; } - if (traceprobe_conflict_field_name(tu->args[i].name, tu->args, i)) { + if (traceprobe_conflict_field_name(parg->name, tu->tp.args, i)) { pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; @@ -395,7 +534,8 @@ static int create_trace_uprobe(int argc, char **argv) } /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg(arg, &tu->size, &tu->args[i], false, false); + ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, + is_return, false); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; @@ -459,11 +599,11 @@ static int probes_seq_show(struct seq_file *m, void *v) char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); + seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); - for (i = 0; i < tu->nr_args; i++) - seq_printf(m, " %s=%s", tu->args[i].name, tu->args[i].comm); + for (i = 0; i < tu->tp.nr_args; i++) + seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); seq_printf(m, "\n"); return 0; @@ -509,7 +649,7 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; - seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->call.name, tu->nhit); + seq_printf(m, " %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit); return 0; } @@ -533,19 +673,116 @@ static const struct file_operations uprobe_profile_ops = { .release = seq_release, }; -static void uprobe_trace_print(struct trace_uprobe *tu, - unsigned long func, struct pt_regs *regs) +struct uprobe_cpu_buffer { + struct mutex mutex; + void *buf; +}; +static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; +static int uprobe_buffer_refcnt; + +static int uprobe_buffer_init(void) +{ + int cpu, err_cpu; + + uprobe_cpu_buffer = alloc_percpu(struct uprobe_cpu_buffer); + if (uprobe_cpu_buffer == NULL) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct page *p = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL, 0); + if (p == NULL) { + err_cpu = cpu; + goto err; + } + per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf = page_address(p); + mutex_init(&per_cpu_ptr(uprobe_cpu_buffer, cpu)->mutex); + } + + return 0; + +err: + for_each_possible_cpu(cpu) { + if (cpu == err_cpu) + break; + free_page((unsigned long)per_cpu_ptr(uprobe_cpu_buffer, cpu)->buf); + } + + free_percpu(uprobe_cpu_buffer); + return -ENOMEM; +} + +static int uprobe_buffer_enable(void) +{ + int ret = 0; + + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (uprobe_buffer_refcnt++ == 0) { + ret = uprobe_buffer_init(); + if (ret < 0) + uprobe_buffer_refcnt--; + } + + return ret; +} + +static void uprobe_buffer_disable(void) +{ + BUG_ON(!mutex_is_locked(&event_mutex)); + + if (--uprobe_buffer_refcnt == 0) { + free_percpu(uprobe_cpu_buffer); + uprobe_cpu_buffer = NULL; + } +} + +static struct uprobe_cpu_buffer *uprobe_buffer_get(void) +{ + struct uprobe_cpu_buffer *ucb; + int cpu; + + cpu = raw_smp_processor_id(); + ucb = per_cpu_ptr(uprobe_cpu_buffer, cpu); + + /* + * Use per-cpu buffers for fastest access, but we might migrate + * so the mutex makes sure we have sole access to it. + */ + mutex_lock(&ucb->mutex); + + return ucb; +} + +static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) +{ + mutex_unlock(&ucb->mutex); +} + +static void __uprobe_trace_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize, + struct ftrace_event_file *ftrace_file) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; void *data; - int size, i; - struct ftrace_event_call *call = &tu->call; + int size, esize; + struct ftrace_event_call *call = &tu->tp.call; + + WARN_ON(call != ftrace_file->event_call); - size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size + tu->size, 0, 0); + if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + return; + + if (ftrace_trigger_soft_disabled(ftrace_file)) + return; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + size = esize + tu->tp.size + dsize; + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, + call->event.type, size, 0, 0); if (!event) return; @@ -559,25 +796,38 @@ static void uprobe_trace_print(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) - call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + memcpy(data, ucb->buf, tu->tp.size + dsize); - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); + event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0); } /* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - if (!is_ret_probe(tu)) - uprobe_trace_print(tu, 0, regs); + struct event_file_link *link; + + if (is_ret_probe(tu)) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + rcu_read_unlock(); + return 0; } static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - uprobe_trace_print(tu, func, regs); + struct event_file_link *link; + + rcu_read_lock(); + list_for_each_entry_rcu(link, &tu->tp.files, list) + __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + rcu_read_unlock(); } /* Event entry printers */ @@ -591,23 +841,24 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e int i; entry = (struct uprobe_trace_entry_head *)iter->ent; - tu = container_of(event, struct trace_uprobe, call.event); + tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name, entry->vaddr[1], entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name, entry->vaddr[0])) goto partial; data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) { - if (!tu->args[i].type->print(s, tu->args[i].name, - data + tu->args[i].offset, entry)) + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + if (!parg->type->print(s, parg->name, data + parg->offset, entry)) goto partial; } @@ -618,43 +869,81 @@ partial: return TRACE_TYPE_PARTIAL_LINE; } -static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu) -{ - return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE); -} - typedef bool (*filter_func_t)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); static int -probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter) +probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file, + filter_func_t filter) { - int ret = 0; + bool enabled = trace_probe_is_enabled(&tu->tp); + struct event_file_link *link = NULL; + int ret; + + if (file) { + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + link->file = file; + list_add_tail_rcu(&link->list, &tu->tp.files); - if (is_trace_uprobe_enabled(tu)) - return -EINTR; + tu->tp.flags |= TP_FLAG_TRACE; + } else + tu->tp.flags |= TP_FLAG_PROFILE; + + ret = uprobe_buffer_enable(); + if (ret < 0) + return ret; WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - tu->flags |= flag; + if (enabled) + return 0; + tu->consumer.filter = filter; ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); - if (ret) - tu->flags &= ~flag; + if (ret) { + if (file) { + list_del(&link->list); + kfree(link); + tu->tp.flags &= ~TP_FLAG_TRACE; + } else + tu->tp.flags &= ~TP_FLAG_PROFILE; + } return ret; } -static void probe_event_disable(struct trace_uprobe *tu, int flag) +static void +probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file) { - if (!is_trace_uprobe_enabled(tu)) + if (!trace_probe_is_enabled(&tu->tp)) return; + if (file) { + struct event_file_link *link; + + link = find_event_file_link(&tu->tp, file); + if (!link) + return; + + list_del_rcu(&link->list); + /* synchronize with u{,ret}probe_trace_func */ + synchronize_sched(); + kfree(link); + + if (!list_empty(&tu->tp.files)) + return; + } + WARN_ON(!uprobe_filter_is_empty(&tu->filter)); uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->flags &= ~flag; + tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE; + + uprobe_buffer_disable(); } static int uprobe_event_define_fields(struct ftrace_event_call *event_call) @@ -672,12 +961,12 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) size = SIZEOF_TRACE_ENTRY(false); } /* Set argument names as fields */ - for (i = 0; i < tu->nr_args; i++) { - ret = trace_define_field(event_call, tu->args[i].type->fmttype, - tu->args[i].name, - size + tu->args[i].offset, - tu->args[i].type->size, - tu->args[i].type->is_signed, + for (i = 0; i < tu->tp.nr_args; i++) { + struct probe_arg *parg = &tu->tp.args[i]; + + ret = trace_define_field(event_call, parg->type->fmttype, + parg->name, size + parg->offset, + parg->type->size, parg->type->is_signed, FILTER_OTHER); if (ret) @@ -686,59 +975,6 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -#define LEN_OR_ZERO (len ? len - pos : 0) -static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) -{ - const char *fmt, *arg; - int i; - int pos = 0; - - if (is_ret_probe(tu)) { - fmt = "(%lx <- %lx)"; - arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; - } else { - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; - } - - /* When len=0, we just calculate the needed length */ - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - - for (i = 0; i < tu->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", - tu->args[i].name, tu->args[i].type->fmt); - } - - pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); - - for (i = 0; i < tu->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tu->args[i].name); - } - - return pos; /* return the length of print_fmt */ -} -#undef LEN_OR_ZERO - -static int set_print_fmt(struct trace_uprobe *tu) -{ - char *print_fmt; - int len; - - /* First: called with 0 length to calculate the needed length */ - len = __set_print_fmt(tu, NULL, 0); - print_fmt = kmalloc(len + 1, GFP_KERNEL); - if (!print_fmt) - return -ENOMEM; - - /* Second: actually write the @print_fmt */ - __set_print_fmt(tu, print_fmt, len + 1); - tu->call.print_fmt = print_fmt; - - return 0; -} - #ifdef CONFIG_PERF_EVENTS static bool __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) @@ -828,17 +1064,23 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, return ret; } -static void uprobe_perf_print(struct trace_uprobe *tu, - unsigned long func, struct pt_regs *regs) +static void __uprobe_perf_func(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - struct ftrace_event_call *call = &tu->call; + struct ftrace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; void *data; - int size, rctx, i; + int size, esize; + int rctx; - size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + size = esize + tu->tp.size + dsize; + size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) + return; preempt_disable(); head = this_cpu_ptr(call->perf_events); @@ -858,8 +1100,13 @@ static void uprobe_perf_print(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - for (i = 0; i < tu->nr_args; i++) - call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); + memcpy(data, ucb->buf, tu->tp.size + dsize); + + if (size - esize > tu->tp.size + dsize) { + int len = tu->tp.size + dsize; + + memset(data + len, 0, size - esize - len); + } perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: @@ -867,42 +1114,46 @@ static void uprobe_perf_print(struct trace_uprobe *tu, } /* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - uprobe_perf_print(tu, 0, regs); + __uprobe_perf_func(tu, 0, regs, ucb, dsize); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, - struct pt_regs *regs) + struct pt_regs *regs, + struct uprobe_cpu_buffer *ucb, int dsize) { - uprobe_perf_print(tu, func, regs); + __uprobe_perf_func(tu, func, regs, ucb, dsize); } #endif /* CONFIG_PERF_EVENTS */ -static -int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) +static int +trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, + void *data) { struct trace_uprobe *tu = event->data; + struct ftrace_event_file *file = data; switch (type) { case TRACE_REG_REGISTER: - return probe_event_enable(tu, TP_FLAG_TRACE, NULL); + return probe_event_enable(tu, file, NULL); case TRACE_REG_UNREGISTER: - probe_event_disable(tu, TP_FLAG_TRACE); + probe_event_disable(tu, file); return 0; #ifdef CONFIG_PERF_EVENTS case TRACE_REG_PERF_REGISTER: - return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter); + return probe_event_enable(tu, NULL, uprobe_perf_filter); case TRACE_REG_PERF_UNREGISTER: - probe_event_disable(tu, TP_FLAG_PROFILE); + probe_event_disable(tu, NULL); return 0; case TRACE_REG_PERF_OPEN: @@ -921,18 +1172,43 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; int ret = 0; + tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; - if (tu->flags & TP_FLAG_TRACE) - ret |= uprobe_trace_func(tu, regs); + udd.tu = tu; + udd.bp_addr = instruction_pointer(regs); + + current->utask->vaddr = (unsigned long) &udd; #ifdef CONFIG_PERF_EVENTS - if (tu->flags & TP_FLAG_PROFILE) - ret |= uprobe_perf_func(tu, regs); + if ((tu->tp.flags & TP_FLAG_TRACE) == 0 && + !uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; #endif + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + if (tu->tp.flags & TP_FLAG_TRACE) + ret |= uprobe_trace_func(tu, regs, ucb, dsize); + +#ifdef CONFIG_PERF_EVENTS + if (tu->tp.flags & TP_FLAG_PROFILE) + ret |= uprobe_perf_func(tu, regs, ucb, dsize); +#endif + uprobe_buffer_put(ucb); return ret; } @@ -940,16 +1216,34 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) { struct trace_uprobe *tu; + struct uprobe_dispatch_data udd; + struct uprobe_cpu_buffer *ucb; + int dsize, esize; tu = container_of(con, struct trace_uprobe, consumer); - if (tu->flags & TP_FLAG_TRACE) - uretprobe_trace_func(tu, func, regs); + udd.tu = tu; + udd.bp_addr = func; + + current->utask->vaddr = (unsigned long) &udd; + + if (WARN_ON_ONCE(!uprobe_cpu_buffer)) + return 0; + + dsize = __get_data_size(&tu->tp, regs); + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + + ucb = uprobe_buffer_get(); + store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize); + + if (tu->tp.flags & TP_FLAG_TRACE) + uretprobe_trace_func(tu, func, regs, ucb, dsize); #ifdef CONFIG_PERF_EVENTS - if (tu->flags & TP_FLAG_PROFILE) - uretprobe_perf_func(tu, func, regs); + if (tu->tp.flags & TP_FLAG_PROFILE) + uretprobe_perf_func(tu, func, regs, ucb, dsize); #endif + uprobe_buffer_put(ucb); return 0; } @@ -959,7 +1253,7 @@ static struct trace_event_functions uprobe_funcs = { static int register_uprobe_event(struct trace_uprobe *tu) { - struct ftrace_event_call *call = &tu->call; + struct ftrace_event_call *call = &tu->tp.call; int ret; /* Initialize ftrace_event_call */ @@ -967,7 +1261,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; - if (set_print_fmt(tu) < 0) + if (set_print_fmt(&tu->tp, is_ret_probe(tu)) < 0) return -ENOMEM; ret = register_ftrace_event(&call->event); @@ -994,11 +1288,11 @@ static int unregister_uprobe_event(struct trace_uprobe *tu) int ret; /* tu->event is unregistered in trace_remove_event_call() */ - ret = trace_remove_event_call(&tu->call); + ret = trace_remove_event_call(&tu->tp.call); if (ret) return ret; - kfree(tu->call.print_fmt); - tu->call.print_fmt = NULL; + kfree(tu->tp.call.print_fmt); + tu->tp.call.print_fmt = NULL; return 0; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c9..fb0a38a26555 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -62,14 +62,12 @@ struct tracepoint_entry { struct hlist_node hlist; struct tracepoint_func *funcs; int refcount; /* Number of times armed. 0 if disarmed. */ + int enabled; /* Tracepoint enabled */ char name[0]; }; struct tp_probes { - union { - struct rcu_head rcu; - struct list_head list; - } u; + struct rcu_head rcu; struct tracepoint_func probes[0]; }; @@ -82,7 +80,7 @@ static inline void *allocate_probes(int count) static void rcu_free_old_probes(struct rcu_head *head) { - kfree(container_of(head, struct tp_probes, u.rcu)); + kfree(container_of(head, struct tp_probes, rcu)); } static inline void release_probes(struct tracepoint_func *old) @@ -90,7 +88,7 @@ static inline void release_probes(struct tracepoint_func *old) if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); + call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes); } } @@ -237,6 +235,7 @@ static struct tracepoint_entry *add_tracepoint(const char *name) memcpy(&e->name[0], name, name_len); e->funcs = NULL; e->refcount = 0; + e->enabled = 0; hlist_add_head(&e->hlist, head); return e; } @@ -316,6 +315,7 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin, if (mark_entry) { set_tracepoint(&mark_entry, *iter, !!mark_entry->refcount); + mark_entry->enabled = !!mark_entry->refcount; } else { disable_tracepoint(*iter); } @@ -373,13 +373,26 @@ tracepoint_add_probe(const char *name, void *probe, void *data) * tracepoint_probe_register - Connect a probe to a tracepoint * @name: tracepoint name * @probe: probe handler + * @data: probe private data + * + * Returns: + * - 0 if the probe was successfully registered, and tracepoint + * callsites are currently loaded for that probe, + * - -ENODEV if the probe was successfully registered, but no tracepoint + * callsite is currently loaded for that probe, + * - other negative error value on error. + * + * When tracepoint_probe_register() returns either 0 or -ENODEV, + * parameters @name, @probe, and @data may be used by the tracepoint + * infrastructure until the probe is unregistered. * - * Returns 0 if ok, error value on error. * The probe address must at least be aligned on the architecture pointer size. */ int tracepoint_probe_register(const char *name, void *probe, void *data) { struct tracepoint_func *old; + struct tracepoint_entry *entry; + int ret = 0; mutex_lock(&tracepoints_mutex); old = tracepoint_add_probe(name, probe, data); @@ -388,9 +401,13 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) return PTR_ERR(old); } tracepoint_update_probes(); /* may update entry */ + entry = get_tracepoint(name); + /* Make sure the entry was enabled */ + if (!entry || !entry->enabled) + ret = -ENODEV; mutex_unlock(&tracepoints_mutex); release_probes(old); - return 0; + return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); @@ -415,6 +432,7 @@ tracepoint_remove_probe(const char *name, void *probe, void *data) * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @name: tracepoint name * @probe: probe function pointer + * @data: probe private data * * We do not need to call a synchronize_sched to make sure the probes have * finished running before doing a module unload, because the module unload @@ -438,210 +456,28 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); -static LIST_HEAD(old_probes); -static int need_update; - -static void tracepoint_add_old_probes(void *old) -{ - need_update = 1; - if (old) { - struct tp_probes *tp_probes = container_of(old, - struct tp_probes, probes[0]); - list_add(&tp_probes->u.list, &old_probes); - } -} - -/** - * tracepoint_probe_register_noupdate - register a probe but not connect - * @name: tracepoint name - * @probe: probe handler - * - * caller must call tracepoint_probe_update_all() - */ -int tracepoint_probe_register_noupdate(const char *name, void *probe, - void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); - -/** - * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect - * @name: tracepoint name - * @probe: probe function pointer - * - * caller must call tracepoint_probe_update_all() - */ -int tracepoint_probe_unregister_noupdate(const char *name, void *probe, - void *data) -{ - struct tracepoint_func *old; - - mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe, data); - if (IS_ERR(old)) { - mutex_unlock(&tracepoints_mutex); - return PTR_ERR(old); - } - tracepoint_add_old_probes(old); - mutex_unlock(&tracepoints_mutex); - return 0; -} -EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); - -/** - * tracepoint_probe_update_all - update tracepoints - */ -void tracepoint_probe_update_all(void) -{ - LIST_HEAD(release_probes); - struct tp_probes *pos, *next; - - mutex_lock(&tracepoints_mutex); - if (!need_update) { - mutex_unlock(&tracepoints_mutex); - return; - } - if (!list_empty(&old_probes)) - list_replace_init(&old_probes, &release_probes); - need_update = 0; - tracepoint_update_probes(); - mutex_unlock(&tracepoints_mutex); - list_for_each_entry_safe(pos, next, &release_probes, u.list) { - list_del(&pos->u.list); - call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); - } -} -EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); - -/** - * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. - * @tracepoint: current tracepoints (in), next tracepoint (out) - * @begin: beginning of the range - * @end: end of the range - * - * Returns whether a next tracepoint has been found (1) or not (0). - * Will return the first tracepoint in the range if the input tracepoint is - * NULL. - */ -static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, - struct tracepoint * const *begin, struct tracepoint * const *end) -{ - if (!*tracepoint && begin != end) { - *tracepoint = begin; - return 1; - } - if (*tracepoint >= begin && *tracepoint < end) - return 1; - return 0; -} #ifdef CONFIG_MODULES -static void tracepoint_get_iter(struct tracepoint_iter *iter) +bool trace_module_has_bad_taint(struct module *mod) { - int found = 0; - struct tp_module *iter_mod; - - /* Core kernel tracepoints */ - if (!iter->module) { - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (found) - goto end; - } - /* Tracepoints in modules */ - mutex_lock(&tracepoints_mutex); - list_for_each_entry(iter_mod, &tracepoint_module_list, list) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - mutex_unlock(&tracepoints_mutex); -end: - if (!found) - tracepoint_iter_reset(iter); + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | + (1 << TAINT_UNSIGNED_MODULE)); } -#else /* CONFIG_MODULES */ -static void tracepoint_get_iter(struct tracepoint_iter *iter) -{ - int found = 0; - /* Core kernel tracepoints */ - found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints_ptrs, - __stop___tracepoints_ptrs); - if (!found) - tracepoint_iter_reset(iter); -} -#endif /* CONFIG_MODULES */ - -void tracepoint_iter_start(struct tracepoint_iter *iter) -{ - tracepoint_get_iter(iter); -} -EXPORT_SYMBOL_GPL(tracepoint_iter_start); - -void tracepoint_iter_next(struct tracepoint_iter *iter) -{ - iter->tracepoint++; - /* - * iter->tracepoint may be invalid because we blindly incremented it. - * Make sure it is valid by marshalling on the tracepoints, getting the - * tracepoints from following modules if necessary. - */ - tracepoint_get_iter(iter); -} -EXPORT_SYMBOL_GPL(tracepoint_iter_next); - -void tracepoint_iter_stop(struct tracepoint_iter *iter) -{ -} -EXPORT_SYMBOL_GPL(tracepoint_iter_stop); - -void tracepoint_iter_reset(struct tracepoint_iter *iter) -{ -#ifdef CONFIG_MODULES - iter->module = NULL; -#endif /* CONFIG_MODULES */ - iter->tracepoint = NULL; -} -EXPORT_SYMBOL_GPL(tracepoint_iter_reset); - -#ifdef CONFIG_MODULES static int tracepoint_module_coming(struct module *mod) { - struct tp_module *tp_mod, *iter; + struct tp_module *tp_mod; int ret = 0; + if (!mod->num_tracepoints) + return 0; + /* * We skip modules that taint the kernel, especially those with different * module headers (for forced load), to make sure we don't cause a crash. - * Staging and out-of-tree GPL modules are fine. + * Staging, out-of-tree, and unsigned GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); @@ -651,23 +487,7 @@ static int tracepoint_module_coming(struct module *mod) } tp_mod->num_tracepoints = mod->num_tracepoints; tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; - - /* - * tracepoint_module_list is kept sorted by struct module pointer - * address for iteration on tracepoints from a seq_file that can release - * the mutex between calls. - */ - list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { - BUG_ON(iter == tp_mod); /* Should never be in the list twice */ - if (iter < tp_mod) { - /* We belong to the location right after iter. */ - list_add(&tp_mod->list, &iter->list); - goto module_added; - } - } - /* We belong to the beginning of the list */ - list_add(&tp_mod->list, &tracepoint_module_list); -module_added: + list_add_tail(&tp_mod->list, &tracepoint_module_list); tracepoint_update_probe_range(mod->tracepoints_ptrs, mod->tracepoints_ptrs + mod->num_tracepoints); end: @@ -679,6 +499,9 @@ static int tracepoint_module_going(struct module *mod) { struct tp_module *pos; + if (!mod->num_tracepoints) + return 0; + mutex_lock(&tracepoints_mutex); tracepoint_update_probe_range(mod->tracepoints_ptrs, mod->tracepoints_ptrs + mod->num_tracepoints); diff --git a/kernel/up.c b/kernel/up.c index 509403e3fbc6..1760bf3d1463 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,16 +22,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -void __smp_call_function_single(int cpu, struct call_single_data *csd, - int wait) +int smp_call_function_single_async(int cpu, struct call_single_data *csd) { unsigned long flags; local_irq_save(flags); csd->func(csd->info); local_irq_restore(flags); + return 0; } -EXPORT_SYMBOL(__smp_call_function_single); +EXPORT_SYMBOL(smp_call_function_single_async); int on_each_cpu(smp_call_func_t func, void *info, int wait) { diff --git a/kernel/user.c b/kernel/user.c index c006131beb77..294fc6a94168 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -222,5 +222,4 @@ static int __init uid_cache_init(void) return 0; } - -module_init(uid_cache_init); +subsys_initcall(uid_cache_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 240fb62cf394..0d8f6023fd8d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test - * for and handle handle INVALID_UID being returned. INVALID_UID + * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) @@ -902,4 +902,4 @@ static __init int user_namespaces_init(void) user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); return 0; } -module_init(user_namespaces_init); +subsys_initcall(user_namespaces_init); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4431610f049a..e90089fd78e0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -158,14 +158,14 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - if (watchdog_user_enabled) { - unsigned cpu; - - for_each_present_cpu(cpu) { - if (per_cpu(watchdog_nmi_touch, cpu) != true) - per_cpu(watchdog_nmi_touch, cpu) = true; - } - } + /* + * Using __raw here because some code paths have + * preemption enabled. If preemption is enabled + * then interrupts should be enabled too, in which + * case we shouldn't have to worry about the watchdog + * going off. + */ + __raw_get_cpu_var(watchdog_nmi_touch) = true; touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -505,7 +505,6 @@ static void restart_watchdog_hrtimer(void *info) static void update_timers(int cpu) { - struct call_single_data data = {.func = restart_watchdog_hrtimer}; /* * Make sure that perf event counter will adopt to a new * sampling period. Updating the sampling period directly would @@ -515,7 +514,7 @@ static void update_timers(int cpu) * might be late already so we have to restart the timer as well. */ watchdog_nmi_disable(cpu); - __smp_call_function_single(cpu, &data, 1); + smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); watchdog_nmi_enable(cpu); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b010eac595d2..0ee63af30bd1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -516,6 +516,13 @@ void destroy_work_on_stack(struct work_struct *work) } EXPORT_SYMBOL_GPL(destroy_work_on_stack); +void destroy_delayed_work_on_stack(struct delayed_work *work) +{ + destroy_timer_on_stack(&work->timer); + debug_object_free(&work->work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack); + #else static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } @@ -1851,6 +1858,12 @@ static void destroy_worker(struct worker *worker) if (worker->flags & WORKER_IDLE) pool->nr_idle--; + /* + * Once WORKER_DIE is set, the kworker may destroy itself at any + * point. Pin to ensure the task stays until we're done with it. + */ + get_task_struct(worker->task); + list_del_init(&worker->entry); worker->flags |= WORKER_DIE; @@ -1859,6 +1872,7 @@ static void destroy_worker(struct worker *worker) spin_unlock_irq(&pool->lock); kthread_stop(worker->task); + put_task_struct(worker->task); kfree(worker); spin_lock_irq(&pool->lock); @@ -3218,7 +3232,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, return -ENOMEM; if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= -20 && attrs->nice <= 19) + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) ret = apply_workqueue_attrs(wq, attrs); else ret = -EINVAL; @@ -4789,6 +4803,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb, /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); break; } return NOTIFY_OK; @@ -4828,6 +4843,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); + destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); |