diff options
32 files changed, 746 insertions, 534 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5361ebec3361..007ba86aef78 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1334,7 +1334,7 @@ PAGE_SIZE multiple when read back. pgdeactivate - Amount of pages moved to the inactive LRU lis + Amount of pages moved to the inactive LRU list pglazyfree diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 6ebae6bbe6a5..b2d9f79c4a7c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -508,10 +508,6 @@ void kernfs_put(struct kernfs_node *kn) struct kernfs_node *parent; struct kernfs_root *root; - /* - * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino - * depends on this to filter reused stale node - */ if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); @@ -536,7 +532,7 @@ void kernfs_put(struct kernfs_node *kn) kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -621,8 +617,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; - u32 gen; - int cursor; + u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -635,23 +630,19 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); - cursor = idr_get_cursor(&root->ino_idr); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); - if (ret >= 0 && ret < cursor) - root->next_generation++; - gen = root->next_generation; + if (ret >= 0 && ret < root->last_id_lowbits) + root->id_highbits++; + id_highbits = root->id_highbits; + root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; - kn->id.ino = ret; - kn->id.generation = gen; - /* - * set ino first. This RELEASE is paired with atomic_inc_not_zero in - * kernfs_find_and_get_node_by_ino - */ - atomic_set_release(&kn->count, 1); + kn->id = (u64)id_highbits << 32 | ret; + + atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -680,7 +671,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - idr_remove(&root->ino_idr, kn->id.ino); + idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -705,50 +696,52 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, } /* - * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number + * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root - * @ino: inode number + * @id: the target node id + * + * @id's lower 32bits encode ino and upper gen. If the gen portion is + * zero, all generations are matched. * * RETURNS: * NULL on failure. Return a kernfs node with reference counter incremented */ -struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, - unsigned int ino) +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id) { struct kernfs_node *kn; + ino_t ino = kernfs_id_ino(id); + u32 gen = kernfs_id_gen(id); - rcu_read_lock(); - kn = idr_find(&root->ino_idr, ino); + spin_lock(&kernfs_idr_lock); + + kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) - goto out; + goto err_unlock; - /* - * Since kernfs_node is freed in RCU, it's possible an old node for ino - * is freed, but reused before RCU grace period. But a freed node (see - * kernfs_put) or an incompletedly initialized node (see - * __kernfs_new_node) should have 'count' 0. We can use this fact to - * filter out such node. - */ - if (!atomic_inc_not_zero(&kn->count)) { - kn = NULL; - goto out; + if (sizeof(ino_t) >= sizeof(u64)) { + /* we looked up with the low 32bits, compare the whole */ + if (kernfs_ino(kn) != ino) + goto err_unlock; + } else { + /* 0 matches all generations */ + if (unlikely(gen && kernfs_gen(kn) != gen)) + goto err_unlock; } /* - * The node could be a new node or a reused node. If it's a new node, - * we are ok. If it's reused because of RCU (because of - * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' - * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, - * hence we can use 'ino' to filter stale node. + * ACTIVATED is protected with kernfs_mutex but it was clear when + * @kn was added to idr and we just wanna see it set. No need to + * grab kernfs_mutex. */ - if (kn->id.ino != ino) - goto out; - rcu_read_unlock(); + if (unlikely(!(kn->flags & KERNFS_ACTIVATED) || + !atomic_inc_not_zero(&kn->count))) + goto err_unlock; + spin_unlock(&kernfs_idr_lock); return kn; -out: - rcu_read_unlock(); - kernfs_put(kn); +err_unlock: + spin_unlock(&kernfs_idr_lock); return NULL; } @@ -962,7 +955,17 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); - root->next_generation = 1; + + /* + * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. + * High bits generation. The starting value for both ino and + * genenration is 1. Initialize upper 32bit allocation + * accordingly. + */ + if (sizeof(ino_t) >= sizeof(u64)) + root->id_highbits = 0; + else + root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, @@ -1678,7 +1681,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->id.ino; + ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e8c792b49616..34366db3620d 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -892,7 +892,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->id.ino); + inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; @@ -901,7 +901,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->id.ino); + p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, &name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index f3eaa8869f42..eac277c63d42 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -201,7 +201,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->id.generation; + inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -247,7 +247,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->id.ino); + inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 02ce570a9a3c..2f3c51d55261 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -109,8 +109,6 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); -struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, - unsigned int ino); /* * file.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 6c12fac2c287..4d31503abaee 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -53,63 +53,85 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; -/* - * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode - * number and generation - */ -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id) +static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent) { - struct kernfs_node *kn; + struct kernfs_node *kn = inode->i_private; - kn = kernfs_find_and_get_node_by_ino(root, id->ino); - if (!kn) - return NULL; - if (kn->id.generation != id->generation) { - kernfs_put(kn); - return NULL; + if (*max_len < 2) { + *max_len = 2; + return FILEID_INVALID; } - return kn; + + *max_len = 2; + *(u64 *)fh = kn->id; + return FILEID_KERNFS; } -static struct inode *kernfs_fh_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type, bool get_parent) { struct kernfs_super_info *info = kernfs_info(sb); - struct inode *inode; struct kernfs_node *kn; + struct inode *inode; + u64 id; - if (ino == 0) - return ERR_PTR(-ESTALE); + if (fh_len < 2) + return NULL; + + switch (fh_type) { + case FILEID_KERNFS: + id = *(u64 *)fid; + break; + case FILEID_INO32_GEN: + case FILEID_INO32_GEN_PARENT: + /* + * blk_log_action() exposes "LOW32,HIGH32" pair without + * type and userland can call us with generic fid + * constructed from them. Combine it back to ID. See + * blk_log_action(). + */ + id = ((u64)fid->i32.gen << 32) | fid->i32.ino; + break; + default: + return NULL; + } - kn = kernfs_find_and_get_node_by_ino(info->root, ino); + kn = kernfs_find_and_get_node_by_id(info->root, id); if (!kn) return ERR_PTR(-ESTALE); + + if (get_parent) { + struct kernfs_node *parent; + + parent = kernfs_get_parent(kn); + kernfs_put(kn); + kn = parent; + if (!kn) + return ERR_PTR(-ESTALE); + } + inode = kernfs_get_inode(sb, kn); kernfs_put(kn); if (!inode) return ERR_PTR(-ESTALE); - if (generation && inode->i_generation != generation) { - /* we didn't find the right inode.. */ - iput(inode); - return ERR_PTR(-ESTALE); - } - return inode; + return d_obtain_alias(inode); } -static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - kernfs_fh_get_inode); + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, false); } -static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, + int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - kernfs_fh_get_inode); + return __kernfs_fh_to_dentry(sb, fid, fh_len, fh_type, true); } static struct dentry *kernfs_get_parent_dentry(struct dentry *child) @@ -120,6 +142,7 @@ static struct dentry *kernfs_get_parent_dentry(struct dentry *child) } static const struct export_operations kernfs_export_ops = { + .encode_fh = kernfs_encode_fh, .fh_to_dentry = kernfs_fh_to_dentry, .fh_to_parent = kernfs_fh_to_parent, .get_parent = kernfs_get_parent_dentry, @@ -363,18 +386,9 @@ void kernfs_kill_sb(struct super_block *sb) void __init kernfs_init(void) { - - /* - * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino - * can access the slab lock free. This could introduce stale nodes, - * please see how kernfs_find_and_get_node_by_ino filters out stale - * nodes. - */ kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), - 0, - SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, - NULL); + 0, SLAB_PANIC, NULL); /* Creates slab cache for kernfs inode attributes */ kernfs_iattrs_cache = kmem_cache_create("kernfs_iattrs_cache", diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 430e219e3aba..63097cb243cb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -355,16 +355,6 @@ struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ /* - * idr allocated in-hierarchy ID. - * - * ID 0 is not used, the ID of the root cgroup is always 1, and a - * new cgroup will be assigned with a smallest available ID. - * - * Allocating/Removing ID must be protected by cgroup_mutex. - */ - int id; - - /* * The depth this cgroup is at. The root is at depth zero and each * step down the hierarchy increments the level. This along with * ancestor_ids[] can determine whether a given cgroup is a @@ -458,7 +448,7 @@ struct cgroup { struct list_head rstat_css_list; /* cgroup basic resource statistics */ - struct cgroup_base_stat pending_bstat; /* pending from children */ + struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; struct prev_cputime prev_cputime; /* for printing out cputime */ @@ -488,7 +478,7 @@ struct cgroup { struct cgroup_freezer_state freezer; /* ids of the ancestors at each level including self */ - int ancestor_ids[]; + u64 ancestor_ids[]; }; /* @@ -509,7 +499,7 @@ struct cgroup_root { struct cgroup cgrp; /* for cgrp->ancestor_ids[0] */ - int cgrp_ancestor_id_storage; + u64 cgrp_ancestor_id_storage; /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -520,9 +510,6 @@ struct cgroup_root { /* Hierarchy-specific flags */ unsigned int flags; - /* IDs for cgroups in this hierarchy */ - struct idr cgroup_idr; - /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3ba3e6da13a6..d7ddebd0cdec 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -150,7 +150,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); -void cgroup_enable_task_cg_lists(void); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); @@ -305,6 +304,11 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ +static inline u64 cgroup_id(struct cgroup *cgrp) +{ + return cgrp->kn->id; +} + /** * css_get - obtain a reference on the specified css * @css: target css @@ -566,7 +570,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; - return cgrp->ancestor_ids[ancestor->level] == ancestor->id; + return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); } /** @@ -617,7 +621,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->id.ino; + return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ @@ -688,18 +692,13 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return &cgrp->kn->id; -} - -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen); +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; +static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@ -719,10 +718,6 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} -static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) -{ - return NULL; -} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { @@ -740,8 +735,8 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, return true; } -static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) {} +static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) +{} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cf6571fc9c01..d896b8657085 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -105,6 +105,11 @@ enum fid_type { FILEID_LUSTRE = 0x97, /* + * 64 bit unique kernfs id + */ + FILEID_KERNFS = 0xfe, + + /* * Filesystems must not use 0xff file ID. */ FILEID_INVALID = 0xff, diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 936b61bd504e..dded2e5a9f42 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -104,21 +104,6 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; -/* represent a kernfs node */ -union kernfs_node_id { - struct { - /* - * blktrace will export this struct as a simplified 'struct - * fid' (which is a big data struction), so userspace can use - * it to find kernfs node. The layout must match the first two - * fields of 'struct fid' exactly. - */ - u32 ino; - u32 generation; - }; - u64 id; -}; - /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -155,7 +140,12 @@ struct kernfs_node { void *priv; - union kernfs_node_id id; + /* + * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, + * the low 32bits are ino and upper generation. + */ + u64 id; + unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; @@ -187,7 +177,8 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; - u32 next_generation; + u32 last_id_lowbits; + u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ @@ -291,6 +282,34 @@ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) return kn->flags & KERNFS_TYPE_MASK; } +static inline ino_t kernfs_id_ino(u64 id) +{ + /* id is ino if ino_t is 64bit; otherwise, low 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return id; + else + return (u32)id; +} + +static inline u32 kernfs_id_gen(u64 id) +{ + /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ + if (sizeof(ino_t) >= sizeof(u64)) + return 1; + else + return id >> 32; +} + +static inline ino_t kernfs_ino(struct kernfs_node *kn) +{ + return kernfs_id_ino(kn->id); +} + +static inline ino_t kernfs_gen(struct kernfs_node *kn) +{ + return kernfs_id_gen(kn->id); +} + /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty @@ -382,8 +401,8 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, - const union kernfs_node_id *id); +struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, + u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index cfc9441ef074..dec7522b6ce1 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -26,7 +26,7 @@ static inline u32 task_netprioidx(struct task_struct *p) rcu_read_lock(); css = task_css(p, net_prio_cgrp_id); - idx = css->cgroup->id; + idx = css->id; rcu_read_unlock(); return idx; } diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index a566cc521476..7f42a3de59e6 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -66,7 +66,7 @@ DECLARE_EVENT_CLASS(cgroup, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), @@ -135,7 +135,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; - __entry->dst_id = dst_cgrp->id; + __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; @@ -179,7 +179,7 @@ DECLARE_EVENT_CLASS(cgroup_event, TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; - __entry->id = cgrp->id; + __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c2ce6480b4b1..ef50be4e5e6c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -61,7 +61,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(pgoff_t, index) ), @@ -75,7 +75,7 @@ DECLARE_EVENT_CLASS(writeback_page_template, TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, __entry->index ) ); @@ -102,7 +102,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) @@ -150,28 +150,28 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK -static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->id.ino; + return cgroup_ino(wb->memcg_css->cgroup); } -static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else - return -1U; + return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ -static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) +static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return -1U; + return 1; } -static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) +static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { - return -1U; + return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ @@ -187,8 +187,8 @@ TRACE_EVENT(inode_foreign_history, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) - __field(unsigned int, cgroup_ino) + __field(ino_t, ino) + __field(ino_t, cgroup_ino) __field(unsigned int, history) ), @@ -199,10 +199,10 @@ TRACE_EVENT(inode_foreign_history, __entry->history = history; ), - TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x", + TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, - __entry->ino, - __entry->cgroup_ino, + (unsigned long)__entry->ino, + (unsigned long)__entry->cgroup_ino, __entry->history ) ); @@ -216,9 +216,9 @@ TRACE_EVENT(inode_switch_wbs, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) - __field(unsigned int, old_cgroup_ino) - __field(unsigned int, new_cgroup_ino) + __field(ino_t, ino) + __field(ino_t, old_cgroup_ino) + __field(ino_t, new_cgroup_ino) ), TP_fast_assign( @@ -228,11 +228,11 @@ TRACE_EVENT(inode_switch_wbs, __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), - TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, - __entry->ino, - __entry->old_cgroup_ino, - __entry->new_cgroup_ino + (unsigned long)__entry->ino, + (unsigned long)__entry->old_cgroup_ino, + (unsigned long)__entry->new_cgroup_ino ) ); @@ -245,10 +245,10 @@ TRACE_EVENT(track_foreign_dirty, TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned int, memcg_id) - __field(unsigned int, cgroup_ino) - __field(unsigned int, page_cgroup_ino) + __field(ino_t, cgroup_ino) + __field(ino_t, page_cgroup_ino) ), TP_fast_assign( @@ -260,16 +260,16 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; + __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); ), - TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u", + TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, - __entry->ino, + (unsigned long)__entry->ino, __entry->memcg_id, - __entry->cgroup_ino, - __entry->page_cgroup_ino + (unsigned long)__entry->cgroup_ino, + (unsigned long)__entry->page_cgroup_ino ) ); @@ -282,7 +282,7 @@ TRACE_EVENT(flush_foreign, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), @@ -294,9 +294,9 @@ TRACE_EVENT(flush_foreign, __entry->frn_memcg_id = frn_memcg_id; ), - TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u", + TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, - __entry->cgroup_ino, + (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) @@ -311,9 +311,9 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_STRUCT__entry ( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(int, sync_mode) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -324,11 +324,11 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), - TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, __entry->sync_mode, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, range_cyclic) __field(int, for_background) __field(int, reason) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, @@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%u", + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, @@ -383,7 +383,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -413,15 +413,15 @@ DECLARE_EVENT_CLASS(writeback_class, TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, dev_name(wb->bdi->dev), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: cgroup_ino=%u", + TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ @@ -459,7 +459,7 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -478,7 +478,7 @@ DECLARE_EVENT_CLASS(wbc_class, TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " - "start=0x%lx end=0x%lx cgroup_ino=%u", + "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, @@ -489,7 +489,7 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->range_cyclic, __entry->range_start, __entry->range_end, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ) @@ -510,7 +510,7 @@ TRACE_EVENT(writeback_queue_io, __field(long, age) __field(int, moved) __field(int, reason) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -522,13 +522,13 @@ TRACE_EVENT(writeback_queue_io, __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%u", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -596,7 +596,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -614,7 +614,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " - "balanced_dirty_ratelimit=%lu cgroup_ino=%u", + "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ @@ -622,7 +622,7 @@ TRACE_EVENT(bdi_dirty_ratelimit, __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -660,7 +660,7 @@ TRACE_EVENT(balance_dirty_pages, __field( long, pause) __field(unsigned long, period) __field( long, think) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -692,7 +692,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%u", + "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, @@ -707,7 +707,7 @@ TRACE_EVENT(balance_dirty_pages, __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -718,10 +718,10 @@ TRACE_EVENT(writeback_sb_inodes_requeue, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -733,13 +733,13 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%u", + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -789,13 +789,13 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_STRUCT__entry( __array(char, name, 32) - __field(unsigned long, ino) + __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) - __field(unsigned int, cgroup_ino) + __field(ino_t, cgroup_ino) ), TP_fast_assign( @@ -811,16 +811,16 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " - "index=%lu to_write=%ld wrote=%lu cgroup_ino=%u", + "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, - __entry->ino, + (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, - __entry->cgroup_ino + (unsigned long)__entry->cgroup_ino ) ); @@ -845,7 +845,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_STRUCT__entry( __field( dev_t, dev ) - __field(unsigned long, ino ) + __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) @@ -861,7 +861,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->dirtied_when, + (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5e28718928ca..cada974c9f4e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; + return cgroup_id(cgrp); } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index addd6fdceec8..2ba750725cb2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; + storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 809e34a3c017..90d1710fef6c 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7f83f4121d8d..09f3a413f6f8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -495,12 +495,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; + bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -522,7 +523,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ef4242e5d4bc..735af8f15f95 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1309,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - if (root) { - idr_destroy(&root->cgroup_idr); - kfree(root); - } + kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1374,6 +1370,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; @@ -1430,9 +1428,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. + * No need to lock the task - since we hold css_set_lock the + * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } @@ -1883,65 +1880,6 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first mount. - */ -static bool use_task_css_set_links __read_mostly; - -void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - - do_each_thread(g, p) { - WARN_ON_ONCE(!list_empty(&p->cg_list) || - task_css_set(p) != &init_css_set); - - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - * Do it while holding siglock so that we don't end up - * racing against cgroup_exit(). - * - * Interrupts were already disabled while acquiring - * the css_set_lock, so we do not need to disable it - * again when acquiring the sighand->siglock here. - */ - spin_lock(&p->sighand->siglock); - if (!(p->flags & PF_EXITING)) { - struct css_set *cset = task_css_set(p); - - if (!css_set_populated(cset)) - css_set_update_populated(cset, true); - list_add_tail(&p->cg_list, &cset->tasks); - get_css_set(cset); - cset->nr_tasks++; - } - spin_unlock(&p->sighand->siglock); - } while_each_thread(g, p); -out_unlock: - spin_unlock_irq(&css_set_lock); - read_unlock(&tasklist_lock); -} - static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -1976,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); root->flags = ctx->flags; if (ctx->release_agent) @@ -1997,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); - if (ret < 0) - goto out; - root_cgrp->id = ret; - root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -2035,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; + WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -2188,13 +2121,6 @@ static int cgroup_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; @@ -2372,9 +2298,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (task->flags & PF_EXITING) return; - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - return; + /* cgroup_threadgroup_rwsem protects racing against forks */ + WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) @@ -2825,7 +2750,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; @@ -2834,7 +2760,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write + * callers by cgroup_mutex. + * Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + if (pid || threadgroup) { + percpu_down_write(&cgroup_threadgroup_rwsem); + *locked = true; + } else { + *locked = false; + } rcu_read_lock(); if (pid) { @@ -2865,13 +2805,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + if (*locked) { + percpu_up_write(&cgroup_threadgroup_rwsem); + *locked = false; + } out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; @@ -2880,7 +2823,8 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (locked) + percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -3601,22 +3545,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4568,9 +4512,6 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { - /* no one should try to iterate before mounting cgroups */ - WARN_ON_ONCE(!use_task_css_set_links); - memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); @@ -4755,12 +4696,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true); + task = cgroup_procs_write_start(buf, true, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4778,7 +4720,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, true); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -4796,6 +4738,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; buf = strstrip(buf); @@ -4803,7 +4746,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, false); + task = cgroup_procs_write_start(buf, false, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4827,7 +4770,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -5037,9 +4980,6 @@ static void css_release_work_fn(struct work_struct *work) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - @@ -5204,10 +5144,12 @@ err_free_css: * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ -static struct cgroup *cgroup_create(struct cgroup *parent) +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; + struct kernfs_node *kn; int level = parent->level + 1; int ret; @@ -5227,15 +5169,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_cancel_ref; } - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); - if (cgrp->id < 0) { - ret = -ENOMEM; + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); goto out_stat_exit; } + cgrp->kn = kn; init_cgroup_housekeeping(cgrp); @@ -5245,7 +5185,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_idr_free; + goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) @@ -5269,7 +5209,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5299,12 +5239,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgroup_get_live(parent); /* - * @cgrp is now fully operational. If something fails after this - * point, it'll be released via the normal destruction path. - */ - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ @@ -5317,8 +5251,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_psi_free: psi_cgroup_free(cgrp); -out_idr_free: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_kernfs_remove: + kernfs_remove(cgrp->kn); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); @@ -5355,7 +5289,6 @@ fail: int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ @@ -5371,27 +5304,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) goto out_unlock; } - cgrp = cgroup_create(parent); + cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_destroy; - } - cgrp->kn = kn; - /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ - kernfs_get(kn); + kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(kn); + ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; @@ -5406,7 +5331,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ - kernfs_activate(kn); + kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; @@ -5836,12 +5761,11 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; - kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); @@ -6002,62 +5926,38 @@ void cgroup_cancel_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; + struct css_set *cset; int i; + spin_lock_irq(&css_set_lock); + + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + /* - * This may race against cgroup_enable_task_cg_lists(). As that - * function sets use_task_css_set_links before grabbing - * tasklist_lock and we just went through tasklist_lock to add - * @child, it's guaranteed that either we see the set - * use_task_css_set_links or cgroup_enable_task_cg_lists() sees - * @child during its iteration. - * - * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_lock guarantees both that the - * association is stable, and, on completion of the parent's - * migration, @child is visible in the source of migration or - * already in the destination cgroup. This guarantee is necessary - * when implementing operations which need to migrate all tasks of - * a cgroup to another. - * - * Note that if we lose to cgroup_enable_task_cg_lists(), @child - * will remain in init_css_set. This is safe because all tasks are - * in the init_css_set before cg_links is enabled and there's no - * operation which transfers all tasks out of init_css_set. + * If the cgroup has to be frozen, the new task has too. Let's set + * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the + * frozen state. */ - if (use_task_css_set_links) { - struct css_set *cset; - - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - if (list_empty(&child->cg_list)) { - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); - } + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); /* - * If the cgroup has to be frozen, the new task has too. - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get - * the task into the frozen state. + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient switch + * from the frozen state and back. */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); - - /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later - * from do_freezer_trap(). So we avoid cgroup's - * transient switch from the frozen state and back. - */ - } - - spin_unlock_irq(&css_set_lock); } + spin_unlock_irq(&css_set_lock); + /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() @@ -6072,20 +5972,8 @@ void cgroup_post_fork(struct task_struct *child) * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * - * Description: Detach cgroup from @tsk and release it. + * Description: Detach cgroup from @tsk. * - * Note that cgroups marked notify_on_release force every task in - * them to take the global cgroup_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cgroups where very high task exit scaling - * is required on large systems. - * - * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We - * call cgroup_exit() while the task is still competent to handle - * notify_on_release(), then leave the task attached to the root cgroup in - * each hierarchy for the remainder of its exit. No need to bother with - * init_css_set refcnting. init_css_set never goes away and we can't race - * with migration path - PF_EXITING is visible to migration path. */ void cgroup_exit(struct task_struct *tsk) { @@ -6093,26 +5981,19 @@ void cgroup_exit(struct task_struct *tsk) struct css_set *cset; int i; - /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. - */ - cset = task_css_set(tsk); + spin_lock_irq(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); - css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); - cset->nr_tasks--; + WARN_ON_ONCE(list_empty(&tsk->cg_list)); + cset = task_css_set(tsk); + css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; - WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) - cgroup_update_frozen(task_dfl_cgroup(tsk)); + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - } else { - get_css_set(cset); - } + spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { @@ -6129,12 +6010,10 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - if (use_task_css_set_links) { - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); - } + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); } void cgroup_free(struct task_struct *task) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c87ee6412b36..58f5073acff7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -929,8 +929,6 @@ static void rebuild_root_domains(void) lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); - cgroup_enable_task_cg_lists(); - rcu_read_lock(); /* diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 8cf010680678..3984dd6b8ddb 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -231,6 +231,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task, return; /* + * It's not necessary to do changes if both of the src and dst cgroups + * are not freezing and task is not frozen. + */ + if (!test_bit(CGRP_FREEZE, &src->flags) && + !test_bit(CGRP_FREEZE, &dst->flags) && + !task->frozen) + return; + + /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not * frozen, we bump both counters to keep them balanced. diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 8e513a573fe9..138059eb730d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -45,7 +45,7 @@ struct pids_cgroup { * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; - int64_t limit; + atomic64_t limit; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->limit, PIDS_MAX); atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); + int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > p->limit) + if (new > limit) goto revert; } @@ -277,7 +278,7 @@ set_limit: * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ - pids->limit = limit; + atomic64_set(&pids->limit, limit); return nbytes; } @@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); - int64_t limit = pids->limit; + int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ca19b4c8acf5..b48b22d4deb6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void) * Functions for cgroup basic resource statistics implemented on top of * rstat. */ -static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, - struct cgroup_base_stat *src_bstat) +static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } +static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime -= src_bstat->cputime.utime; + dst_bstat->cputime.stime -= src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +} + static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; - struct task_cputime cputime; - struct cgroup_base_stat delta; + struct cgroup_base_stat cur, delta; unsigned seq; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cputime = rstatc->bstat.cputime; + cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* calculate the delta to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); - memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_base_stat_accumulate(&cgrp->bstat, &delta); - if (parent) - cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); + /* propagate percpu delta to global */ + delta = cur; + cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_add(&cgrp->bstat, &delta); + cgroup_base_stat_add(&rstatc->last_bstat, &delta); + + /* propagate global delta to parent */ + if (parent) { + delta = cgrp->bstat; + cgroup_base_stat_sub(&delta, &cgrp->last_bstat); + cgroup_base_stat_add(&parent->bstat, &delta); + cgroup_base_stat_add(&cgrp->last_bstat, &delta); + } } static struct cgroup_rstat_cpu * diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d6e93ab0478..475e29498bca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len, - union kernfs_node_id *cgid) + const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; @@ -100,8 +99,8 @@ record_it: t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; - if (cgid) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) @@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm), NULL); + sizeof(tsk->comm), 0); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } @@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); + blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, union kernfs_node_id *cgid) + void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -294,7 +293,7 @@ record_it: t->pdu_len = pdu_len + cgid_len; if (cgid_len) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); @@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) } #ifdef CONFIG_BLK_CGROUP -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct blk_trace *bt = q->blk_trace; if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - return NULL; + return 0; if (!bio->bi_blkg) - return NULL; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return 0; + return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - return NULL; + return 0; } #endif -static union kernfs_node_id * +static u64 blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) { if (!rq->bio) - return NULL; + return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(q, rq->bio); } @@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what, - union kernfs_node_id *cgid) + unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, NULL); + NULL, 0); } } @@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, NULL); + 0, 0, NULL, 0); } } @@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } } @@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return (void *)(te_blk_io_trace(ent) + 1) + - (has_cg ? sizeof(union kernfs_node_id) : 0); + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } -static inline const void *cgid_start(const struct trace_entry *ent) +static inline u64 t_cgid(const struct trace_entry *ent) { - return (void *)(te_blk_io_trace(ent) + 1); + return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent)->pdu_len - - (has_cg ? sizeof(union kernfs_node_id) : 0); + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, fill_rwbs(rwbs, t); if (has_cg) { - const union kernfs_node_id *id = cgid_start(iter->ent); + u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; @@ -1267,11 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); - } else + } else { + /* + * The cgid portion used to be "INO,GEN". Userland + * builds a FILEID_INO32_GEN fid out of them and + * opens the cgroup using open_by_handle_at(2). + * While 32bit ino setups are still the same, 64bit + * ones now use the 64bit ino as the whole ID and + * no longer use generation. + * + * Regarldess of the content, always output + * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can + * be mapped back to @id on both 64 and 32bit ino + * setups. See __kernfs_fh_to_dentry(). + */ trace_seq_printf(&iter->seq, - "%3d,%-3d %x,%-x %2s %3s ", + "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), - id->ino, id->generation, act, rwbs); + id & U32_MAX, id >> 32, act, rwbs); + } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); diff --git a/net/core/filter.c b/net/core/filter.c index 3fed5755494b..9ad29f576f1a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4089,7 +4089,7 @@ BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgrp->kn->id.id; + return cgroup_id(cgrp); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4114,7 +4114,7 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, if (!ancestor) return 0; - return ancestor->kn->id.id; + return cgroup_id(ancestor); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 256b7954b720..8881dd943dd0 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -93,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx) static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev) { struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); - int id = css->cgroup->id; + int id = css->id; if (map && id < map->priomap_len) return map->priomap[id]; @@ -113,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css, struct net_device *dev, u32 prio) { struct netprio_map *map; - int id = css->cgroup->id; + int id = css->id; int ret; /* avoid extending priomap for zero writes */ @@ -177,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return css->cgroup->id; + return css->id; } static int read_priomap(struct seq_file *sf, void *v) @@ -237,7 +237,7 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; cgroup_taskset_for_each(p, css, tset) { - void *v = (void *)(unsigned long)css->cgroup->id; + void *v = (void *)(unsigned long)css->id; task_lock(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 8d369b6a2069..66aafe1f5746 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -1,8 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -Wall +CFLAGS += -Wall -pthread all: +TEST_FILES := with_stress.sh +TEST_PROGS := test_stress.sh TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_freezer diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index bdb69599c4bd..8f7131dcf1ff 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -158,6 +158,22 @@ long cg_read_key_long(const char *cgroup, const char *control, const char *key) return atol(ptr + strlen(key)); } +long cg_read_lc(const char *cgroup, const char *control) +{ + char buf[PAGE_SIZE]; + const char delim[] = "\n"; + char *line; + long cnt = 0; + + if (cg_read(cgroup, control, buf, sizeof(buf))) + return -1; + + for (line = strtok(buf, delim); line; line = strtok(NULL, delim)) + cnt++; + + return cnt; +} + int cg_write(const char *cgroup, const char *control, char *buf) { char path[PATH_MAX]; @@ -282,10 +298,12 @@ int cg_enter(const char *cgroup, int pid) int cg_enter_current(const char *cgroup) { - char pidbuf[64]; + return cg_write(cgroup, "cgroup.procs", "0"); +} - snprintf(pidbuf, sizeof(pidbuf), "%d", getpid()); - return cg_write(cgroup, "cgroup.procs", pidbuf); +int cg_enter_current_thread(const char *cgroup) +{ + return cg_write(cgroup, "cgroup.threads", "0"); } int cg_run(const char *cgroup, @@ -410,11 +428,25 @@ int set_oom_adj_score(int pid, int score) return 0; } -char proc_read_text(int pid, const char *item, char *buf, size_t size) +ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) { char path[PATH_MAX]; - snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); + if (!pid) + snprintf(path, sizeof(path), "/proc/%s/%s", + thread ? "thread-self" : "self", item); + else + snprintf(path, sizeof(path), "/proc/%d/%s", pid, item); return read_text(path, buf, size); } + +int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) +{ + char buf[PAGE_SIZE]; + + if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0) + return -1; + + return strstr(buf, needle) ? 0 : -1; +} diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index c72f28046bfa..49c54fbdb229 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <stdbool.h> #include <stdlib.h> #define PAGE_SIZE 4096 @@ -29,12 +30,14 @@ extern int cg_read_strstr(const char *cgroup, const char *control, const char *needle); extern long cg_read_long(const char *cgroup, const char *control); long cg_read_key_long(const char *cgroup, const char *control, const char *key); +extern long cg_read_lc(const char *cgroup, const char *control); extern int cg_write(const char *cgroup, const char *control, char *buf); extern int cg_run(const char *cgroup, int (*fn)(const char *cgroup, void *arg), void *arg); extern int cg_enter(const char *cgroup, int pid); extern int cg_enter_current(const char *cgroup); +extern int cg_enter_current_thread(const char *cgroup); extern int cg_run_nowait(const char *cgroup, int (*fn)(const char *cgroup, void *arg), void *arg); @@ -45,4 +48,5 @@ extern int is_swap_enabled(void); extern int set_oom_adj_score(int pid, int score); extern int cg_wait_for_proc_count(const char *cgroup, int count); extern int cg_killall(const char *cgroup); -extern char proc_read_text(int pid, const char *item, char *buf, size_t size); +extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); +extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 79053a4f4783..c5ca669feb2b 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -5,6 +5,9 @@ #include <unistd.h> #include <stdio.h> #include <errno.h> +#include <signal.h> +#include <string.h> +#include <pthread.h> #include "../kselftest.h" #include "cgroup_util.h" @@ -354,6 +357,147 @@ cleanup: return ret; } +static void *dummy_thread_fn(void *arg) +{ + return (void *)(size_t)pause(); +} + +/* + * Test threadgroup migration. + * All threads of a process are migrated together. + */ +static int test_cgcore_proc_migration(const char *root) +{ + int ret = KSFT_FAIL; + int t, c_threads, n_threads = 13; + char *src = NULL, *dst = NULL; + pthread_t threads[n_threads]; + + src = cg_name(root, "cg_src"); + dst = cg_name(root, "cg_dst"); + if (!src || !dst) + goto cleanup; + + if (cg_create(src)) + goto cleanup; + if (cg_create(dst)) + goto cleanup; + + if (cg_enter_current(src)) + goto cleanup; + + for (c_threads = 0; c_threads < n_threads; ++c_threads) { + if (pthread_create(&threads[c_threads], NULL, dummy_thread_fn, NULL)) + goto cleanup; + } + + cg_enter_current(dst); + if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + for (t = 0; t < c_threads; ++t) { + pthread_cancel(threads[t]); + } + + for (t = 0; t < c_threads; ++t) { + pthread_join(threads[t], NULL); + } + + cg_enter_current(root); + + if (dst) + cg_destroy(dst); + if (src) + cg_destroy(src); + free(dst); + free(src); + return ret; +} + +static void *migrating_thread_fn(void *arg) +{ + int g, i, n_iterations = 1000; + char **grps = arg; + char lines[3][PATH_MAX]; + + for (g = 1; g < 3; ++g) + snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0])); + + for (i = 0; i < n_iterations; ++i) { + cg_enter_current_thread(grps[(i % 2) + 1]); + + if (proc_read_strstr(0, 1, "cgroup", lines[(i % 2) + 1])) + return (void *)-1; + } + return NULL; +} + +/* + * Test single thread migration. + * Threaded cgroups allow successful migration of a thread. + */ +static int test_cgcore_thread_migration(const char *root) +{ + int ret = KSFT_FAIL; + char *dom = NULL; + char line[PATH_MAX]; + char *grps[3] = { (char *)root, NULL, NULL }; + pthread_t thr; + void *retval; + + dom = cg_name(root, "cg_dom"); + grps[1] = cg_name(root, "cg_dom/cg_src"); + grps[2] = cg_name(root, "cg_dom/cg_dst"); + if (!grps[1] || !grps[2] || !dom) + goto cleanup; + + if (cg_create(dom)) + goto cleanup; + if (cg_create(grps[1])) + goto cleanup; + if (cg_create(grps[2])) + goto cleanup; + + if (cg_write(grps[1], "cgroup.type", "threaded")) + goto cleanup; + if (cg_write(grps[2], "cgroup.type", "threaded")) + goto cleanup; + + if (cg_enter_current(grps[1])) + goto cleanup; + + if (pthread_create(&thr, NULL, migrating_thread_fn, grps)) + goto cleanup; + + if (pthread_join(thr, &retval)) + goto cleanup; + + if (retval) + goto cleanup; + + snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0])); + if (proc_read_strstr(0, 1, "cgroup", line)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (grps[2]) + cg_destroy(grps[2]); + if (grps[1]) + cg_destroy(grps[1]); + if (dom) + cg_destroy(dom); + free(grps[2]); + free(grps[1]); + free(dom); + return ret; +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -366,6 +510,8 @@ struct corecg_test { T(test_cgcore_parent_becomes_threaded), T(test_cgcore_invalid_domain), T(test_cgcore_populated), + T(test_cgcore_proc_migration), + T(test_cgcore_thread_migration), }; #undef T diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 0fc1b6d4b0f9..23d8fa4a3e4e 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -72,6 +72,7 @@ static int cg_prepare_for_wait(const char *cgroup) if (ret == -1) { debug("Error: inotify_add_watch() failed\n"); close(fd); + fd = -1; } return fd; @@ -701,7 +702,7 @@ static int proc_check_stopped(int pid) char buf[PAGE_SIZE]; int len; - len = proc_read_text(pid, "stat", buf, sizeof(buf)); + len = proc_read_text(pid, 0, "stat", buf, sizeof(buf)); if (len == -1) { debug("Can't get %d stat\n", pid); return -1; diff --git a/tools/testing/selftests/cgroup/test_stress.sh b/tools/testing/selftests/cgroup/test_stress.sh new file mode 100755 index 000000000000..15d9d5896394 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_stress.sh @@ -0,0 +1,4 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +./with_stress.sh -s subsys -s fork ./test_core diff --git a/tools/testing/selftests/cgroup/with_stress.sh b/tools/testing/selftests/cgroup/with_stress.sh new file mode 100755 index 000000000000..e28c35008f5b --- /dev/null +++ b/tools/testing/selftests/cgroup/with_stress.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +stress_fork() +{ + while true ; do + /usr/bin/true + sleep 0.01 + done +} + +stress_subsys() +{ + local verb=+ + while true ; do + echo $verb$subsys_ctrl >$sysfs/cgroup.subtree_control + [ $verb = "+" ] && verb=- || verb=+ + # incommensurable period with other stresses + sleep 0.011 + done +} + +init_and_check() +{ + sysfs=`mount -t cgroup2 | head -1 | awk '{ print $3 }'` + if [ ! -d "$sysfs" ]; then + echo "Skipping: cgroup2 is not mounted" >&2 + exit $ksft_skip + fi + + if ! echo +$subsys_ctrl >$sysfs/cgroup.subtree_control ; then + echo "Skipping: cannot enable $subsys_ctrl in $sysfs" >&2 + exit $ksft_skip + fi + + if ! echo -$subsys_ctrl >$sysfs/cgroup.subtree_control ; then + echo "Skipping: cannot disable $subsys_ctrl in $sysfs" >&2 + exit $ksft_skip + fi +} + +declare -a stresses +declare -a stress_pids +duration=5 +rc=0 +subsys_ctrl=cpuset +sysfs= + +while getopts c:d:hs: opt; do + case $opt in + c) + subsys_ctrl=$OPTARG + ;; + d) + duration=$OPTARG + ;; + h) + echo "Usage $0 [ -s stress ] ... [ -d duration ] [-c controller] cmd args .." + echo -e "\t default duration $duration seconds" + echo -e "\t default controller $subsys_ctrl" + exit + ;; + s) + func=stress_$OPTARG + if [ "x$(type -t $func)" != "xfunction" ] ; then + echo "Unknown stress $OPTARG" + exit 1 + fi + stresses+=($func) + ;; + esac +done +shift $((OPTIND - 1)) + +init_and_check + +for s in ${stresses[*]} ; do + $s & + stress_pids+=($!) +done + + +time=0 +start=$(date +%s) + +while [ $time -lt $duration ] ; do + $* + rc=$? + [ $rc -eq 0 ] || break + time=$(($(date +%s) - $start)) +done + +for pid in ${stress_pids[*]} ; do + kill -SIGTERM $pid + wait $pid +done + +exit $rc |