From 9f6df573a4041f896cbf51f1b3743494196620a7 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:04 -0600 Subject: kernfs: Add API to generate relative kernfs path The new function kernfs_path_from_node() generates and returns kernfs path of a given kernfs_node relative to a given parent kernfs_node. Signed-off-by: Aditya Kali Signed-off-by: Serge E. Hallyn Acked-by: Greg Kroah-Hartman Signed-off-by: Tejun Heo --- include/linux/kernfs.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index af51df35d749..716bfdede5f5 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -267,8 +267,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); size_t kernfs_path_len(struct kernfs_node *kn); -char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, - size_t buflen); +int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, + char *buf, size_t buflen); +char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); @@ -338,8 +339,8 @@ static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) static inline size_t kernfs_path_len(struct kernfs_node *kn) { return 0; } -static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, - size_t buflen) +static inline char *kernfs_path(struct kernfs_node *kn, char *buf, + size_t buflen) { return NULL; } static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } -- cgit v1.2.3 From 5e2bec7c2248ae27c5b16cd97215ae05c1d39179 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:05 -0600 Subject: sched: new clone flag CLONE_NEWCGROUP for cgroup namespace CLONE_NEWCGROUP will be used to create new cgroup namespace. Signed-off-by: Aditya Kali Signed-off-by: Serge Hallyn Signed-off-by: Tejun Heo --- include/uapi/linux/sched.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index cc89ddefa926..5f0fe019a720 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -21,8 +21,7 @@ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) - and is now available for re-use. */ +#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ #define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ #define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ -- cgit v1.2.3 From a79a908fd2b080977b45bf103184b81c9d11ad07 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:06 -0600 Subject: cgroup: introduce cgroup namespaces Introduce the ability to create new cgroup namespace. The newly created cgroup namespace remembers the cgroup of the process at the point of creation of the cgroup namespace (referred as cgroupns-root). The main purpose of cgroup namespace is to virtualize the contents of /proc/self/cgroup file. Processes inside a cgroup namespace are only able to see paths relative to their namespace root (unless they are moved outside of their cgroupns-root, at which point they will see a relative path from their cgroupns-root). For a correctly setup container this enables container-tools (like libcontainer, lxc, lmctfy, etc.) to create completely virtualized containers without leaking system level cgroup hierarchy to the task. This patch only implements the 'unshare' part of the cgroupns. Signed-off-by: Aditya Kali Signed-off-by: Serge Hallyn Signed-off-by: Tejun Heo --- fs/proc/namespaces.c | 3 + include/linux/cgroup.h | 49 ++++++++++++++ include/linux/nsproxy.h | 2 + include/linux/proc_ns.h | 4 ++ kernel/cgroup.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/cpuset.c | 8 +-- kernel/fork.c | 2 +- kernel/nsproxy.c | 19 +++++- 8 files changed, 250 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 276f12431dbf..72cb26f85d58 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = { &userns_operations, #endif &mntns_operations, +#ifdef CONFIG_CGROUPS + &cgroupns_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2162dca88dc0..a20320c666fd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,6 +17,11 @@ #include #include #include +#include +#include +#include +#include +#include #include @@ -611,4 +616,48 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ +struct cgroup_namespace { + atomic_t count; + struct ns_common ns; + struct user_namespace *user_ns; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +#endif /* !CONFIG_CGROUPS */ + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + if (ns) + atomic_inc(&ns->count); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (ns && atomic_dec_and_test(&ns->count)) + free_cgroup_ns(ns); +} + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 35fa08fd7739..ac0d65bef5d0 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -8,6 +8,7 @@ struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; +struct cgroup_namespace; struct fs_struct; /* @@ -33,6 +34,7 @@ struct nsproxy { struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; + struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 42dfc615dbf8..de0e7719d4c5 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -9,6 +9,8 @@ struct pid_namespace; struct nsproxy; struct path; +struct task_struct; +struct inode; struct proc_ns_operations { const char *name; @@ -24,6 +26,7 @@ extern const struct proc_ns_operations ipcns_operations; extern const struct proc_ns_operations pidns_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; /* * We always define these enumerators @@ -34,6 +37,7 @@ enum { PROC_UTS_INIT_INO = 0xEFFFFFFEU, PROC_USER_INIT_INO = 0xEFFFFFFDU, PROC_PID_INIT_INO = 0xEFFFFFFCU, + PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, }; #ifdef CONFIG_PROC_FS diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7ad61915967f..b001c5d36bec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -59,6 +59,9 @@ #include #include #include +#include +#include +#include #include /* @@ -212,6 +215,15 @@ static unsigned long have_fork_callback __read_mostly; static unsigned long have_exit_callback __read_mostly; static unsigned long have_free_callback __read_mostly; +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; + /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; @@ -2177,6 +2189,35 @@ static struct file_system_type cgroup2_fs_type = { .kill_sb = cgroup_kill_sb, }; +static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + int ret; + + ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); + if (ret < 0 || ret >= buflen) + return NULL; + return buf; +} + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + char *ret; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(cgroup_path_ns); + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2204,7 +2245,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path(cgrp, buf, buflen); + path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ if (strlcpy(buf, "/", buflen) < buflen) @@ -5297,6 +5338,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + get_user_ns(init_cgroup_ns.user_ns); + mutex_lock(&cgroup_mutex); /* Add init_css_set to the hash table */ @@ -5438,7 +5481,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path(cgrp, buf, PATH_MAX); + path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + current->nsproxy->cgroup_ns); if (!path) { retval = -ENAMETOOLONG; goto out_unlock; @@ -5720,7 +5764,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); + spin_lock_bh(&css_set_lock); + path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_bh(&css_set_lock); if (!path) goto out; @@ -5931,6 +5977,127 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ +/* cgroup namespaces */ + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns = NULL; + struct css_set *cset = NULL; + int err; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + err = -EPERM; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + goto err_out; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cset = task_css_set(current); + get_css_set(cset); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + err = -ENOMEM; + new_ns = alloc_cgroup_ns(); + if (!new_ns) + goto err_out; + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->root_cset = cset; + + return new_ns; + +err_out: + if (cset) + put_css_set(cset); + kfree(new_ns); + return ERR_PTR(err); +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, void *ns) +{ + pr_info("setns not supported for cgroup namespace"); + return -EINVAL; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 41989ab4db57..d393125b228c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, goto out; retval = -ENAMETOOLONG; - rcu_read_lock(); - css = task_css(tsk, cpuset_cgrp_id); - p = cgroup_path(css->cgroup, buf, PATH_MAX); - rcu_read_unlock(); + css = task_get_css(tsk, cpuset_cgrp_id); + p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); if (!p) goto out_free; seq_puts(m, p); diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..6611a6267949 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1884,7 +1884,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..782102e59eed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -25,6 +25,7 @@ #include #include #include +#include static struct kmem_cache *nsproxy_cachep; @@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_NET .net_ns = &init_net, #endif +#ifdef CONFIG_CGROUPS + .cgroup_ns = &init_cgroup_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns, + tsk->nsproxy->cgroup_ns); + if (IS_ERR(new_nsp->cgroup_ns)) { + err = PTR_ERR(new_nsp->cgroup_ns); + goto out_cgroup; + } + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: + put_cgroup_ns(new_nsp->cgroup_ns); +out_cgroup: if (new_nsp->pid_ns_for_children) put_pid_ns(new_nsp->pid_ns_for_children); out_pid: @@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *new_ns; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET)))) { + CLONE_NEWPID | CLONE_NEWNET | + CLONE_NEWCGROUP)))) { get_nsproxy(old_ns); return 0; } @@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); -- cgit v1.2.3 From fb3c8315650f89a1993fb3ae3e74e9c7e4a1c9c0 Mon Sep 17 00:00:00 2001 From: Aditya Kali Date: Fri, 29 Jan 2016 02:54:08 -0600 Subject: kernfs: define kernfs_node_dentry Add a new kernfs api is added to lookup the dentry for a particular kernfs path. Signed-off-by: Aditya Kali Signed-off-by: Serge E. Hallyn Acked-by: Greg Kroah-Hartman Signed-off-by: Tejun Heo --- fs/kernfs/mount.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/kernfs.h | 2 ++ 2 files changed, 71 insertions(+) (limited to 'include') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 8eaf417187f1..b67dbccdaf88 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "kernfs-internal.h" @@ -62,6 +63,74 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) return NULL; } +/* + * find the next ancestor in the path down to @child, where @parent was the + * ancestor whose descendant we want to find. + * + * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root + * node. If @parent is b, then we return the node for c. + * Passing in d as @parent is not ok. + */ +static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, + struct kernfs_node *parent) +{ + if (child == parent) { + pr_crit_once("BUG in find_next_ancestor: called with parent == child"); + return NULL; + } + + while (child->parent != parent) { + if (!child->parent) + return NULL; + child = child->parent; + } + + return child; +} + +/** + * kernfs_node_dentry - get a dentry for the given kernfs_node + * @kn: kernfs_node for which a dentry is needed + * @sb: the kernfs super_block + */ +struct dentry *kernfs_node_dentry(struct kernfs_node *kn, + struct super_block *sb) +{ + struct dentry *dentry; + struct kernfs_node *knparent = NULL; + + BUG_ON(sb->s_op != &kernfs_sops); + + dentry = dget(sb->s_root); + + /* Check if this is the root kernfs_node */ + if (!kn->parent) + return dentry; + + knparent = find_next_ancestor(kn, NULL); + if (WARN_ON(!knparent)) + return ERR_PTR(-EINVAL); + + do { + struct dentry *dtmp; + struct kernfs_node *kntmp; + + if (kn == knparent) + return dentry; + kntmp = find_next_ancestor(kn, knparent); + if (WARN_ON(!kntmp)) + return ERR_PTR(-EINVAL); + mutex_lock(&d_inode(dentry)->i_mutex); + dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name)); + mutex_unlock(&d_inode(dentry)->i_mutex); + dput(dentry); + if (IS_ERR(dtmp)) + return dtmp; + knparent = kntmp; + dentry = dtmp; + } while (true); +} + static int kernfs_fill_super(struct super_block *sb, unsigned long magic) { struct kernfs_super_info *info = kernfs_info(sb); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 716bfdede5f5..c06c44242f39 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -284,6 +284,8 @@ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); +struct dentry *kernfs_node_dentry(struct kernfs_node *kn, + struct super_block *sb); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); -- cgit v1.2.3