diff options
-rw-r--r-- | Documentation/sysctl/README | 1 | ||||
-rw-r--r-- | Documentation/sysctl/user.txt | 66 | ||||
-rw-r--r-- | fs/devpts/inode.c | 71 | ||||
-rw-r--r-- | fs/mount.h | 1 | ||||
-rw-r--r-- | fs/namespace.c | 22 | ||||
-rw-r--r-- | fs/proc/proc_sysctl.c | 14 | ||||
-rw-r--r-- | include/linux/cgroup.h | 1 | ||||
-rw-r--r-- | include/linux/ipc_namespace.h | 1 | ||||
-rw-r--r-- | include/linux/pid_namespace.h | 1 | ||||
-rw-r--r-- | include/linux/sysctl.h | 3 | ||||
-rw-r--r-- | include/linux/user_namespace.h | 37 | ||||
-rw-r--r-- | include/linux/utsname.h | 1 | ||||
-rw-r--r-- | include/net/net_namespace.h | 1 | ||||
-rw-r--r-- | ipc/namespace.c | 45 | ||||
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/cgroup.c | 18 | ||||
-rw-r--r-- | kernel/fork.c | 5 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 25 | ||||
-rw-r--r-- | kernel/ucount.c | 235 | ||||
-rw-r--r-- | kernel/user_namespace.c | 74 | ||||
-rw-r--r-- | kernel/utsname.c | 34 | ||||
-rw-r--r-- | net/core/net_namespace.c | 22 | ||||
-rw-r--r-- | net/sysctl_net.c | 4 |
23 files changed, 589 insertions, 95 deletions
diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README index 8c3306e01d52..91f54ffa0077 100644 --- a/Documentation/sysctl/README +++ b/Documentation/sysctl/README @@ -69,6 +69,7 @@ proc/ <empty> sunrpc/ SUN Remote Procedure Call (NFS) vm/ memory management tuning buffer and cache management +user/ Per user per user namespace limits These are the subdirs I have on my system. There might be more or other subdirs in another setup. If you see another dir, I'd diff --git a/Documentation/sysctl/user.txt b/Documentation/sysctl/user.txt new file mode 100644 index 000000000000..1291c498f78f --- /dev/null +++ b/Documentation/sysctl/user.txt @@ -0,0 +1,66 @@ +Documentation for /proc/sys/user/* kernel version 4.9.0 + (c) 2016 Eric Biederman <ebiederm@xmission.com> + +============================================================== + +This file contains the documetation for the sysctl files in +/proc/sys/user. + +The files in this directory can be used to override the default +limits on the number of namespaces and other objects that have +per user per user namespace limits. + +The primary purpose of these limits is to stop programs that +malfunction and attempt to create a ridiculous number of objects, +before the malfunction becomes a system wide problem. It is the +intention that the defaults of these limits are set high enough that +no program in normal operation should run into these limits. + +The creation of per user per user namespace objects are charged to +the user in the user namespace who created the object and +verified to be below the per user limit in that user namespace. + +The creation of objects is also charged to all of the users +who created user namespaces the creation of the object happens +in (user namespaces can be nested) and verified to be below the per user +limits in the user namespaces of those users. + +This recursive counting of created objects ensures that creating a +user namespace does not allow a user to escape their current limits. + +Currently, these files are in /proc/sys/user: + +- max_cgroup_namespaces + + The maximum number of cgroup namespaces that any user in the current + user namespace may create. + +- max_ipc_namespaces + + The maximum number of ipc namespaces that any user in the current + user namespace may create. + +- max_mnt_namespaces + + The maximum number of mount namespaces that any user in the current + user namespace may create. + +- max_net_namespaces + + The maximum number of network namespaces that any user in the + current user namespace may create. + +- max_pid_namespaces + + The maximum number of pid namespaces that any user in the current + user namespace may create. + +- max_user_namespaces + + The maximum number of user namespaces that any user in the current + user namespace may create. + +- max_uts_namespaces + + The maximum number of user namespaces that any user in the current + user namespace may create. diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index d116453b0276..154cc45c19e8 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -272,13 +272,8 @@ static int mknod_ptmx(struct super_block *sb) struct dentry *root = sb->s_root; struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - kuid_t root_uid; - kgid_t root_gid; - - root_uid = make_kuid(current_user_ns(), 0); - root_gid = make_kgid(current_user_ns(), 0); - if (!uid_valid(root_uid) || !gid_valid(root_gid)) - return -EINVAL; + kuid_t ptmx_uid = current_fsuid(); + kgid_t ptmx_gid = current_fsgid(); inode_lock(d_inode(root)); @@ -309,8 +304,8 @@ static int mknod_ptmx(struct super_block *sb) mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); - inode->i_uid = root_uid; - inode->i_gid = root_gid; + inode->i_uid = ptmx_uid; + inode->i_gid = ptmx_gid; d_add(dentry, inode); @@ -336,7 +331,6 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) struct pts_fs_info *fsi = DEVPTS_SB(sb); struct pts_mount_opts *opts = &fsi->mount_opts; - sync_filesystem(sb); err = parse_mount_options(data, PARSE_REMOUNT, opts); /* @@ -395,6 +389,7 @@ static int devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -403,10 +398,16 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; + error = -ENOMEM; s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; + error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); + if (error) + goto fail; + + error = -ENOMEM; inode = new_inode(s); if (!inode) goto fail; @@ -418,13 +419,21 @@ devpts_fill_super(struct super_block *s, void *data, int silent) set_nlink(inode, 2); s->s_root = d_make_root(inode); - if (s->s_root) - return 0; + if (!s->s_root) { + pr_err("get root dentry failed\n"); + goto fail; + } - pr_err("get root dentry failed\n"); + error = mknod_ptmx(s); + if (error) + goto fail_dput; + return 0; +fail_dput: + dput(s->s_root); + s->s_root = NULL; fail: - return -ENOMEM; + return error; } /* @@ -436,43 +445,15 @@ fail: static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int error; - struct pts_mount_opts opts; - struct super_block *s; - - error = parse_mount_options(data, PARSE_MOUNT, &opts); - if (error) - return ERR_PTR(error); - - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (!s->s_root) { - error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) - goto out_undo_sget; - s->s_flags |= MS_ACTIVE; - } - - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); - - error = mknod_ptmx(s); - if (error) - goto out_undo_sget; - - return dget(s->s_root); - -out_undo_sget: - deactivate_locked_super(s); - return ERR_PTR(error); + return mount_nodev(fs_type, flags, data, devpts_fill_super); } static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); - ida_destroy(&fsi->allocated_ptys); + if (fsi) + ida_destroy(&fsi->allocated_ptys); kfree(fsi); kill_litter_super(sb); } diff --git a/fs/mount.h b/fs/mount.h index 14db05d424f7..e037981d8351 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,6 +10,7 @@ struct mnt_namespace { struct mount * root; struct list_head list; struct user_namespace *user_ns; + struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; diff --git a/fs/namespace.c b/fs/namespace.c index fea56f310547..8a0e90eb81d3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2719,9 +2719,20 @@ dput_out: return retval; } +static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); +} + +static void dec_mnt_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); +} + static void free_mnt_ns(struct mnt_namespace *ns) { ns_free_inum(&ns->ns); + dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); } @@ -2738,14 +2749,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + struct ucounts *ucounts; int ret; + ucounts = inc_mnt_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); - if (!new_ns) + if (!new_ns) { + dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); + } ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); + dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } new_ns->ns.ops = &mntns_operations; @@ -2756,6 +2775,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) init_waitqueue_head(&new_ns->poll); new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; return new_ns; } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 1b93650dda2f..a80acdfbe180 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock); static void drop_sysctl_table(struct ctl_table_header *header); static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry, struct nsproxy *namespaces); + struct ctl_table **pentry); static int insert_links(struct ctl_table_header *head); static void put_links(struct ctl_table_header *header); @@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head) } static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +lookup_header_set(struct ctl_table_root *root) { struct ctl_table_set *set = &root->default_set; if (root->lookup) - set = root->lookup(root, namespaces); + set = root->lookup(root); return set; } @@ -491,7 +491,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; if (S_ISLNK(p->mode)) { - ret = sysctl_follow_link(&h, &p, current->nsproxy); + ret = sysctl_follow_link(&h, &p); err = ERR_PTR(ret); if (ret) goto out; @@ -659,7 +659,7 @@ static bool proc_sys_link_fill_cache(struct file *file, if (S_ISLNK(table->mode)) { /* It is not an error if we can not follow the link ignore it */ - int err = sysctl_follow_link(&head, &table, current->nsproxy); + int err = sysctl_follow_link(&head, &table); if (err) goto out; } @@ -976,7 +976,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir) } static int sysctl_follow_link(struct ctl_table_header **phead, - struct ctl_table **pentry, struct nsproxy *namespaces) + struct ctl_table **pentry) { struct ctl_table_header *head; struct ctl_table_root *root; @@ -988,7 +988,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead, ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; - set = lookup_header_set(root, namespaces); + set = lookup_header_set(root); dir = xlate_dir(set, (*phead)->parent); if (IS_ERR(dir)) ret = PTR_ERR(dir); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 984f73b719a9..1ed92812785a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -621,6 +621,7 @@ struct cgroup_namespace { atomic_t count; struct ns_common ns; struct user_namespace *user_ns; + struct ucounts *ucounts; struct css_set *root_cset; }; diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index d10e54f03c09..848e5796400e 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -58,6 +58,7 @@ struct ipc_namespace { /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; + struct ucounts *ucounts; struct ns_common ns; }; diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 918b117a7cd3..34cce96741bc 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -40,6 +40,7 @@ struct pid_namespace { struct fs_pin *bacct; #endif struct user_namespace *user_ns; + struct ucounts *ucounts; struct work_struct proc_work; kgid_t pid_gid; int hide_pid; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 697e160c78d0..f166ca0203e2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -155,8 +155,7 @@ struct ctl_table_set { struct ctl_table_root { struct ctl_table_set default_set; - struct ctl_table_set *(*lookup)(struct ctl_table_root *root, - struct nsproxy *namespaces); + struct ctl_table_set *(*lookup)(struct ctl_table_root *root); int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); }; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 190cf0760815..eb209d4523f5 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -22,6 +22,19 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED +struct ucounts; + +enum ucount_type { + UCOUNT_USER_NAMESPACES, + UCOUNT_PID_NAMESPACES, + UCOUNT_UTS_NAMESPACES, + UCOUNT_IPC_NAMESPACES, + UCOUNT_NET_NAMESPACES, + UCOUNT_MNT_NAMESPACES, + UCOUNT_CGROUP_NAMESPACES, + UCOUNT_COUNTS, +}; + struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; @@ -39,10 +52,30 @@ struct user_namespace { struct key *persistent_keyring_register; struct rw_semaphore persistent_keyring_register_sem; #endif + struct work_struct work; +#ifdef CONFIG_SYSCTL + struct ctl_table_set set; + struct ctl_table_header *sysctls; +#endif + struct ucounts *ucounts; + int ucount_max[UCOUNT_COUNTS]; +}; + +struct ucounts { + struct hlist_node node; + struct user_namespace *ns; + kuid_t uid; + atomic_t count; + atomic_t ucount[UCOUNT_COUNTS]; }; extern struct user_namespace init_user_ns; +bool setup_userns_sysctls(struct user_namespace *ns); +void retire_userns_sysctls(struct user_namespace *ns); +struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); +void dec_ucount(struct ucounts *ucounts, enum ucount_type type); + #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) @@ -54,12 +87,12 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); -extern void free_user_ns(struct user_namespace *ns); +extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && atomic_dec_and_test(&ns->count)) - free_user_ns(ns); + __put_user_ns(ns); } struct seq_operations; diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 5093f58ae192..60f0bb83b313 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -24,6 +24,7 @@ struct uts_namespace { struct kref kref; struct new_utsname name; struct user_namespace *user_ns; + struct ucounts *ucounts; struct ns_common ns; }; extern struct uts_namespace init_uts_ns; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 0933c7455a30..fc4f757107df 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -60,6 +60,7 @@ struct net { struct list_head exit_list; /* Use only net_mutex */ struct user_namespace *user_ns; /* Owning user namespace */ + struct ucounts *ucounts; spinlock_t nsid_lock; struct idr netns_ids; diff --git a/ipc/namespace.c b/ipc/namespace.c index 578d93be619d..0abdea496493 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -16,39 +16,61 @@ #include "util.h" +static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); +} + +static void dec_ipc_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES); +} + static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, struct ipc_namespace *old_ns) { struct ipc_namespace *ns; + struct ucounts *ucounts; int err; + err = -ENOSPC; + ucounts = inc_ipc_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL); if (ns == NULL) - return ERR_PTR(-ENOMEM); + goto fail_dec; err = ns_alloc_inum(&ns->ns); - if (err) { - kfree(ns); - return ERR_PTR(err); - } + if (err) + goto fail_free; ns->ns.ops = &ipcns_operations; atomic_set(&ns->count, 1); ns->user_ns = get_user_ns(user_ns); + ns->ucounts = ucounts; err = mq_init_ns(ns); - if (err) { - put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); - return ERR_PTR(err); - } + if (err) + goto fail_put; sem_init_ns(ns); msg_init_ns(ns); shm_init_ns(ns); return ns; + +fail_put: + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); +fail_free: + kfree(ns); +fail_dec: + dec_ipc_namespaces(ucounts); +fail: + return ERR_PTR(err); } struct ipc_namespace *copy_ipcs(unsigned long flags, @@ -96,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) msg_exit_ns(ns); shm_exit_ns(ns); + dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b952..eb26e12c6c2a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o + async.o range.o smpboot.o ucount.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86b0e8b16426..d6504338e284 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -6295,6 +6295,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) /* cgroup namespaces */ +static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES); +} + +static void dec_cgroup_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES); +} + static struct cgroup_namespace *alloc_cgroup_ns(void) { struct cgroup_namespace *new_ns; @@ -6316,6 +6326,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) void free_cgroup_ns(struct cgroup_namespace *ns) { put_css_set(ns->root_cset); + dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); @@ -6327,6 +6338,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct cgroup_namespace *old_ns) { struct cgroup_namespace *new_ns; + struct ucounts *ucounts; struct css_set *cset; BUG_ON(!old_ns); @@ -6340,6 +6352,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); + ucounts = inc_cgroup_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + /* It is not safe to take cgroup_mutex here */ spin_lock_irq(&css_set_lock); cset = task_css_set(current); @@ -6349,10 +6365,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, new_ns = alloc_cgroup_ns(); if (IS_ERR(new_ns)) { put_css_set(cset); + dec_cgroup_namespaces(ucounts); return new_ns; } new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; new_ns->root_cset = cset; return new_ns; diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4a866..3cb4853a59aa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -302,6 +302,7 @@ int arch_task_struct_size __read_mostly; void __init fork_init(void) { + int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -321,6 +322,10 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < UCOUNT_COUNTS; i++) { + init_user_ns.ucount_max[i] = max_threads/2; + } } int __weak arch_dup_task_struct(struct task_struct *dst, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 4fa2d56a936c..df9e8e9e0be7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work) /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 +static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); +} + +static void dec_pid_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); +} + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; + struct ucounts *ucounts; int i; int err; - if (level > MAX_PID_NS_LEVEL) { - err = -EINVAL; + err = -ENOSPC; + if (level > MAX_PID_NS_LEVEL) + goto out; + ucounts = inc_pid_namespaces(user_ns); + if (!ucounts) goto out; - } err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) - goto out; + goto out_dec; ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!ns->pidmap[0].page) @@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + ns->ucounts = ucounts; ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); @@ -129,6 +143,8 @@ out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); +out_dec: + dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } @@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); + dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); call_rcu(&ns->rcu, delayed_free_pidns); } diff --git a/kernel/ucount.c b/kernel/ucount.c new file mode 100644 index 000000000000..9d20d5dd298a --- /dev/null +++ b/kernel/ucount.c @@ -0,0 +1,235 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/stat.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/hash.h> +#include <linux/user_namespace.h> + +#define UCOUNTS_HASHTABLE_BITS 10 +static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; +static DEFINE_SPINLOCK(ucounts_lock); + +#define ucounts_hashfn(ns, uid) \ + hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ + UCOUNTS_HASHTABLE_BITS) +#define ucounts_hashentry(ns, uid) \ + (ucounts_hashtable + ucounts_hashfn(ns, uid)) + + +#ifdef CONFIG_SYSCTL +static struct ctl_table_set * +set_lookup(struct ctl_table_root *root) +{ + return ¤t_user_ns()->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t_user_ns()->set == set; +} + +static int set_permissions(struct ctl_table_header *head, + struct ctl_table *table) +{ + struct user_namespace *user_ns = + container_of(head->set, struct user_namespace, set); + int mode; + + /* Allow users with CAP_SYS_RESOURCE unrestrained access */ + if (ns_capable(user_ns, CAP_SYS_RESOURCE)) + mode = (table->mode & S_IRWXU) >> 6; + else + /* Allow all others at most read-only access */ + mode = table->mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, + .permissions = set_permissions, +}; + +static int zero = 0; +static int int_max = INT_MAX; +#define UCOUNT_ENTRY(name) \ + { \ + .procname = name, \ + .maxlen = sizeof(int), \ + .mode = 0644, \ + .proc_handler = proc_dointvec_minmax, \ + .extra1 = &zero, \ + .extra2 = &int_max, \ + } +static struct ctl_table user_table[] = { + UCOUNT_ENTRY("max_user_namespaces"), + UCOUNT_ENTRY("max_pid_namespaces"), + UCOUNT_ENTRY("max_uts_namespaces"), + UCOUNT_ENTRY("max_ipc_namespaces"), + UCOUNT_ENTRY("max_net_namespaces"), + UCOUNT_ENTRY("max_mnt_namespaces"), + UCOUNT_ENTRY("max_cgroup_namespaces"), + { } +}; +#endif /* CONFIG_SYSCTL */ + +bool setup_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + setup_sysctl_set(&ns->set, &set_root, set_is_seen); + tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL); + if (tbl) { + int i; + for (i = 0; i < UCOUNT_COUNTS; i++) { + tbl[i].data = &ns->ucount_max[i]; + } + ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl); + } + if (!ns->sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->set); + return false; + } +#endif + return true; +} + +void retire_userns_sysctls(struct user_namespace *ns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + tbl = ns->sysctls->ctl_table_arg; + unregister_sysctl_table(ns->sysctls); + retire_sysctl_set(&ns->set); + kfree(tbl); +#endif +} + +static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) +{ + struct ucounts *ucounts; + + hlist_for_each_entry(ucounts, hashent, node) { + if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) + return ucounts; + } + return NULL; +} + +static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +{ + struct hlist_head *hashent = ucounts_hashentry(ns, uid); + struct ucounts *ucounts, *new; + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (!ucounts) { + spin_unlock(&ucounts_lock); + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->ns = ns; + new->uid = uid; + atomic_set(&new->count, 0); + + spin_lock(&ucounts_lock); + ucounts = find_ucounts(ns, uid, hashent); + if (ucounts) { + kfree(new); + } else { + hlist_add_head(&new->node, hashent); + ucounts = new; + } + } + if (!atomic_add_unless(&ucounts->count, 1, INT_MAX)) + ucounts = NULL; + spin_unlock(&ucounts_lock); + return ucounts; +} + +static void put_ucounts(struct ucounts *ucounts) +{ + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock(&ucounts_lock); + hlist_del_init(&ucounts->node); + spin_unlock(&ucounts_lock); + + kfree(ucounts); + } +} + +static inline bool atomic_inc_below(atomic_t *v, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c >= u)) + return false; + old = atomic_cmpxchg(v, c, c+1); + if (likely(old == c)) + return true; + c = old; + } +} + +struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, + enum ucount_type type) +{ + struct ucounts *ucounts, *iter, *bad; + struct user_namespace *tns; + ucounts = get_ucounts(ns, uid); + for (iter = ucounts; iter; iter = tns->ucounts) { + int max; + tns = iter->ns; + max = READ_ONCE(tns->ucount_max[type]); + if (!atomic_inc_below(&iter->ucount[type], max)) + goto fail; + } + return ucounts; +fail: + bad = iter; + for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) + atomic_dec(&iter->ucount[type]); + + put_ucounts(ucounts); + return NULL; +} + +void dec_ucount(struct ucounts *ucounts, enum ucount_type type) +{ + struct ucounts *iter; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + int dec = atomic_dec_if_positive(&iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + } + put_ucounts(ucounts); +} + +static __init int user_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + static struct ctl_table_header *user_header; + static struct ctl_table empty[1]; + /* + * It is necessary to register the user directory in the + * default set so that registrations in the child sets work + * properly. + */ + user_header = register_sysctl("user", empty); + BUG_ON(!user_header); + BUG_ON(!setup_userns_sysctls(&init_user_ns)); +#endif + return 0; +} +subsys_initcall(user_namespace_sysctl_init); + + diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a58a219b99c6..86b7854fec8e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); +static void free_user_ns(struct work_struct *work); + +static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) +{ + return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); +} + +static void dec_user_namespaces(struct ucounts *ucounts) +{ + return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); +} static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { @@ -62,10 +73,16 @@ int create_user_ns(struct cred *new) struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; - int ret; + struct ucounts *ucounts; + int ret, i; + ret = -ENOSPC; if (parent_ns->level > 32) - return -EUSERS; + goto fail; + + ucounts = inc_user_namespaces(parent_ns, owner); + if (!ucounts) + goto fail; /* * Verify that we can not violate the policy of which files @@ -73,26 +90,27 @@ int create_user_ns(struct cred *new) * by verifing that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ + ret = -EPERM; if (current_chrooted()) - return -EPERM; + goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ + ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) - return -EPERM; + goto fail_dec; + ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) - return -ENOMEM; + goto fail_dec; ret = ns_alloc_inum(&ns->ns); - if (ret) { - kmem_cache_free(user_ns_cachep, ns); - return ret; - } + if (ret) + goto fail_free; ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); @@ -101,18 +119,37 @@ int create_user_ns(struct cred *new) ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; + INIT_WORK(&ns->work, free_user_ns); + for (i = 0; i < UCOUNT_COUNTS; i++) { + ns->ucount_max[i] = INT_MAX; + } + ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); - set_cred_user_ns(new, ns); - #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif + ret = -ENOMEM; + if (!setup_userns_sysctls(ns)) + goto fail_keyring; + + set_cred_user_ns(new, ns); return 0; +fail_keyring: +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif + ns_free_inum(&ns->ns); +fail_free: + kmem_cache_free(user_ns_cachep, ns); +fail_dec: + dec_user_namespaces(ucounts); +fail: + return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) @@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) return err; } -void free_user_ns(struct user_namespace *ns) +static void free_user_ns(struct work_struct *work) { - struct user_namespace *parent; + struct user_namespace *parent, *ns = + container_of(work, struct user_namespace, work); do { + struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); + dec_user_namespaces(ucounts); ns = parent; } while (atomic_dec_and_test(&parent->count)); } -EXPORT_SYMBOL(free_user_ns); + +void __put_user_ns(struct user_namespace *ns) +{ + schedule_work(&ns->work); +} +EXPORT_SYMBOL(__put_user_ns); static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { diff --git a/kernel/utsname.c b/kernel/utsname.c index e1211a8a5c18..6976cd47dcf6 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -17,6 +17,16 @@ #include <linux/user_namespace.h> #include <linux/proc_ns.h> +static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); +} + +static void dec_uts_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); +} + static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; @@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; + struct ucounts *ucounts; int err; + err = -ENOSPC; + ucounts = inc_uts_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; ns = create_uts_ns(); if (!ns) - return ERR_PTR(-ENOMEM); + goto fail_dec; err = ns_alloc_inum(&ns->ns); - if (err) { - kfree(ns); - return ERR_PTR(err); - } + if (err) + goto fail_free; + ns->ucounts = ucounts; ns->ns.ops = &utsns_operations; down_read(&uts_sem); @@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; + +fail_free: + kfree(ns); +fail_dec: + dec_uts_namespaces(ucounts); +fail: + return ERR_PTR(err); } /* @@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 861efa34f08c..e8be581b47b0 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -266,6 +266,16 @@ struct net *get_net_ns_by_id(struct net *net, int id) return peer; } +static struct ucounts *inc_net_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES); +} + +static void dec_net_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); +} + /* * setup_net runs the initializers for the network namespace object. */ @@ -351,19 +361,27 @@ void net_drop_ns(void *p) struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, struct net *old_net) { + struct ucounts *ucounts; struct net *net; int rv; if (!(flags & CLONE_NEWNET)) return get_net(old_net); + ucounts = inc_net_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + net = net_alloc(); - if (!net) + if (!net) { + dec_net_namespaces(ucounts); return ERR_PTR(-ENOMEM); + } get_user_ns(user_ns); mutex_lock(&net_mutex); + net->ucounts = ucounts; rv = setup_net(net, user_ns); if (rv == 0) { rtnl_lock(); @@ -372,6 +390,7 @@ struct net *copy_net_ns(unsigned long flags, } mutex_unlock(&net_mutex); if (rv < 0) { + dec_net_namespaces(ucounts); put_user_ns(user_ns); net_drop_ns(net); return ERR_PTR(rv); @@ -444,6 +463,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + dec_net_namespaces(net->ucounts); put_user_ns(net->user_ns); net_drop_ns(net); } diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 46a71c701e7c..ba9b5d1a31df 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -27,9 +27,9 @@ #endif static struct ctl_table_set * -net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces) +net_ctl_header_lookup(struct ctl_table_root *root) { - return &namespaces->net_ns->sysctls; + return ¤t->nsproxy->net_ns->sysctls; } static int is_seen(struct ctl_table_set *set) |