From e3912ac37e07a13c70675cd75020694de4841c74 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:36:51 -0800 Subject: proc: use %u for pid printing and slightly less stack PROC_NUMBUF is 13 which is enough for "negative int + \n + \0". However PIDs and TGIDs are never negative and newline is not a concern, so use just 10 per integer. Link: http://lkml.kernel.org/r/20171120203005.GA27743@avx2 Signed-off-by: Alexey Dobriyan Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 ++++++++-------- fs/proc/fd.c | 2 +- fs/proc/self.c | 6 +++--- fs/proc/thread_self.c | 5 ++--- 4 files changed, 14 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 60316b52d659..fe56f3c7002a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3018,11 +3018,11 @@ static const struct inode_operations proc_tgid_base_inode_operations = { static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) { struct dentry *dentry, *leader, *dir; - char buf[PROC_NUMBUF]; + char buf[10 + 1]; struct qstr name; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { @@ -3034,7 +3034,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) return; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", tgid); + name.len = snprintf(buf, sizeof(buf), "%u", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); if (!leader) goto out; @@ -3046,7 +3046,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) goto out_put_leader; name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%d", pid); + name.len = snprintf(buf, sizeof(buf), "%u", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { d_invalidate(dentry); @@ -3225,14 +3225,14 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) continue; - len = snprintf(name, sizeof(name), "%d", iter.tgid); + len = snprintf(name, sizeof(name), "%u", iter.tgid); ctx->pos = iter.tgid + TGID_OFFSET; if (!proc_fill_cache(file, ctx, name, len, proc_pid_instantiate, iter.task, NULL)) { @@ -3560,10 +3560,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; tid = task_pid_nr_ns(task, ns); - len = snprintf(name, sizeof(name), "%d", tid); + len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 96fc70225e54..6b80cd1e419a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -236,7 +236,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, for (fd = ctx->pos - 2; fd < files_fdtable(files)->max_fds; fd++, ctx->pos++) { - char name[PROC_NUMBUF]; + char name[10 + 1]; int len; if (!fcheck_files(files, fd)) diff --git a/fs/proc/self.c b/fs/proc/self.c index 31326bb23b8b..d30627aa440b 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -17,11 +17,11 @@ static const char *proc_self_get_link(struct dentry *dentry, if (!tgid) return ERR_PTR(-ENOENT); - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC); + /* max length of unsigned int in decimal + NULL term */ + name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d", tgid); + sprintf(name, "%u", tgid); set_delayed_call(done, kfree_link, name); return name; } diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b813e3b529f2..6c1a54716337 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -18,11 +18,10 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, if (!pid) return ERR_PTR(-ENOENT); - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, - dentry ? GFP_KERNEL : GFP_ATOMIC); + name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); - sprintf(name, "%d/task/%d", tgid, pid); + sprintf(name, "%u/task/%u", tgid, pid); set_delayed_call(done, kfree_link, name); return name; } -- cgit v1.2.3 From 9f7118b2007d5e7c7a061550d2ca2ecb841537dc Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:36:55 -0800 Subject: proc: don't use READ_ONCE/WRITE_ONCE for /proc/*/fail-nth READ_ONCE and WRITE_ONCE are useless when there is only one read/write is being made. Link: http://lkml.kernel.org/r/20171120204033.GA9446@avx2 Signed-off-by: Alexey Dobriyan Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index fe56f3c7002a..373091249bdb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1370,7 +1370,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - WRITE_ONCE(task->fail_nth, n); + task->fail_nth = n; put_task_struct(task); return count; @@ -1386,8 +1386,7 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, task = get_proc_task(file_inode(file)); if (!task) return -ESRCH; - len = snprintf(numbuf, sizeof(numbuf), "%u\n", - READ_ONCE(task->fail_nth)); + len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); len = simple_read_from_buffer(buf, count, ppos, numbuf, len); put_task_struct(task); -- cgit v1.2.3 From ac7f1061c2c11bb8936b1b6a94cdb48de732f7a4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:36:59 -0800 Subject: proc: fix /proc/*/map_files lookup Current code does: if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) However sscanf() is broken garbage. It silently accepts whitespace between format specifiers (did you know that?). It silently accepts valid strings which result in integer overflow. Do not use sscanf() for any even remotely reliable parsing code. OK # readlink '/proc/1/map_files/55a23af39000-55a23b05b000' /lib/systemd/systemd broken # readlink '/proc/1/map_files/ 55a23af39000-55a23b05b000' /lib/systemd/systemd broken # readlink '/proc/1/map_files/55a23af39000-55a23b05b000 ' /lib/systemd/systemd very broken # readlink '/proc/1/map_files/1000000000000000055a23af39000-55a23b05b000' /lib/systemd/systemd Andrei said: : This patch breaks criu. It was a bug in criu. And this bug is on a minor : path, which works when memfd_create() isn't available. It is a reason why : I ask to not backport this patch to stable kernels. : : In CRIU this bug can be triggered, only if this patch will be backported : to a kernel which version is lower than v3.16. Link: http://lkml.kernel.org/r/20171120212706.GA14325@avx2 Signed-off-by: Alexey Dobriyan Cc: Pavel Emelyanov Cc: Andrei Vagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 373091249bdb..4c12cb2cd704 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -100,6 +100,8 @@ #include "internal.h" #include "fd.h" +#include "../../lib/kstrtox.h" + /* NOTE: * Implementing inode permission operations in /proc is almost * certainly an error. Permission checks need to happen during @@ -1906,8 +1908,33 @@ end_instantiate: static int dname_to_vma_addr(struct dentry *dentry, unsigned long *start, unsigned long *end) { - if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + const char *str = dentry->d_name.name; + unsigned long long sval, eval; + unsigned int len; + + len = _parse_integer(str, 16, &sval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (sval != (unsigned long)sval) + return -EINVAL; + str += len; + + if (*str != '-') return -EINVAL; + str++; + + len = _parse_integer(str, 16, &eval); + if (len & KSTRTOX_OVERFLOW) + return -EINVAL; + if (eval != (unsigned long)eval) + return -EINVAL; + str += len; + + if (*str != '\0') + return -EINVAL; + + *start = sval; + *end = eval; return 0; } -- cgit v1.2.3 From 593bc695a1102a540f1613c651e73693b17a7343 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:02 -0800 Subject: fs/proc/vmcore.c: simpler /proc/vmcore cleanup Iterators aren't necessary as you can just grab the first entry and delete it until no entries left. Link: http://lkml.kernel.org/r/20171121191121.GA20757@avx2 Signed-off-by: Alexey Dobriyan Cc: Mahesh Salgaonkar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 885d445afa0d..a45f0af22a60 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1178,18 +1178,16 @@ fs_initcall(vmcore_init); /* Cleanup function for vmcore module. */ void vmcore_cleanup(void) { - struct list_head *pos, *next; - if (proc_vmcore) { proc_remove(proc_vmcore); proc_vmcore = NULL; } /* clear the vmcore list. */ - list_for_each_safe(pos, next, &vmcore_list) { + while (!list_empty(&vmcore_list)) { struct vmcore *m; - m = list_entry(pos, struct vmcore, list); + m = list_first_entry(&vmcore_list, struct vmcore, list); list_del(&m->list); kfree(m); } -- cgit v1.2.3 From 20d28cde5558a2a211620254ec7bc53a4334167f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:06 -0800 Subject: proc: less memory for /proc/*/map_files readdir dentry name can be evaluated later, right before calling into VFS. Also, spend less time under ->mmap_sem. Link: http://lkml.kernel.org/r/20171110163034.GA2534@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 4c12cb2cd704..a3efc2427c74 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2026,9 +2026,9 @@ out: } struct map_files_info { + unsigned long start; + unsigned long end; fmode_t mode; - unsigned int len; - unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; /* @@ -2198,10 +2198,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) if (++pos <= ctx->pos) continue; + info.start = vma->vm_start; + info.end = vma->vm_end; info.mode = vma->vm_file->f_mode; - info.len = snprintf(info.name, - sizeof(info.name), "%lx-%lx", - vma->vm_start, vma->vm_end); if (flex_array_put(fa, i++, &info, GFP_KERNEL)) BUG(); } @@ -2209,9 +2208,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) up_read(&mm->mmap_sem); for (i = 0; i < nr_files; i++) { + char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ + unsigned int len; + p = flex_array_get(fa, i); + len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, - p->name, p->len, + buf, len, proc_map_files_instantiate, task, (void *)(unsigned long)p->mode)) -- cgit v1.2.3 From 171ef917dfe721b1437b0066f7bc5684d776bba8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:10 -0800 Subject: fs/proc/array.c: delete children_seq_release() It is 1:1 wrapper around seq_release(). Link: http://lkml.kernel.org/r/20171122171510.GA12161@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index d67a72dcb92c..598803576e4c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -736,16 +736,10 @@ static int children_seq_open(struct inode *inode, struct file *file) return ret; } -int children_seq_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - return 0; -} - const struct file_operations proc_tid_children_operations = { .open = children_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = children_seq_release, + .release = seq_release, }; #endif /* CONFIG_PROC_CHILDREN */ -- cgit v1.2.3 From d0290bc20d4739b7a900ae37eb5d4cc3be2b393f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 6 Feb 2018 15:37:13 -0800 Subject: fs/proc/kcore.c: use probe_kernel_read() instead of memcpy() Commit df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data") added a bounce buffer to avoid hardened usercopy checks. Copying to the bounce buffer was implemented with a simple memcpy() assuming that it is always valid to read from kernel memory iff the kern_addr_valid() check passed. A simple, but pointless, test case like "dd if=/proc/kcore of=/dev/null" now can easily crash the kernel, since the former execption handling on invalid kernel addresses now doesn't work anymore. Also adding a kern_addr_valid() implementation wouldn't help here. Most architectures simply return 1 here, while a couple implemented a page table walk to figure out if something is mapped at the address in question. With DEBUG_PAGEALLOC active mappings are established and removed all the time, so that relying on the result of kern_addr_valid() before executing the memcpy() also doesn't work. Therefore simply use probe_kernel_read() to copy to the bounce buffer. This also allows to simplify read_kcore(). At least on s390 this fixes the observed crashes and doesn't introduce warnings that were removed with df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data"), even though the generic probe_kernel_read() implementation uses uaccess functions. While looking into this I'm also wondering if kern_addr_valid() could be completely removed...(?) Link: http://lkml.kernel.org/r/20171202132739.99971-1-heiko.carstens@de.ibm.com Fixes: df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data") Fixes: f5509cc18daa ("mm: Hardened usercopy") Signed-off-by: Heiko Carstens Acked-by: Kees Cook Cc: Jiri Olsa Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4bc85cb8be6a..e8a93bc8285d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,23 +512,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) return -EFAULT; } else { if (kern_addr_valid(start)) { - unsigned long n; - /* * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - memcpy(buf, (char *) start, tsz); - n = copy_to_user(buffer, buf, tsz); - /* - * We cannot distinguish between fault on source - * and fault on destination. When this happens - * we clear too and hope it will trigger the - * EFAULT again. - */ - if (n) { - if (clear_user(buffer + tsz - n, - n)) + if (probe_kernel_read(buf, (void *) start, tsz)) { + if (clear_user(buffer, tsz)) + return -EFAULT; + } else { + if (copy_to_user(buffer, buf, tsz)) return -EFAULT; } } else { -- cgit v1.2.3 From 163cf548db888710695d5dbe907cda4262d45b52 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:18 -0800 Subject: fs/proc/internal.h: rearrange struct proc_dir_entry struct proc_dir_entry became bit messy over years: * move 16-bit ->mode_t before namelen to get rid of padding * make ->in_use first field: it seems to be most used resulting in smaller code on x86_64 (defconfig): add/remove: 0/0 grow/shrink: 7/13 up/down: 24/-67 (-43) Function old new delta proc_readdir_de 451 455 +4 proc_get_inode 282 286 +4 pde_put 65 69 +4 remove_proc_subtree 294 297 +3 remove_proc_entry 297 300 +3 proc_register 295 298 +3 proc_notify_change 94 97 +3 unuse_pde 27 26 -1 proc_reg_write 89 85 -4 proc_reg_unlocked_ioctl 85 81 -4 proc_reg_read 89 85 -4 proc_reg_llseek 87 83 -4 proc_reg_get_unmapped_area 123 119 -4 proc_entry_rundown 139 135 -4 proc_reg_poll 91 85 -6 proc_reg_mmap 79 73 -6 proc_get_link 55 49 -6 proc_reg_release 108 101 -7 proc_reg_open 298 291 -7 close_pdeo 228 218 -10 * move writeable fields together to a first cacheline (on x86_64), those include * ->in_use: reference count, taken every open/read/write/close etc * ->count: reference count, taken at readdir on every entry * ->pde_openers: tracks (nearly) every open, dirtied * ->pde_unload_lock: spinlock protecting ->pde_openers * ->proc_iops, ->proc_fops, ->data: writeonce fields, used right together with previous group. * other rarely written fields go into 1st/2nd and 2nd/3rd cacheline on 32-bit and 64-bit respectively. Additionally on 32-bit, ->subdir, ->subdir_node, ->namelen, ->name go fully into 2nd cacheline, separated from writeable fields. They are all used during lookup. Link: http://lkml.kernel.org/r/20171220215914.GA7877@avx2 Signed-off-by: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 4a67188c8d74..a290a1e921a5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -31,24 +31,27 @@ struct mempolicy; * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { + /* + * number of callers into module in progress; + * negative -> it's going away RSN + */ + atomic_t in_use; + atomic_t count; /* use count */ + struct list_head pde_openers; /* who did ->open, but not ->release */ + spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + struct completion *pde_unload_completion; + const struct inode_operations *proc_iops; + const struct file_operations *proc_fops; + void *data; unsigned int low_ino; - umode_t mode; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; - const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; struct proc_dir_entry *parent; struct rb_root_cached subdir; struct rb_node subdir_node; - void *data; - atomic_t count; /* use count */ - atomic_t in_use; /* number of callers into module in progress; */ - /* negative -> it's going away RSN */ - struct completion *pde_unload_completion; - struct list_head pde_openers; /* who did ->open, but not ->release */ - spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + umode_t mode; u8 namelen; char name[]; } __randomize_layout; -- cgit v1.2.3 From 53f63345d893df36b58e81ddb3d11dcd2e9cc966 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:21 -0800 Subject: fs/proc/internal.h: fix up comment Document what ->pde_unload_lock actually does. Link: http://lkml.kernel.org/r/20180103185120.GB31849@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a290a1e921a5..5ba317874f0d 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -38,7 +38,8 @@ struct proc_dir_entry { atomic_t in_use; atomic_t count; /* use count */ struct list_head pde_openers; /* who did ->open, but not ->release */ - spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */ + /* protects ->pde_openers and all struct pde_opener instances */ + spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; -- cgit v1.2.3 From efb1a57d90cae6af1ddd32f1b920c924a711aba5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:24 -0800 Subject: fs/proc: use __ro_after_init /proc/self inode numbers, value of proc_inode_cache and st_nlink of /proc/$TGID are fixed constants. Link: http://lkml.kernel.org/r/20180103184707.GA31849@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 5 +++-- fs/proc/inode.c | 3 ++- fs/proc/self.c | 3 ++- fs/proc/thread_self.c | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index a3efc2427c74..9298324325ed 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -112,8 +113,8 @@ * in /proc for a task before it execs a suid executable. */ -static u8 nlink_tid; -static u8 nlink_tgid; +static u8 nlink_tid __ro_after_init; +static u8 nlink_tgid __ro_after_init; struct pid_entry { const char *name; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8dacaabb9f37..c5c8e7af5520 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -5,6 +5,7 @@ * Copyright (C) 1991, 1992 Linus Torvalds */ +#include #include #include #include @@ -52,7 +53,7 @@ static void proc_evict_inode(struct inode *inode) } } -static struct kmem_cache * proc_inode_cachep; +static struct kmem_cache *proc_inode_cachep __ro_after_init; static struct inode *proc_alloc_inode(struct super_block *sb) { diff --git a/fs/proc/self.c b/fs/proc/self.c index d30627aa440b..4d7d061696b3 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -30,7 +31,7 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum; +static unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 6c1a54716337..9d2efaca499f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -30,7 +31,7 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum; +static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { -- cgit v1.2.3 From 15b158b4e6274351fc3cf652cbabc57104efb547 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:28 -0800 Subject: proc: spread likely/unlikely a bit use_pde() is used at every open/read/write/... of every random /proc file. Negative refcount happens only if PDE is being deleted by module (read: never). So it gets "likely". unuse_pde() gets "unlikely" for the same reason. close_pdeo() gets unlikely as the completion is filled only if there is a race between PDE removal and close() (read: never ever). It even saves code on x86_64 defconfig: add/remove: 0/0 grow/shrink: 1/2 up/down: 2/-20 (-18) Function old new delta close_pdeo 183 185 +2 proc_reg_get_unmapped_area 119 111 -8 proc_reg_poll 85 73 -12 Link: http://lkml.kernel.org/r/20180104175657.GA5204@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index c5c8e7af5520..6e8724958116 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -129,12 +129,12 @@ enum {BIAS = -1U<<31}; static inline int use_pde(struct proc_dir_entry *pde) { - return atomic_inc_unless_negative(&pde->in_use); + return likely(atomic_inc_unless_negative(&pde->in_use)); } static void unuse_pde(struct proc_dir_entry *pde) { - if (atomic_dec_return(&pde->in_use) == BIAS) + if (unlikely(atomic_dec_return(&pde->in_use) == BIAS)) complete(pde->pde_unload_completion); } @@ -167,7 +167,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); - if (pdeo->c) + if (unlikely(pdeo->c)) complete(pdeo->c); kfree(pdeo); } @@ -421,7 +421,7 @@ static const char *proc_get_link(struct dentry *dentry, struct delayed_call *done) { struct proc_dir_entry *pde = PDE(inode); - if (unlikely(!use_pde(pde))) + if (!use_pde(pde)) return ERR_PTR(-EINVAL); set_delayed_call(done, proc_put_link, pde); return pde->data; -- cgit v1.2.3 From 93ad5bc6d4addb74e30d421cd3ba5249c961fb3e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:37:31 -0800 Subject: proc: rearrange args Rearrange args for smaller code. lookup revolves around memcmp() which gets len 3rd arg, so propagate length as 3rd arg. readdir and lookup add additional arg to VFS ->readdir and ->lookup, so better add it to the end. Space savings on x86_64: add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-18 (-18) Function old new delta proc_readdir 22 13 -9 proc_lookup 18 9 -9 proc_match() is smaller if not inlined, I promise! Link: http://lkml.kernel.org/r/20180104175958.GB5204@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 18 +++++++++--------- fs/proc/internal.h | 5 ++--- fs/proc/proc_net.c | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 793a67574668..5d709fa8f3a2 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -28,7 +28,7 @@ static DEFINE_RWLOCK(proc_subdir_lock); -static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) +static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) return -1; @@ -60,7 +60,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, struct proc_dir_entry *de = rb_entry(node, struct proc_dir_entry, subdir_node); - int result = proc_match(len, name, de); + int result = proc_match(name, de, len); if (result < 0) node = node->rb_left; @@ -84,7 +84,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *this = rb_entry(*new, struct proc_dir_entry, subdir_node); - int result = proc_match(de->namelen, de->name, this); + int result = proc_match(de->name, this, de->namelen); parent = *new; if (result < 0) @@ -211,8 +211,8 @@ void proc_free_inum(unsigned int inum) * Don't create negative dentries here, return -ENOENT by hand * instead. */ -struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, - struct dentry *dentry) +struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, + struct proc_dir_entry *de) { struct inode *inode; @@ -235,7 +235,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - return proc_lookup_de(PDE(dir), dir, dentry); + return proc_lookup_de(dir, dentry, PDE(dir)); } /* @@ -247,8 +247,8 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, * value of the readdir() call, as long as it's non-negative * for success.. */ -int proc_readdir_de(struct proc_dir_entry *de, struct file *file, - struct dir_context *ctx) +int proc_readdir_de(struct file *file, struct dir_context *ctx, + struct proc_dir_entry *de) { int i; @@ -292,7 +292,7 @@ int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - return proc_readdir_de(PDE(inode), file, ctx); + return proc_readdir_de(file, ctx, PDE(inode)); } /* diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5ba317874f0d..d697c8ab0a14 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -153,10 +153,9 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i * generic.c */ extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); -extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, - struct dentry *); +struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); -extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *); +int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a2bf369c923d..68c06ae7888c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -135,7 +135,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { - de = proc_lookup_de(net->proc_net, dir, dentry); + de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; @@ -172,7 +172,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { - ret = proc_readdir_de(net->proc_net, file, ctx); + ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; -- cgit v1.2.3 From 4bf8ba811ac1102d7de6f73af3b9f323463e16c0 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 6 Feb 2018 15:37:34 -0800 Subject: fs/proc/consoles.c: use seq_putc() in show_console_dev() A single character (line break) should be put into a sequence. Thus use the corresponding function "seq_putc". This issue was detected by using the Coccinelle software. Link: http://lkml.kernel.org/r/04fb69fe-d820-9141-820f-07e9a48f4635@users.sourceforge.net Signed-off-by: Markus Elfring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/consoles.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index 290ba85cb900..a8ac48aebd59 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -55,8 +55,7 @@ static int show_console_dev(struct seq_file *m, void *v) if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); - seq_printf(m, "\n"); - + seq_putc(m, '\n'); return 0; } -- cgit v1.2.3 From 60c9d92f887f4606d363fece7a36c92664dc64c6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 6 Feb 2018 15:39:13 -0800 Subject: elf: fix NT_FILE integer overflow If vm.max_map_count bumped above 2^26 (67+ mil) and system has enough RAM to allocate all the VMAs (~12.8 GB on Fedora 27 with 200-byte VMAs), then it should be possible to overflow 32-bit "size", pass paranoia check, allocate very little vmalloc space and oops while writing into vmalloc guard page... But I didn't test this, only coredump of regular process. Link: http://lkml.kernel.org/r/20180112203427.GA9109@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 83732fef510d..bdb201230bae 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1599,6 +1599,8 @@ static int fill_files_note(struct memelfnote *note) /* *Estimated* file count and total data size needed */ count = current->mm->map_count; + if (count > UINT_MAX / 64) + return -EINVAL; size = count * 64; names_ofs = (2 + 3 * count) * sizeof(data[0]); -- cgit v1.2.3 From fb04b91bc2c3a83e9e2ba9c5ce0f0124dd3ffef0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 6 Feb 2018 15:39:21 -0800 Subject: nilfs2: use time64_t internally The superblock and segment timestamps are used only internally in nilfs2 and can be read out using sysfs. Since we are using the old 'get_seconds()' interface and store the data as timestamps, the behavior differs slightly between 64-bit and 32-bit kernels, the latter will show incorrect timestamps after 2038 in sysfs, and presumably fail completely in 2106 as comparisons go wrong. This changes nilfs2 to use time64_t with ktime_get_real_seconds() to handle timestamps, making the behavior consistent and correct on both 32-bit and 64-bit machines. The on-disk format already uses 64-bit timestamps, so nothing changes there. Link: http://lkml.kernel.org/r/20180122211050.1286441-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Ryusuke Konishi Cc: Jens Axboe Cc: Ingo Molnar Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segbuf.c | 2 +- fs/nilfs2/segbuf.h | 4 ++-- fs/nilfs2/segment.c | 2 +- fs/nilfs2/segment.h | 2 +- fs/nilfs2/sufile.c | 2 +- fs/nilfs2/sufile.h | 2 +- fs/nilfs2/super.c | 4 ++-- fs/nilfs2/sysfs.c | 21 ++++++++++----------- fs/nilfs2/the_nilfs.h | 8 ++++---- 9 files changed, 23 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 6c5009cc4e6f..68cb9e4740b4 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -130,7 +130,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, } int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, - time_t ctime, __u64 cno) + time64_t ctime, __u64 cno) { int err; diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 7bbccc099709..10e16935fff6 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -46,7 +46,7 @@ struct nilfs_segsum_info { unsigned long nfileblk; u64 seg_seq; __u64 cno; - time_t ctime; + time64_t ctime; sector_t next; }; @@ -120,7 +120,7 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t, +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time64_t, __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 9f3ffba41533..0953635e7d48 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2040,7 +2040,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) goto out; /* Update time stamp */ - sci->sc_seg_ctime = get_seconds(); + sci->sc_seg_ctime = ktime_get_real_seconds(); err = nilfs_segctor_collect(sci, nilfs, mode); if (unlikely(err)) diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 84084a4d9b3e..04634e3e3d58 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -157,7 +157,7 @@ struct nilfs_sc_info { unsigned long sc_blk_cnt; unsigned long sc_datablk_cnt; unsigned long sc_nblk_this_inc; - time_t sc_seg_ctime; + time64_t sc_seg_ctime; __u64 sc_cno; unsigned long sc_flags; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1341a41e7b43..c7fa139d50e8 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -526,7 +526,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) * @modtime: modification time (option) */ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime) + unsigned long nblocks, time64_t modtime) { struct buffer_head *bh; struct nilfs_segment_usage *su; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 158a9190c8ec..673a891350f4 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -35,7 +35,7 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); int nilfs_sufile_alloc(struct inode *, __u64 *); int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, - unsigned long nblocks, time_t modtime); + unsigned long nblocks, time64_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int, size_t); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3073b646e1ba..6ffeca84d7c3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -283,10 +283,10 @@ int nilfs_commit_super(struct super_block *sb, int flag) { struct the_nilfs *nilfs = sb->s_fs_info; struct nilfs_super_block **sbp = nilfs->ns_sbp; - time_t t; + time64_t t; /* nilfs->ns_sem must be locked by the caller. */ - t = get_seconds(); + t = ktime_get_real_seconds(); nilfs->ns_sbwtime = t; sbp[0]->s_wtime = cpu_to_le64(t); sbp[0]->s_sum = 0; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 490303e3d517..4b25837e7724 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -31,7 +31,7 @@ static struct kset *nilfs_kset; #define NILFS_SHOW_TIME(time_t_val, buf) ({ \ struct tm res; \ int count = 0; \ - time_to_tm(time_t_val, 0, &res); \ + time64_to_tm(time_t_val, 0, &res); \ res.tm_year += 1900; \ res.tm_mon += 1; \ count = scnprintf(buf, PAGE_SIZE, \ @@ -579,7 +579,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; @@ -593,13 +593,13 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t ctime; + time64_t ctime; down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", ctime); } static ssize_t @@ -607,7 +607,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; @@ -621,14 +621,13 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t nongc_ctime; + time64_t nongc_ctime; down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)nongc_ctime); + return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime); } static ssize_t @@ -728,7 +727,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; @@ -742,13 +741,13 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - time_t sbwtime; + time64_t sbwtime; down_read(&nilfs->ns_sem); sbwtime = nilfs->ns_sbwtime; up_read(&nilfs->ns_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); + return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime); } static ssize_t diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 883d732b0259..36da1779f976 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -116,7 +116,7 @@ struct the_nilfs { */ struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; - time_t ns_sbwtime; + time64_t ns_sbwtime; unsigned int ns_sbwcount; unsigned int ns_sbsize; unsigned int ns_mount_state; @@ -131,8 +131,8 @@ struct the_nilfs { __u64 ns_nextnum; unsigned long ns_pseg_offset; __u64 ns_cno; - time_t ns_ctime; - time_t ns_nongc_ctime; + time64_t ns_ctime; + time64_t ns_nongc_ctime; atomic_t ns_ndirtyblks; /* @@ -267,7 +267,7 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { - u64 t = get_seconds(); + u64 t = ktime_get_real_seconds(); return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; -- cgit v1.2.3 From b0cd38c7f54c9176f78cce2e2b23e1513bf5a224 Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernandez" Date: Tue, 6 Feb 2018 15:49:02 -0800 Subject: hfsplus: honor setgid flag on directories When creating a file inside a directory that has the setgid flag set, give the new file the group ID of the parent, and also the setgid flag if it is a directory itself. Link: http://lkml.kernel.org/r/20171204192705.GA6101@debian.home Signed-off-by: Ernesto A. Fernandez Reviewed-by: Vyacheslav Dubeyko Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/dir.c | 4 ++-- fs/hfsplus/hfsplus_fs.h | 3 ++- fs/hfsplus/inode.c | 7 +++---- fs/hfsplus/super.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index e8120a282435..15e06fb552da 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -444,7 +444,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); + inode = hfsplus_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO); if (!inode) goto out; @@ -486,7 +486,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); - inode = hfsplus_new_inode(dir->i_sb, mode); + inode = hfsplus_new_inode(dir->i_sb, dir, mode); if (!inode) goto out; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a015044daa05..d9255abafb81 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -478,7 +478,8 @@ extern const struct address_space_operations hfsplus_aops; extern const struct address_space_operations hfsplus_btree_aops; extern const struct dentry_operations hfsplus_dentry_operations; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode); +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode); void hfsplus_delete_inode(struct inode *inode); void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 190c60efbc99..c0c8d433864f 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -354,7 +354,8 @@ static const struct file_operations hfsplus_file_operations = { .unlocked_ioctl = hfsplus_ioctl, }; -struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) +struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, + umode_t mode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); @@ -364,9 +365,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) return NULL; inode->i_ino = sbi->next_cnid++; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1d458b716957..513c357c734b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -549,7 +549,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!sbi->hidden_dir) { mutex_lock(&sbi->vh_mutex); - sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); + sbi->hidden_dir = hfsplus_new_inode(sb, root, S_IFDIR); if (!sbi->hidden_dir) { mutex_unlock(&sbi->vh_mutex); err = -ENOMEM; -- cgit v1.2.3 From 1a60e4d516defb5808967a8b42ac6675a3f05085 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 6 Feb 2018 15:40:59 -0800 Subject: vfs: remove might_sleep() from clear_inode() Commit 7994e6f72543 ("vfs: Move waiting for inode writeback from end_writeback() to evict_inode()") removed inode_sync_wait() from end_writeback() and commit dbd5768f87ff ("vfs: Rename end_writeback() to clear_inode()") renamed end_writeback() to clear_inode(). After these patches there is no sleeping operation in clear_inode(). So, remove might_sleep() from it. Link: http://lkml.kernel.org/r/20171108004354.40308-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Alexander Viro Cc: Greg Thelen Cc: Jan Kara Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index e2ca0f4b5151..ef362364d396 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -498,7 +498,6 @@ EXPORT_SYMBOL(__remove_inode_hash); void clear_inode(struct inode *inode) { - might_sleep(); /* * We have to cycle tree_lock here because reclaim can be still in the * process of removing the last page (in __delete_from_page_cache()) -- cgit v1.2.3 From 4c2e4befb3cc9ce42d506aa537c9ab504723e98c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 6 Feb 2018 15:41:45 -0800 Subject: pipe, sysctl: drop 'min' parameter from pipe-max-size converter Patch series "pipe: buffer limits fixes and cleanups", v2. This series simplifies the sysctl handler for pipe-max-size and fixes another set of bugs related to the pipe buffer limits: - The root user wasn't allowed to exceed the limits when creating new pipes. - There was an off-by-one error when checking the limits, so a limit of N was actually treated as N - 1. - F_SETPIPE_SZ accepted values over UINT_MAX. - Reading the pipe buffer limits could be racy. This patch (of 7): Before validating the given value against pipe_min_size, do_proc_dopipe_max_size_conv() calls round_pipe_size(), which rounds the value up to pipe_min_size. Therefore, the second check against pipe_min_size is redundant. Remove it. Link: http://lkml.kernel.org/r/20180111052902.14409-2-ebiggers3@gmail.com Signed-off-by: Eric Biggers Acked-by: Kees Cook Acked-by: Joe Lawrence Cc: Alexander Viro Cc: "Luis R . Rodriguez" Cc: Michael Kerrisk Cc: Mikulas Patocka Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 10 +++------- include/linux/pipe_fs_i.h | 2 +- kernel/sysctl.c | 15 +-------------- 3 files changed, 5 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index a449ca0ec0c6..1f6ec1051e59 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -35,11 +35,6 @@ */ unsigned int pipe_max_size = 1048576; -/* - * Minimum pipe size, as required by POSIX - */ -unsigned int pipe_min_size = PAGE_SIZE; - /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ @@ -1024,8 +1019,9 @@ unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; - if (size < pipe_min_size) - size = pipe_min_size; + /* Minimum pipe size, as required by POSIX */ + if (size < PAGE_SIZE) + size = PAGE_SIZE; nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (nr_pages == 0) diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 2dc5e9870fcd..7d9beda14584 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -167,7 +167,7 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_size, pipe_min_size; +extern unsigned int pipe_max_size; extern unsigned long pipe_user_pages_hard; extern unsigned long pipe_user_pages_soft; int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2fb4e27c636a..f21375aa6cf6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1813,7 +1813,6 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = &pipe_proc_fn, - .extra1 = &pipe_min_size, }, { .procname = "pipe-user-pages-hard", @@ -2615,16 +2614,10 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } -struct do_proc_dopipe_max_size_conv_param { - unsigned int *min; -}; - static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { - struct do_proc_dopipe_max_size_conv_param *param = data; - if (write) { unsigned int val; @@ -2635,9 +2628,6 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, if (val == 0) return -EINVAL; - if (param->min && *param->min > val) - return -ERANGE; - *valp = val; } else { unsigned int val = *valp; @@ -2650,11 +2640,8 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - struct do_proc_dopipe_max_size_conv_param param = { - .min = (unsigned int *) table->extra1, - }; return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, ¶m); + do_proc_dopipe_max_size_conv, NULL); } static void validate_coredump_safety(void) -- cgit v1.2.3 From 319e0a21bb7823abbb4818fe2724e572bbac77a2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 6 Feb 2018 15:41:49 -0800 Subject: pipe, sysctl: remove pipe_proc_fn() pipe_proc_fn() is no longer needed, as it only calls through to proc_dopipe_max_size(). Just put proc_dopipe_max_size() in the ctl_table entry directly, and remove the unneeded EXPORT_SYMBOL() and the ENOSYS stub for it. (The reason the ENOSYS stub isn't needed is that the pipe-max-size ctl_table entry is located directly in 'kern_table' rather than being registered separately. Therefore, the entry is already only defined when the kernel is built with sysctl support.) Link: http://lkml.kernel.org/r/20180111052902.14409-3-ebiggers3@gmail.com Signed-off-by: Eric Biggers Acked-by: Kees Cook Acked-by: Joe Lawrence Cc: Alexander Viro Cc: "Luis R . Rodriguez" Cc: Michael Kerrisk Cc: Mikulas Patocka Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 10 ---------- include/linux/pipe_fs_i.h | 1 - include/linux/sysctl.h | 3 --- kernel/sysctl.c | 15 +++++---------- 4 files changed, 5 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 1f6ec1051e59..61adbf0f5b53 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1120,16 +1120,6 @@ out_revert_acct: return ret; } -/* - * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size - * will return an error. - */ -int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, - size_t *lenp, loff_t *ppos) -{ - return proc_dopipe_max_size(table, write, buf, lenp, ppos); -} - /* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same * location, so checking ->i_pipe is not enough to verify that this is a diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 7d9beda14584..5028bd4b2c96 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -170,7 +170,6 @@ void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); extern unsigned int pipe_max_size; extern unsigned long pipe_user_pages_hard; extern unsigned long pipe_user_pages_soft; -int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 992bc9948232..b769ecfcc3bd 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -51,9 +51,6 @@ extern int proc_dointvec_minmax(struct ctl_table *, int, extern int proc_douintvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -extern int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f21375aa6cf6..59f09ea3ca3e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -218,6 +218,8 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -1812,7 +1814,7 @@ static struct ctl_table fs_table[] = { .data = &pipe_max_size, .maxlen = sizeof(pipe_max_size), .mode = 0644, - .proc_handler = &pipe_proc_fn, + .proc_handler = proc_dopipe_max_size, }, { .procname = "pipe-user-pages-hard", @@ -2637,8 +2639,8 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, return 0; } -int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -3147,12 +3149,6 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } -int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3196,7 +3192,6 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); -EXPORT_SYMBOL_GPL(proc_dopipe_max_size); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit v1.2.3 From 85c2dd5473b2718b4b63e74bfeb1ca876868e11f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 6 Feb 2018 15:41:53 -0800 Subject: pipe: actually allow root to exceed the pipe buffer limits pipe-user-pages-hard and pipe-user-pages-soft are only supposed to apply to unprivileged users, as documented in both Documentation/sysctl/fs.txt and the pipe(7) man page. However, the capabilities are actually only checked when increasing a pipe's size using F_SETPIPE_SZ, not when creating a new pipe. Therefore, if pipe-user-pages-hard has been set, the root user can run into it and be unable to create pipes. Similarly, if pipe-user-pages-soft has been set, the root user can run into it and have their pipes limited to 1 page each. Fix this by allowing the privileged override in both cases. Link: http://lkml.kernel.org/r/20180111052902.14409-4-ebiggers3@gmail.com Fixes: 759c01142a5d ("pipe: limit the per-user amount of pages allocated in pipes") Signed-off-by: Eric Biggers Acked-by: Kees Cook Acked-by: Joe Lawrence Cc: Alexander Viro Cc: "Luis R . Rodriguez" Cc: Michael Kerrisk Cc: Mikulas Patocka Cc: Willy Tarreau Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 61adbf0f5b53..04acfad4692b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -613,6 +613,11 @@ static bool too_many_pipe_buffers_hard(unsigned long user_bufs) return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard; } +static bool is_unprivileged_user(void) +{ + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; @@ -629,12 +634,12 @@ struct pipe_inode_info *alloc_pipe_info(void) user_bufs = account_pipe_buffers(user, 0, pipe_bufs); - if (too_many_pipe_buffers_soft(user_bufs)) { + if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, 1); pipe_bufs = 1; } - if (too_many_pipe_buffers_hard(user_bufs)) + if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), @@ -1065,7 +1070,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) if (nr_pages > pipe->buffers && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && - !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } -- cgit v1.2.3 From 9903a91c763ecdae333a04a9d89d79d2b8966503 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 6 Feb 2018 15:41:56 -0800 Subject: pipe: fix off-by-one error when checking buffer limits With pipe-user-pages-hard set to 'N', users were actually only allowed up to 'N - 1' buffers; and likewise for pipe-user-pages-soft. Fix this to allow up to 'N' buffers, as would be expected. Link: http://lkml.kernel.org/r/20180111052902.14409-5-ebiggers3@gmail.com Fixes: b0b91d18e2e9 ("pipe: fix limit checking in pipe_set_size()") Signed-off-by: Eric Biggers Acked-by: Willy Tarreau Acked-by: Kees Cook Acked-by: Joe Lawrence Cc: Alexander Viro Cc: "Luis R . Rodriguez" Cc: Michael Kerrisk Cc: Mikulas Patocka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 04acfad4692b..46c30ac777da 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -605,12 +605,12 @@ static unsigned long account_pipe_buffers(struct user_struct *user, static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { - return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft; + return pipe_user_pages_soft && user_bufs > pipe_user_pages_soft; } static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { - return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard; + return pipe_user_pages_hard && user_bufs > pipe_user_pages_hard; } static bool is_unprivileged_user(void) -- cgit v1.2.3 From 96e99be40e4cff870a83233731121ec0f7f95075 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 6 Feb 2018 15:42:00 -0800 Subject: pipe: reject F_SETPIPE_SZ with size over UINT_MAX A pipe's size is represented as an 'unsigned int'. As expected, writing a value greater than UINT_MAX to /proc/sys/fs/pipe-max-size fails with EINVAL. However, the F_SETPIPE_SZ fcntl silently truncates such values to 32 bits, rather than failing with EINVAL as expected. (It *does* fail with EINVAL for values above (1 << 31) but <= UINT_MAX.) Fix this by moving the check against UINT_MAX into round_pipe_size() which is called in both cases. Link: http://lkml.kernel.org/r/20180111052902.14409-6-ebiggers3@gmail.com Signed-off-by: Eric Biggers Acked-by: Kees Cook Acked-by: Joe Lawrence Cc: Alexander Viro Cc: "Luis R . Rodriguez" Cc: Michael Kerrisk Cc: Mikulas Patocka Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 5 ++++- include/linux/pipe_fs_i.h | 2 +- kernel/sysctl.c | 3 --- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 46c30ac777da..817393d36244 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1020,10 +1020,13 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -unsigned int round_pipe_size(unsigned int size) +unsigned int round_pipe_size(unsigned long size) { unsigned long nr_pages; + if (size > UINT_MAX) + return 0; + /* Minimum pipe size, as required by POSIX */ if (size < PAGE_SIZE) size = PAGE_SIZE; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 5028bd4b2c96..5a3bb3b7c9ad 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -190,6 +190,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file); int create_pipe_files(struct file **, int); -unsigned int round_pipe_size(unsigned int size); +unsigned int round_pipe_size(unsigned long size); #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 59f09ea3ca3e..f98f28c12020 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2623,9 +2623,6 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, if (write) { unsigned int val; - if (*lvalp > UINT_MAX) - return -EINVAL; - val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; -- cgit v1.2.3 From c4fed5a91fadc8a277b1eda474317b501651dd3e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 6 Feb 2018 15:42:05 -0800 Subject: pipe: simplify round_pipe_size() round_pipe_size() calculates the number of pages the requested size corresponds to, then rounds the page count up to the next power of 2. However, it also rounds everything < PAGE_SIZE up to PAGE_SIZE. Therefore, there's no need to actually translate the size into a page count; we just need to round the size up to the next power of 2. We do need to verify the size isn't greater than (1 << 31), since on 32-bit systems roundup_pow_of_two() would be undefined in that case. But that can just be combined with the UINT_MAX check which we need anyway now. Finally, update pipe_set_size() to not redundantly check the return value of round_pipe_size() for the "invalid size" case twice. Link: http://lkml.kernel.org/r/20180111052902.14409-7-ebiggers3@gmail.com Signed-off-by: Eric Biggers Acked-by: Kees Cook Acked-by: Joe Lawrence Cc: Alexander Viro Cc: "Luis R . Rodriguez" Cc: Michael Kerrisk Cc: Mikulas Patocka Cc: Willy Tarreau Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 817393d36244..8be52158c400 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1022,20 +1022,14 @@ const struct file_operations pipefifo_fops = { */ unsigned int round_pipe_size(unsigned long size) { - unsigned long nr_pages; - - if (size > UINT_MAX) + if (size > (1U << 31)) return 0; /* Minimum pipe size, as required by POSIX */ if (size < PAGE_SIZE) - size = PAGE_SIZE; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages == 0) - return 0; + return PAGE_SIZE; - return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; + return roundup_pow_of_two(size); } /* @@ -1050,8 +1044,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) long ret = 0; size = round_pipe_size(arg); - if (size == 0) - return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) -- cgit v1.2.3 From f7340761812fc10313e6fcc115e0bc4f7a799112 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 6 Feb 2018 15:42:08 -0800 Subject: pipe: read buffer limits atomically The pipe buffer limits are accessed without any locking, and may be changed at any time by the sysctl handlers. In theory this could cause problems for expressions like the following: pipe_user_pages_hard && user_bufs > pipe_user_pages_hard ... since the assembly code might reference the 'pipe_user_pages_hard' memory location multiple times, and if the admin removes the limit by setting it to 0, there is a very brief window where processes could incorrectly observe the limit to be exceeded. Fix this by loading the limits with READ_ONCE() prior to use. Link: http://lkml.kernel.org/r/20180111052902.14409-8-ebiggers3@gmail.com Signed-off-by: Eric Biggers Acked-by: Kees Cook Acked-by: Joe Lawrence Cc: Alexander Viro Cc: Michael Kerrisk Cc: Willy Tarreau Cc: Mikulas Patocka Cc: "Luis R . Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 8be52158c400..0913aed7fd0d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -605,12 +605,16 @@ static unsigned long account_pipe_buffers(struct user_struct *user, static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { - return pipe_user_pages_soft && user_bufs > pipe_user_pages_soft; + unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); + + return soft_limit && user_bufs > soft_limit; } static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { - return pipe_user_pages_hard && user_bufs > pipe_user_pages_hard; + unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); + + return hard_limit && user_bufs > hard_limit; } static bool is_unprivileged_user(void) @@ -624,13 +628,14 @@ struct pipe_inode_info *alloc_pipe_info(void) unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; + unsigned int max_size = READ_ONCE(pipe_max_size); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; - if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE)) - pipe_bufs = pipe_max_size >> PAGE_SHIFT; + if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) + pipe_bufs = max_size >> PAGE_SHIFT; user_bufs = account_pipe_buffers(user, 0, pipe_bufs); -- cgit v1.2.3