From e3912ac37e07a13c70675cd75020694de4841c74 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:36:51 -0800
Subject: proc: use %u for pid printing and slightly less stack

PROC_NUMBUF is 13 which is enough for "negative int + \n + \0".

However PIDs and TGIDs are never negative and newline is not a concern,
so use just 10 per integer.

Link: http://lkml.kernel.org/r/20171120203005.GA27743@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alexander Viro <viro@ftp.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c        | 16 ++++++++--------
 fs/proc/fd.c          |  2 +-
 fs/proc/self.c        |  6 +++---
 fs/proc/thread_self.c |  5 ++---
 4 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 60316b52d659..fe56f3c7002a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3018,11 +3018,11 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 {
 	struct dentry *dentry, *leader, *dir;
-	char buf[PROC_NUMBUF];
+	char buf[10 + 1];
 	struct qstr name;
 
 	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%d", pid);
+	name.len = snprintf(buf, sizeof(buf), "%u", pid);
 	/* no ->d_hash() rejects on procfs */
 	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
 	if (dentry) {
@@ -3034,7 +3034,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 		return;
 
 	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%d", tgid);
+	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
 	leader = d_hash_and_lookup(mnt->mnt_root, &name);
 	if (!leader)
 		goto out;
@@ -3046,7 +3046,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
 		goto out_put_leader;
 
 	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%d", pid);
+	name.len = snprintf(buf, sizeof(buf), "%u", pid);
 	dentry = d_hash_and_lookup(dir, &name);
 	if (dentry) {
 		d_invalidate(dentry);
@@ -3225,14 +3225,14 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
-		char name[PROC_NUMBUF];
+		char name[10 + 1];
 		int len;
 
 		cond_resched();
 		if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
 			continue;
 
-		len = snprintf(name, sizeof(name), "%d", iter.tgid);
+		len = snprintf(name, sizeof(name), "%u", iter.tgid);
 		ctx->pos = iter.tgid + TGID_OFFSET;
 		if (!proc_fill_cache(file, ctx, name, len,
 				     proc_pid_instantiate, iter.task, NULL)) {
@@ -3560,10 +3560,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
 	     task;
 	     task = next_tid(task), ctx->pos++) {
-		char name[PROC_NUMBUF];
+		char name[10 + 1];
 		int len;
 		tid = task_pid_nr_ns(task, ns);
-		len = snprintf(name, sizeof(name), "%d", tid);
+		len = snprintf(name, sizeof(name), "%u", tid);
 		if (!proc_fill_cache(file, ctx, name, len,
 				proc_task_instantiate, task, NULL)) {
 			/* returning this tgid failed, save it as the first
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 96fc70225e54..6b80cd1e419a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -236,7 +236,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 	for (fd = ctx->pos - 2;
 	     fd < files_fdtable(files)->max_fds;
 	     fd++, ctx->pos++) {
-		char name[PROC_NUMBUF];
+		char name[10 + 1];
 		int len;
 
 		if (!fcheck_files(files, fd))
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 31326bb23b8b..d30627aa440b 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -17,11 +17,11 @@ static const char *proc_self_get_link(struct dentry *dentry,
 
 	if (!tgid)
 		return ERR_PTR(-ENOENT);
-	/* 11 for max length of signed int in decimal + NULL term */
-	name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+	/* max length of unsigned int in decimal + NULL term */
+	name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
 	if (unlikely(!name))
 		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
-	sprintf(name, "%d", tgid);
+	sprintf(name, "%u", tgid);
 	set_delayed_call(done, kfree_link, name);
 	return name;
 }
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index b813e3b529f2..6c1a54716337 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -18,11 +18,10 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 
 	if (!pid)
 		return ERR_PTR(-ENOENT);
-	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
-				dentry ? GFP_KERNEL : GFP_ATOMIC);
+	name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
 	if (unlikely(!name))
 		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
-	sprintf(name, "%d/task/%d", tgid, pid);
+	sprintf(name, "%u/task/%u", tgid, pid);
 	set_delayed_call(done, kfree_link, name);
 	return name;
 }
-- 
cgit v1.2.3


From 9f7118b2007d5e7c7a061550d2ca2ecb841537dc Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:36:55 -0800
Subject: proc: don't use READ_ONCE/WRITE_ONCE for /proc/*/fail-nth

READ_ONCE and WRITE_ONCE are useless when there is only one read/write
is being made.

Link: http://lkml.kernel.org/r/20171120204033.GA9446@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fe56f3c7002a..373091249bdb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1370,7 +1370,7 @@ static ssize_t proc_fail_nth_write(struct file *file, const char __user *buf,
 	task = get_proc_task(file_inode(file));
 	if (!task)
 		return -ESRCH;
-	WRITE_ONCE(task->fail_nth, n);
+	task->fail_nth = n;
 	put_task_struct(task);
 
 	return count;
@@ -1386,8 +1386,7 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf,
 	task = get_proc_task(file_inode(file));
 	if (!task)
 		return -ESRCH;
-	len = snprintf(numbuf, sizeof(numbuf), "%u\n",
-			READ_ONCE(task->fail_nth));
+	len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth);
 	len = simple_read_from_buffer(buf, count, ppos, numbuf, len);
 	put_task_struct(task);
 
-- 
cgit v1.2.3


From ac7f1061c2c11bb8936b1b6a94cdb48de732f7a4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:36:59 -0800
Subject: proc: fix /proc/*/map_files lookup

Current code does:

	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)

However sscanf() is broken garbage.

It silently accepts whitespace between format specifiers
(did you know that?).

It silently accepts valid strings which result in integer overflow.

Do not use sscanf() for any even remotely reliable parsing code.

	OK
	# readlink '/proc/1/map_files/55a23af39000-55a23b05b000'
	/lib/systemd/systemd

	broken
	# readlink '/proc/1/map_files/               55a23af39000-55a23b05b000'
	/lib/systemd/systemd

	broken
	# readlink '/proc/1/map_files/55a23af39000-55a23b05b000    '
	/lib/systemd/systemd

	very broken
	# readlink '/proc/1/map_files/1000000000000000055a23af39000-55a23b05b000'
	/lib/systemd/systemd

Andrei said:

: This patch breaks criu.  It was a bug in criu.  And this bug is on a minor
: path, which works when memfd_create() isn't available.  It is a reason why
: I ask to not backport this patch to stable kernels.
:
: In CRIU this bug can be triggered, only if this patch will be backported
: to a kernel which version is lower than v3.16.

Link: http://lkml.kernel.org/r/20171120212706.GA14325@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 373091249bdb..4c12cb2cd704 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -100,6 +100,8 @@
 #include "internal.h"
 #include "fd.h"
 
+#include "../../lib/kstrtox.h"
+
 /* NOTE:
  *	Implementing inode permission operations in /proc is almost
  *	certainly an error.  Permission checks need to happen during
@@ -1906,8 +1908,33 @@ end_instantiate:
 static int dname_to_vma_addr(struct dentry *dentry,
 			     unsigned long *start, unsigned long *end)
 {
-	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+	const char *str = dentry->d_name.name;
+	unsigned long long sval, eval;
+	unsigned int len;
+
+	len = _parse_integer(str, 16, &sval);
+	if (len & KSTRTOX_OVERFLOW)
+		return -EINVAL;
+	if (sval != (unsigned long)sval)
+		return -EINVAL;
+	str += len;
+
+	if (*str != '-')
 		return -EINVAL;
+	str++;
+
+	len = _parse_integer(str, 16, &eval);
+	if (len & KSTRTOX_OVERFLOW)
+		return -EINVAL;
+	if (eval != (unsigned long)eval)
+		return -EINVAL;
+	str += len;
+
+	if (*str != '\0')
+		return -EINVAL;
+
+	*start = sval;
+	*end = eval;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 593bc695a1102a540f1613c651e73693b17a7343 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:02 -0800
Subject: fs/proc/vmcore.c: simpler /proc/vmcore cleanup

Iterators aren't necessary as you can just grab the first entry and delete
it until no entries left.

Link: http://lkml.kernel.org/r/20171121191121.GA20757@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 885d445afa0d..a45f0af22a60 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1178,18 +1178,16 @@ fs_initcall(vmcore_init);
 /* Cleanup function for vmcore module. */
 void vmcore_cleanup(void)
 {
-	struct list_head *pos, *next;
-
 	if (proc_vmcore) {
 		proc_remove(proc_vmcore);
 		proc_vmcore = NULL;
 	}
 
 	/* clear the vmcore list. */
-	list_for_each_safe(pos, next, &vmcore_list) {
+	while (!list_empty(&vmcore_list)) {
 		struct vmcore *m;
 
-		m = list_entry(pos, struct vmcore, list);
+		m = list_first_entry(&vmcore_list, struct vmcore, list);
 		list_del(&m->list);
 		kfree(m);
 	}
-- 
cgit v1.2.3


From 20d28cde5558a2a211620254ec7bc53a4334167f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:06 -0800
Subject: proc: less memory for /proc/*/map_files readdir

dentry name can be evaluated later, right before calling into VFS.

Also, spend less time under ->mmap_sem.

Link: http://lkml.kernel.org/r/20171110163034.GA2534@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4c12cb2cd704..a3efc2427c74 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2026,9 +2026,9 @@ out:
 }
 
 struct map_files_info {
+	unsigned long	start;
+	unsigned long	end;
 	fmode_t		mode;
-	unsigned int	len;
-	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
 /*
@@ -2198,10 +2198,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 			if (++pos <= ctx->pos)
 				continue;
 
+			info.start = vma->vm_start;
+			info.end = vma->vm_end;
 			info.mode = vma->vm_file->f_mode;
-			info.len = snprintf(info.name,
-					sizeof(info.name), "%lx-%lx",
-					vma->vm_start, vma->vm_end);
 			if (flex_array_put(fa, i++, &info, GFP_KERNEL))
 				BUG();
 		}
@@ -2209,9 +2208,13 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
 	up_read(&mm->mmap_sem);
 
 	for (i = 0; i < nr_files; i++) {
+		char buf[4 * sizeof(long) + 2];	/* max: %lx-%lx\0 */
+		unsigned int len;
+
 		p = flex_array_get(fa, i);
+		len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
 		if (!proc_fill_cache(file, ctx,
-				      p->name, p->len,
+				      buf, len,
 				      proc_map_files_instantiate,
 				      task,
 				      (void *)(unsigned long)p->mode))
-- 
cgit v1.2.3


From 171ef917dfe721b1437b0066f7bc5684d776bba8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:10 -0800
Subject: fs/proc/array.c: delete children_seq_release()

It is 1:1 wrapper around seq_release().

Link: http://lkml.kernel.org/r/20171122171510.GA12161@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index d67a72dcb92c..598803576e4c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -736,16 +736,10 @@ static int children_seq_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-int children_seq_release(struct inode *inode, struct file *file)
-{
-	seq_release(inode, file);
-	return 0;
-}
-
 const struct file_operations proc_tid_children_operations = {
 	.open    = children_seq_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = children_seq_release,
+	.release = seq_release,
 };
 #endif /* CONFIG_PROC_CHILDREN */
-- 
cgit v1.2.3


From d0290bc20d4739b7a900ae37eb5d4cc3be2b393f Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 6 Feb 2018 15:37:13 -0800
Subject: fs/proc/kcore.c: use probe_kernel_read() instead of memcpy()

Commit df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext
data") added a bounce buffer to avoid hardened usercopy checks.  Copying
to the bounce buffer was implemented with a simple memcpy() assuming
that it is always valid to read from kernel memory iff the
kern_addr_valid() check passed.

A simple, but pointless, test case like "dd if=/proc/kcore of=/dev/null"
now can easily crash the kernel, since the former execption handling on
invalid kernel addresses now doesn't work anymore.

Also adding a kern_addr_valid() implementation wouldn't help here.  Most
architectures simply return 1 here, while a couple implemented a page
table walk to figure out if something is mapped at the address in
question.

With DEBUG_PAGEALLOC active mappings are established and removed all the
time, so that relying on the result of kern_addr_valid() before
executing the memcpy() also doesn't work.

Therefore simply use probe_kernel_read() to copy to the bounce buffer.
This also allows to simplify read_kcore().

At least on s390 this fixes the observed crashes and doesn't introduce
warnings that were removed with df04abfd181a ("fs/proc/kcore.c: Add
bounce buffer for ktext data"), even though the generic
probe_kernel_read() implementation uses uaccess functions.

While looking into this I'm also wondering if kern_addr_valid() could be
completely removed...(?)

Link: http://lkml.kernel.org/r/20171202132739.99971-1-heiko.carstens@de.ibm.com
Fixes: df04abfd181a ("fs/proc/kcore.c: Add bounce buffer for ktext data")
Fixes: f5509cc18daa ("mm: Hardened usercopy")
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 4bc85cb8be6a..e8a93bc8285d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -512,23 +512,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				return -EFAULT;
 		} else {
 			if (kern_addr_valid(start)) {
-				unsigned long n;
-
 				/*
 				 * Using bounce buffer to bypass the
 				 * hardened user copy kernel text checks.
 				 */
-				memcpy(buf, (char *) start, tsz);
-				n = copy_to_user(buffer, buf, tsz);
-				/*
-				 * We cannot distinguish between fault on source
-				 * and fault on destination. When this happens
-				 * we clear too and hope it will trigger the
-				 * EFAULT again.
-				 */
-				if (n) { 
-					if (clear_user(buffer + tsz - n,
-								n))
+				if (probe_kernel_read(buf, (void *) start, tsz)) {
+					if (clear_user(buffer, tsz))
+						return -EFAULT;
+				} else {
+					if (copy_to_user(buffer, buf, tsz))
 						return -EFAULT;
 				}
 			} else {
-- 
cgit v1.2.3


From 163cf548db888710695d5dbe907cda4262d45b52 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:18 -0800
Subject: fs/proc/internal.h: rearrange struct proc_dir_entry

struct proc_dir_entry became bit messy over years:

* move 16-bit ->mode_t before namelen to get rid of padding
* make ->in_use first field: it seems to be most used resulting in
  smaller code on x86_64 (defconfig):

	add/remove: 0/0 grow/shrink: 7/13 up/down: 24/-67 (-43)
	Function                                     old     new   delta
	proc_readdir_de                              451     455      +4
	proc_get_inode                               282     286      +4
	pde_put                                       65      69      +4
	remove_proc_subtree                          294     297      +3
	remove_proc_entry                            297     300      +3
	proc_register                                295     298      +3
	proc_notify_change                            94      97      +3
	unuse_pde                                     27      26      -1
	proc_reg_write                                89      85      -4
	proc_reg_unlocked_ioctl                       85      81      -4
	proc_reg_read                                 89      85      -4
	proc_reg_llseek                               87      83      -4
	proc_reg_get_unmapped_area                   123     119      -4
	proc_entry_rundown                           139     135      -4
	proc_reg_poll                                 91      85      -6
	proc_reg_mmap                                 79      73      -6
	proc_get_link                                 55      49      -6
	proc_reg_release                             108     101      -7
	proc_reg_open                                298     291      -7
	close_pdeo                                   228     218     -10

* move writeable fields together to a first cacheline (on x86_64),
  those include
	* ->in_use: reference count, taken every open/read/write/close etc
	* ->count: reference count, taken at readdir on every entry
	* ->pde_openers: tracks (nearly) every open, dirtied
	* ->pde_unload_lock: spinlock protecting ->pde_openers
	* ->proc_iops, ->proc_fops, ->data: writeonce fields,
	  used right together with previous group.

* other rarely written fields go into 1st/2nd and 2nd/3rd cacheline on
  32-bit and 64-bit respectively.

Additionally on 32-bit, ->subdir, ->subdir_node, ->namelen, ->name go
fully into 2nd cacheline, separated from writeable fields.  They are all
used during lookup.

Link: http://lkml.kernel.org/r/20171220215914.GA7877@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 4a67188c8d74..a290a1e921a5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -31,24 +31,27 @@ struct mempolicy;
  * subdir_node is used to build the rb tree "subdir" of the parent.
  */
 struct proc_dir_entry {
+	/*
+	 * number of callers into module in progress;
+	 * negative -> it's going away RSN
+	 */
+	atomic_t in_use;
+	atomic_t count;		/* use count */
+	struct list_head pde_openers;	/* who did ->open, but not ->release */
+	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	struct completion *pde_unload_completion;
+	const struct inode_operations *proc_iops;
+	const struct file_operations *proc_fops;
+	void *data;
 	unsigned int low_ino;
-	umode_t mode;
 	nlink_t nlink;
 	kuid_t uid;
 	kgid_t gid;
 	loff_t size;
-	const struct inode_operations *proc_iops;
-	const struct file_operations *proc_fops;
 	struct proc_dir_entry *parent;
 	struct rb_root_cached subdir;
 	struct rb_node subdir_node;
-	void *data;
-	atomic_t count;		/* use count */
-	atomic_t in_use;	/* number of callers into module in progress; */
-			/* negative -> it's going away RSN */
-	struct completion *pde_unload_completion;
-	struct list_head pde_openers;	/* who did ->open, but not ->release */
-	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	umode_t mode;
 	u8 namelen;
 	char name[];
 } __randomize_layout;
-- 
cgit v1.2.3


From 53f63345d893df36b58e81ddb3d11dcd2e9cc966 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:21 -0800
Subject: fs/proc/internal.h: fix up comment

Document what ->pde_unload_lock actually does.

Link: http://lkml.kernel.org/r/20180103185120.GB31849@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/internal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a290a1e921a5..5ba317874f0d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -38,7 +38,8 @@ struct proc_dir_entry {
 	atomic_t in_use;
 	atomic_t count;		/* use count */
 	struct list_head pde_openers;	/* who did ->open, but not ->release */
-	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	/* protects ->pde_openers and all struct pde_opener instances */
+	spinlock_t pde_unload_lock;
 	struct completion *pde_unload_completion;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
-- 
cgit v1.2.3


From efb1a57d90cae6af1ddd32f1b920c924a711aba5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:24 -0800
Subject: fs/proc: use __ro_after_init

/proc/self inode numbers, value of proc_inode_cache and st_nlink of
/proc/$TGID are fixed constants.

Link: http://lkml.kernel.org/r/20180103184707.GA31849@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c        | 5 +++--
 fs/proc/inode.c       | 3 ++-
 fs/proc/self.c        | 3 ++-
 fs/proc/thread_self.c | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a3efc2427c74..9298324325ed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -75,6 +75,7 @@
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <linux/printk.h>
+#include <linux/cache.h>
 #include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/audit.h>
@@ -112,8 +113,8 @@
  *	in /proc for a task before it execs a suid executable.
  */
 
-static u8 nlink_tid;
-static u8 nlink_tgid;
+static u8 nlink_tid __ro_after_init;
+static u8 nlink_tgid __ro_after_init;
 
 struct pid_entry {
 	const char *name;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8dacaabb9f37..c5c8e7af5520 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -5,6 +5,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
+#include <linux/cache.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
@@ -52,7 +53,7 @@ static void proc_evict_inode(struct inode *inode)
 	}
 }
 
-static struct kmem_cache * proc_inode_cachep;
+static struct kmem_cache *proc_inode_cachep __ro_after_init;
 
 static struct inode *proc_alloc_inode(struct super_block *sb)
 {
diff --git a/fs/proc/self.c b/fs/proc/self.c
index d30627aa440b..4d7d061696b3 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/cache.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
@@ -30,7 +31,7 @@ static const struct inode_operations proc_self_inode_operations = {
 	.get_link	= proc_self_get_link,
 };
 
-static unsigned self_inum;
+static unsigned self_inum __ro_after_init;
 
 int proc_setup_self(struct super_block *s)
 {
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 6c1a54716337..9d2efaca499f 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/cache.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
@@ -30,7 +31,7 @@ static const struct inode_operations proc_thread_self_inode_operations = {
 	.get_link	= proc_thread_self_get_link,
 };
 
-static unsigned thread_self_inum;
+static unsigned thread_self_inum __ro_after_init;
 
 int proc_setup_thread_self(struct super_block *s)
 {
-- 
cgit v1.2.3


From 15b158b4e6274351fc3cf652cbabc57104efb547 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:28 -0800
Subject: proc: spread likely/unlikely a bit

use_pde() is used at every open/read/write/...  of every random /proc
file.  Negative refcount happens only if PDE is being deleted by module
(read: never).  So it gets "likely".

unuse_pde() gets "unlikely" for the same reason.

close_pdeo() gets unlikely as the completion is filled only if there is a
race between PDE removal and close() (read: never ever).

It even saves code on x86_64 defconfig:

	add/remove: 0/0 grow/shrink: 1/2 up/down: 2/-20 (-18)
	Function                                     old     new   delta
	close_pdeo                                   183     185      +2
	proc_reg_get_unmapped_area                   119     111      -8
	proc_reg_poll                                 85      73     -12

Link: http://lkml.kernel.org/r/20180104175657.GA5204@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index c5c8e7af5520..6e8724958116 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -129,12 +129,12 @@ enum {BIAS = -1U<<31};
 
 static inline int use_pde(struct proc_dir_entry *pde)
 {
-	return atomic_inc_unless_negative(&pde->in_use);
+	return likely(atomic_inc_unless_negative(&pde->in_use));
 }
 
 static void unuse_pde(struct proc_dir_entry *pde)
 {
-	if (atomic_dec_return(&pde->in_use) == BIAS)
+	if (unlikely(atomic_dec_return(&pde->in_use) == BIAS))
 		complete(pde->pde_unload_completion);
 }
 
@@ -167,7 +167,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 		spin_lock(&pde->pde_unload_lock);
 		/* After ->release. */
 		list_del(&pdeo->lh);
-		if (pdeo->c)
+		if (unlikely(pdeo->c))
 			complete(pdeo->c);
 		kfree(pdeo);
 	}
@@ -421,7 +421,7 @@ static const char *proc_get_link(struct dentry *dentry,
 				 struct delayed_call *done)
 {
 	struct proc_dir_entry *pde = PDE(inode);
-	if (unlikely(!use_pde(pde)))
+	if (!use_pde(pde))
 		return ERR_PTR(-EINVAL);
 	set_delayed_call(done, proc_put_link, pde);
 	return pde->data;
-- 
cgit v1.2.3


From 93ad5bc6d4addb74e30d421cd3ba5249c961fb3e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:37:31 -0800
Subject: proc: rearrange args

Rearrange args for smaller code.

lookup revolves around memcmp() which gets len 3rd arg, so propagate
length as 3rd arg.

readdir and lookup add additional arg to VFS ->readdir and ->lookup, so
better add it to the end.

Space savings on x86_64:

	add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-18 (-18)
	Function                                     old     new   delta
	proc_readdir                                  22      13      -9
	proc_lookup                                   18       9      -9

proc_match() is smaller if not inlined, I promise!

Link: http://lkml.kernel.org/r/20180104175958.GB5204@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/generic.c  | 18 +++++++++---------
 fs/proc/internal.h |  5 ++---
 fs/proc/proc_net.c |  4 ++--
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 793a67574668..5d709fa8f3a2 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -28,7 +28,7 @@
 
 static DEFINE_RWLOCK(proc_subdir_lock);
 
-static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
+static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len)
 {
 	if (len < de->namelen)
 		return -1;
@@ -60,7 +60,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
 		struct proc_dir_entry *de = rb_entry(node,
 						     struct proc_dir_entry,
 						     subdir_node);
-		int result = proc_match(len, name, de);
+		int result = proc_match(name, de, len);
 
 		if (result < 0)
 			node = node->rb_left;
@@ -84,7 +84,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir,
 		struct proc_dir_entry *this = rb_entry(*new,
 						       struct proc_dir_entry,
 						       subdir_node);
-		int result = proc_match(de->namelen, de->name, this);
+		int result = proc_match(de->name, this, de->namelen);
 
 		parent = *new;
 		if (result < 0)
@@ -211,8 +211,8 @@ void proc_free_inum(unsigned int inum)
  * Don't create negative dentries here, return -ENOENT by hand
  * instead.
  */
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
-		struct dentry *dentry)
+struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
+			      struct proc_dir_entry *de)
 {
 	struct inode *inode;
 
@@ -235,7 +235,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
 		unsigned int flags)
 {
-	return proc_lookup_de(PDE(dir), dir, dentry);
+	return proc_lookup_de(dir, dentry, PDE(dir));
 }
 
 /*
@@ -247,8 +247,8 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
  * value of the readdir() call, as long as it's non-negative
  * for success..
  */
-int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
-		    struct dir_context *ctx)
+int proc_readdir_de(struct file *file, struct dir_context *ctx,
+		    struct proc_dir_entry *de)
 {
 	int i;
 
@@ -292,7 +292,7 @@ int proc_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
 
-	return proc_readdir_de(PDE(inode), file, ctx);
+	return proc_readdir_de(file, ctx, PDE(inode));
 }
 
 /*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5ba317874f0d..d697c8ab0a14 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -153,10 +153,9 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
  * generic.c
  */
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
-extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
-				     struct dentry *);
+struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
 extern int proc_readdir(struct file *, struct dir_context *);
-extern int proc_readdir_de(struct proc_dir_entry *, struct file *, struct dir_context *);
+int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);
 
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 {
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a2bf369c923d..68c06ae7888c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -135,7 +135,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir,
 	de = ERR_PTR(-ENOENT);
 	net = get_proc_task_net(dir);
 	if (net != NULL) {
-		de = proc_lookup_de(net->proc_net, dir, dentry);
+		de = proc_lookup_de(dir, dentry, net->proc_net);
 		put_net(net);
 	}
 	return de;
@@ -172,7 +172,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
 	ret = -EINVAL;
 	net = get_proc_task_net(file_inode(file));
 	if (net != NULL) {
-		ret = proc_readdir_de(net->proc_net, file, ctx);
+		ret = proc_readdir_de(file, ctx, net->proc_net);
 		put_net(net);
 	}
 	return ret;
-- 
cgit v1.2.3


From 4bf8ba811ac1102d7de6f73af3b9f323463e16c0 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 6 Feb 2018 15:37:34 -0800
Subject: fs/proc/consoles.c: use seq_putc() in show_console_dev()

A single character (line break) should be put into a sequence.  Thus use
the corresponding function "seq_putc".

This issue was detected by using the Coccinelle software.

Link: http://lkml.kernel.org/r/04fb69fe-d820-9141-820f-07e9a48f4635@users.sourceforge.net
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/consoles.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index 290ba85cb900..a8ac48aebd59 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -55,8 +55,7 @@ static int show_console_dev(struct seq_file *m, void *v)
 	if (dev)
 		seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
 
-	seq_printf(m, "\n");
-
+	seq_putc(m, '\n');
 	return 0;
 }
 
-- 
cgit v1.2.3


From 60c9d92f887f4606d363fece7a36c92664dc64c6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 6 Feb 2018 15:39:13 -0800
Subject: elf: fix NT_FILE integer overflow

If vm.max_map_count bumped above 2^26 (67+ mil) and system has enough RAM
to allocate all the VMAs (~12.8 GB on Fedora 27 with 200-byte VMAs), then
it should be possible to overflow 32-bit "size", pass paranoia check,
allocate very little vmalloc space and oops while writing into vmalloc
guard page...

But I didn't test this, only coredump of regular process.

Link: http://lkml.kernel.org/r/20180112203427.GA9109@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 83732fef510d..bdb201230bae 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1599,6 +1599,8 @@ static int fill_files_note(struct memelfnote *note)
 
 	/* *Estimated* file count and total data size needed */
 	count = current->mm->map_count;
+	if (count > UINT_MAX / 64)
+		return -EINVAL;
 	size = count * 64;
 
 	names_ofs = (2 + 3 * count) * sizeof(data[0]);
-- 
cgit v1.2.3


From fb04b91bc2c3a83e9e2ba9c5ce0f0124dd3ffef0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 6 Feb 2018 15:39:21 -0800
Subject: nilfs2: use time64_t internally

The superblock and segment timestamps are used only internally in nilfs2
and can be read out using sysfs.

Since we are using the old 'get_seconds()' interface and store the data
as timestamps, the behavior differs slightly between 64-bit and 32-bit
kernels, the latter will show incorrect timestamps after 2038 in sysfs,
and presumably fail completely in 2106 as comparisons go wrong.

This changes nilfs2 to use time64_t with ktime_get_real_seconds() to
handle timestamps, making the behavior consistent and correct on both
32-bit and 64-bit machines.

The on-disk format already uses 64-bit timestamps, so nothing changes
there.

Link: http://lkml.kernel.org/r/20180122211050.1286441-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segbuf.c    |  2 +-
 fs/nilfs2/segbuf.h    |  4 ++--
 fs/nilfs2/segment.c   |  2 +-
 fs/nilfs2/segment.h   |  2 +-
 fs/nilfs2/sufile.c    |  2 +-
 fs/nilfs2/sufile.h    |  2 +-
 fs/nilfs2/super.c     |  4 ++--
 fs/nilfs2/sysfs.c     | 21 ++++++++++-----------
 fs/nilfs2/the_nilfs.h |  8 ++++----
 9 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6c5009cc4e6f..68cb9e4740b4 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -130,7 +130,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
 }
 
 int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags,
-		       time_t ctime, __u64 cno)
+		       time64_t ctime, __u64 cno)
 {
 	int err;
 
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 7bbccc099709..10e16935fff6 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -46,7 +46,7 @@ struct nilfs_segsum_info {
 	unsigned long		nfileblk;
 	u64			seg_seq;
 	__u64			cno;
-	time_t			ctime;
+	time64_t		ctime;
 	sector_t		next;
 };
 
@@ -120,7 +120,7 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf,
 			   struct nilfs_segment_buffer *prev);
 void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
 				  struct the_nilfs *);
-int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t,
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time64_t,
 		       __u64);
 int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
 int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9f3ffba41533..0953635e7d48 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2040,7 +2040,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 			goto out;
 
 		/* Update time stamp */
-		sci->sc_seg_ctime = get_seconds();
+		sci->sc_seg_ctime = ktime_get_real_seconds();
 
 		err = nilfs_segctor_collect(sci, nilfs, mode);
 		if (unlikely(err))
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 84084a4d9b3e..04634e3e3d58 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -157,7 +157,7 @@ struct nilfs_sc_info {
 	unsigned long		sc_blk_cnt;
 	unsigned long		sc_datablk_cnt;
 	unsigned long		sc_nblk_this_inc;
-	time_t			sc_seg_ctime;
+	time64_t		sc_seg_ctime;
 	__u64			sc_cno;
 	unsigned long		sc_flags;
 
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1341a41e7b43..c7fa139d50e8 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -526,7 +526,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
  * @modtime: modification time (option)
  */
 int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
-				   unsigned long nblocks, time_t modtime)
+				   unsigned long nblocks, time64_t modtime)
 {
 	struct buffer_head *bh;
 	struct nilfs_segment_usage *su;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 158a9190c8ec..673a891350f4 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -35,7 +35,7 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end);
 int nilfs_sufile_alloc(struct inode *, __u64 *);
 int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
 int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
-				   unsigned long nblocks, time_t modtime);
+				   unsigned long nblocks, time64_t modtime);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int,
 				size_t);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3073b646e1ba..6ffeca84d7c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -283,10 +283,10 @@ int nilfs_commit_super(struct super_block *sb, int flag)
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_super_block **sbp = nilfs->ns_sbp;
-	time_t t;
+	time64_t t;
 
 	/* nilfs->ns_sem must be locked by the caller. */
-	t = get_seconds();
+	t = ktime_get_real_seconds();
 	nilfs->ns_sbwtime = t;
 	sbp[0]->s_wtime = cpu_to_le64(t);
 	sbp[0]->s_sum = 0;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 490303e3d517..4b25837e7724 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -31,7 +31,7 @@ static struct kset *nilfs_kset;
 #define NILFS_SHOW_TIME(time_t_val, buf) ({ \
 		struct tm res; \
 		int count = 0; \
-		time_to_tm(time_t_val, 0, &res); \
+		time64_to_tm(time_t_val, 0, &res); \
 		res.tm_year += 1900; \
 		res.tm_mon += 1; \
 		count = scnprintf(buf, PAGE_SIZE, \
@@ -579,7 +579,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
 					struct the_nilfs *nilfs,
 					char *buf)
 {
-	time_t ctime;
+	time64_t ctime;
 
 	down_read(&nilfs->ns_segctor_sem);
 	ctime = nilfs->ns_ctime;
@@ -593,13 +593,13 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
 					    struct the_nilfs *nilfs,
 					    char *buf)
 {
-	time_t ctime;
+	time64_t ctime;
 
 	down_read(&nilfs->ns_segctor_sem);
 	ctime = nilfs->ns_ctime;
 	up_read(&nilfs->ns_segctor_sem);
 
-	return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", ctime);
 }
 
 static ssize_t
@@ -607,7 +607,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
 					 struct the_nilfs *nilfs,
 					 char *buf)
 {
-	time_t nongc_ctime;
+	time64_t nongc_ctime;
 
 	down_read(&nilfs->ns_segctor_sem);
 	nongc_ctime = nilfs->ns_nongc_ctime;
@@ -621,14 +621,13 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
 						struct the_nilfs *nilfs,
 						char *buf)
 {
-	time_t nongc_ctime;
+	time64_t nongc_ctime;
 
 	down_read(&nilfs->ns_segctor_sem);
 	nongc_ctime = nilfs->ns_nongc_ctime;
 	up_read(&nilfs->ns_segctor_sem);
 
-	return snprintf(buf, PAGE_SIZE, "%llu\n",
-			(unsigned long long)nongc_ctime);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime);
 }
 
 static ssize_t
@@ -728,7 +727,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr,
 				     struct the_nilfs *nilfs,
 				     char *buf)
 {
-	time_t sbwtime;
+	time64_t sbwtime;
 
 	down_read(&nilfs->ns_sem);
 	sbwtime = nilfs->ns_sbwtime;
@@ -742,13 +741,13 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
 					 struct the_nilfs *nilfs,
 					 char *buf)
 {
-	time_t sbwtime;
+	time64_t sbwtime;
 
 	down_read(&nilfs->ns_sem);
 	sbwtime = nilfs->ns_sbwtime;
 	up_read(&nilfs->ns_sem);
 
-	return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime);
+	return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime);
 }
 
 static ssize_t
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 883d732b0259..36da1779f976 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -116,7 +116,7 @@ struct the_nilfs {
 	 */
 	struct buffer_head     *ns_sbh[2];
 	struct nilfs_super_block *ns_sbp[2];
-	time_t			ns_sbwtime;
+	time64_t		ns_sbwtime;
 	unsigned int		ns_sbwcount;
 	unsigned int		ns_sbsize;
 	unsigned int		ns_mount_state;
@@ -131,8 +131,8 @@ struct the_nilfs {
 	__u64			ns_nextnum;
 	unsigned long		ns_pseg_offset;
 	__u64			ns_cno;
-	time_t			ns_ctime;
-	time_t			ns_nongc_ctime;
+	time64_t		ns_ctime;
+	time64_t		ns_nongc_ctime;
 	atomic_t		ns_ndirtyblks;
 
 	/*
@@ -267,7 +267,7 @@ struct nilfs_root {
 
 static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
 {
-	u64 t = get_seconds();
+	u64 t = ktime_get_real_seconds();
 
 	return t < nilfs->ns_sbwtime ||
 		t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
-- 
cgit v1.2.3


From b0cd38c7f54c9176f78cce2e2b23e1513bf5a224 Mon Sep 17 00:00:00 2001
From: "Ernesto A. Fernandez" <ernesto.mnd.fernandez@gmail.com>
Date: Tue, 6 Feb 2018 15:49:02 -0800
Subject: hfsplus: honor setgid flag on directories

When creating a file inside a directory that has the setgid flag set, give
the new file the group ID of the parent, and also the setgid flag if it is
a directory itself.

Link: http://lkml.kernel.org/r/20171204192705.GA6101@debian.home
Signed-off-by: Ernesto A. Fernandez <ernesto.mnd.fernandez@gmail.com>
Reviewed-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/dir.c        | 4 ++--
 fs/hfsplus/hfsplus_fs.h | 3 ++-
 fs/hfsplus/inode.c      | 7 +++----
 fs/hfsplus/super.c      | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index e8120a282435..15e06fb552da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -444,7 +444,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
 	int res = -ENOMEM;
 
 	mutex_lock(&sbi->vh_mutex);
-	inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
+	inode = hfsplus_new_inode(dir->i_sb, dir, S_IFLNK | S_IRWXUGO);
 	if (!inode)
 		goto out;
 
@@ -486,7 +486,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
 	int res = -ENOMEM;
 
 	mutex_lock(&sbi->vh_mutex);
-	inode = hfsplus_new_inode(dir->i_sb, mode);
+	inode = hfsplus_new_inode(dir->i_sb, dir, mode);
 	if (!inode)
 		goto out;
 
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index a015044daa05..d9255abafb81 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -478,7 +478,8 @@ extern const struct address_space_operations hfsplus_aops;
 extern const struct address_space_operations hfsplus_btree_aops;
 extern const struct dentry_operations hfsplus_dentry_operations;
 
-struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode);
+struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
+				umode_t mode);
 void hfsplus_delete_inode(struct inode *inode);
 void hfsplus_inode_read_fork(struct inode *inode,
 			     struct hfsplus_fork_raw *fork);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 190c60efbc99..c0c8d433864f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -354,7 +354,8 @@ static const struct file_operations hfsplus_file_operations = {
 	.unlocked_ioctl = hfsplus_ioctl,
 };
 
-struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
+struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
+				umode_t mode)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	struct inode *inode = new_inode(sb);
@@ -364,9 +365,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
 		return NULL;
 
 	inode->i_ino = sbi->next_cnid++;
-	inode->i_mode = mode;
-	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
+	inode_init_owner(inode, dir, mode);
 	set_nlink(inode, 1);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1d458b716957..513c357c734b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -549,7 +549,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 
 		if (!sbi->hidden_dir) {
 			mutex_lock(&sbi->vh_mutex);
-			sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
+			sbi->hidden_dir = hfsplus_new_inode(sb, root, S_IFDIR);
 			if (!sbi->hidden_dir) {
 				mutex_unlock(&sbi->vh_mutex);
 				err = -ENOMEM;
-- 
cgit v1.2.3


From 1a60e4d516defb5808967a8b42ac6675a3f05085 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Tue, 6 Feb 2018 15:40:59 -0800
Subject: vfs: remove might_sleep() from clear_inode()

Commit 7994e6f72543 ("vfs: Move waiting for inode writeback from
end_writeback() to evict_inode()") removed inode_sync_wait() from
end_writeback() and commit dbd5768f87ff ("vfs: Rename end_writeback() to
clear_inode()") renamed end_writeback() to clear_inode().

After these patches there is no sleeping operation in clear_inode().
So, remove might_sleep() from it.

Link: http://lkml.kernel.org/r/20171108004354.40308-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Greg Thelen <gthelen@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index e2ca0f4b5151..ef362364d396 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -498,7 +498,6 @@ EXPORT_SYMBOL(__remove_inode_hash);
 
 void clear_inode(struct inode *inode)
 {
-	might_sleep();
 	/*
 	 * We have to cycle tree_lock here because reclaim can be still in the
 	 * process of removing the last page (in __delete_from_page_cache())
-- 
cgit v1.2.3


From 4c2e4befb3cc9ce42d506aa537c9ab504723e98c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 6 Feb 2018 15:41:45 -0800
Subject: pipe, sysctl: drop 'min' parameter from pipe-max-size converter

Patch series "pipe: buffer limits fixes and cleanups", v2.

This series simplifies the sysctl handler for pipe-max-size and fixes
another set of bugs related to the pipe buffer limits:

- The root user wasn't allowed to exceed the limits when creating new
  pipes.

- There was an off-by-one error when checking the limits, so a limit of
  N was actually treated as N - 1.

- F_SETPIPE_SZ accepted values over UINT_MAX.

- Reading the pipe buffer limits could be racy.

This patch (of 7):

Before validating the given value against pipe_min_size,
do_proc_dopipe_max_size_conv() calls round_pipe_size(), which rounds the
value up to pipe_min_size.  Therefore, the second check against
pipe_min_size is redundant.  Remove it.

Link: http://lkml.kernel.org/r/20180111052902.14409-2-ebiggers3@gmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 10 +++-------
 include/linux/pipe_fs_i.h |  2 +-
 kernel/sysctl.c           | 15 +--------------
 3 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index a449ca0ec0c6..1f6ec1051e59 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -35,11 +35,6 @@
  */
 unsigned int pipe_max_size = 1048576;
 
-/*
- * Minimum pipe size, as required by POSIX
- */
-unsigned int pipe_min_size = PAGE_SIZE;
-
 /* Maximum allocatable pages per user. Hard limit is unset by default, soft
  * matches default values.
  */
@@ -1024,8 +1019,9 @@ unsigned int round_pipe_size(unsigned int size)
 {
 	unsigned long nr_pages;
 
-	if (size < pipe_min_size)
-		size = pipe_min_size;
+	/* Minimum pipe size, as required by POSIX */
+	if (size < PAGE_SIZE)
+		size = PAGE_SIZE;
 
 	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (nr_pages == 0)
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 2dc5e9870fcd..7d9beda14584 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -167,7 +167,7 @@ void pipe_lock(struct pipe_inode_info *);
 void pipe_unlock(struct pipe_inode_info *);
 void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 
-extern unsigned int pipe_max_size, pipe_min_size;
+extern unsigned int pipe_max_size;
 extern unsigned long pipe_user_pages_hard;
 extern unsigned long pipe_user_pages_soft;
 int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2fb4e27c636a..f21375aa6cf6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1813,7 +1813,6 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(pipe_max_size),
 		.mode		= 0644,
 		.proc_handler	= &pipe_proc_fn,
-		.extra1		= &pipe_min_size,
 	},
 	{
 		.procname	= "pipe-user-pages-hard",
@@ -2615,16 +2614,10 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 do_proc_douintvec_minmax_conv, &param);
 }
 
-struct do_proc_dopipe_max_size_conv_param {
-	unsigned int *min;
-};
-
 static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 					unsigned int *valp,
 					int write, void *data)
 {
-	struct do_proc_dopipe_max_size_conv_param *param = data;
-
 	if (write) {
 		unsigned int val;
 
@@ -2635,9 +2628,6 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 		if (val == 0)
 			return -EINVAL;
 
-		if (param->min && *param->min > val)
-			return -ERANGE;
-
 		*valp = val;
 	} else {
 		unsigned int val = *valp;
@@ -2650,11 +2640,8 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 int proc_dopipe_max_size(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct do_proc_dopipe_max_size_conv_param param = {
-		.min = (unsigned int *) table->extra1,
-	};
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
-				 do_proc_dopipe_max_size_conv, &param);
+				 do_proc_dopipe_max_size_conv, NULL);
 }
 
 static void validate_coredump_safety(void)
-- 
cgit v1.2.3


From 319e0a21bb7823abbb4818fe2724e572bbac77a2 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 6 Feb 2018 15:41:49 -0800
Subject: pipe, sysctl: remove pipe_proc_fn()

pipe_proc_fn() is no longer needed, as it only calls through to
proc_dopipe_max_size().  Just put proc_dopipe_max_size() in the ctl_table
entry directly, and remove the unneeded EXPORT_SYMBOL() and the ENOSYS
stub for it.

(The reason the ENOSYS stub isn't needed is that the pipe-max-size
ctl_table entry is located directly in 'kern_table' rather than being
registered separately.  Therefore, the entry is already only defined when
the kernel is built with sysctl support.)

Link: http://lkml.kernel.org/r/20180111052902.14409-3-ebiggers3@gmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 10 ----------
 include/linux/pipe_fs_i.h |  1 -
 include/linux/sysctl.h    |  3 ---
 kernel/sysctl.c           | 15 +++++----------
 4 files changed, 5 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 1f6ec1051e59..61adbf0f5b53 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1120,16 +1120,6 @@ out_revert_acct:
 	return ret;
 }
 
-/*
- * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size
- * will return an error.
- */
-int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
-		 size_t *lenp, loff_t *ppos)
-{
-	return proc_dopipe_max_size(table, write, buf, lenp, ppos);
-}
-
 /*
  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
  * location, so checking ->i_pipe is not enough to verify that this is a
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 7d9beda14584..5028bd4b2c96 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -170,7 +170,6 @@ void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
 extern unsigned int pipe_max_size;
 extern unsigned long pipe_user_pages_hard;
 extern unsigned long pipe_user_pages_soft;
-int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 992bc9948232..b769ecfcc3bd 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,9 +51,6 @@ extern int proc_dointvec_minmax(struct ctl_table *, int,
 extern int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
-extern int proc_dopipe_max_size(struct ctl_table *table, int write,
-				void __user *buffer, size_t *lenp,
-				loff_t *ppos);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f21375aa6cf6..59f09ea3ca3e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -218,6 +218,8 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
 static int proc_dostring_coredump(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
 
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
@@ -1812,7 +1814,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &pipe_max_size,
 		.maxlen		= sizeof(pipe_max_size),
 		.mode		= 0644,
-		.proc_handler	= &pipe_proc_fn,
+		.proc_handler	= proc_dopipe_max_size,
 	},
 	{
 		.procname	= "pipe-user-pages-hard",
@@ -2637,8 +2639,8 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 	return 0;
 }
 
-int proc_dopipe_max_size(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dopipe_max_size(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	return do_proc_douintvec(table, write, buffer, lenp, ppos,
 				 do_proc_dopipe_max_size_conv, NULL);
@@ -3147,12 +3149,6 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
-int proc_dopipe_max_size(struct ctl_table *table, int write,
-			 void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-	return -ENOSYS;
-}
-
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3196,7 +3192,6 @@ EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
-EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
 EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
-- 
cgit v1.2.3


From 85c2dd5473b2718b4b63e74bfeb1ca876868e11f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 6 Feb 2018 15:41:53 -0800
Subject: pipe: actually allow root to exceed the pipe buffer limits

pipe-user-pages-hard and pipe-user-pages-soft are only supposed to apply
to unprivileged users, as documented in both Documentation/sysctl/fs.txt
and the pipe(7) man page.

However, the capabilities are actually only checked when increasing a
pipe's size using F_SETPIPE_SZ, not when creating a new pipe.  Therefore,
if pipe-user-pages-hard has been set, the root user can run into it and be
unable to create pipes.  Similarly, if pipe-user-pages-soft has been set,
the root user can run into it and have their pipes limited to 1 page each.

Fix this by allowing the privileged override in both cases.

Link: http://lkml.kernel.org/r/20180111052902.14409-4-ebiggers3@gmail.com
Fixes: 759c01142a5d ("pipe: limit the per-user amount of pages allocated in pipes")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 61adbf0f5b53..04acfad4692b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -613,6 +613,11 @@ static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
 	return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard;
 }
 
+static bool is_unprivileged_user(void)
+{
+	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+}
+
 struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
@@ -629,12 +634,12 @@ struct pipe_inode_info *alloc_pipe_info(void)
 
 	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
 
-	if (too_many_pipe_buffers_soft(user_bufs)) {
+	if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
 		user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
 		pipe_bufs = 1;
 	}
 
-	if (too_many_pipe_buffers_hard(user_bufs))
+	if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
 		goto out_revert_acct;
 
 	pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
@@ -1065,7 +1070,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	if (nr_pages > pipe->buffers &&
 			(too_many_pipe_buffers_hard(user_bufs) ||
 			 too_many_pipe_buffers_soft(user_bufs)) &&
-			!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+			is_unprivileged_user()) {
 		ret = -EPERM;
 		goto out_revert_acct;
 	}
-- 
cgit v1.2.3


From 9903a91c763ecdae333a04a9d89d79d2b8966503 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 6 Feb 2018 15:41:56 -0800
Subject: pipe: fix off-by-one error when checking buffer limits

With pipe-user-pages-hard set to 'N', users were actually only allowed up
to 'N - 1' buffers; and likewise for pipe-user-pages-soft.

Fix this to allow up to 'N' buffers, as would be expected.

Link: http://lkml.kernel.org/r/20180111052902.14409-5-ebiggers3@gmail.com
Fixes: b0b91d18e2e9 ("pipe: fix limit checking in pipe_set_size()")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Willy Tarreau <w@1wt.eu>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 04acfad4692b..46c30ac777da 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -605,12 +605,12 @@ static unsigned long account_pipe_buffers(struct user_struct *user,
 
 static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
 {
-	return pipe_user_pages_soft && user_bufs >= pipe_user_pages_soft;
+	return pipe_user_pages_soft && user_bufs > pipe_user_pages_soft;
 }
 
 static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
 {
-	return pipe_user_pages_hard && user_bufs >= pipe_user_pages_hard;
+	return pipe_user_pages_hard && user_bufs > pipe_user_pages_hard;
 }
 
 static bool is_unprivileged_user(void)
-- 
cgit v1.2.3


From 96e99be40e4cff870a83233731121ec0f7f95075 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 6 Feb 2018 15:42:00 -0800
Subject: pipe: reject F_SETPIPE_SZ with size over UINT_MAX

A pipe's size is represented as an 'unsigned int'.  As expected, writing a
value greater than UINT_MAX to /proc/sys/fs/pipe-max-size fails with
EINVAL.  However, the F_SETPIPE_SZ fcntl silently truncates such values to
32 bits, rather than failing with EINVAL as expected.  (It *does* fail
with EINVAL for values above (1 << 31) but <= UINT_MAX.)

Fix this by moving the check against UINT_MAX into round_pipe_size() which
is called in both cases.

Link: http://lkml.kernel.org/r/20180111052902.14409-6-ebiggers3@gmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 5 ++++-
 include/linux/pipe_fs_i.h | 2 +-
 kernel/sysctl.c           | 3 ---
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 46c30ac777da..817393d36244 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1020,10 +1020,13 @@ const struct file_operations pipefifo_fops = {
  * Currently we rely on the pipe array holding a power-of-2 number
  * of pages. Returns 0 on error.
  */
-unsigned int round_pipe_size(unsigned int size)
+unsigned int round_pipe_size(unsigned long size)
 {
 	unsigned long nr_pages;
 
+	if (size > UINT_MAX)
+		return 0;
+
 	/* Minimum pipe size, as required by POSIX */
 	if (size < PAGE_SIZE)
 		size = PAGE_SIZE;
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 5028bd4b2c96..5a3bb3b7c9ad 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -190,6 +190,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
 struct pipe_inode_info *get_pipe_info(struct file *file);
 
 int create_pipe_files(struct file **, int);
-unsigned int round_pipe_size(unsigned int size);
+unsigned int round_pipe_size(unsigned long size);
 
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 59f09ea3ca3e..f98f28c12020 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2623,9 +2623,6 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 	if (write) {
 		unsigned int val;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
-
 		val = round_pipe_size(*lvalp);
 		if (val == 0)
 			return -EINVAL;
-- 
cgit v1.2.3


From c4fed5a91fadc8a277b1eda474317b501651dd3e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 6 Feb 2018 15:42:05 -0800
Subject: pipe: simplify round_pipe_size()

round_pipe_size() calculates the number of pages the requested size
corresponds to, then rounds the page count up to the next power of 2.

However, it also rounds everything < PAGE_SIZE up to PAGE_SIZE.
Therefore, there's no need to actually translate the size into a page
count; we just need to round the size up to the next power of 2.

We do need to verify the size isn't greater than (1 << 31), since on
32-bit systems roundup_pow_of_two() would be undefined in that case.  But
that can just be combined with the UINT_MAX check which we need anyway
now.

Finally, update pipe_set_size() to not redundantly check the return value
of round_pipe_size() for the "invalid size" case twice.

Link: http://lkml.kernel.org/r/20180111052902.14409-7-ebiggers3@gmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Willy Tarreau <w@1wt.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 817393d36244..8be52158c400 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1022,20 +1022,14 @@ const struct file_operations pipefifo_fops = {
  */
 unsigned int round_pipe_size(unsigned long size)
 {
-	unsigned long nr_pages;
-
-	if (size > UINT_MAX)
+	if (size > (1U << 31))
 		return 0;
 
 	/* Minimum pipe size, as required by POSIX */
 	if (size < PAGE_SIZE)
-		size = PAGE_SIZE;
-
-	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages == 0)
-		return 0;
+		return PAGE_SIZE;
 
-	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+	return roundup_pow_of_two(size);
 }
 
 /*
@@ -1050,8 +1044,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	long ret = 0;
 
 	size = round_pipe_size(arg);
-	if (size == 0)
-		return -EINVAL;
 	nr_pages = size >> PAGE_SHIFT;
 
 	if (!nr_pages)
-- 
cgit v1.2.3


From f7340761812fc10313e6fcc115e0bc4f7a799112 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 6 Feb 2018 15:42:08 -0800
Subject: pipe: read buffer limits atomically

The pipe buffer limits are accessed without any locking, and may be
changed at any time by the sysctl handlers.  In theory this could cause
problems for expressions like the following:

    pipe_user_pages_hard && user_bufs > pipe_user_pages_hard

...  since the assembly code might reference the 'pipe_user_pages_hard'
memory location multiple times, and if the admin removes the limit by
setting it to 0, there is a very brief window where processes could
incorrectly observe the limit to be exceeded.

Fix this by loading the limits with READ_ONCE() prior to use.

Link: http://lkml.kernel.org/r/20180111052902.14409-8-ebiggers3@gmail.com
Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 8be52158c400..0913aed7fd0d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -605,12 +605,16 @@ static unsigned long account_pipe_buffers(struct user_struct *user,
 
 static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
 {
-	return pipe_user_pages_soft && user_bufs > pipe_user_pages_soft;
+	unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
+
+	return soft_limit && user_bufs > soft_limit;
 }
 
 static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
 {
-	return pipe_user_pages_hard && user_bufs > pipe_user_pages_hard;
+	unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
+
+	return hard_limit && user_bufs > hard_limit;
 }
 
 static bool is_unprivileged_user(void)
@@ -624,13 +628,14 @@ struct pipe_inode_info *alloc_pipe_info(void)
 	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 	struct user_struct *user = get_current_user();
 	unsigned long user_bufs;
+	unsigned int max_size = READ_ONCE(pipe_max_size);
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 	if (pipe == NULL)
 		goto out_free_uid;
 
-	if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE))
-		pipe_bufs = pipe_max_size >> PAGE_SHIFT;
+	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
+		pipe_bufs = max_size >> PAGE_SHIFT;
 
 	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
 
-- 
cgit v1.2.3