From 085d16d5f949b64713d5e960d6c9bbf51bc1d511 Mon Sep 17 00:00:00 2001
From: Dan Aloni <dan.aloni@vastdata.com>
Date: Sun, 8 May 2022 15:54:50 +0300
Subject: nfs: fix broken handling of the softreval mount option

Turns out that ever since this mount option was added, passing
`softreval` in NFS mount options cancelled all other flags while not
affecting the underlying flag `NFS_MOUNT_SOFTREVAL`.

Fixes: c74dfe97c104 ("NFS: Add mount option 'softreval'")
Signed-off-by: Dan Aloni <dan.aloni@vastdata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fs_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e2d59bb5e6bb..9a16897e8dc6 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 		if (result.negated)
 			ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
 		else
-			ctx->flags &= NFS_MOUNT_SOFTREVAL;
+			ctx->flags |= NFS_MOUNT_SOFTREVAL;
 		break;
 	case Opt_posix:
 		if (result.negated)
-- 
cgit v1.2.3


From 1927e498aee1757b3df755a194cbfc5cc0f2b663 Mon Sep 17 00:00:00 2001
From: Kalesh Singh <kaleshsingh@google.com>
Date: Mon, 9 May 2022 17:34:28 -0700
Subject: procfs: prevent unprivileged processes accessing fdinfo dir

The file permissions on the fdinfo dir from were changed from
S_IRUSR|S_IXUSR to S_IRUGO|S_IXUGO, and a PTRACE_MODE_READ check was added
for opening the fdinfo files [1].  However, the ptrace permission check
was not added to the directory, allowing anyone to get the open FD numbers
by reading the fdinfo directory.

Add the missing ptrace permission check for opening the fdinfo directory.

[1] https://lkml.kernel.org/r/20210308170651.919148-1-kaleshsingh@google.com

Link: https://lkml.kernel.org/r/20210713162008.1056986-1-kaleshsingh@google.com
Fixes: 7bc3fa0172a4 ("procfs: allow reading fdinfo with PTRACE_MODE_READ")
Signed-off-by: Kalesh Singh <kaleshsingh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Hridya Valsaraju <hridya@google.com>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/fd.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 172c86270b31..913bef0d2a36 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -72,7 +72,7 @@ out:
 	return 0;
 }
 
-static int seq_fdinfo_open(struct inode *inode, struct file *file)
+static int proc_fdinfo_access_allowed(struct inode *inode)
 {
 	bool allowed = false;
 	struct task_struct *task = get_proc_task(inode);
@@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
 	if (!allowed)
 		return -EACCES;
 
+	return 0;
+}
+
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+	int ret = proc_fdinfo_access_allowed(inode);
+
+	if (ret)
+		return ret;
+
 	return single_open(file, seq_show, inode);
 }
 
@@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
 				  proc_fdinfo_instantiate);
 }
 
+static int proc_open_fdinfo(struct inode *inode, struct file *file)
+{
+	int ret = proc_fdinfo_access_allowed(inode);
+
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 const struct inode_operations proc_fdinfo_inode_operations = {
 	.lookup		= proc_lookupfdinfo,
 	.setattr	= proc_setattr,
 };
 
 const struct file_operations proc_fdinfo_operations = {
+	.open		= proc_open_fdinfo,
 	.read		= generic_read_dir,
 	.iterate_shared	= proc_readfdinfo,
 	.llseek		= generic_file_llseek,
-- 
cgit v1.2.3


From 620239d9a32e9fe27c9204ec11e40058671aeeb6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 25 Apr 2022 15:54:27 -0400
Subject: ceph: fix setting of xattrs on async created inodes

Currently when we create a file, we spin up an xattr buffer to send
along with the create request. If we end up doing an async create
however, then we currently pass down a zero-length xattr buffer.

Fix the code to send down the xattr buffer in req->r_pagelist. If the
xattrs span more than a page, however give up and don't try to do an
async create.

Cc: stable@vger.kernel.org
URL: https://bugzilla.redhat.com/show_bug.cgi?id=2063929
Fixes: 9a8d03ca2e2c ("ceph: attempt to do async create when possible")
Reported-by: John Fortin <fortinj66@gmail.com>
Reported-by: Sri Ramanujam <sri@ramanujam.io>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6c9e837aa1d3..8c8226c0feac 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
 	iinfo.change_attr = 1;
 	ceph_encode_timespec64(&iinfo.btime, &now);
 
-	iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
-	iinfo.xattr_data = xattr_buf;
-	memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+	if (req->r_pagelist) {
+		iinfo.xattr_len = req->r_pagelist->length;
+		iinfo.xattr_data = req->r_pagelist->mapped_tail;
+	} else {
+		/* fake it */
+		iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+		iinfo.xattr_data = xattr_buf;
+		memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+	}
 
 	in.ino = cpu_to_le64(vino.ino);
 	in.snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 		err = ceph_security_init_secctx(dentry, mode, &as_ctx);
 		if (err < 0)
 			goto out_ctx;
+		/* Async create can't handle more than a page of xattrs */
+		if (as_ctx.pagelist &&
+		    !list_is_singular(&as_ctx.pagelist->head))
+			try_async = false;
 	} else if (!d_in_lookup(dentry)) {
 		/* If it's not being looked up, it's negative */
 		return -ENOENT;
-- 
cgit v1.2.3


From 642d51fb0775a41dd6bb3d99b5a40c24df131c20 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Thu, 5 May 2022 18:53:09 +0800
Subject: ceph: check folio PG_private bit instead of folio->private

The pages in the file mapping maybe reclaimed and reused by other
subsystems and the page->private maybe used as flags field or
something else, if later that pages are used by page caches again
the page->private maybe not cleared as expected.

Here will check the PG_private bit instead of the folio->private.

Cc: stable@vger.kernel.org
URL: https://tracker.ceph.com/issues/55421
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Luis Henriques <lhenriques@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index aa25bffd4823..b6edcf89a429 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
 	if (folio_test_dirty(folio)) {
 		dout("%p dirty_folio %p idx %lu -- already dirty\n",
 		     mapping->host, folio, folio->index);
-		BUG_ON(!folio_get_private(folio));
+		VM_BUG_ON_FOLIO(!folio_test_private(folio), folio);
 		return false;
 	}
 
@@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
 	 * Reference snap context in folio->private.  Also set
 	 * PagePrivate so that we get invalidate_folio callback.
 	 */
-	BUG_ON(folio_get_private(folio));
+	VM_BUG_ON_FOLIO(folio_test_private(folio), folio);
 	folio_attach_private(folio, snapc);
 
 	return ceph_fscache_dirty_folio(mapping, folio);
@@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
 	}
 
 	WARN_ON(!folio_test_locked(folio));
-	if (folio_get_private(folio)) {
+	if (folio_test_private(folio)) {
 		dout("%p invalidate_folio idx %lu full dirty page\n",
 		     inode, folio->index);
 
@@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req)
 
 	/* clean all pages */
 	for (i = 0; i < req->r_num_ops; i++) {
-		if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
+		if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) {
+			pr_warn("%s incorrect op %d req %p index %d tid %llu\n",
+				__func__, req->r_ops[i].op, req, i, req->r_tid);
 			break;
+		}
 
 		osd_data = osd_req_op_extent_osd_data(req, i);
 		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
-- 
cgit v1.2.3


From d031a8866e709c9d1ee5537a321b6192b4d2dc5b Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 14 Apr 2022 17:52:39 +0200
Subject: gfs2: Fix filesystem block deallocation for short writes

When a write cannot be carried out in full, gfs2_iomap_end() releases
blocks that have been allocated for this write but haven't been used.

To compute the end of the allocation, gfs2_iomap_end() incorrectly
rounded the end of the attempted write down to the next block boundary
to arrive at the end of the allocation.  It would have to round up, but
the end of the allocation is also available as iomap->offset +
iomap->length, so just use that instead.

In addition, use round_up() for computing the start of the unused range.

Fixes: 64bc06bb32ee ("gfs2: iomap buffered write support")
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/bmap.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 39080b2d6cf8..b6697333bb2b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
 
 	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
 		/* Deallocate blocks that were just allocated. */
-		loff_t blockmask = i_blocksize(inode) - 1;
-		loff_t end = (pos + length) & ~blockmask;
+		loff_t hstart = round_up(pos + written, i_blocksize(inode));
+		loff_t hend = iomap->offset + iomap->length;
 
-		pos = (pos + written + blockmask) & ~blockmask;
-		if (pos < end) {
-			truncate_pagecache_range(inode, pos, end - 1);
-			punch_hole(ip, pos, end - pos);
+		if (hstart < hend) {
+			truncate_pagecache_range(inode, hstart, hend - 1);
+			punch_hole(ip, hstart, hend - hstart);
 		}
 	}
 
-- 
cgit v1.2.3


From 42e4c3bdcae7833eeeaed7bf0c000c2de17dd291 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 27 Apr 2022 13:53:42 +0200
Subject: gfs2: Variable rename

Instead of counting the number of bytes read from the filesystem,
functions gfs2_file_direct_read and gfs2_file_read_iter count the number
of bytes written into the user buffer.  Conversely, functions
gfs2_file_direct_write and gfs2_file_buffered_write count the number of
bytes read from the user buffer.  This is nothing but confusing, so
change the read functions to count how many bytes they have read, and
the write functions to count how many bytes they have written.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 48f01323c37c..4d36c01727ad 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -807,7 +807,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
 	struct file *file = iocb->ki_filp;
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	size_t prev_count = 0, window_size = 0;
-	size_t written = 0;
+	size_t read = 0;
 	ssize_t ret;
 
 	/*
@@ -839,11 +839,11 @@ retry_under_glock:
 	pagefault_disable();
 	to->nofault = true;
 	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
-			   IOMAP_DIO_PARTIAL, written);
+			   IOMAP_DIO_PARTIAL, read);
 	to->nofault = false;
 	pagefault_enable();
 	if (ret > 0)
-		written = ret;
+		read = ret;
 
 	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
 		size_t leftover;
@@ -863,7 +863,7 @@ out_uninit:
 	gfs2_holder_uninit(gh);
 	if (ret < 0)
 		return ret;
-	return written;
+	return read;
 }
 
 static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -873,7 +873,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
 	struct inode *inode = file->f_mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	size_t prev_count = 0, window_size = 0;
-	size_t read = 0;
+	size_t written = 0;
 	ssize_t ret;
 
 	/*
@@ -906,13 +906,13 @@ retry_under_glock:
 
 	from->nofault = true;
 	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
-			   IOMAP_DIO_PARTIAL, read);
+			   IOMAP_DIO_PARTIAL, written);
 	from->nofault = false;
 
 	if (ret == -ENOTBLK)
 		ret = 0;
 	if (ret > 0)
-		read = ret;
+		written = ret;
 
 	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
 		size_t leftover;
@@ -933,7 +933,7 @@ out_uninit:
 	gfs2_holder_uninit(gh);
 	if (ret < 0)
 		return ret;
-	return read;
+	return written;
 }
 
 static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -941,7 +941,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct gfs2_inode *ip;
 	struct gfs2_holder gh;
 	size_t prev_count = 0, window_size = 0;
-	size_t written = 0;
+	size_t read = 0;
 	ssize_t ret;
 
 	/*
@@ -962,7 +962,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (ret >= 0) {
 		if (!iov_iter_count(to))
 			return ret;
-		written = ret;
+		read = ret;
 	} else if (ret != -EFAULT) {
 		if (ret != -EAGAIN)
 			return ret;
@@ -980,7 +980,7 @@ retry_under_glock:
 	ret = generic_file_read_iter(iocb, to);
 	pagefault_enable();
 	if (ret > 0)
-		written += ret;
+		read += ret;
 
 	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
 		size_t leftover;
@@ -998,7 +998,7 @@ retry_under_glock:
 		gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
-	return written ? written : ret;
+	return read ? read : ret;
 }
 
 static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
@@ -1012,7 +1012,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
 	struct gfs2_holder *statfs_gh = NULL;
 	size_t prev_count = 0, window_size = 0;
 	size_t orig_count = iov_iter_count(from);
-	size_t read = 0;
+	size_t written = 0;
 	ssize_t ret;
 
 	/*
@@ -1050,13 +1050,13 @@ retry_under_glock:
 	current->backing_dev_info = NULL;
 	if (ret > 0) {
 		iocb->ki_pos += ret;
-		read += ret;
+		written += ret;
 	}
 
 	if (inode == sdp->sd_rindex)
 		gfs2_glock_dq_uninit(statfs_gh);
 
-	from->count = orig_count - read;
+	from->count = orig_count - written;
 	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
 		size_t leftover;
 
@@ -1077,8 +1077,8 @@ out_uninit:
 	gfs2_holder_uninit(gh);
 	if (statfs_gh)
 		kfree(statfs_gh);
-	from->count = orig_count - read;
-	return read ? read : ret;
+	from->count = orig_count - written;
+	return written ? written : ret;
 }
 
 /**
-- 
cgit v1.2.3


From 6d22ff471070e21e24667be70764ee5abdfe5608 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 5 May 2022 12:37:49 +0200
Subject: gfs2: Clean up use of fault_in_iov_iter_{read,write}able

No need to store the return value of the fault_in functions in separate
variables.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4d36c01727ad..acc0c1d41564 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -846,12 +846,10 @@ retry_under_glock:
 		read = ret;
 
 	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
-		size_t leftover;
-
 		gfs2_holder_allow_demote(gh);
-		leftover = fault_in_iov_iter_writeable(to, window_size);
+		window_size -= fault_in_iov_iter_writeable(to, window_size);
 		gfs2_holder_disallow_demote(gh);
-		if (leftover != window_size) {
+		if (window_size) {
 			if (gfs2_holder_queued(gh))
 				goto retry_under_glock;
 			goto retry;
@@ -915,12 +913,10 @@ retry_under_glock:
 		written = ret;
 
 	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
-		size_t leftover;
-
 		gfs2_holder_allow_demote(gh);
-		leftover = fault_in_iov_iter_readable(from, window_size);
+		window_size -= fault_in_iov_iter_readable(from, window_size);
 		gfs2_holder_disallow_demote(gh);
-		if (leftover != window_size) {
+		if (window_size) {
 			if (gfs2_holder_queued(gh))
 				goto retry_under_glock;
 			goto retry;
@@ -983,12 +979,10 @@ retry_under_glock:
 		read += ret;
 
 	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
-		size_t leftover;
-
 		gfs2_holder_allow_demote(&gh);
-		leftover = fault_in_iov_iter_writeable(to, window_size);
+		window_size -= fault_in_iov_iter_writeable(to, window_size);
 		gfs2_holder_disallow_demote(&gh);
-		if (leftover != window_size) {
+		if (window_size) {
 			if (gfs2_holder_queued(&gh))
 				goto retry_under_glock;
 			goto retry;
@@ -1058,13 +1052,11 @@ retry_under_glock:
 
 	from->count = orig_count - written;
 	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
-		size_t leftover;
-
 		gfs2_holder_allow_demote(gh);
-		leftover = fault_in_iov_iter_readable(from, window_size);
+		window_size -= fault_in_iov_iter_readable(from, window_size);
 		gfs2_holder_disallow_demote(gh);
-		if (leftover != window_size) {
-			from->count = min(from->count, window_size - leftover);
+		if (window_size) {
+			from->count = min(from->count, window_size);
 			if (gfs2_holder_queued(gh))
 				goto retry_under_glock;
 			goto retry;
-- 
cgit v1.2.3


From 72382264502d9348ead372f82ecc3044de5c82d2 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 5 May 2022 12:53:26 +0200
Subject: gfs2: Pull return value test out of should_fault_in_pages

Pull the return value test of the previous read or write operation out
of should_fault_in_pages().  In a following patch, we'll fault in pages
before the I/O and there will be no return value to check.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index acc0c1d41564..ea87bef7314d 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -770,7 +770,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	return ret ? ret : ret1;
 }
 
-static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+static inline bool should_fault_in_pages(struct iov_iter *i,
 					 size_t *prev_count,
 					 size_t *window_size)
 {
@@ -779,8 +779,6 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
 
 	if (likely(!count))
 		return false;
-	if (ret <= 0 && ret != -EFAULT)
-		return false;
 	if (!iter_is_iovec(i))
 		return false;
 
@@ -842,10 +840,12 @@ retry_under_glock:
 			   IOMAP_DIO_PARTIAL, read);
 	to->nofault = false;
 	pagefault_enable();
+	if (ret <= 0 && ret != -EFAULT)
+		goto out_unlock;
 	if (ret > 0)
 		read = ret;
 
-	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+	if (should_fault_in_pages(to, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(gh);
 		window_size -= fault_in_iov_iter_writeable(to, window_size);
 		gfs2_holder_disallow_demote(gh);
@@ -855,6 +855,7 @@ retry_under_glock:
 			goto retry;
 		}
 	}
+out_unlock:
 	if (gfs2_holder_queued(gh))
 		gfs2_glock_dq(gh);
 out_uninit:
@@ -899,20 +900,23 @@ retry:
 		goto out_uninit;
 	/* Silently fall back to buffered I/O when writing beyond EOF */
 	if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
-		goto out;
+		goto out_unlock;
 retry_under_glock:
 
 	from->nofault = true;
 	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
 			   IOMAP_DIO_PARTIAL, written);
 	from->nofault = false;
-
-	if (ret == -ENOTBLK)
-		ret = 0;
+	if (ret <= 0) {
+		if (ret == -ENOTBLK)
+			ret = 0;
+		if (ret != -EFAULT)
+			goto out_unlock;
+	}
 	if (ret > 0)
 		written = ret;
 
-	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+	if (should_fault_in_pages(from, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(gh);
 		window_size -= fault_in_iov_iter_readable(from, window_size);
 		gfs2_holder_disallow_demote(gh);
@@ -922,7 +926,7 @@ retry_under_glock:
 			goto retry;
 		}
 	}
-out:
+out_unlock:
 	if (gfs2_holder_queued(gh))
 		gfs2_glock_dq(gh);
 out_uninit:
@@ -975,10 +979,12 @@ retry_under_glock:
 	pagefault_disable();
 	ret = generic_file_read_iter(iocb, to);
 	pagefault_enable();
+	if (ret <= 0 && ret != -EFAULT)
+		goto out_unlock;
 	if (ret > 0)
 		read += ret;
 
-	if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+	if (should_fault_in_pages(to, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(&gh);
 		window_size -= fault_in_iov_iter_writeable(to, window_size);
 		gfs2_holder_disallow_demote(&gh);
@@ -988,6 +994,7 @@ retry_under_glock:
 			goto retry;
 		}
 	}
+out_unlock:
 	if (gfs2_holder_queued(&gh))
 		gfs2_glock_dq(&gh);
 out_uninit:
@@ -1050,8 +1057,11 @@ retry_under_glock:
 	if (inode == sdp->sd_rindex)
 		gfs2_glock_dq_uninit(statfs_gh);
 
+	if (ret <= 0 && ret != -EFAULT)
+		goto out_unlock;
+
 	from->count = orig_count - written;
-	if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+	if (should_fault_in_pages(from, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(gh);
 		window_size -= fault_in_iov_iter_readable(from, window_size);
 		gfs2_holder_disallow_demote(gh);
-- 
cgit v1.2.3


From 324d116c5a5c8204dc00e63f725a3c5ed09afb53 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 5 May 2022 13:32:23 +0200
Subject: gfs2: Align read and write chunks to the page cache

Align the chunks that reads and writes are carried out in to the page
cache rather than the user buffers.  This will be more efficient in
general, especially for allocating writes.  Optimizing the case that the
user buffer is gfs2 backed isn't very useful; we only need to make sure
we won't deadlock.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ea87bef7314d..11c46407d4a8 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -771,6 +771,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 }
 
 static inline bool should_fault_in_pages(struct iov_iter *i,
+					 struct kiocb *iocb,
 					 size_t *prev_count,
 					 size_t *window_size)
 {
@@ -783,15 +784,13 @@ static inline bool should_fault_in_pages(struct iov_iter *i,
 		return false;
 
 	size = PAGE_SIZE;
-	offs = offset_in_page(i->iov[0].iov_base + i->iov_offset);
+	offs = offset_in_page(iocb->ki_pos);
 	if (*prev_count != count || !*window_size) {
 		size_t nr_dirtied;
 
-		size = ALIGN(offs + count, PAGE_SIZE);
-		size = min_t(size_t, size, SZ_1M);
 		nr_dirtied = max(current->nr_dirtied_pause -
 				 current->nr_dirtied, 8);
-		size = min(size, nr_dirtied << PAGE_SHIFT);
+		size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT);
 	}
 
 	*prev_count = count;
@@ -845,7 +844,7 @@ retry_under_glock:
 	if (ret > 0)
 		read = ret;
 
-	if (should_fault_in_pages(to, &prev_count, &window_size)) {
+	if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(gh);
 		window_size -= fault_in_iov_iter_writeable(to, window_size);
 		gfs2_holder_disallow_demote(gh);
@@ -916,7 +915,7 @@ retry_under_glock:
 	if (ret > 0)
 		written = ret;
 
-	if (should_fault_in_pages(from, &prev_count, &window_size)) {
+	if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(gh);
 		window_size -= fault_in_iov_iter_readable(from, window_size);
 		gfs2_holder_disallow_demote(gh);
@@ -984,7 +983,7 @@ retry_under_glock:
 	if (ret > 0)
 		read += ret;
 
-	if (should_fault_in_pages(to, &prev_count, &window_size)) {
+	if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(&gh);
 		window_size -= fault_in_iov_iter_writeable(to, window_size);
 		gfs2_holder_disallow_demote(&gh);
@@ -1061,7 +1060,7 @@ retry_under_glock:
 		goto out_unlock;
 
 	from->count = orig_count - written;
-	if (should_fault_in_pages(from, &prev_count, &window_size)) {
+	if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
 		gfs2_holder_allow_demote(gh);
 		window_size -= fault_in_iov_iter_readable(from, window_size);
 		gfs2_holder_disallow_demote(gh);
-- 
cgit v1.2.3


From fa5dfa645d85910d747f4e0c97f19e5e97d1c270 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 4 May 2022 23:37:30 +0200
Subject: gfs2: buffered write prefaulting

In gfs2_file_buffered_write, to increase the likelihood that all the
user memory we're trying to write will be resident in memory, carry out
the write in chunks and fault in each chunk of user memory before trying
to write it.  Otherwise, some workloads will trigger frequent short
"internal" writes, causing filesystem blocks to be allocated and then
partially deallocated again when writing into holes, which is wasteful
and breaks reservations.

Neither the chunked writes nor any of the short "internal" writes are
user visible.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 11c46407d4a8..5eda1bcc85e3 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -778,7 +778,7 @@ static inline bool should_fault_in_pages(struct iov_iter *i,
 	size_t count = iov_iter_count(i);
 	size_t size, offs;
 
-	if (likely(!count))
+	if (!count)
 		return false;
 	if (!iter_is_iovec(i))
 		return false;
@@ -1033,7 +1033,20 @@ retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
+	if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
 retry_under_glock:
+		gfs2_holder_allow_demote(gh);
+		window_size -= fault_in_iov_iter_readable(from, window_size);
+		gfs2_holder_disallow_demote(gh);
+		if (!window_size) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		if (!gfs2_holder_queued(gh))
+			goto retry;
+		from->count = min(from->count, window_size);
+	}
+
 	if (inode == sdp->sd_rindex) {
 		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 
@@ -1060,17 +1073,8 @@ retry_under_glock:
 		goto out_unlock;
 
 	from->count = orig_count - written;
-	if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
-		gfs2_holder_allow_demote(gh);
-		window_size -= fault_in_iov_iter_readable(from, window_size);
-		gfs2_holder_disallow_demote(gh);
-		if (window_size) {
-			from->count = min(from->count, window_size);
-			if (gfs2_holder_queued(gh))
-				goto retry_under_glock;
-			goto retry;
-		}
-	}
+	if (should_fault_in_pages(from, iocb, &prev_count, &window_size))
+		goto retry_under_glock;
 out_unlock:
 	if (gfs2_holder_queued(gh))
 		gfs2_glock_dq(gh);
-- 
cgit v1.2.3


From e1fa9ea85ce89207d2ac0316da35a4a7736801f9 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 11 May 2022 18:27:12 +0200
Subject: gfs2: Stop using glock holder auto-demotion for now

We're having unresolved issues with the glock holder auto-demotion mechanism
introduced in commit dc732906c245.  This mechanism was assumed to be essential
for avoiding frequent short reads and writes until commit 296abc0d91d8
("gfs2: No short reads or writes upon glock contention").  Since then,
when the inode glock is lost, it is simply re-acquired and the operation
is resumed.  This means that apart from the performance penalty, we
might as well drop the inode glock before faulting in pages, and
re-acquire it afterwards.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/file.c | 46 ++++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 5eda1bcc85e3..2556ae1f92ea 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -832,7 +832,6 @@ retry:
 	ret = gfs2_glock_nq(gh);
 	if (ret)
 		goto out_uninit;
-retry_under_glock:
 	pagefault_disable();
 	to->nofault = true;
 	ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
@@ -845,14 +844,10 @@ retry_under_glock:
 		read = ret;
 
 	if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
-		gfs2_holder_allow_demote(gh);
+		gfs2_glock_dq(gh);
 		window_size -= fault_in_iov_iter_writeable(to, window_size);
-		gfs2_holder_disallow_demote(gh);
-		if (window_size) {
-			if (gfs2_holder_queued(gh))
-				goto retry_under_glock;
+		if (window_size)
 			goto retry;
-		}
 	}
 out_unlock:
 	if (gfs2_holder_queued(gh))
@@ -900,7 +895,6 @@ retry:
 	/* Silently fall back to buffered I/O when writing beyond EOF */
 	if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
 		goto out_unlock;
-retry_under_glock:
 
 	from->nofault = true;
 	ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
@@ -916,14 +910,10 @@ retry_under_glock:
 		written = ret;
 
 	if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
-		gfs2_holder_allow_demote(gh);
+		gfs2_glock_dq(gh);
 		window_size -= fault_in_iov_iter_readable(from, window_size);
-		gfs2_holder_disallow_demote(gh);
-		if (window_size) {
-			if (gfs2_holder_queued(gh))
-				goto retry_under_glock;
+		if (window_size)
 			goto retry;
-		}
 	}
 out_unlock:
 	if (gfs2_holder_queued(gh))
@@ -974,7 +964,6 @@ retry:
 	ret = gfs2_glock_nq(&gh);
 	if (ret)
 		goto out_uninit;
-retry_under_glock:
 	pagefault_disable();
 	ret = generic_file_read_iter(iocb, to);
 	pagefault_enable();
@@ -984,14 +973,10 @@ retry_under_glock:
 		read += ret;
 
 	if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) {
-		gfs2_holder_allow_demote(&gh);
+		gfs2_glock_dq(&gh);
 		window_size -= fault_in_iov_iter_writeable(to, window_size);
-		gfs2_holder_disallow_demote(&gh);
-		if (window_size) {
-			if (gfs2_holder_queued(&gh))
-				goto retry_under_glock;
+		if (window_size)
 			goto retry;
-		}
 	}
 out_unlock:
 	if (gfs2_holder_queued(&gh))
@@ -1030,22 +1015,17 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
 retry:
-	ret = gfs2_glock_nq(gh);
-	if (ret)
-		goto out_uninit;
 	if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
-retry_under_glock:
-		gfs2_holder_allow_demote(gh);
 		window_size -= fault_in_iov_iter_readable(from, window_size);
-		gfs2_holder_disallow_demote(gh);
 		if (!window_size) {
 			ret = -EFAULT;
-			goto out_unlock;
+			goto out_uninit;
 		}
-		if (!gfs2_holder_queued(gh))
-			goto retry;
 		from->count = min(from->count, window_size);
 	}
+	ret = gfs2_glock_nq(gh);
+	if (ret)
+		goto out_uninit;
 
 	if (inode == sdp->sd_rindex) {
 		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -1073,8 +1053,10 @@ retry_under_glock:
 		goto out_unlock;
 
 	from->count = orig_count - written;
-	if (should_fault_in_pages(from, iocb, &prev_count, &window_size))
-		goto retry_under_glock;
+	if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) {
+		gfs2_glock_dq(gh);
+		goto retry;
+	}
 out_unlock:
 	if (gfs2_holder_queued(gh))
 		gfs2_glock_dq(gh);
-- 
cgit v1.2.3


From aa184e8671f0f911fc2fb3f68cd506e4d7838faa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 17 May 2022 12:32:05 -0600
Subject: io_uring: don't attempt to IOPOLL for MSG_RING requests

We gate whether to IOPOLL for a request on whether the opcode is allowed
on a ring setup for IOPOLL and if it's got a file assigned. MSG_RING
is the only one that allows a file yet isn't pollable, it's merely
supported to allow communication on an IOPOLL ring, not because we can
poll for completion of it.

Put the assigned file early and clear it, so we don't attempt to poll
for it.

Reported-by: syzbot+1a0a53300ce782f8b3ad@syzkaller.appspotmail.com
Fixes: 3f1d52abf098 ("io_uring: defer msg-ring file validity check until command issue")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 91de361ea9ab..e0823f58f795 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4481,6 +4481,9 @@ done:
 	if (ret < 0)
 		req_set_fail(req);
 	__io_req_complete(req, issue_flags, ret, 0);
+	/* put file to avoid an attempt to IOPOLL the req */
+	io_put_file(req->file);
+	req->file = NULL;
 	return 0;
 }
 
-- 
cgit v1.2.3