From 085d16d5f949b64713d5e960d6c9bbf51bc1d511 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Sun, 8 May 2022 15:54:50 +0300 Subject: nfs: fix broken handling of the softreval mount option Turns out that ever since this mount option was added, passing `softreval` in NFS mount options cancelled all other flags while not affecting the underlying flag `NFS_MOUNT_SOFTREVAL`. Fixes: c74dfe97c104 ("NFS: Add mount option 'softreval'") Signed-off-by: Dan Aloni Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e2d59bb5e6bb..9a16897e8dc6 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -517,7 +517,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (result.negated) ctx->flags &= ~NFS_MOUNT_SOFTREVAL; else - ctx->flags &= NFS_MOUNT_SOFTREVAL; + ctx->flags |= NFS_MOUNT_SOFTREVAL; break; case Opt_posix: if (result.negated) -- cgit v1.2.3 From 1927e498aee1757b3df755a194cbfc5cc0f2b663 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 9 May 2022 17:34:28 -0700 Subject: procfs: prevent unprivileged processes accessing fdinfo dir The file permissions on the fdinfo dir from were changed from S_IRUSR|S_IXUSR to S_IRUGO|S_IXUGO, and a PTRACE_MODE_READ check was added for opening the fdinfo files [1]. However, the ptrace permission check was not added to the directory, allowing anyone to get the open FD numbers by reading the fdinfo directory. Add the missing ptrace permission check for opening the fdinfo directory. [1] https://lkml.kernel.org/r/20210308170651.919148-1-kaleshsingh@google.com Link: https://lkml.kernel.org/r/20210713162008.1056986-1-kaleshsingh@google.com Fixes: 7bc3fa0172a4 ("procfs: allow reading fdinfo with PTRACE_MODE_READ") Signed-off-by: Kalesh Singh Cc: Kees Cook Cc: Eric W. Biederman Cc: Christian Brauner Cc: Suren Baghdasaryan Cc: Hridya Valsaraju Cc: Jann Horn Signed-off-by: Andrew Morton --- fs/proc/fd.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 172c86270b31..913bef0d2a36 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -72,7 +72,7 @@ out: return 0; } -static int seq_fdinfo_open(struct inode *inode, struct file *file) +static int proc_fdinfo_access_allowed(struct inode *inode) { bool allowed = false; struct task_struct *task = get_proc_task(inode); @@ -86,6 +86,16 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file) if (!allowed) return -EACCES; + return 0; +} + +static int seq_fdinfo_open(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + return single_open(file, seq_show, inode); } @@ -348,12 +358,23 @@ static int proc_readfdinfo(struct file *file, struct dir_context *ctx) proc_fdinfo_instantiate); } +static int proc_open_fdinfo(struct inode *inode, struct file *file) +{ + int ret = proc_fdinfo_access_allowed(inode); + + if (ret) + return ret; + + return 0; +} + const struct inode_operations proc_fdinfo_inode_operations = { .lookup = proc_lookupfdinfo, .setattr = proc_setattr, }; const struct file_operations proc_fdinfo_operations = { + .open = proc_open_fdinfo, .read = generic_read_dir, .iterate_shared = proc_readfdinfo, .llseek = generic_file_llseek, -- cgit v1.2.3 From 620239d9a32e9fe27c9204ec11e40058671aeeb6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 25 Apr 2022 15:54:27 -0400 Subject: ceph: fix setting of xattrs on async created inodes Currently when we create a file, we spin up an xattr buffer to send along with the create request. If we end up doing an async create however, then we currently pass down a zero-length xattr buffer. Fix the code to send down the xattr buffer in req->r_pagelist. If the xattrs span more than a page, however give up and don't try to do an async create. Cc: stable@vger.kernel.org URL: https://bugzilla.redhat.com/show_bug.cgi?id=2063929 Fixes: 9a8d03ca2e2c ("ceph: attempt to do async create when possible") Reported-by: John Fortin Reported-by: Sri Ramanujam Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c9e837aa1d3..8c8226c0feac 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -629,9 +629,15 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); - iinfo.xattr_len = ARRAY_SIZE(xattr_buf); - iinfo.xattr_data = xattr_buf; - memset(iinfo.xattr_data, 0, iinfo.xattr_len); + if (req->r_pagelist) { + iinfo.xattr_len = req->r_pagelist->length; + iinfo.xattr_data = req->r_pagelist->mapped_tail; + } else { + /* fake it */ + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + } in.ino = cpu_to_le64(vino.ino); in.snapid = cpu_to_le64(CEPH_NOSNAP); @@ -743,6 +749,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_security_init_secctx(dentry, mode, &as_ctx); if (err < 0) goto out_ctx; + /* Async create can't handle more than a page of xattrs */ + if (as_ctx.pagelist && + !list_is_singular(&as_ctx.pagelist->head)) + try_async = false; } else if (!d_in_lookup(dentry)) { /* If it's not being looked up, it's negative */ return -ENOENT; -- cgit v1.2.3 From 642d51fb0775a41dd6bb3d99b5a40c24df131c20 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 5 May 2022 18:53:09 +0800 Subject: ceph: check folio PG_private bit instead of folio->private The pages in the file mapping maybe reclaimed and reused by other subsystems and the page->private maybe used as flags field or something else, if later that pages are used by page caches again the page->private maybe not cleared as expected. Here will check the PG_private bit instead of the folio->private. Cc: stable@vger.kernel.org URL: https://tracker.ceph.com/issues/55421 Signed-off-by: Xiubo Li Reviewed-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index aa25bffd4823..b6edcf89a429 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -85,7 +85,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) if (folio_test_dirty(folio)) { dout("%p dirty_folio %p idx %lu -- already dirty\n", mapping->host, folio, folio->index); - BUG_ON(!folio_get_private(folio)); + VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); return false; } @@ -122,7 +122,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) * Reference snap context in folio->private. Also set * PagePrivate so that we get invalidate_folio callback. */ - BUG_ON(folio_get_private(folio)); + VM_BUG_ON_FOLIO(folio_test_private(folio), folio); folio_attach_private(folio, snapc); return ceph_fscache_dirty_folio(mapping, folio); @@ -150,7 +150,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset, } WARN_ON(!folio_test_locked(folio)); - if (folio_get_private(folio)) { + if (folio_test_private(folio)) { dout("%p invalidate_folio idx %lu full dirty page\n", inode, folio->index); @@ -729,8 +729,11 @@ static void writepages_finish(struct ceph_osd_request *req) /* clean all pages */ for (i = 0; i < req->r_num_ops; i++) { - if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) { + pr_warn("%s incorrect op %d req %p index %d tid %llu\n", + __func__, req->r_ops[i].op, req, i, req->r_tid); break; + } osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); -- cgit v1.2.3 From d031a8866e709c9d1ee5537a321b6192b4d2dc5b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 14 Apr 2022 17:52:39 +0200 Subject: gfs2: Fix filesystem block deallocation for short writes When a write cannot be carried out in full, gfs2_iomap_end() releases blocks that have been allocated for this write but haven't been used. To compute the end of the allocation, gfs2_iomap_end() incorrectly rounded the end of the attempted write down to the next block boundary to arrive at the end of the allocation. It would have to round up, but the end of the allocation is also available as iomap->offset + iomap->length, so just use that instead. In addition, use round_up() for computing the start of the unused range. Fixes: 64bc06bb32ee ("gfs2: iomap buffered write support") Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 39080b2d6cf8..b6697333bb2b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1153,13 +1153,12 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ - loff_t blockmask = i_blocksize(inode) - 1; - loff_t end = (pos + length) & ~blockmask; + loff_t hstart = round_up(pos + written, i_blocksize(inode)); + loff_t hend = iomap->offset + iomap->length; - pos = (pos + written + blockmask) & ~blockmask; - if (pos < end) { - truncate_pagecache_range(inode, pos, end - 1); - punch_hole(ip, pos, end - pos); + if (hstart < hend) { + truncate_pagecache_range(inode, hstart, hend - 1); + punch_hole(ip, hstart, hend - hstart); } } -- cgit v1.2.3 From 42e4c3bdcae7833eeeaed7bf0c000c2de17dd291 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 27 Apr 2022 13:53:42 +0200 Subject: gfs2: Variable rename Instead of counting the number of bytes read from the filesystem, functions gfs2_file_direct_read and gfs2_file_read_iter count the number of bytes written into the user buffer. Conversely, functions gfs2_file_direct_write and gfs2_file_buffered_write count the number of bytes read from the user buffer. This is nothing but confusing, so change the read functions to count how many bytes they have read, and the write functions to count how many bytes they have written. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 48f01323c37c..4d36c01727ad 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -807,7 +807,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -839,11 +839,11 @@ retry_under_glock: pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, written); + IOMAP_DIO_PARTIAL, read); to->nofault = false; pagefault_enable(); if (ret > 0) - written = ret; + read = ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { size_t leftover; @@ -863,7 +863,7 @@ out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return written; + return read; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, @@ -873,7 +873,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -906,13 +906,13 @@ retry_under_glock: from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, - IOMAP_DIO_PARTIAL, read); + IOMAP_DIO_PARTIAL, written); from->nofault = false; if (ret == -ENOTBLK) ret = 0; if (ret > 0) - read = ret; + written = ret; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { size_t leftover; @@ -933,7 +933,7 @@ out_uninit: gfs2_holder_uninit(gh); if (ret < 0) return ret; - return read; + return written; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -941,7 +941,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct gfs2_inode *ip; struct gfs2_holder gh; size_t prev_count = 0, window_size = 0; - size_t written = 0; + size_t read = 0; ssize_t ret; /* @@ -962,7 +962,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (ret >= 0) { if (!iov_iter_count(to)) return ret; - written = ret; + read = ret; } else if (ret != -EFAULT) { if (ret != -EAGAIN) return ret; @@ -980,7 +980,7 @@ retry_under_glock: ret = generic_file_read_iter(iocb, to); pagefault_enable(); if (ret > 0) - written += ret; + read += ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { size_t leftover; @@ -998,7 +998,7 @@ retry_under_glock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - return written ? written : ret; + return read ? read : ret; } static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, @@ -1012,7 +1012,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, struct gfs2_holder *statfs_gh = NULL; size_t prev_count = 0, window_size = 0; size_t orig_count = iov_iter_count(from); - size_t read = 0; + size_t written = 0; ssize_t ret; /* @@ -1050,13 +1050,13 @@ retry_under_glock: current->backing_dev_info = NULL; if (ret > 0) { iocb->ki_pos += ret; - read += ret; + written += ret; } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); - from->count = orig_count - read; + from->count = orig_count - written; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { size_t leftover; @@ -1077,8 +1077,8 @@ out_uninit: gfs2_holder_uninit(gh); if (statfs_gh) kfree(statfs_gh); - from->count = orig_count - read; - return read ? read : ret; + from->count = orig_count - written; + return written ? written : ret; } /** -- cgit v1.2.3 From 6d22ff471070e21e24667be70764ee5abdfe5608 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 5 May 2022 12:37:49 +0200 Subject: gfs2: Clean up use of fault_in_iov_iter_{read,write}able No need to store the return value of the fault_in functions in separate variables. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4d36c01727ad..acc0c1d41564 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -846,12 +846,10 @@ retry_under_glock: read = ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_writeable(to, window_size); + window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { + if (window_size) { if (gfs2_holder_queued(gh)) goto retry_under_glock; goto retry; @@ -915,12 +913,10 @@ retry_under_glock: written = ret; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); + window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { + if (window_size) { if (gfs2_holder_queued(gh)) goto retry_under_glock; goto retry; @@ -983,12 +979,10 @@ retry_under_glock: read += ret; if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(&gh); - leftover = fault_in_iov_iter_writeable(to, window_size); + window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); - if (leftover != window_size) { + if (window_size) { if (gfs2_holder_queued(&gh)) goto retry_under_glock; goto retry; @@ -1058,13 +1052,11 @@ retry_under_glock: from->count = orig_count - written; if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { - size_t leftover; - gfs2_holder_allow_demote(gh); - leftover = fault_in_iov_iter_readable(from, window_size); + window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); - if (leftover != window_size) { - from->count = min(from->count, window_size - leftover); + if (window_size) { + from->count = min(from->count, window_size); if (gfs2_holder_queued(gh)) goto retry_under_glock; goto retry; -- cgit v1.2.3 From 72382264502d9348ead372f82ecc3044de5c82d2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 5 May 2022 12:53:26 +0200 Subject: gfs2: Pull return value test out of should_fault_in_pages Pull the return value test of the previous read or write operation out of should_fault_in_pages(). In a following patch, we'll fault in pages before the I/O and there will be no return value to check. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index acc0c1d41564..ea87bef7314d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -770,7 +770,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } -static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, +static inline bool should_fault_in_pages(struct iov_iter *i, size_t *prev_count, size_t *window_size) { @@ -779,8 +779,6 @@ static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, if (likely(!count)) return false; - if (ret <= 0 && ret != -EFAULT) - return false; if (!iter_is_iovec(i)) return false; @@ -842,10 +840,12 @@ retry_under_glock: IOMAP_DIO_PARTIAL, read); to->nofault = false; pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) read = ret; - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); @@ -855,6 +855,7 @@ retry_under_glock: goto retry; } } +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: @@ -899,20 +900,23 @@ retry: goto out_uninit; /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) - goto out; + goto out_unlock; retry_under_glock: from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, IOMAP_DIO_PARTIAL, written); from->nofault = false; - - if (ret == -ENOTBLK) - ret = 0; + if (ret <= 0) { + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EFAULT) + goto out_unlock; + } if (ret > 0) written = ret; - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); @@ -922,7 +926,7 @@ retry_under_glock: goto retry; } } -out: +out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); out_uninit: @@ -975,10 +979,12 @@ retry_under_glock: pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; if (ret > 0) read += ret; - if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, &prev_count, &window_size)) { gfs2_holder_allow_demote(&gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); @@ -988,6 +994,7 @@ retry_under_glock: goto retry; } } +out_unlock: if (gfs2_holder_queued(&gh)) gfs2_glock_dq(&gh); out_uninit: @@ -1050,8 +1057,11 @@ retry_under_glock: if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + from->count = orig_count - written; - if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); -- cgit v1.2.3 From 324d116c5a5c8204dc00e63f725a3c5ed09afb53 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 5 May 2022 13:32:23 +0200 Subject: gfs2: Align read and write chunks to the page cache Align the chunks that reads and writes are carried out in to the page cache rather than the user buffers. This will be more efficient in general, especially for allocating writes. Optimizing the case that the user buffer is gfs2 backed isn't very useful; we only need to make sure we won't deadlock. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index ea87bef7314d..11c46407d4a8 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -771,6 +771,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, } static inline bool should_fault_in_pages(struct iov_iter *i, + struct kiocb *iocb, size_t *prev_count, size_t *window_size) { @@ -783,15 +784,13 @@ static inline bool should_fault_in_pages(struct iov_iter *i, return false; size = PAGE_SIZE; - offs = offset_in_page(i->iov[0].iov_base + i->iov_offset); + offs = offset_in_page(iocb->ki_pos); if (*prev_count != count || !*window_size) { size_t nr_dirtied; - size = ALIGN(offs + count, PAGE_SIZE); - size = min_t(size_t, size, SZ_1M); nr_dirtied = max(current->nr_dirtied_pause - current->nr_dirtied, 8); - size = min(size, nr_dirtied << PAGE_SHIFT); + size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT); } *prev_count = count; @@ -845,7 +844,7 @@ retry_under_glock: if (ret > 0) read = ret; - if (should_fault_in_pages(to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(gh); @@ -916,7 +915,7 @@ retry_under_glock: if (ret > 0) written = ret; - if (should_fault_in_pages(from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); @@ -984,7 +983,7 @@ retry_under_glock: if (ret > 0) read += ret; - if (should_fault_in_pages(to, &prev_count, &window_size)) { + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(&gh); window_size -= fault_in_iov_iter_writeable(to, window_size); gfs2_holder_disallow_demote(&gh); @@ -1061,7 +1060,7 @@ retry_under_glock: goto out_unlock; from->count = orig_count - written; - if (should_fault_in_pages(from, &prev_count, &window_size)) { + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); gfs2_holder_disallow_demote(gh); -- cgit v1.2.3 From fa5dfa645d85910d747f4e0c97f19e5e97d1c270 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 4 May 2022 23:37:30 +0200 Subject: gfs2: buffered write prefaulting In gfs2_file_buffered_write, to increase the likelihood that all the user memory we're trying to write will be resident in memory, carry out the write in chunks and fault in each chunk of user memory before trying to write it. Otherwise, some workloads will trigger frequent short "internal" writes, causing filesystem blocks to be allocated and then partially deallocated again when writing into holes, which is wasteful and breaks reservations. Neither the chunked writes nor any of the short "internal" writes are user visible. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 11c46407d4a8..5eda1bcc85e3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -778,7 +778,7 @@ static inline bool should_fault_in_pages(struct iov_iter *i, size_t count = iov_iter_count(i); size_t size, offs; - if (likely(!count)) + if (!count) return false; if (!iter_is_iovec(i)) return false; @@ -1033,7 +1033,20 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { retry_under_glock: + gfs2_holder_allow_demote(gh); + window_size -= fault_in_iov_iter_readable(from, window_size); + gfs2_holder_disallow_demote(gh); + if (!window_size) { + ret = -EFAULT; + goto out_unlock; + } + if (!gfs2_holder_queued(gh)) + goto retry; + from->count = min(from->count, window_size); + } + if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -1060,17 +1073,8 @@ retry_under_glock: goto out_unlock; from->count = orig_count - written; - if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(gh); - window_size -= fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (window_size) { - from->count = min(from->count, window_size); - if (gfs2_holder_queued(gh)) - goto retry_under_glock; - goto retry; - } - } + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) + goto retry_under_glock; out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); -- cgit v1.2.3 From e1fa9ea85ce89207d2ac0316da35a4a7736801f9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 11 May 2022 18:27:12 +0200 Subject: gfs2: Stop using glock holder auto-demotion for now We're having unresolved issues with the glock holder auto-demotion mechanism introduced in commit dc732906c245. This mechanism was assumed to be essential for avoiding frequent short reads and writes until commit 296abc0d91d8 ("gfs2: No short reads or writes upon glock contention"). Since then, when the inode glock is lost, it is simply re-acquired and the operation is resumed. This means that apart from the performance penalty, we might as well drop the inode glock before faulting in pages, and re-acquire it afterwards. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/file.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 5eda1bcc85e3..2556ae1f92ea 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -832,7 +832,6 @@ retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); to->nofault = true; ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, @@ -845,14 +844,10 @@ retry_under_glock: read = ret; if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(gh); + gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(gh); - if (window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (window_size) goto retry; - } } out_unlock: if (gfs2_holder_queued(gh)) @@ -900,7 +895,6 @@ retry: /* Silently fall back to buffered I/O when writing beyond EOF */ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) goto out_unlock; -retry_under_glock: from->nofault = true; ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, @@ -916,14 +910,10 @@ retry_under_glock: written = ret; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(gh); + gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); - if (window_size) { - if (gfs2_holder_queued(gh)) - goto retry_under_glock; + if (window_size) goto retry; - } } out_unlock: if (gfs2_holder_queued(gh)) @@ -974,7 +964,6 @@ retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; -retry_under_glock: pagefault_disable(); ret = generic_file_read_iter(iocb, to); pagefault_enable(); @@ -984,14 +973,10 @@ retry_under_glock: read += ret; if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { - gfs2_holder_allow_demote(&gh); + gfs2_glock_dq(&gh); window_size -= fault_in_iov_iter_writeable(to, window_size); - gfs2_holder_disallow_demote(&gh); - if (window_size) { - if (gfs2_holder_queued(&gh)) - goto retry_under_glock; + if (window_size) goto retry; - } } out_unlock: if (gfs2_holder_queued(&gh)) @@ -1030,22 +1015,17 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); retry: - ret = gfs2_glock_nq(gh); - if (ret) - goto out_uninit; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { -retry_under_glock: - gfs2_holder_allow_demote(gh); window_size -= fault_in_iov_iter_readable(from, window_size); - gfs2_holder_disallow_demote(gh); if (!window_size) { ret = -EFAULT; - goto out_unlock; + goto out_uninit; } - if (!gfs2_holder_queued(gh)) - goto retry; from->count = min(from->count, window_size); } + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; if (inode == sdp->sd_rindex) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -1073,8 +1053,10 @@ retry_under_glock: goto out_unlock; from->count = orig_count - written; - if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) - goto retry_under_glock; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + goto retry; + } out_unlock: if (gfs2_holder_queued(gh)) gfs2_glock_dq(gh); -- cgit v1.2.3 From aa184e8671f0f911fc2fb3f68cd506e4d7838faa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 May 2022 12:32:05 -0600 Subject: io_uring: don't attempt to IOPOLL for MSG_RING requests We gate whether to IOPOLL for a request on whether the opcode is allowed on a ring setup for IOPOLL and if it's got a file assigned. MSG_RING is the only one that allows a file yet isn't pollable, it's merely supported to allow communication on an IOPOLL ring, not because we can poll for completion of it. Put the assigned file early and clear it, so we don't attempt to poll for it. Reported-by: syzbot+1a0a53300ce782f8b3ad@syzkaller.appspotmail.com Fixes: 3f1d52abf098 ("io_uring: defer msg-ring file validity check until command issue") Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/io_uring.c b/fs/io_uring.c index 91de361ea9ab..e0823f58f795 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4481,6 +4481,9 @@ done: if (ret < 0) req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ + io_put_file(req->file); + req->file = NULL; return 0; } -- cgit v1.2.3