summaryrefslogtreecommitdiffstats
path: root/fs/gfs2/file.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 12:25:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 12:25:03 -0700
commitc03098d4b9ad76bca2966a8769dcfe59f7f85103 (patch)
treee7e2a6a0a84ad29baa14c018e3d4dcb12bd08fd6 /fs/gfs2/file.c
parentab2e7f4b46bf8fccf088ec496b3bb26b43e91340 (diff)
parentb01b2d72da25c000aeb124bc78daf3fb998be2b6 (diff)
downloadlinux-c03098d4b9ad76bca2966a8769dcfe59f7f85103.tar.bz2
Merge tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2
Pull gfs2 mmap + page fault deadlocks fixes from Andreas Gruenbacher: "Functions gfs2_file_read_iter and gfs2_file_write_iter are both accessing the user buffer to write to or read from while holding the inode glock. In the most basic deadlock scenario, that buffer will not be resident and it will be mapped to the same file. Accessing the buffer will trigger a page fault, and gfs2 will deadlock trying to take the same inode glock again while trying to handle that fault. Fix that and similar, more complex scenarios by disabling page faults while accessing user buffers. To make this work, introduce a small amount of new infrastructure and fix some bugs that didn't trigger so far, with page faults enabled" * tag 'gfs2-v5.15-rc5-mmap-fault' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2: gfs2: Fix mmap + page fault deadlocks for direct I/O iov_iter: Introduce nofault flag to disable page faults gup: Introduce FOLL_NOFAULT flag to disable page faults iomap: Add done_before argument to iomap_dio_rw iomap: Support partial direct I/O on user copy failures iomap: Fix iomap_dio_rw return value for user copies gfs2: Fix mmap + page fault deadlocks for buffered I/O gfs2: Eliminate ip->i_gh gfs2: Move the inode glock locking to gfs2_file_buffered_write gfs2: Introduce flag for glock holder auto-demotion gfs2: Clean up function may_grant gfs2: Add wrapper for iomap_file_buffered_write iov_iter: Introduce fault_in_iov_iter_writeable iov_iter: Turn iov_iter_fault_in_readable into fault_in_iov_iter_readable gup: Turn fault_in_pages_{readable,writeable} into fault_in_{readable,writeable} powerpc/kvm: Fix kvm_use_magic_page iov_iter: Fix iov_iter_get_pages{,_alloc} page fault return value
Diffstat (limited to 'fs/gfs2/file.c')
-rw-r--r--fs/gfs2/file.c252
1 files changed, 229 insertions, 23 deletions
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 5436a688157a..8f4627a19359 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -776,27 +776,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
return ret ? ret : ret1;
}
+static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
+ size_t *prev_count,
+ size_t *window_size)
+{
+ char __user *p = i->iov[0].iov_base + i->iov_offset;
+ size_t count = iov_iter_count(i);
+ int pages = 1;
+
+ if (likely(!count))
+ return false;
+ if (ret <= 0 && ret != -EFAULT)
+ return false;
+ if (!iter_is_iovec(i))
+ return false;
+
+ if (*prev_count != count || !*window_size) {
+ int pages, nr_dirtied;
+
+ pages = min_t(int, BIO_MAX_VECS,
+ DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
+ nr_dirtied = max(current->nr_dirtied_pause -
+ current->nr_dirtied, 1);
+ pages = min(pages, nr_dirtied);
+ }
+
+ *prev_count = count;
+ *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
+ return true;
+}
+
static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
struct gfs2_holder *gh)
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
- size_t count = iov_iter_count(to);
+ size_t prev_count = 0, window_size = 0;
+ size_t written = 0;
ssize_t ret;
- if (!count)
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
+ * physical as well as manual page faults, and we need to disable both
+ * kinds.
+ *
+ * For direct I/O, gfs2 takes the inode glock in deferred mode. This
+ * locking mode is compatible with other deferred holders, so multiple
+ * processes and nodes can do direct I/O to a file at the same time.
+ * There's no guarantee that reads or writes will be atomic. Any
+ * coordination among readers and writers needs to happen externally.
+ */
+
+ if (!iov_iter_count(to))
return 0; /* skip atime */
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
+retry_under_glock:
+ pagefault_disable();
+ to->nofault = true;
+ ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, written);
+ to->nofault = false;
+ pagefault_enable();
+ if (ret > 0)
+ written = ret;
+
+ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+ size_t leftover;
- ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
- gfs2_glock_dq(gh);
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_writeable(to, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh))
+ goto retry;
+ goto retry_under_glock;
+ }
+ }
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return written;
}
static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
@@ -805,11 +877,21 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- size_t len = iov_iter_count(from);
- loff_t offset = iocb->ki_pos;
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
ssize_t ret;
/*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ *
+ * For writes, iomap_dio_rw only triggers manual page faults, so we
+ * don't need to disable physical ones.
+ */
+
+ /*
* Deferred lock, even if its a write, since we do no allocation on
* this path. All we need to change is the atime, and this lock mode
* ensures that other nodes have flushed their buffered read caches
@@ -818,31 +900,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
* VFS does.
*/
gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
+retry:
ret = gfs2_glock_nq(gh);
if (ret)
goto out_uninit;
-
+retry_under_glock:
/* Silently fall back to buffered I/O when writing beyond EOF */
- if (offset + len > i_size_read(&ip->i_inode))
+ if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
goto out;
- ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
+ from->nofault = true;
+ ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+ IOMAP_DIO_PARTIAL, read);
+ from->nofault = false;
+
if (ret == -ENOTBLK)
ret = 0;
+ if (ret > 0)
+ read = ret;
+
+ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_readable(from, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh))
+ goto retry;
+ goto retry_under_glock;
+ }
+ }
out:
- gfs2_glock_dq(gh);
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
out_uninit:
gfs2_holder_uninit(gh);
- return ret;
+ if (ret < 0)
+ return ret;
+ return read;
}
static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct gfs2_inode *ip;
struct gfs2_holder gh;
+ size_t prev_count = 0, window_size = 0;
size_t written = 0;
ssize_t ret;
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
if (iocb->ki_flags & IOCB_DIRECT) {
ret = gfs2_file_direct_read(iocb, to, &gh);
if (likely(ret != -ENOTBLK))
@@ -864,18 +977,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
ip = GFS2_I(iocb->ki_filp->f_mapping->host);
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+retry:
ret = gfs2_glock_nq(&gh);
if (ret)
goto out_uninit;
+retry_under_glock:
+ pagefault_disable();
ret = generic_file_read_iter(iocb, to);
+ pagefault_enable();
if (ret > 0)
written += ret;
- gfs2_glock_dq(&gh);
+
+ if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(&gh);
+ leftover = fault_in_iov_iter_writeable(to, window_size);
+ gfs2_holder_disallow_demote(&gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(&gh)) {
+ if (written)
+ goto out_uninit;
+ goto retry;
+ }
+ goto retry_under_glock;
+ }
+ }
+ if (gfs2_holder_queued(&gh))
+ gfs2_glock_dq(&gh);
out_uninit:
gfs2_holder_uninit(&gh);
return written ? written : ret;
}
+static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from,
+ struct gfs2_holder *gh)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_holder *statfs_gh = NULL;
+ size_t prev_count = 0, window_size = 0;
+ size_t read = 0;
+ ssize_t ret;
+
+ /*
+ * In this function, we disable page faults when we're holding the
+ * inode glock while doing I/O. If a page fault occurs, we indicate
+ * that the inode glock may be dropped, fault in the pages manually,
+ * and retry.
+ */
+
+ if (inode == sdp->sd_rindex) {
+ statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
+ if (!statfs_gh)
+ return -ENOMEM;
+ }
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
+retry:
+ ret = gfs2_glock_nq(gh);
+ if (ret)
+ goto out_uninit;
+retry_under_glock:
+ if (inode == sdp->sd_rindex) {
+ struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+ ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, statfs_gh);
+ if (ret)
+ goto out_unlock;
+ }
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ pagefault_disable();
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ pagefault_enable();
+ current->backing_dev_info = NULL;
+ if (ret > 0) {
+ iocb->ki_pos += ret;
+ read += ret;
+ }
+
+ if (inode == sdp->sd_rindex)
+ gfs2_glock_dq_uninit(statfs_gh);
+
+ if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
+ size_t leftover;
+
+ gfs2_holder_allow_demote(gh);
+ leftover = fault_in_iov_iter_readable(from, window_size);
+ gfs2_holder_disallow_demote(gh);
+ if (leftover != window_size) {
+ if (!gfs2_holder_queued(gh)) {
+ if (read)
+ goto out_uninit;
+ goto retry;
+ }
+ goto retry_under_glock;
+ }
+ }
+out_unlock:
+ if (gfs2_holder_queued(gh))
+ gfs2_glock_dq(gh);
+out_uninit:
+ gfs2_holder_uninit(gh);
+ if (statfs_gh)
+ kfree(statfs_gh);
+ return read ? read : ret;
+}
+
/**
* gfs2_file_write_iter - Perform a write to a file
* @iocb: The io context
@@ -927,9 +1140,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out_unlock;
iocb->ki_flags |= IOCB_DSYNC;
- current->backing_dev_info = inode_to_bdi(inode);
- buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
+ buffered = gfs2_file_buffered_write(iocb, from, &gh);
if (unlikely(buffered <= 0)) {
if (!ret)
ret = buffered;
@@ -943,7 +1154,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* the direct I/O range as we don't know if the buffered pages
* made it to disk.
*/
- iocb->ki_pos += buffered;
ret2 = generic_write_sync(iocb, buffered);
invalidate_mapping_pages(mapping,
(iocb->ki_pos - buffered) >> PAGE_SHIFT,
@@ -951,13 +1161,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!ret || ret2 > 0)
ret += ret2;
} else {
- current->backing_dev_info = inode_to_bdi(inode);
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- current->backing_dev_info = NULL;
- if (likely(ret > 0)) {
- iocb->ki_pos += ret;
+ ret = gfs2_file_buffered_write(iocb, from, &gh);
+ if (likely(ret > 0))
ret = generic_write_sync(iocb, ret);
- }
}
out_unlock: