From bb737bbe48bea9854455cb61ea1dc06e92ce586c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 20 Apr 2020 17:01:34 +0200 Subject: virtiofs: schedule blocking async replies in separate worker In virtiofs (unlike in regular fuse) processing of async replies is serialized. This can result in a deadlock in rare corner cases when there's a circular dependency between the completion of two or more async replies. Such a deadlock can be reproduced with xfstests:generic/503 if TEST_DIR == SCRATCH_MNT (which is a misconfiguration): - Process A is waiting for page lock in worker thread context and blocked (virtio_fs_requests_done_work()). - Process B is holding page lock and waiting for pending writes to finish (fuse_wait_on_page_writeback()). - Write requests are waiting in virtqueue and can't complete because worker thread is blocked on page lock (process A). Fix this by creating a unique work_struct for each async reply that can block (O_DIRECT read). Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/fuse/file.c') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d67b830fb7a..d400b71b98d5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -712,6 +712,7 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, spin_unlock(&io->lock); ia->ap.args.end = fuse_aio_complete_req; + ia->ap.args.may_block = io->should_dirty; err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); if (err) fuse_aio_complete_req(fc, &ia->ap.args, err); -- cgit v1.2.3 From cf576c58b3a283333fc6e9a7c1c8e5342fa59b97 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 12 May 2020 10:29:04 +0800 Subject: fuse: invalidate inode attr in writeback cache mode Under writeback mode, inode->i_blocks is not updated, making utils du read st.blocks as 0. For example, when using virtiofs (cache=always & nondax mode) with writeback_cache enabled, writing a new file and check its disk usage with du, du reports 0 usage. # uname -r 5.6.0-rc6+ # mount -t virtiofs virtiofs /mnt/virtiofs # rm -f /mnt/virtiofs/testfile # create new file and do extend write # xfs_io -fc "pwrite 0 4k" /mnt/virtiofs/testfile wrote 4096/4096 bytes at offset 0 4 KiB, 1 ops; 0.0001 sec (28.103 MiB/sec and 7194.2446 ops/sec) # du -k /mnt/virtiofs/testfile 0 <==== disk usage is 0 # stat -c %s,%b /mnt/virtiofs/testfile 4096,0 <==== i_size is correct, but st_blocks is 0 Fix it by invalidating attr in fuse_flush(), so we get up-to-date attr from server on next getattr. Signed-off-by: Eryu Guan Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/fuse/file.c') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d400b71b98d5..262c5e20f324 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -445,8 +445,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (is_bad_inode(inode)) return -EIO; + err = 0; if (fc->no_flush) - return 0; + goto inval_attr_out; err = write_inode_now(inode, 1); if (err) @@ -475,6 +476,14 @@ static int fuse_flush(struct file *file, fl_owner_t id) fc->no_flush = 1; err = 0; } + +inval_attr_out: + /* + * In memory i_blocks is not maintained by fuse, if writeback cache is + * enabled, i_blocks from cached attr may not be accurate. + */ + if (!err && fc->writeback_cache) + fuse_invalidate_attr(inode); return err; } -- cgit v1.2.3 From 614c026e8a46636198da93ec30719f93975bb26a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2020 14:50:37 +0200 Subject: fuse: always flush dirty data on close(2) We want cached data to synced with the userspace filesystem on close(), for example to allow getting correct st_blocks value. Do this regardless of whether the userspace filesystem implements a FLUSH method or not. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/fuse/file.c') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 262c5e20f324..4aa750d08d62 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -445,10 +445,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (is_bad_inode(inode)) return -EIO; - err = 0; - if (fc->no_flush) - goto inval_attr_out; - err = write_inode_now(inode, 1); if (err) return err; @@ -461,6 +457,10 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; + err = 0; + if (fc->no_flush) + goto inval_attr_out; + memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); -- cgit v1.2.3 From 6b2fb79963fbed7db3ef850926d913518fd5c62f Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 19 Sep 2019 17:11:20 +0300 Subject: fuse: optimize writepages search Re-work fi->writepages, replacing list with rb-tree. This improves performance because kernel fuse iterates through fi->writepages for each writeback page and typical number of entries is about 800 (for 100MB of fuse writeback). Before patch: 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 41.3473 s, 260 MB/s 2 1 0 57445400 40416 6323676 0 0 33 374743 8633 19210 1 8 88 3 0 29.86% [kernel] [k] _raw_spin_lock 26.62% [fuse] [k] fuse_page_is_writeback After patch: 10240+0 records in 10240+0 records out 10737418240 bytes (11 GB) copied, 21.4954 s, 500 MB/s 2 9 0 53676040 31744 10265984 0 0 64 854790 10956 48387 1 6 88 6 0 23.55% [kernel] [k] copy_user_enhanced_fast_string 9.87% [kernel] [k] __memcpy 3.10% [kernel] [k] _raw_spin_lock Signed-off-by: Maxim Patlasov Signed-off-by: Vasily Averin Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 62 ++++++++++++++++++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 2 +- 2 files changed, 50 insertions(+), 14 deletions(-) (limited to 'fs/fuse/file.c') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4aa750d08d62..15812a8b3834 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -357,7 +357,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct list_head writepages_entry; + struct rb_node writepages_entry; struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; @@ -366,17 +366,23 @@ struct fuse_writepage_args { static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_writepage_args *wpa; + struct rb_node *n; + + n = fi->writepages.rb_node; - list_for_each_entry(wpa, &fi->writepages, writepages_entry) { + while (n) { + struct fuse_writepage_args *wpa; pgoff_t curr_index; + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + wpa->ia.ap.num_pages && - curr_index <= idx_to) { + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else return wpa; - } } return NULL; } @@ -1624,7 +1630,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&wpa->writepages_entry); + rb_erase(&wpa->writepages_entry, &fi->writepages); for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); @@ -1712,6 +1718,36 @@ __acquires(fi->lock) } } +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return (void) WARN_ON(true); + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); +} + static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, int error) { @@ -1730,7 +1766,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, wpa->next = next->next; next->next = NULL; next->ia.ff = fuse_file_get(wpa->ia.ff); - list_add(&next->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, next); /* * Skip fuse_flush_writepages() to make it easy to crop requests @@ -1865,7 +1901,7 @@ static int fuse_writepage_locked(struct page *page) inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1977,10 +2013,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_wpa->writepages_entry); + rb_erase(&new_wpa->writepages_entry, &fi->writepages); old_wpa = fuse_find_writeback(fi, page->index, page->index); if (!old_wpa) { - list_add(&new_wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, new_wpa); spin_unlock(&fi->lock); return false; } @@ -2095,7 +2131,7 @@ static int fuse_writepages_fill(struct page *page, wpa->inode = inode; spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); spin_unlock(&fi->lock); data->wpa = wpa; @@ -3405,5 +3441,5 @@ void fuse_init_file_inode(struct inode *inode) INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); - INIT_LIST_HEAD(&fi->writepages); + fi->writepages = RB_ROOT; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d7cde216fc87..740a8a7d7ae6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -111,7 +111,7 @@ struct fuse_inode { wait_queue_head_t page_waitq; /* List of writepage requestst (pending or sent) */ - struct list_head writepages; + struct rb_root writepages; }; /* readdir cache (directory only) */ -- cgit v1.2.3 From 2c4656dfd994538176db30ce09c02cc0dfc361ae Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 20 May 2020 11:39:35 +0200 Subject: fuse: fix copy_file_range cache issues a) Dirty cache needs to be written back not just in the writeback_cache case, since the dirty pages may come from memory maps. b) The fuse_writeback_range() helper takes an inclusive interval, so the end position needs to be pos+len-1 instead of pos+len. Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'fs/fuse/file.c') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 15812a8b3834..d365008fda44 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3325,13 +3325,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; - if (fc->writeback_cache) { - inode_lock(inode_in); - err = fuse_writeback_range(inode_in, pos_in, pos_in + len); - inode_unlock(inode_in); - if (err) - return err; - } + inode_lock(inode_in); + err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); + inode_unlock(inode_in); + if (err) + return err; inode_lock(inode_out); @@ -3339,11 +3337,9 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; - if (fc->writeback_cache) { - err = fuse_writeback_range(inode_out, pos_out, pos_out + len); - if (err) - goto out; - } + err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); + if (err) + goto out; if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); -- cgit v1.2.3 From 9b46418c40fe910e6537618f9932a8be78a3dd6c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 20 May 2020 11:39:35 +0200 Subject: fuse: copy_file_range should truncate cache After the copy operation completes the cache is not up-to-date. Truncate all pages in the interval that has successfully been copied. Truncating completely copied dirty pages is okay, since the data has been overwritten anyway. Truncating partially copied dirty pages is not okay; add a comment for now. Fixes: 88bc7d5097a1 ("fuse: add support for copy_file_range()") Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs/fuse/file.c') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d365008fda44..336d1cf72da0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3337,6 +3337,24 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; + /* + * Write out dirty pages in the destination file before sending the COPY + * request to userspace. After the request is completed, truncate off + * pages (including partial ones) from the cache that have been copied, + * since these contain stale data at that point. + * + * This should be mostly correct, but if the COPY writes to partial + * pages (at the start or end) and the parts not covered by the COPY are + * written through a memory map after calling fuse_writeback_range(), + * then these partial page modifications will be lost on truncation. + * + * It is unlikely that someone would rely on such mixed style + * modifications. Yet this does give less guarantees than if the + * copying was performed with write(2). + * + * To fix this a i_mmap_sem style lock could be used to prevent new + * faults while the copy is ongoing. + */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); if (err) goto out; @@ -3360,6 +3378,10 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; + truncate_inode_pages_range(inode_out->i_mapping, + ALIGN_DOWN(pos_out, PAGE_SIZE), + ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + if (fc->writeback_cache) { fuse_write_update_size(inode_out, pos_out + outarg.size); file_update_time(file_out); -- cgit v1.2.3