summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-03-31 13:00:16 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-03-31 13:00:16 -0700
commit15c981d16d70e8a5be297fa4af07a64ab7e080ed (patch)
tree9487ba1525d75501cd1a3896dac4e6321efd3a55 /fs/btrfs/file.c
parent1455c69900c8c6442b182a74087931f4ffb1cac4 (diff)
parent6ff06729c22ec0b7498d900d79cc88cfb8aceaeb (diff)
downloadlinux-15c981d16d70e8a5be297fa4af07a64ab7e080ed.tar.bz2
Merge tag 'for-5.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "A number of core changes that make things work better in general, code is simpler and cleaner. Core changes: - per-inode file extent tree, for in memory tracking of contiguous extent ranges to make sure i_size adjustments are accurate - tree root structures are protected by reference counts, replacing SRCU that did not cover some cases - leak detector for tree root structures - per-transaction pinned extent tracking - buffer heads are replaced by bios for super block access - speedup of extent back reference resolution, on an example test scenario the runtime of send went down from a hour to minutes - factor out locking scheme used for subvolume writer and NOCOW exclusion, abstracted as DREW lock, double reader-writer exclusion (allow either readers or writers) - cleanup and abstract extent allocation policies, preparation for zoned device support - make reflink/clone_range work on inline extents - add more cancellation point for relocation, improves long response from 'balance cancel' - add page migration callback for data pages - switch to guid for uuids, with additional cleanups of the interface - make ranged full fsyncs more efficient - removal of obsolete ioctl flag BTRFS_SUBVOL_CREATE_ASYNC - remove b-tree readahead from delayed refs paths, avoiding seek and read unnecessary blocks Features: - v2 of ioctl to delete subvolumes, allowing to delete by id and more future extensions Fixes: - fix qgroup rescan worker that could block umount - fix crash during unmount due to race with delayed inode workers - fix dellaloc flushing logic that could create unnecessary chunks under heavy load - fix missing file extent item for hole after ranged fsync - several fixes in relocation error handling Other: - more documentation of relocation, device replace, space reservations - many random cleanups" * tag 'for-5.7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (210 commits) btrfs: fix missing semaphore unlock in btrfs_sync_file btrfs: use nofs allocations for running delayed items btrfs: sysfs: Use scnprintf() instead of snprintf() btrfs: do not resolve backrefs for roots that are being deleted btrfs: track reloc roots based on their commit root bytenr btrfs: restart relocate_tree_blocks properly btrfs: reloc: reorder reservation before root selection btrfs: do not readahead in build_backref_tree btrfs: do not use readahead for running delayed refs btrfs: Remove async_transid from btrfs_mksubvol/create_subvol/create_snapshot btrfs: Remove transid argument from btrfs_ioctl_snap_create_transid btrfs: Remove BTRFS_SUBVOL_CREATE_ASYNC support btrfs: kill the subvol_srcu btrfs: make btrfs_cleanup_fs_roots use the radix tree lock btrfs: don't take an extra root ref at allocation time btrfs: hold a ref on the root on the dead roots list btrfs: make inodes hold a ref on their roots btrfs: move the root freeing stuff into btrfs_put_root btrfs: move ino_cache_inode dropping out of btrfs_free_fs_root btrfs: make the extent buffer leak check per fs info ...
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c80
1 files changed, 53 insertions, 27 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a16da274c9aa..8a144f9cb7ac 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -27,6 +27,7 @@
#include "qgroup.h"
#include "compression.h"
#include "delalloc-space.h"
+#include "reflink.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -277,7 +278,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_key key;
struct btrfs_ioctl_defrag_range_args range;
int num_defrag;
- int index;
int ret;
/* get the inode */
@@ -285,9 +285,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
- index = srcu_read_lock(&fs_info->subvol_srcu);
-
- inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ inode_root = btrfs_get_fs_root(fs_info, &key, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
@@ -297,11 +295,11 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, inode_root);
+ btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
- srcu_read_unlock(&fs_info->subvol_srcu, index);
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -337,7 +335,6 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
iput(inode);
return 0;
cleanup:
- srcu_read_unlock(&fs_info->subvol_srcu, index);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
@@ -1552,15 +1549,14 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
u64 num_bytes;
int ret;
- ret = btrfs_start_write_no_snapshotting(root);
- if (!ret)
+ if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
- btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart,
+ btrfs_lock_and_flush_ordered_range(inode, lockstart,
lockend, NULL);
num_bytes = lockend - lockstart + 1;
@@ -1568,7 +1564,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
@@ -1674,7 +1670,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
data_reserved, pos,
write_bytes);
else
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
break;
}
@@ -1778,7 +1774,7 @@ again:
release_bytes = 0;
if (only_release_metadata)
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
if (only_release_metadata && copied > 0) {
lockstart = round_down(pos,
@@ -1807,7 +1803,7 @@ again:
if (release_bytes) {
if (only_release_metadata) {
- btrfs_end_write_no_snapshotting(root);
+ btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
@@ -2071,6 +2067,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_init_log_ctx(&ctx, inode);
/*
+ * Set the range to full if the NO_HOLES feature is not enabled.
+ * This is to avoid missing file extent items representing holes after
+ * replaying the log.
+ */
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
* multi-task, and make the performance up. See
@@ -2092,19 +2098,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges, and assertion failures during
- * hole detection. Do this while holding the inode lock, to avoid races
- * with other tasks.
- */
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
-
- /*
* Before we acquired the inode's lock, someone may have dirtied more
* pages in the target range. We need to make sure that writeback for
* any such pages does not start while we are logging the inode, because
@@ -2124,6 +2117,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
@@ -2486,6 +2480,11 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
+ clone_info->file_offset, clone_len);
+ if (ret)
+ return ret;
+
/* If it's a hole, nothing more needs to be done. */
if (clone_info->disk_offset == 0)
return 0;
@@ -2596,6 +2595,24 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
+ } else if (!clone_info && cur_offset < drop_end) {
+ /*
+ * We are past the i_size here, but since we didn't
+ * insert holes we need to clear the mapped area so we
+ * know to not set disk_i_size in this area until a new
+ * file extent is inserted here.
+ */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ /*
+ * We couldn't clear our area, so we could
+ * presumably adjust up and corrupt the fs, so
+ * we need to abort.
+ */
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
}
if (clone_info && drop_end > clone_info->file_offset) {
@@ -2686,6 +2703,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
+ } else if (!clone_info && cur_offset < drop_end) {
+ /* See the comment in the loop above for the reasoning here. */
+ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ cur_offset, drop_end - cur_offset);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_trans;
+ }
+
}
if (clone_info) {
ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
@@ -2935,7 +2961,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
inode->i_ctime = current_time(inode);
i_size_write(inode, end);
- btrfs_ordered_update_i_size(inode, end, NULL);
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode(trans, root, inode);
ret2 = btrfs_end_transaction(trans);