From 78192442d383fb47ff5fa066f23c1d1bd382e526 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 15 May 2019 07:33:48 +0800 Subject: btrfs: extent-tree: Refactor add_pinned_bytes() to add|sub_pinned_bytes() Instead of using @sign to determine whether we're adding or subtracting. Even it only has 3 callers, it's still (and in fact already caused problem in the past) confusing to use. Refactor add_pinned_bytes() to add_pinned_bytes() and sub_pinned_bytes() to explicitly show what we're doing. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5faf057f6f37..caaa79a3aa48 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -756,27 +756,38 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, return NULL; } -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref, int sign) +static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) { - struct btrfs_space_info *space_info; - s64 num_bytes; - u64 flags; - - ASSERT(sign == 1 || sign == -1); - num_bytes = sign * ref->len; if (ref->type == BTRFS_REF_METADATA) { if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; + return BTRFS_BLOCK_GROUP_SYSTEM; else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; + return BTRFS_BLOCK_GROUP_METADATA; } + return BTRFS_BLOCK_GROUP_DATA; +} + +static void add_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) +{ + struct btrfs_space_info *space_info; + u64 flags = generic_ref_to_space_flags(ref); + + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, + BTRFS_TOTAL_BYTES_PINNED_BATCH); +} + +static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) +{ + struct btrfs_space_info *space_info; + u64 flags = generic_ref_to_space_flags(ref); space_info = __find_space_info(fs_info, flags); ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, + percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, BTRFS_TOTAL_BYTES_PINNED_BATCH); } @@ -2065,7 +2076,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_ref_tree_mod(fs_info, generic_ref); if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - add_pinned_bytes(fs_info, generic_ref, -1); + sub_pinned_bytes(fs_info, generic_ref); return ret; } @@ -7191,7 +7202,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, &generic_ref, 1); + add_pinned_bytes(fs_info, &generic_ref); if (last_ref) { /* @@ -7239,7 +7250,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) btrfs_ref_tree_mod(fs_info, ref); if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref, 1); + add_pinned_bytes(fs_info, ref); return ret; } -- cgit v1.2.3 From 9b4e675a9978800995f83af0ed90e890ca501f31 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 16 May 2019 13:39:59 +0200 Subject: btrfs: detect fast implementation of crc32c on all architectures Currently, there's only check for fast crc32c implementation on X86, based on the CPU flags. This is used to decide if checksumming should be offloaded to worker threads or can be calculated by the caller. As there are more architectures that implement a faster version of crc32c (ARM, SPARC, s390, MIPS, PowerPC), also there are specialized hw cards. The detection is based on driver name, all generic C implementations contain 'generic', while the specialized versions do not. Alternatively the priority could be used, but this is not currently provided by the crypto API. The flag is set per-filesystem at mount time and used for the offloading decisions. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ++++++ fs/btrfs/disk-io.c | 13 ++++--------- fs/btrfs/super.c | 2 ++ 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0a61dff27f57..02870c1bb68a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -791,6 +791,12 @@ enum { /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, }; struct btrfs_fs_info { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index deb74a8c191a..024c9fadeaeb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -40,10 +40,6 @@ #include "tree-checker.h" #include "ref-verify.h" -#ifdef CONFIG_X86 -#include -#endif - #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ @@ -873,14 +869,13 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, return btree_csum_one_bio(bio); } -static int check_async_write(struct btrfs_inode *bi) +static int check_async_write(struct btrfs_fs_info *fs_info, + struct btrfs_inode *bi) { if (atomic_read(&bi->sync_writers)) return 0; -#ifdef CONFIG_X86 - if (static_cpu_has(X86_FEATURE_XMM4_2)) + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) return 0; -#endif return 1; } @@ -889,7 +884,7 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(BTRFS_I(inode)); + int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0645ec428b4f..db410c6b5f5f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1553,6 +1553,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; + if (!strstr(crc32c_impl(), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); error = btrfs_fill_super(s, fs_devices, data); } if (!error) -- cgit v1.2.3 From 5911c8fe05c54c9f74a6467650e6493e4808cd01 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 15 May 2019 15:31:04 +0200 Subject: btrfs: fiemap: preallocate ulists for btrfs_check_shared btrfs_check_shared looks up parents of a given extent and uses ulists for that. These are allocated and freed repeatedly. Preallocation in the caller will avoid the overhead and also allow us to use the GFP_KERNEL as it is happens before the extent locks are taken. Reviewed-by: Nikolay Borisov Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 17 ++++++----------- fs/btrfs/backref.h | 3 ++- fs/btrfs/extent_io.c | 17 +++++++++++++++-- 3 files changed, 23 insertions(+), 14 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 982152d3f920..89116afda7a2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1465,12 +1465,11 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - struct ulist *tmp = NULL; - struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; struct seq_list elem = SEQ_LIST_INIT(elem); @@ -1481,12 +1480,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) .share_count = 0, }; - tmp = ulist_alloc(GFP_NOFS); - roots = ulist_alloc(GFP_NOFS); - if (!tmp || !roots) { - ret = -ENOMEM; - goto out; - } + ulist_init(roots); + ulist_init(tmp); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { @@ -1527,8 +1522,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) up_read(&fs_info->commit_root_sem); } out: - ulist_free(tmp); - ulist_free(roots); + ulist_release(roots); + ulist_release(tmp); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 54d58988483a..777f61dc081e 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp_ulist); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index db337e53aab3..441f6c47cebd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4542,6 +4542,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; struct fiemap_cache cache = { 0 }; + struct ulist *roots; + struct ulist *tmp_ulist; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4555,6 +4557,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!roots || !tmp_ulist) { + ret = -ENOMEM; + goto out_free_ulist; + } + start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; @@ -4566,7 +4575,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, btrfs_ino(BTRFS_I(inode)), -1, 0); if (ret < 0) { btrfs_free_path(path); - return ret; + goto out_free_ulist; } else { WARN_ON(!ret); if (ret == 1) @@ -4675,7 +4684,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ ret = btrfs_check_shared(root, btrfs_ino(BTRFS_I(inode)), - bytenr); + bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; if (ret) @@ -4721,6 +4730,10 @@ out: btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + +out_free_ulist: + ulist_free(roots); + ulist_free(tmp_ulist); return ret; } -- cgit v1.2.3 From ddb93784692fd41d7f1e74cf8b441428b7e57a64 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 14 May 2019 13:54:38 +0300 Subject: btrfs: Don't opencode sync_blockdev in btrfs_init_dev_replace_tgtdev Using sync_blockdev makes it plain obvious what's happening. No functional changes. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ee0989c7e3a9..1b83787176f9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -201,7 +201,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return PTR_ERR(bdev); } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { -- cgit v1.2.3 From b0d9e1ea17fdd3c1a13e2f9dffe7d808d6c9641a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 14 May 2019 13:54:39 +0300 Subject: btrfs: Reduce critical section in btrfs_init_dev_replace_tgtdev There is no point in holding btrfs_fs_devices::device_list_mutex while initialising fields of the not-yet-published device. Instead, hold the mutex only when the newly initialised device is being published. I think holding device_list_mutex here is redundant altogether, because at this point BTRFS_FS_EXCL_OP is set which prevents device removal/addition/balance/resize to occur. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1b83787176f9..2fa826b12d40 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -237,7 +237,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } rcu_assign_pointer(device->name, name); - mutex_lock(&fs_info->fs_devices->device_list_mutex); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; device->io_width = fs_info->sectorsize; @@ -256,6 +255,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; fs_info->fs_devices->open_devices++; -- cgit v1.2.3 From 419684b2c217d2eba6a4a4cf0548c346be7879d7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 14 May 2019 13:54:40 +0300 Subject: btrfs: dev-replace: Remove impossible WARN_ON This WARN_ON can never trigger because src_device cannot be null. btrfs_find_device_by_devspec always returns either an error or a valid pointer to the device. Just remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2fa826b12d40..eb8b5cb2c40d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -447,7 +447,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, } dev_replace->cont_reading_from_srcdev_mode = read_src; - WARN_ON(!src_device); dev_replace->srcdev = src_device; dev_replace->tgtdev = tgt_device; -- cgit v1.2.3 From e1e0eb43ce1fd7bbdd9590715623cb3799896434 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 14 May 2019 13:54:41 +0300 Subject: btrfs: Ensure btrfs_init_dev_replace_tgtdev sees up to date values btrfs_init_dev_replace_tgtdev reads certain values from the source device (such as commit_total_bytes) which are updated during transaction commit. Currently this function is called before committing any pending transaction, leading to possibly reading outdated values. Fix this by moving the function below the transaction commit, at this point the EXCL_OP bit it set hence once transaction is complete the total size of the device cannot be changed (it's usually changed by resize/remove ops which are blocked). Fixes: 9e271ae27e44 ("Btrfs: kernel operation should come after user input has been verified") Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index eb8b5cb2c40d..149e6139182b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -414,11 +414,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return -ETXTBSY; } - ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, - src_device, &tgt_device); - if (ret) - return ret; - /* * Here we commit the transaction to make sure commit_total_bytes * of all the devices are updated. @@ -432,6 +427,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, + src_device, &tgt_device); + if (ret) + return ret; + need_unlock = true; down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { -- cgit v1.2.3 From fa19452a4039539a5846198b9aecb8be51ce61ca Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 14 May 2019 13:54:42 +0300 Subject: btrfs: Streamline replace sem unlock in btrfs_dev_replace_start There are only 2 branches which goto leave label with need_unlock set to true. Essentially need_unlock is used as a substitute for directly calling up_write. Since the branches needing this are only 2 and their context is not that big it's more clear to just call up_write where required. No functional changes. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 149e6139182b..94f000b0ca79 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -400,7 +400,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int ret; struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - bool need_unlock; src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name); @@ -432,7 +431,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) return ret; - need_unlock = true; down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -443,6 +441,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: ASSERT(0); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + up_write(&dev_replace->rwsem); goto leave; } @@ -471,7 +470,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -483,12 +481,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - need_unlock = true; down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; + up_write(&dev_replace->rwsem); goto leave; } @@ -510,8 +508,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - if (need_unlock) - up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } -- cgit v1.2.3 From f232ab04f65bb757f2d992a074a46df85c8f1797 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 14 May 2019 13:54:43 +0300 Subject: btrfs: Explicitly reserve space for devreplace item Part of device replace involves writing an item to the device root containing information about pending replace operations. Currently space for this item is not being explicitly reserved so this works thanks to presence of global reserve. While not fatal it's not a good practice. Let's be explicit about space requirement of device replace and reserve space when starting the transaction. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 94f000b0ca79..45c5ba7d80ed 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -477,8 +477,8 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* force writing the updated state information to disk */ - trans = btrfs_start_transaction(root, 0); + /* Commit dev_replace state and reserve 1 item for it. */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); down_write(&dev_replace->rwsem); -- cgit v1.2.3 From 2ed95d2d59b065f4db2fd42752cbe9a00969f2bd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 14 May 2019 13:54:45 +0300 Subject: btrfs: Remove redundant assignment of tgt_device->commit_total_bytes This is already done in btrfs_init_dev_replace_tgtdev which is the first phase of device replace, called before doing scrub. During that time exclusive lock is held. Additionally btrfs_fs_device::commit_total_bytes is always set based on the size of the underlying block device which shouldn't change once set. This makes the 2nd assignment of the variable in the finishing phase redundant. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 45c5ba7d80ed..b0ba5839ec08 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -674,7 +674,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_device_set_disk_total_bytes(tgt_device, src_device->disk_total_bytes); btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); - tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; btrfs_assign_next_active_device(src_device, tgt_device); -- cgit v1.2.3 From 4c094c33c9ed4b8d0d814bd1d7ff78e123d15d00 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 3 May 2019 08:30:54 +0800 Subject: btrfs: tree-checker: Check if the file extent end overflows Under certain conditions, we could have strange file extent item in log tree like: item 18 key (69599 108 397312) itemoff 15208 itemsize 53 extent data disk bytenr 0 nr 0 extent data offset 0 nr 18446744073709547520 ram 18446744073709547520 The num_bytes + ram_bytes overflow 64 bit type. For num_bytes part, we can detect such overflow along with file offset (key->offset), as file_offset + num_bytes should never go beyond u64. For ram_bytes part, it's about the decompressed size of the extent, not directly related to the size. In theory it is OK to have a large value, and put extra limitation on RAM bytes may cause unexpected false alerts. So in tree-checker, we only check if the file offset and num bytes overflow. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 96fce4bef4e7..ccd5706199d7 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -132,6 +132,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 extent_end; if (!IS_ALIGNED(key->offset, sectorsize)) { file_extent_err(leaf, slot, @@ -207,6 +208,16 @@ static int check_extent_data_item(struct extent_buffer *leaf, CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) return -EUCLEAN; + /* Catch extent end overflow */ + if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), + key->offset, &extent_end)) { + file_extent_err(leaf, slot, + "extent end overflow, have file offset %llu extent num bytes %llu", + key->offset, + btrfs_file_extent_num_bytes(leaf, fi)); + return -EUCLEAN; + } + /* * Check that no two consecutive file extent items, in the same leaf, * present ranges that overlap each other. -- cgit v1.2.3 From 8f63a84051e8e42aa99291197e4adae67241a705 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Wed, 28 Nov 2018 11:21:12 +0800 Subject: btrfs: switch order of unlocks of space_info and bg in do_trimming() In function do_trimming(), block_group->lock should be unlocked first, as the locks should be released in the reverse order. This does not cause problems but should follow the best practices. Reviewed-by: Nikolay Borisov Signed-off-by: Su Yue Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f74dc259307b..9a76e7671938 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3166,8 +3166,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, space_info->bytes_readonly += reserved_bytes; block_group->reserved -= reserved_bytes; space_info->bytes_reserved -= reserved_bytes; - spin_unlock(&space_info->lock); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); } return ret; -- cgit v1.2.3 From cebf05ca65d68d0de869a74754e0a0be35056243 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 5 Oct 2018 07:26:15 -0500 Subject: btrfs: Remove unused variable mode in btrfs_mount This is a leftover from 312c89fbca06 ("btrfs: cleanup btrfs_mount() using btrfs_mount_root()"), the mode was used for opening devices that's not done here anymore. Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index db410c6b5f5f..526dbae5c4cf 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1603,14 +1603,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct vfsmount *mnt_root; struct dentry *root; - fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - error = btrfs_parse_subvol_options(data, &subvol_name, &subvol_objectid); if (error) { -- cgit v1.2.3 From be9b8dfa9c478a5bef22aeaa69d3f12d1033bbe4 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 12 Sep 2018 06:06:25 +0800 Subject: Btrfs: remove unused variables in __btrfs_unlink_inode This code was first introduced in 5f39d397dfbe ("Btrfs: Create extent_buffer interface for large blocksizes") and the function was named btrfs_unlink_trans. It later got renamed to __btrfs_unlink_inode and finally commit 16cdcec736cd ("btrfs: implement delayed inode items operation") changed the way inodes are deleted and obviated the need for those two members. Signed-off-by: Liu Bo Reviewed-by: David Sterba [ replace changelog by Nikolay's version ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2aabdb85226..d324e2a06240 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3935,9 +3935,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; - struct extent_buffer *leaf; struct btrfs_dir_item *di; - struct btrfs_key key; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); @@ -3955,8 +3953,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = di ? PTR_ERR(di) : -ENOENT; goto err; } - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; -- cgit v1.2.3 From 0ee5f8ae082e1f675a2fb6db601c31ac9958a134 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:13 +0200 Subject: btrfs: fix minimum number of chunk errors for DUP The list of profiles in btrfs_chunk_max_errors lists DUP as a profile DUP able to tolerate 1 device missing. Though this profile is special with 2 copies, it still needs the device, unlike the others. Looking at the history of changes, thre's no clear reason why DUP is there, functions were refactored and blocks of code merged to one helper. d20983b40e828 Btrfs: fix writing data into the seed filesystem - factor code to a helper de11cc12df173 Btrfs: don't pre-allocate btrfs bio - unrelated change, DUP still in the list with max errors 1 a236aed14ccb0 Btrfs: Deal with failed writes in mirrored configurations - introduced the max errors, leaves DUP and RAID1 in the same group Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c2a6e4b39da..8508f6028c8d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5328,8 +5328,7 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_DUP)) { + BTRFS_BLOCK_GROUP_RAID5)) { max_errors = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { max_errors = 2; -- cgit v1.2.3 From 49cc180ca916cc3af9ab109a0497e86ccf20d641 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:15 +0200 Subject: btrfs: raid56: allow the exact minimum number of devices for balance convert The minimum number of devices for RAID5 is 2, though this is only a bit expensive RAID1, and for RAID6 it's 3, which is a triple copy that works only 3 devices. mkfs.btrfs allows that and mounting such filesystem also works, so the conversion via balance filters is inconsistent with the others and we should not prevent it. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8508f6028c8d..10f7de0cc7e6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4080,11 +4080,12 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - if (num_devices > 2) + if (num_devices >= 2) allowed |= BTRFS_BLOCK_GROUP_RAID5; + if (num_devices >= 3) + allowed |= BTRFS_BLOCK_GROUP_RAID6; if (num_devices > 3) - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID6); + allowed |= BTRFS_BLOCK_GROUP_RAID10; if (validate_convert_profile(&bctl->data, allowed)) { int index = btrfs_bg_flags_to_raid_index(bctl->data.target); -- cgit v1.2.3 From c8bf1b67039556884d0532f7b06acd524c90ed87 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:17 +0200 Subject: btrfs: remove mapping tree structures indirection fs_info::mapping_tree is the physical<->logical mapping tree and uses the same underlying structure as extents, but is embedded to another structure. There are no other members and this indirection is useless. No functional change. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 +---- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 14 ++++++------ fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/scrub.c | 8 +++---- fs/btrfs/volumes.c | 53 ++++++++++++++++++++------------------------- fs/btrfs/volumes.h | 3 +-- 8 files changed, 40 insertions(+), 50 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 02870c1bb68a..1baa8cc39571 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -99,10 +99,6 @@ static inline u32 count_max_extents(u64 size) return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); } -struct btrfs_mapping_tree { - struct extent_map_tree map_tree; -}; - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -830,7 +826,7 @@ struct btrfs_fs_info { struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ - struct btrfs_mapping_tree mapping_tree; + struct extent_map_tree mapping_tree; /* * block reservation for extent, checksum, root tree and diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index b0ba5839ec08..6b2e9aa83ffa 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -723,7 +723,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; u64 start = 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 024c9fadeaeb..03e703d70701 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2684,7 +2684,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); - btrfs_mapping_init(&fs_info->mapping_tree); + extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index caaa79a3aa48..cbe6a7278008 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9960,7 +9960,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &root->fs_info->mapping_tree.map_tree; + em_tree = &root->fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, found_key.objectid, found_key.offset); @@ -10254,21 +10254,21 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct btrfs_block_group_cache *bg; u64 start = 0; int ret = 0; while (1) { - read_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); /* * lookup_extent_mapping will return the first extent map * intersecting the range, so setting @len to 1 is enough to * get the first chunk. */ - em = lookup_extent_mapping(&map_tree->map_tree, start, 1); - read_unlock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(map_tree, start, 1); + read_unlock(&map_tree->lock); if (!em) break; @@ -10864,7 +10864,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (remove_em) { struct extent_map_tree *em_tree; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -10882,7 +10882,7 @@ struct btrfs_trans_handle * btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, const u64 chunk_offset) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; unsigned int num_items; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9a76e7671938..db53ac88e159 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3358,7 +3358,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) if (cleanup) { mutex_lock(&fs_info->chunk_mutex); - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, block_group->key.objectid, 1); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f7b29f9db5e2..0827bdf4faf1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3410,15 +3410,15 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; int ret = 0; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, chunk_offset, 1); + read_unlock(&map_tree->lock); if (!em) { /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 10f7de0cc7e6..a3fa741c8534 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1818,7 +1818,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); n = rb_last(&em_tree->map.rb_root); if (n) { @@ -2941,7 +2941,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, length); read_unlock(&em_tree->lock); @@ -5144,7 +5144,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->block_len = em->len; em->orig_block_len = stripe_size; - em_tree = &info->mapping_tree.map_tree; + em_tree = &info->mapping_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); if (ret) { @@ -5378,21 +5378,16 @@ end: return readonly; } -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) -{ - extent_map_tree_init(&tree->map_tree); -} - -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +void btrfs_mapping_tree_free(struct extent_map_tree *tree) { struct extent_map *em; while (1) { - write_lock(&tree->map_tree.lock); - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, 0, (u64)-1); if (em) - remove_extent_mapping(&tree->map_tree, em); - write_unlock(&tree->map_tree.lock); + remove_extent_mapping(tree, em); + write_unlock(&tree->lock); if (!em) break; /* once for us */ @@ -6687,7 +6682,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; u64 logical; @@ -6712,9 +6707,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, return ret; } - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, logical, 1); + read_unlock(&map_tree->lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -6783,9 +6778,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, } - write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em, 0); - write_unlock(&map_tree->map_tree.lock); + write_lock(&map_tree->lock); + ret = add_extent_mapping(map_tree, em, 0); + write_unlock(&map_tree->lock); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", @@ -7103,14 +7098,14 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; u64 next_start = 0; bool ret = true; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, 0, (u64)-1); + read_unlock(&map_tree->lock); /* No chunk at all? Return false anyway */ if (!em) { ret = false; @@ -7148,10 +7143,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, next_start = extent_map_end(em); free_extent_map(em); - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, next_start, + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, next_start, (u64)(-1) - next_start); - read_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->lock); } out: return ret; @@ -7612,7 +7607,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct btrfs_device *dev; @@ -7701,7 +7696,7 @@ out: static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct rb_node *node; int ret = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 136a3eb64604..07156d974ac4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -413,8 +413,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_init(struct btrfs_mapping_tree *tree); -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, -- cgit v1.2.3 From 9fa02ac75ba14683b341ec7430202cccfc8c3fab Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:20 +0200 Subject: btrfs: use raid_attr table in get_profile_num_devs The dev_max constraints are defined in the raid_attr table, use it instead of open-coding it. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cbe6a7278008..326fd8660efb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4336,15 +4336,9 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; - if (type & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; + if (!num_dev) num_dev = fs_info->fs_devices->rw_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_dev = 2; - else - num_dev = 1; /* DUP or single */ return num_dev; } -- cgit v1.2.3 From fc9a2ac77c05ba060dcfb084a752aa3ee37f9e2a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:22 +0200 Subject: btrfs: use raid_attr in btrfs_chunk_max_errors The number of tolerated failures is stored in the raid_attr table, use it. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a3fa741c8534..995a15a816f2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5325,19 +5325,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) static inline int btrfs_chunk_max_errors(struct map_lookup *map) { - int max_errors; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5)) { - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { - max_errors = 2; - } else { - max_errors = 0; - } + const int index = btrfs_bg_flags_to_raid_index(map->type); - return max_errors; + return btrfs_raid_array[index].tolerated_failures; } int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) -- cgit v1.2.3 From 081db89b13cb26f9ef1f5b1752f7fcb775191cbe Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:27 +0200 Subject: btrfs: use raid_attr to get allowed profiles for balance conversion Iterate over the table and gather all allowed profiles for a given number of devices, instead of open coding. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 995a15a816f2..301e60b11dd0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4047,6 +4047,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, u64 num_devices; unsigned seq; bool reducing_integrity; + int i; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -4076,16 +4077,11 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) + if (num_devices >= btrfs_raid_array[i].devs_min) + allowed |= btrfs_raid_array[i].bg_flag; - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; - if (num_devices > 1) - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - if (num_devices >= 2) - allowed |= BTRFS_BLOCK_GROUP_RAID5; - if (num_devices >= 3) - allowed |= BTRFS_BLOCK_GROUP_RAID6; - if (num_devices > 3) - allowed |= BTRFS_BLOCK_GROUP_RAID10; if (validate_convert_profile(&bctl->data, allowed)) { int index = btrfs_bg_flags_to_raid_index(bctl->data.target); -- cgit v1.2.3 From 6079e12cdb8f9afb02b9f4f3e4d13c076e1e99e9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:29 +0200 Subject: btrfs: use raid_attr table to find profiles for integrity lowering Replace open coded list of the profiles by selecting them from the raid_attr table. The criteria are now more explicit, we need profiles that have more than 1 copy of the data or can reconstruct the data with a missing device. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 301e60b11dd0..71add22e041a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4110,11 +4110,16 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, goto out; } - /* allow to reduce meta or sys integrity only if force set */ - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6; + /* + * Allow to reduce metadata or system integrity only if force set for + * profiles with redundancy (copies, parity) + */ + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { + if (btrfs_raid_array[i].ncopies >= 2 || + btrfs_raid_array[i].tolerated_failures >= 1) + allowed |= btrfs_raid_array[i].bg_flag; + } do { seq = read_seqbegin(&fs_info->profiles_lock); -- cgit v1.2.3 From 44b28adafd1f814fbde8b53a99eee83f3b31b2d9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:31 +0200 Subject: btrfs: use raid_attr table for btrfs_bg_type_to_factor The factor is the number of copies. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 71add22e041a..bbc195c97c74 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7586,10 +7586,9 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) */ int btrfs_bg_type_to_factor(u64 flags) { - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return 2; - return 1; + const int index = btrfs_bg_flags_to_raid_index(flags); + + return btrfs_raid_array[index].ncopies; } -- cgit v1.2.3 From 946c9256c6fabe8d3fe8e076df1c4290dcd778f2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:34 +0200 Subject: btrfs: factor out helper for counting data stripes Factor the sequence of ifs to a helper, the 'data stripes' here means the number of stripes without redundancy and parity. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bbc195c97c74..e9caba25ddb6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3474,6 +3474,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf, return 1; } +static u64 calc_data_stripes(u64 type, int num_stripes) +{ + const int index = btrfs_bg_flags_to_raid_index(type); + const int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; + + if (nparity) + return num_stripes - nparity; + else + return num_stripes / ncopies; +} + /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, @@ -3483,22 +3495,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); u64 stripe_offset; u64 stripe_length; + u64 type; int factor; int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) return 0; - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { - factor = num_stripes / 2; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { - factor = num_stripes - 1; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { - factor = num_stripes - 2; - } else { - factor = num_stripes; - } + type = btrfs_chunk_type(leaf, chunk); + factor = calc_data_stripes(type, num_stripes); for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); -- cgit v1.2.3 From 8c3e3582a4f0dbdaea49cfd71640a626c8bb7134 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:36 +0200 Subject: btrfs: use u8 for raid_array members The raid_attr table is now 7 * 56 = 392 bytes long, consisting of just small numbers so we don't have to use ints. New size is 7 * 32 = 224, saving 3 cachelines. Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/volumes.h | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 03e703d70701..fe14b971440f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3704,7 +3704,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[BTRFS_RAID_SINGLE]. tolerated_failures); @@ -3713,7 +3713,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) continue; if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[raid_type]. tolerated_failures); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 07156d974ac4..73520a6ed90a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -336,16 +336,16 @@ struct btrfs_device_info { }; struct btrfs_raid_attr { - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int tolerated_failures; /* max tolerated fail devs */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int nparity; /* number of stripes worth of bytes to store + u8 sub_stripes; /* sub_stripes info for map */ + u8 dev_stripes; /* stripes per dev */ + u8 devs_max; /* max devs to use */ + u8 devs_min; /* min devs needed */ + u8 tolerated_failures; /* max tolerated fail devs */ + u8 devs_increment; /* ndevs has to be a multiple of this */ + u8 ncopies; /* how many copies to data has */ + u8 nparity; /* number of stripes worth of bytes to store * parity information */ - int mindev_error; /* error code if min devs requisite is unmet */ + u8 mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ }; -- cgit v1.2.3 From e3ecdb3fdecf2d505bd742ffccf1b0a2854f2812 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:38 +0200 Subject: btrfs: factor out devs_max setting in __btrfs_alloc_chunk Merge the repeated code before the if-else block. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e9caba25ddb6..2e0860d14ad2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4956,6 +4956,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = btrfs_raid_array[index].sub_stripes; dev_stripes = btrfs_raid_array[index].dev_stripes; devs_max = btrfs_raid_array[index].devs_max; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info); devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; @@ -4964,8 +4966,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) @@ -4973,13 +4973,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); -- cgit v1.2.3 From 158da513b13deb04b266e4ad171871d1a8531f26 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:41 +0200 Subject: btrfs: refactor helper for bg flags to name conversion The helper lacks the btrfs_ prefix and the parameter is the raw blockgroup type, so none of the callers has to do the flags -> index conversion. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 +--- fs/btrfs/volumes.c | 34 +++++++++++++--------------------- fs/btrfs/volumes.h | 3 +-- 3 files changed, 15 insertions(+), 26 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 326fd8660efb..af73264e33a5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10146,7 +10146,6 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) struct btrfs_space_info *space_info; struct raid_kobject *rkobj; LIST_HEAD(list); - int index; int ret = 0; spin_lock(&fs_info->pending_raid_kobjs_lock); @@ -10155,10 +10154,9 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) list_for_each_entry(rkobj, &list, list) { space_info = __find_space_info(fs_info, rkobj->flags); - index = btrfs_bg_flags_to_raid_index(rkobj->flags); ret = kobject_add(&rkobj->kobj, &space_info->kobj, - "%s", get_raid_name(index)); + "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); if (ret) { kobject_put(&rkobj->kobj); break; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2e0860d14ad2..eb737bcd7aac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -123,12 +123,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const char *get_raid_name(enum btrfs_raid_types type) +const char *btrfs_bg_type_to_raid_name(u64 flags) { - if (type >= BTRFS_NR_RAID_TYPES) + const int index = btrfs_bg_flags_to_raid_index(flags); + + if (index >= BTRFS_NR_RAID_TYPES) return NULL; - return btrfs_raid_array[type].raid_name; + return btrfs_raid_array[index].raid_name; } /* @@ -3926,11 +3928,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, bp += ret; \ } while (0) - if (flags & BTRFS_BALANCE_ARGS_CONVERT) { - int index = btrfs_bg_flags_to_raid_index(bargs->target); - - CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); - } + if (flags & BTRFS_BALANCE_ARGS_CONVERT) + CHECK_APPEND_1ARG("convert=%s,", + btrfs_bg_type_to_raid_name(bargs->target)); if (flags & BTRFS_BALANCE_ARGS_SOFT) CHECK_APPEND_NOARG("soft,"); @@ -4088,29 +4088,23 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, allowed |= btrfs_raid_array[i].bg_flag; if (validate_convert_profile(&bctl->data, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); - btrfs_err(fs_info, "balance: invalid convert data profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->data.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->meta, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); - btrfs_err(fs_info, "balance: invalid convert metadata profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->meta.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->sys, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); - btrfs_err(fs_info, "balance: invalid convert system profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->sys.target)); ret = -EINVAL; goto out; } @@ -4159,12 +4153,10 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); - int data_index = btrfs_bg_flags_to_raid_index(data_target); - btrfs_warn(fs_info, "balance: metadata profile %s has lower redundancy than data profile %s", - get_raid_name(meta_index), get_raid_name(data_index)); + btrfs_bg_type_to_raid_name(meta_target), + btrfs_bg_type_to_raid_name(data_target)); } ret = insert_balance_item(fs_info, bctl); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 73520a6ed90a..4a7a4d90ded8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -556,8 +556,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -const char *get_raid_name(enum btrfs_raid_types type); - void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head *btrfs_get_fs_uuids(void); @@ -567,6 +565,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); int btrfs_bg_type_to_factor(u64 flags); +const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); #endif -- cgit v1.2.3 From 72ad813157c58416c29fa554928e21b9bdae2582 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:43 +0200 Subject: btrfs: constify map parameter for nr_parity_stripes and nr_data_stripes Signed-off-by: David Sterba --- fs/btrfs/raid56.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index f5d4c13a8dbc..2503485db859 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,7 +7,7 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(struct map_lookup *map) +static inline int nr_parity_stripes(const struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) return 1; @@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map) return 0; } -static inline int nr_data_stripes(struct map_lookup *map) +static inline int nr_data_stripes(const struct map_lookup *map) { return map->num_stripes - nr_parity_stripes(map); } -- cgit v1.2.3 From cff8267228c14e56ded8407787d4328603423b3e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 17 May 2019 11:43:45 +0200 Subject: btrfs: read number of data stripes from map only once There are several places that call nr_data_stripes, but this value does not change. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++---- fs/btrfs/volumes.c | 17 +++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0827bdf4faf1..e51929a55af4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2660,18 +2660,18 @@ static int get_raid56_logic_offset(u64 physical, int num, u64 last_offset; u32 stripe_index; u32 rot; + const int data_stripes = nr_data_stripes(map); - last_offset = (physical - map->stripes[num].physical) * - nr_data_stripes(map); + last_offset = (physical - map->stripes[num].physical) * data_stripes; if (stripe_start) *stripe_start = last_offset; *offset = last_offset; - for (i = 0; i < nr_data_stripes(map); i++) { + for (i = 0; i < data_stripes; i++) { *offset = last_offset + i * map->stripe_len; stripe_nr = div64_u64(*offset, map->stripe_len); - stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(stripe_nr, data_stripes); /* Work out the disk rotation on this stripe-set */ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index eb737bcd7aac..5514ae801f4c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5918,6 +5918,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 stripe_nr; u64 stripe_len; u32 stripe_index; + int data_stripes; int i; int ret = 0; int num_stripes; @@ -5949,6 +5950,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * to get to this block */ stripe_nr = div64_u64(stripe_nr, stripe_len); + data_stripes = nr_data_stripes(map); stripe_offset = stripe_nr * stripe_len; if (offset < stripe_offset) { @@ -5965,7 +5967,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* if we're here for raid56, we need to know the stripe aligned start */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); + unsigned long full_stripe_len = stripe_len * data_stripes; raid56_full_stripe_start = offset; /* allow a write of a full stripe, but make sure we don't @@ -5983,7 +5985,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, stripe (on a single disk). */ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && (op == BTRFS_MAP_WRITE)) { - max_len = stripe_len * nr_data_stripes(map) - + max_len = stripe_len * data_stripes - (offset - raid56_full_stripe_start); } else { /* we limit the length of each bio to what fits in a stripe */ @@ -6073,7 +6075,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * nr_data_stripes(map)); + stripe_len * data_stripes); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -6089,10 +6091,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * Mirror #3 is RAID6 Q block. */ stripe_nr = div_u64_rem(stripe_nr, - nr_data_stripes(map), &stripe_index); + data_stripes, &stripe_index); if (mirror_num > 1) - stripe_index = nr_data_stripes(map) + - mirror_num - 2; + stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, @@ -6150,8 +6151,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) + tmp = stripe_nr * data_stripes; + for (i = 0; i < data_stripes; i++) bbio->raid_map[(i+rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; -- cgit v1.2.3 From 0185f364cb65855cdf5dfd0de11f8f9622940250 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 29 Apr 2019 14:03:32 +0800 Subject: btrfs: extent-tree: Add lockdep assert when updating space info Just add a safe net for btrfs_space_info member updating. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index af73264e33a5..f37daddbeabe 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -58,6 +58,7 @@ enum { static inline void update_##name(struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ + lockdep_assert_held(&sinfo->lock); \ if (bytes < 0 && sinfo->name < -bytes) { \ WARN_ON(1); \ sinfo->name = 0; \ -- cgit v1.2.3 From 480b9b4d847fe18f4a559f5cac718d9b2cbcdcaf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 29 Apr 2019 14:03:33 +0800 Subject: btrfs: extent-tree: Add trace events for space info numbers update Add trace event for update_bytes_pinned() and update_bytes_may_use() to detect underflow better. The output would be something like (only showing data part): ## Buffered write start, 16K total ## 2255.954 xfs_io/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=0 diff=4096 2257.169 sudo/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=4096 diff=4096 2257.346 sudo/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=8192 diff=4096 2257.542 sudo/860 btrfs:update_bytes_may_use:(nil)U: type=DATA old=12288 diff=4096 ## Delalloc start ## 3727.853 kworker/u8:3-e/700 btrfs:update_bytes_may_use:(nil)U: type=DATA old=16384 diff=-16384 ## Space cache update ## 3733.132 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=0 diff=65536 3733.169 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=65536 diff=-65536 3739.868 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=0 diff=65536 3739.891 sudo/862 btrfs:update_bytes_may_use:(nil)U: type=DATA old=65536 diff=-65536 These two trace events will allow bcc tool to probe btrfs_space_info changes and detect underflow with more details (e.g. backtrace for each update). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 36 ++++++++++++++++++++---------------- include/trace/events/btrfs.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f37daddbeabe..f63dfb777e4a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -55,10 +55,12 @@ enum { * Declare a helper function to detect underflow of various space info members */ #define DECLARE_SPACE_INFO_UPDATE(name) \ -static inline void update_##name(struct btrfs_space_info *sinfo, \ +static inline void update_##name(struct btrfs_fs_info *fs_info, \ + struct btrfs_space_info *sinfo, \ s64 bytes) \ { \ lockdep_assert_held(&sinfo->lock); \ + trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ if (bytes < 0 && sinfo->name < -bytes) { \ WARN_ON(1); \ sinfo->name = 0; \ @@ -4209,7 +4211,7 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - update_bytes_may_use(data_sinfo, bytes); + update_bytes_may_use(fs_info, data_sinfo, bytes); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4262,7 +4264,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - update_bytes_may_use(data_sinfo, -len); + update_bytes_may_use(fs_info, data_sinfo, -len); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); @@ -5169,13 +5171,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * If not things get more complicated. */ if (used + orig_bytes <= space_info->total_bytes) { - update_bytes_may_use(space_info, orig_bytes); + update_bytes_may_use(fs_info, space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, system_chunk)) { - update_bytes_may_use(space_info, orig_bytes); + update_bytes_may_use(fs_info, space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; @@ -5503,7 +5505,7 @@ again: flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } - update_bytes_may_use(space_info, -num_bytes); + update_bytes_may_use(fs_info, space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); spin_unlock(&space_info->lock); @@ -5531,7 +5533,8 @@ again: ticket->bytes, 1); list_del_init(&ticket->list); num_bytes -= ticket->bytes; - update_bytes_may_use(space_info, ticket->bytes); + update_bytes_may_use(fs_info, space_info, + ticket->bytes); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -5539,7 +5542,7 @@ again: trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 1); - update_bytes_may_use(space_info, num_bytes); + update_bytes_may_use(fs_info, space_info, num_bytes); ticket->bytes -= num_bytes; num_bytes = 0; } @@ -5832,14 +5835,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = min(num_bytes, block_rsv->size - block_rsv->reserved); block_rsv->reserved += num_bytes; - update_bytes_may_use(sinfo, num_bytes); + update_bytes_may_use(fs_info, sinfo, num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 1); } } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - update_bytes_may_use(sinfo, -num_bytes); + update_bytes_may_use(fs_info, sinfo, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; @@ -6302,7 +6305,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); + update_bytes_pinned(info, cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6377,7 +6380,7 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); + update_bytes_pinned(fs_info, cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6586,7 +6589,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - update_bytes_may_use(space_info, -ram_bytes); + update_bytes_may_use(cache->fs_info, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6742,7 +6745,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - update_bytes_pinned(space_info, -len); + update_bytes_pinned(fs_info, space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6763,7 +6766,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - update_bytes_may_use(space_info, to_add); + update_bytes_may_use(fs_info, space_info, + to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -11024,7 +11028,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - update_bytes_pinned(space_info, -block_group->pinned); + update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index f9eff010fc7e..2f6a669408bb 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -29,6 +29,7 @@ struct btrfs_qgroup_extent_record; struct btrfs_qgroup; struct extent_io_tree; struct prelim_ref; +struct btrfs_space_info; TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS_NR); TRACE_DEFINE_ENUM(FLUSH_DELAYED_ITEMS); @@ -2091,6 +2092,45 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock); DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic); +DECLARE_EVENT_CLASS(btrfs__space_info_update, + + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *sinfo, u64 old, s64 diff), + + TP_ARGS(fs_info, sinfo, old, diff), + + TP_STRUCT__entry_btrfs( + __field( u64, type ) + __field( u64, old ) + __field( s64, diff ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->type = sinfo->flags; + __entry->old = old; + __entry->diff = diff; + ), + TP_printk_btrfs("type=%s old=%llu diff=%lld", + __print_flags(__entry->type, "|", BTRFS_GROUP_FLAGS), + __entry->old, __entry->diff) +); + +DEFINE_EVENT(btrfs__space_info_update, update_bytes_may_use, + + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *sinfo, u64 old, s64 diff), + + TP_ARGS(fs_info, sinfo, old, diff) +); + +DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, + + TP_PROTO(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *sinfo, u64 old, s64 diff), + + TP_ARGS(fs_info, sinfo, old, diff) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 1200b51f57dca934cfd78969f460d636ec0fb838 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 May 2019 15:31:50 +0800 Subject: btrfs: remove the incorrect comment on RO fs when btrfs_run_delalloc_range() fails At the context of btrfs_run_delalloc_range(), we haven't started/joined a transaction, thus even something went wrong, we can't and won't abort transaction, thus no way to make the fs RO. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 441f6c47cebd..dd3f659f223e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3310,7 +3310,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); - /* File system has been set read-only */ if (ret) { SetPageError(page); /* -- cgit v1.2.3 From ffa87214c1100c7ea38e85a3cf981d0d47b1c744 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 7 May 2019 10:19:22 +0300 Subject: btrfs: add new helper btrfs_lock_and_flush_ordered_range There is a certain idiom used in multiple places in btrfs' codebase, dealing with flushing an ordered range. Factor this in a separate function that can be reused. Future patches will replace the existing code with that function. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.h | 4 ++++ 2 files changed, 37 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52889da69113..37401cc04a6b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -962,6 +962,39 @@ out: return index; } +/* + * btrfs_flush_ordered_range - Lock the passed range and ensures all pending + * ordered extents in it are run to completion. + * + * @tree: IO tree used for locking out other users of the range + * @inode: Inode whose ordered tree is to be searched + * @start: Beginning of range to flush + * @end: Last byte of range to lock + * @cached_state: If passed, will return the extent state responsible for the + * locked range. It's the caller's responsibility to free the cached state. + * + * This function always returns with the given range locked, ensuring after it's + * called no order extent can be pending. + */ +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + + while (1) { + lock_extent_bits(tree, start, end, cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) + break; + unlock_extent_cached(tree, start, end, cached_state); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4c5991c3de14..9b68179d580f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -188,6 +188,10 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state); int __init ordered_data_init(void); void __cold ordered_data_exit(void); -- cgit v1.2.3 From 23d31bd476d1d096d0f073483547872ec155ab34 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 7 May 2019 10:19:23 +0300 Subject: btrfs: Use newly introduced btrfs_lock_and_flush_ordered_range There several functions which open code btrfs_lock_and_flush_ordered_range, just replace them with a call to the function. No functional changes. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 29 ++++------------------------- fs/btrfs/file.c | 14 ++------------ fs/btrfs/inode.c | 17 ++--------------- 3 files changed, 8 insertions(+), 52 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dd3f659f223e..9591ecfaa57e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3204,21 +3204,10 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, unsigned long *bio_flags, u64 *prev_em_start) { - struct inode *inode; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; - inode = pages[0]->mapping->host; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - end - start + 1); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, @@ -3234,22 +3223,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree, unsigned long *bio_flags, unsigned int read_flags) { - struct inode *inode = page->mapping->host; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; int ret; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - PAGE_SIZE); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 89f5be2bfb43..007652ff5cb6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1550,7 +1550,6 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; - struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; u64 num_bytes; int ret; @@ -1563,17 +1562,8 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - while (1) { - lock_extent(&inode->io_tree, lockstart, lockend); - ordered = btrfs_lookup_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (!ordered) { - break; - } - unlock_extent(&inode->io_tree, lockstart, lockend); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, + lockend, NULL); num_bytes = lockend - lockstart + 1; ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d324e2a06240..9137bafc9376 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5004,21 +5004,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent_bits(io_tree, hole_start, block_end - 1, - &cached_state); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, - block_end - hole_start); - if (!ordered) - break; - unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - + btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, + block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, -- cgit v1.2.3 From bd80d94efb83acd67d48f9f3f07483c8306085aa Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 7 May 2019 10:19:24 +0300 Subject: btrfs: Always use a cached extent_state in btrfs_lock_and_flush_ordered_range In case no cached_state argument is passed to btrfs_lock_and_flush_ordered_range use one locally in the function. This optimises the case when an ordered extent is found since the unlock function will be able to unlock that state directly without searching for it again. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 37401cc04a6b..df02ed25b7db 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -982,14 +982,26 @@ void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, struct extent_state **cached_state) { struct btrfs_ordered_extent *ordered; + struct extent_state *cachedp = NULL; + + if (cached_state) + cachedp = *cached_state; while (1) { - lock_extent_bits(tree, start, end, cached_state); + lock_extent_bits(tree, start, end, &cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); - if (!ordered) + if (!ordered) { + /* + * If no external cached_state has been passed then + * decrement the extra ref taken for cachedp since we + * aren't exposing it outside of this function + */ + if (!cached_state) + refcount_dec(&cachedp->refs); break; - unlock_extent_cached(tree, start, end, cached_state); + } + unlock_extent_cached(tree, start, end, &cachedp); btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); btrfs_put_ordered_extent(ordered); } -- cgit v1.2.3 From 0b6f5d408bb5df3101eb1944b1f301dea50c7e7e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 9 May 2019 18:11:11 +0300 Subject: btrfs: Add comments on locking of several device-related fields Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 +++- fs/btrfs/volumes.h | 11 ++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5514ae801f4c..776f5c7ca7c5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -239,7 +239,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new - * device is added/removed + * device is added/removed. Additionally it also protects post_commit_list of + * individual devices, since they can be added to the transaction's + * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 4a7a4d90ded8..fea7b65a712e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -43,8 +43,8 @@ struct btrfs_pending_bios { #define BTRFS_DEV_STATE_FLUSH_SENT (4) struct btrfs_device { - struct list_head dev_list; - struct list_head dev_alloc_list; + struct list_head dev_list; /* device_list_mutex */ + struct list_head dev_alloc_list; /* chunk mutex */ struct list_head post_commit_list; /* chunk mutex */ struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; @@ -229,9 +229,14 @@ struct btrfs_fs_devices { * this mutex lock. */ struct mutex device_list_mutex; + + /* List of all devices, protected by device_list_mutex */ struct list_head devices; - /* devices not currently being allocated */ + /* + * Devices which can satisfy space allocation. Protected by + * chunk_mutex + */ struct list_head alloc_list; struct btrfs_fs_devices *seed; -- cgit v1.2.3 From 5f791ec31f535f62a2dc06b37c3a69e5e268b5db Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 7 May 2019 10:23:46 +0300 Subject: btrfs: Return EAGAIN if we can't start no snpashot write in check_can_nocow The first thing code does in check_can_nocow is trying to block concurrent snapshots. If this fails (due to snpashot already being in progress) the function returns ENOSPC which makes no sense. Instead return EAGAIN. Despite this return value not being propagated to callers it's good practice to return the closest in terms of semantics error code. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 007652ff5cb6..5370152ea7e3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1556,7 +1556,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, ret = btrfs_start_write_no_snapshotting(root); if (!ret) - return -ENOSPC; + return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, -- cgit v1.2.3 From a94d1d0cb3bf1983fcdf05b59d914dbff4f1f52c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 May 2019 18:49:58 +0800 Subject: btrfs: Flush before reflinking any extent to prevent NOCOW write falling back to COW without data reservation [BUG] The following script can cause unexpected fsync failure: #!/bin/bash dev=/dev/test/test mnt=/mnt/btrfs mkfs.btrfs -f $dev -b 512M > /dev/null mount $dev $mnt -o nospace_cache # Prealloc one extent xfs_io -f -c "falloc 8k 64m" $mnt/file1 # Fill the remaining data space xfs_io -f -c "pwrite 0 -b 4k 512M" $mnt/padding sync # Write into the prealloc extent xfs_io -c "pwrite 1m 16m" $mnt/file1 # Reflink then fsync, fsync would fail due to ENOSPC xfs_io -c "reflink $mnt/file1 8k 0 4k" -c "fsync" $mnt/file1 umount $dev The fsync fails with ENOSPC, and the last page of the buffered write is lost. [CAUSE] This is caused by: - Btrfs' back reference only has extent level granularity So write into shared extent must be COWed even only part of the extent is shared. So for above script we have: - fallocate Create a preallocated extent where we can do NOCOW write. - fill all the remaining data and unallocated space - buffered write into preallocated space As we have not enough space available for data and the extent is not shared (yet) we fall into NOCOW mode. - reflink Now part of the large preallocated extent is shared, later write into that extent must be COWed. - fsync triggers writeback But now the extent is shared and therefore we must fallback into COW mode, which fails with ENOSPC since there's not enough space to allocate data extents. [WORKAROUND] The workaround is to ensure any buffered write in the related extents (not just the reflink source range) get flushed before reflink/dedupe, so that NOCOW writes succeed that happened before reflinking succeed. The workaround is expensive, we could do it better by only flushing NOCOW range, but that needs extra accounting for NOCOW range. For now, fix the possible data loss first. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2a1be0d1a698..5b4beebf138c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3999,6 +3999,27 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (!same_inode) inode_dio_wait(inode_out); + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) -- cgit v1.2.3 From 5852c8b961542f997aa5aeee9c80e745e7af2d63 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 May 2019 10:18:58 +0200 Subject: btrfs: use btrfs_csum_data() instead of directly calling crc32c btrfsic_test_for_metadata() directly calls the crc32c() library function for calculating the CRC32C checksum, but then uses btrfs_csum_final() to invert the result. To ease further refactoring and development around checksumming in BTRFS convert to calling btrfs_csum_data(), which is a wrapper around crc32c(). Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b0c8094528d1..85774e2fa3e5 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1728,7 +1728,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, size_t sublen = i ? PAGE_SIZE : (PAGE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data, sublen); + crc = btrfs_csum_data(data, crc, sublen); } btrfs_csum_final(crc, csum); if (memcmp(csum, h->csum, state->csum_size)) -- cgit v1.2.3 From 65019df8c3b0efa363c30ca4dd69a1a370a3ebe8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 May 2019 10:18:59 +0200 Subject: btrfs: resurrect btrfs_crc32c() Commit 9678c54388b6 ("btrfs: Remove custom crc32c init code") removed the btrfs_crc32c() function, because it was a duplicate of the crc32c() library function we already have in the kernel. Resurrect it as a shim wrapper over crc32c() to make following transformations of the checksumming code in btrfs easier. Also provide a btrfs_crc32_final() to ease following transformations. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 11 +++++++++++ fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/send.c | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1baa8cc39571..3691cd177ead 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -2644,6 +2645,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) +static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) +{ + return crc32c(crc, address, length); +} + +static inline void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f63dfb777e4a..5d736d8ae3e3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1135,11 +1135,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f7fe4770f0e5..49edcc709a99 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -686,7 +686,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, -- cgit v1.2.3 From 4bb3c2e2b5bb0d57f7d11f2da829357c5ca628a4 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 May 2019 10:19:00 +0200 Subject: btrfs: use btrfs_crc32c{,_final}() in for free space cache The CRC checksum in the free space cache is not dependant on the super block's csum_type field but always a CRC32C. So use btrfs_crc32c() and btrfs_crc32c_final() instead of btrfs_csum_data() and btrfs_csum_final() for computing these checksums. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index db53ac88e159..6814fa42eba3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -465,9 +465,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; @@ -493,9 +492,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); -- cgit v1.2.3 From 1e25a2e3ca0dab0ed1030570e95d98af47113eae Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 May 2019 10:19:01 +0200 Subject: btrfs: don't assume ordered sums to be 4 bytes BTRFS has the implicit assumption that a checksum in btrfs_orderd_sums is 4 bytes. While this is true for CRC32C, it is not for any other checksum. Change the data type to be a byte array and adjust loop index calculation accordingly. This includes moving the adjustment of 'index' by 'ins_size' in btrfs_csum_file_blocks() before dividing 'ins_size' by the checksum size, because before this patch the 'sums' member of 'struct btrfs_ordered_sum' was 4 Bytes in size and afterwards it is only one byte. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/ctree.h | 3 ++- fs/btrfs/file-item.c | 35 +++++++++++++++++++++-------------- fs/btrfs/ordered-data.c | 10 ++++++---- fs/btrfs/ordered-data.h | 4 ++-- fs/btrfs/scrub.c | 2 +- 6 files changed, 34 insertions(+), 24 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 84dd4a8980c5..a8e551ca8798 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -631,7 +631,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, - sums); + (u8 *)sums); BUG_ON(ret); /* -ENOMEM */ } sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, @@ -657,7 +657,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u8 *) sums); BUG_ON(ret); /* -ENOMEM */ } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3691cd177ead..a66ed58058d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3199,7 +3199,8 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d431ea8198e4..0464dd4c0579 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -22,9 +22,13 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ - sizeof(struct btrfs_ordered_sum)) / \ - sizeof(u32) * (fs_info)->sectorsize) +static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, + u16 csum_size) +{ + u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + + return ncsums * fs_info->sectorsize; +} int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -144,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 logical_offset, u32 *dst, int dio) + u64 logical_offset, u8 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -211,7 +215,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio if (!dio) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - (u32 *)csum, nblocks); + csum, nblocks); if (count) goto found; @@ -283,7 +287,8 @@ next: return 0; } -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } @@ -374,7 +379,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(fs_info)); + max_ordered_sum_bytes(fs_info, csum_size)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -439,6 +444,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -473,6 +479,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, - 1); for (i = 0; i < nr_sectors; i++) { + u32 tmp; + if (offset >= ordered->file_offset + ordered->len || offset < ordered->file_offset) { unsigned long bytes_left; @@ -498,17 +506,16 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, index = 0; } - sums->sums[index] = ~(u32)0; + memset(&sums->sums[index], 0xff, csum_size); data = kmap_atomic(bvec.bv_page); - sums->sums[index] - = btrfs_csum_data(data + bvec.bv_offset + tmp = btrfs_csum_data(data + bvec.bv_offset + (i * fs_info->sectorsize), - sums->sums[index], + *(u32 *)&sums->sums[index], fs_info->sectorsize); kunmap_atomic(data); - btrfs_csum_final(sums->sums[index], + btrfs_csum_final(tmp, (char *)(sums->sums + index)); - index++; + index += csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; total_bytes += fs_info->sectorsize; @@ -904,9 +911,9 @@ found: write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, ins_size); + index += ins_size; ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index df02ed25b7db..3f67b7827393 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -924,14 +924,16 @@ out: * be reclaimed before their checksum is actually put into the btree */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len) + u8 *sum, int len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); @@ -947,10 +949,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, num_sectors = ordered_sum->len >> inode->i_sb->s_blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); - memcpy(sum + index, ordered_sum->sums + i, - num_sectors); + memcpy(sum + index, ordered_sum->sums + i * csum_size, + num_sectors * csum_size); - index += (int)num_sectors; + index += (int)num_sectors * csum_size; if (index == len) goto out; disk_bytenr += num_sectors * sectorsize; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 9b68179d580f..5204171ea962 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -23,7 +23,7 @@ struct btrfs_ordered_sum { int len; struct list_head list; /* last field is a variable length array of csums */ - u32 sums[]; + u8 sums[]; }; /* @@ -183,7 +183,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len); + u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e51929a55af4..0e77bffd2a5a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2448,7 +2448,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) ASSERT(index < UINT_MAX); num_sectors = sum->len / sctx->fs_info->sectorsize; - memcpy(csum, sum->sums + index, sctx->csum_size); + memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); -- cgit v1.2.3 From 10fe6ca80d9d25eca9fd6d98eccf6c795532fe96 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 May 2019 10:19:02 +0200 Subject: btrfs: don't assume compressed_bio sums to be 4 bytes BTRFS has the implicit assumption that a checksum in compressed_bio is 4 bytes. While this is true for CRC32C, it is not for any other checksum. Change the data type to be a byte array and adjust loop index calculation accordingly. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 27 +++++++++++++++++---------- fs/btrfs/compression.h | 2 +- fs/btrfs/file-item.c | 2 +- 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index a8e551ca8798..92291f266324 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -57,12 +57,14 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int ret; struct page *page; unsigned long i; char *kaddr; u32 csum; - u32 *cb_sum = &cb->sums; + u8 *cb_sum = cb->sums; if (inode->flags & BTRFS_INODE_NODATASUM) return 0; @@ -76,13 +78,13 @@ static int check_compressed_csum(struct btrfs_inode *inode, btrfs_csum_final(csum, (u8 *)&csum); kunmap_atomic(kaddr); - if (csum != *cb_sum) { + if (memcmp(&csum, cb_sum, csum_size)) { btrfs_print_data_csum_error(inode, disk_start, csum, - *cb_sum, cb->mirror_num); + *(u32 *)cb_sum, cb->mirror_num); ret = -EIO; goto fail; } - cb_sum++; + cb_sum += csum_size; } ret = 0; @@ -536,7 +538,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - u32 *sums; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -558,7 +561,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; - sums = &cb->sums; + sums = cb->sums; cb->start = em->orig_start; em_len = em->len; @@ -617,6 +620,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + unsigned int nr_sectors; + ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -631,11 +636,13 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, - (u8 *)sums); + sums); BUG_ON(ret); /* -ENOMEM */ } - sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); + + nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, + fs_info->sectorsize); + sums += csum_size * nr_sectors; ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { @@ -657,7 +664,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, (u8 *) sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9976fe0f7526..191e5f4e3523 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -61,7 +61,7 @@ struct compressed_bio { * the start of a variable length array of checksums only * used by reads */ - u32 sums; + u8 sums[]; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 0464dd4c0579..de89fd1310a6 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -186,7 +186,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio } csum = btrfs_bio->csum; } else { - csum = (u8 *)dst; + csum = dst; } if (bio->bi_iter.bi_size > PAGE_SIZE * 8) -- cgit v1.2.3 From 7ebc7e5f2c81f658b1d2f95fcde15dfd1a84b370 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 3 Jun 2019 16:58:52 +0200 Subject: btrfs: format checksums according to type for printing Add a small helper for btrfs_print_data_csum_error() which formats the checksum according to it's type for pretty printing. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba [ shorten macro name ] Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d5b438706b77..de13be2a31a1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -337,22 +337,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); } +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define CSUM_FMT "0x%*phN" +#define CSUM_FMT_VALUE(size, bytes) size, bytes + static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; + struct btrfs_super_block *sb = root->fs_info->super_copy; + const u16 csum_size = btrfs_super_csum_size(sb); /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, - "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, &csum), + CSUM_FMT_VALUE(csum_size, &csum_expected), + mirror_num); else btrfs_warn_rl(root->fs_info, - "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, &csum), + CSUM_FMT_VALUE(csum_size, &csum_expected), + mirror_num); } #endif -- cgit v1.2.3 From e7e16f4882edb5935ff0ba81a0df25ae0b80b549 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 3 Jun 2019 16:58:53 +0200 Subject: btrfs: add common checksum type validation Currently btrfs is only supporting CRC32C as checksumming algorithm. As this is about to change provide a function to validate the checksum type in the superblock against all possible algorithms. This makes adding new algorithms easier as there are fewer places to adjust when adding new algorithms. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fe14b971440f..3f3bb70ca437 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -352,6 +352,16 @@ out: return ret; } +static bool btrfs_supported_super_csum(u16 csum_type) +{ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + return true; + default: + return false; + } +} + /* * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. @@ -362,7 +372,12 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; u16 csum_type = btrfs_super_csum_type(disk_sb); - int ret = 0; + + if (!btrfs_supported_super_csum(csum_type)) { + btrfs_err(fs_info, "unsupported checksum algorithm %u", + csum_type); + return 1; + } if (csum_type == BTRFS_CSUM_TYPE_CRC32) { u32 crc = ~(u32)0; @@ -378,16 +393,10 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, btrfs_csum_final(crc, result); if (memcmp(raw_disk_sb, result, sizeof(result))) - ret = 1; + return 1; } - if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - btrfs_err(fs_info, "unsupported checksum algorithm %u", - csum_type); - ret = 1; - } - - return ret; + return 0; } int btrfs_verify_level_key(struct extent_buffer *eb, int level, @@ -2572,7 +2581,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { + if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); -- cgit v1.2.3 From 8dc3f22c8ba02c5a5b889406259c50b3eaa61c65 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 3 Jun 2019 16:58:54 +0200 Subject: btrfs: check for supported superblock checksum type before checksum validation Now that we have factorerd out the superblock checksum type validation, we can check for supported superblock checksum types before doing the actual validation of the superblock read from disk. This leads the path to further simplifications of btrfs_check_super_csum() later on. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba [ add comment ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f3bb70ca437..ce6dc95effae 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2816,6 +2816,20 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + /* + * Verify the type first, if that or the the checksum value are + * corrupted, we'll find out + */ + if (!btrfs_supported_super_csum(btrfs_super_csum_type( + (struct btrfs_super_block *) bh->b_data))) { + btrfs_err(fs_info, "unsupported checksum algorithm: %u", + btrfs_super_csum_type((struct btrfs_super_block *) + bh->b_data)); + err = -EINVAL; + brelse(bh); + goto fail_alloc; + } + /* * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). -- cgit v1.2.3 From 51bce6c9b97729835bb55ceb37febd5c8fa962dd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 3 Jun 2019 16:58:55 +0200 Subject: btrfs: Simplify btrfs_check_super_csum() and get rid of size assumptions Now that we have already checked for a valid checksum type before calling btrfs_check_super_csum(), it can be simplified even further. While at it get rid of the implicit size assumption of the resulting checksum as well. This is a preparation for changing all checksum functionality to use the crypto layer later. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ce6dc95effae..c9ce0b002008 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -371,30 +371,20 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, { struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; - u16 csum_type = btrfs_super_csum_type(disk_sb); - - if (!btrfs_supported_super_csum(csum_type)) { - btrfs_err(fs_info, "unsupported checksum algorithm %u", - csum_type); - return 1; - } - - if (csum_type == BTRFS_CSUM_TYPE_CRC32) { - u32 crc = ~(u32)0; - char result[sizeof(crc)]; + u32 crc = ~(u32)0; + char result[BTRFS_CSUM_SIZE]; - /* - * The super_block structure does not span the whole - * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checksum. - */ - crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, - crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is + * filled with zeros and is included in the checksum. + */ + crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, + crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, result); - if (memcmp(raw_disk_sb, result, sizeof(result))) - return 1; - } + if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) + return 1; return 0; } @@ -2611,6 +2601,7 @@ int open_ctree(struct super_block *sb, u32 stripesize; u64 generation; u64 features; + u16 csum_type; struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; @@ -2820,11 +2811,10 @@ int open_ctree(struct super_block *sb, * Verify the type first, if that or the the checksum value are * corrupted, we'll find out */ - if (!btrfs_supported_super_csum(btrfs_super_csum_type( - (struct btrfs_super_block *) bh->b_data))) { + csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + if (!btrfs_supported_super_csum(csum_type)) { btrfs_err(fs_info, "unsupported checksum algorithm: %u", - btrfs_super_csum_type((struct btrfs_super_block *) - bh->b_data)); + csum_type); err = -EINVAL; brelse(bh); goto fail_alloc; -- cgit v1.2.3 From 6d97c6e31b553bc9f58b83ac3c4c79c17affbda8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 3 Jun 2019 16:58:56 +0200 Subject: btrfs: add boilerplate code for directly including the crypto framework Add boilerplate code for directly including the crypto framework. This helps us flipping the switch for new algorithms. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 ++++++++ fs/btrfs/disk-io.c | 46 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 47 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a66ed58058d9..2e908c557fb2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -73,6 +73,7 @@ struct btrfs_ref; /* four bytes for CRC32 */ static const int btrfs_csum_sizes[] = { 4 }; +static const char *btrfs_csum_names[] = { "crc32c" }; #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1163,6 +1164,8 @@ struct btrfs_fs_info { spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; + struct crypto_shash *csum_shash; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -2454,6 +2457,11 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) return btrfs_csum_sizes[t]; } +static inline const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csum_names[csum_type]; +} /* * The leaf data grows from end-to-front in the node. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c9ce0b002008..34222bbe4b48 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -2256,6 +2257,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, return 0; } +static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) +{ + struct crypto_shash *csum_shash; + const char *csum_name = btrfs_super_csum_name(csum_type); + + csum_shash = crypto_alloc_shash(csum_name, 0, 0); + + if (IS_ERR(csum_shash)) { + btrfs_err(fs_info, "error allocating %s hash for checksum", + csum_name); + return PTR_ERR(csum_shash); + } + + fs_info->csum_shash = csum_shash; + + return 0; +} + +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + crypto_free_shash(fs_info->csum_shash); +} + static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2820,6 +2844,12 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + ret = btrfs_init_csum_hash(fs_info, csum_type); + if (ret) { + err = ret; + goto fail_alloc; + } + /* * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). @@ -2828,7 +2858,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); - goto fail_alloc; + goto fail_csum; } /* @@ -2865,11 +2895,11 @@ int open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } if (!btrfs_super_root(disk_super)) - goto fail_alloc; + goto fail_csum; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2891,7 +2921,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super) & @@ -2901,7 +2931,7 @@ int open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super); @@ -2945,7 +2975,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_alloc; + goto fail_csum; } /* @@ -2961,7 +2991,7 @@ int open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3339,6 +3369,8 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); +fail_csum: + btrfs_free_csum_hash(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); -- cgit v1.2.3 From d5178578bcd461cc79118c7a139882350fe505aa Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 3 Jun 2019 16:58:57 +0200 Subject: btrfs: directly call into crypto framework for checksumming Currently btrfs_csum_data() relied on the crc32c() wrapper around the crypto framework for calculating the CRCs. As we have our own crypto_shash structure in the fs_info now, we can directly call into the crypto framework without going trough the wrapper. This way we can even remove the btrfs_csum_data() and btrfs_csum_final() wrappers. The module dependency on crc32c is preserved via MODULE_SOFTDEP("pre: crc32c"), which was previously provided by LIBCRC32C config option doing the same. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 3 ++- fs/btrfs/check-integrity.c | 11 +++++++---- fs/btrfs/compression.c | 17 +++++++++++------ fs/btrfs/disk-io.c | 46 +++++++++++++++++++++++----------------------- fs/btrfs/disk-io.h | 2 -- fs/btrfs/file-item.c | 18 +++++++++--------- fs/btrfs/inode.c | 23 +++++++++++++++-------- fs/btrfs/scrub.c | 30 +++++++++++++++++++++--------- fs/btrfs/super.c | 1 + 9 files changed, 89 insertions(+), 62 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 23537bc8c827..212b4a854f2c 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -2,7 +2,8 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select LIBCRC32C + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 85774e2fa3e5..81a9731959a9 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -83,7 +83,7 @@ #include #include #include -#include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1710,9 +1710,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages) { struct btrfs_fs_info *fs_info = state->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; unsigned int i; if (num_pages * PAGE_SIZE < state->metablock_size) @@ -1723,14 +1723,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + for (i = 0; i < num_pages; i++) { u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); size_t sublen = i ? PAGE_SIZE : (PAGE_SIZE - BTRFS_CSUM_SIZE); - crc = btrfs_csum_data(data, crc, sublen); + crypto_shash_update(shash, data, sublen); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, h->csum, state->csum_size)) return 1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 92291f266324..935c0c564c02 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -58,29 +59,33 @@ static int check_compressed_csum(struct btrfs_inode *inode, u64 disk_start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int ret; struct page *page; unsigned long i; char *kaddr; - u32 csum; + u8 csum[BTRFS_CSUM_SIZE]; u8 *cb_sum = cb->sums; if (inode->flags & BTRFS_INODE_NODATASUM) return 0; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < cb->nr_pages; i++) { page = cb->compressed_pages[i]; - csum = ~(u32)0; + crypto_shash_init(shash); kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); - btrfs_csum_final(csum, (u8 *)&csum); + crypto_shash_update(shash, kaddr, PAGE_SIZE); kunmap_atomic(kaddr); + crypto_shash_final(shash, (u8 *)&csum); if (memcmp(&csum, cb_sum, csum_size)) { - btrfs_print_data_csum_error(inode, disk_start, csum, - *(u32 *)cb_sum, cb->mirror_num); + btrfs_print_data_csum_error(inode, disk_start, + *(u32 *)csum, *(u32 *)cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34222bbe4b48..6c7dc24d4031 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -246,16 +246,6 @@ out: return em; } -u32 btrfs_csum_data(const char *data, u32 seed, size_t len) -{ - return crc32c(seed, data, len); -} - -void btrfs_csum_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - /* * Compute the csum of a btree block and store the result to provided buffer. * @@ -263,6 +253,8 @@ void btrfs_csum_final(u32 crc, u8 *result) */ static int csum_tree_block(struct extent_buffer *buf, u8 *result) { + struct btrfs_fs_info *fs_info = buf->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -270,9 +262,12 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) unsigned long map_start; unsigned long map_len; int err; - u32 crc = ~(u32)0; + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); len = buf->len - offset; + while (len > 0) { /* * Note: we don't need to check for the err == 1 case here, as @@ -285,14 +280,13 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) if (WARN_ON(err)) return err; cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(kaddr + offset - map_start, - crc, cur_len); + crypto_shash_update(shash, kaddr + offset - map_start, cur_len); len -= cur_len; offset += cur_len; } memset(result, 0, BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + crypto_shash_final(shash, result); return 0; } @@ -372,17 +366,20 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, { struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; - u32 crc = ~(u32)0; char result[BTRFS_CSUM_SIZE]; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is * filled with zeros and is included in the checksum. */ - crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, - crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, result); if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) return 1; @@ -3512,17 +3509,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { + struct btrfs_fs_info *fs_info = device->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct buffer_head *bh; int i; int ret; int errors = 0; - u32 crc; u64 bytenr; int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3531,10 +3531,10 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); - crc = ~(u32)0; - crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, sb->csum); + crypto_shash_init(shash); + crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, sb->csum); /* One reference for us, and we leave it for the caller */ bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0161aa1ea0b..e80f7c45a307 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -115,8 +115,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -u32 btrfs_csum_data(const char *data, u32 seed, size_t len); -void btrfs_csum_final(u32 crc, u8 *result); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index de89fd1310a6..1a599f50837b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -432,6 +433,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; @@ -465,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; + shash->tfm = fs_info->csum_shash; + bio_for_each_segment(bvec, bio, iter) { if (!contig) offset = page_offset(bvec.bv_page) + bvec.bv_offset; @@ -479,8 +483,6 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, - 1); for (i = 0; i < nr_sectors; i++) { - u32 tmp; - if (offset >= ordered->file_offset + ordered->len || offset < ordered->file_offset) { unsigned long bytes_left; @@ -506,15 +508,13 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, index = 0; } - memset(&sums->sums[index], 0xff, csum_size); + crypto_shash_init(shash); data = kmap_atomic(bvec.bv_page); - tmp = btrfs_csum_data(data + bvec.bv_offset - + (i * fs_info->sectorsize), - *(u32 *)&sums->sums[index], - fs_info->sectorsize); + crypto_shash_update(shash, data + bvec.bv_offset + + (i * fs_info->sectorsize), + fs_info->sectorsize); kunmap_atomic(data); - btrfs_csum_final(tmp, - (char *)(sums->sums + index)); + crypto_shash_final(shash, (char *)(sums->sums + index)); index += csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9137bafc9376..3d356a0f8990 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3203,23 +3203,30 @@ static int __readpage_endio_check(struct inode *inode, int icsum, struct page *page, int pgoff, u64 start, size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; - u32 csum_expected; - u32 csum = ~(u32)0; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *csum_expected; + u8 csum[BTRFS_CSUM_SIZE]; - csum_expected = *(((u32 *)io_bio->csum) + icsum); + csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + pgoff, csum, len); - btrfs_csum_final(csum, (u8 *)&csum); - if (csum != csum_expected) + shash->tfm = fs_info->csum_shash; + + crypto_shash_init(shash); + crypto_shash_update(shash, kaddr + pgoff, len); + crypto_shash_final(shash, csum); + + if (memcmp(csum, csum_expected, csum_size)) goto zeroit; kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - io_bio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), start, *(u32 *)csum, + *(u32 *)csum_expected, io_bio->mirror_num); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0e77bffd2a5a..9f0297d529d4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -1787,11 +1788,12 @@ static int scrub_checksum(struct scrub_block *sblock) static int scrub_checksum_data(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; void *buffer; - u32 crc = ~(u32)0; u64 len; int index; @@ -1799,6 +1801,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (!sblock->pagev[0]->have_csum) return 0; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + on_disk_csum = sblock->pagev[0]->csum; page = sblock->pagev[0]->page; buffer = kmap_atomic(page); @@ -1808,7 +1813,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crc = btrfs_csum_data(buffer, crc, l); + crypto_shash_update(shash, buffer, l); kunmap_atomic(buffer); len -= l; if (len == 0) @@ -1820,7 +1825,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) buffer = kmap_atomic(page); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1832,16 +1837,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1875,7 +1883,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1889,7 +1897,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1900,18 +1908,22 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; int fail_gen = 0; int fail_cor = 0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1934,7 +1946,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1948,7 +1960,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 526dbae5c4cf..6e196b8a0820 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2464,3 +2464,4 @@ late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); -- cgit v1.2.3 From ea41d6b2785f6aaeefffaaab62c3c4164b5628ce Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 3 Jun 2019 16:58:58 +0200 Subject: btrfs: remove assumption about csum type form btrfs_print_data_csum_error() btrfs_print_data_csum_error() still assumed checksums to be 32 bit in size. Make it size agnostic. Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 10 +++++----- fs/btrfs/compression.c | 3 +-- fs/btrfs/inode.c | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index de13be2a31a1..f853835c409c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -342,7 +342,7 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) #define CSUM_FMT_VALUE(size, bytes) size, bytes static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; struct btrfs_super_block *sb = root->fs_info->super_copy; @@ -354,16 +354,16 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, &csum), - CSUM_FMT_VALUE(csum_size, &csum_expected), + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); else btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, &csum), - CSUM_FMT_VALUE(csum_size, &csum_expected), + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 935c0c564c02..66e21a4e9ea2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -84,8 +84,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, if (memcmp(&csum, cb_sum, csum_size)) { btrfs_print_data_csum_error(inode, disk_start, - *(u32 *)csum, *(u32 *)cb_sum, - cb->mirror_num); + csum, cb_sum, cb->mirror_num); ret = -EIO; goto fail; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d356a0f8990..7126493d8d8c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3225,8 +3225,8 @@ static int __readpage_endio_check(struct inode *inode, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, *(u32 *)csum, - *(u32 *)csum_expected, io_bio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, + io_bio->mirror_num); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); -- cgit v1.2.3 From 6f8e4fd43073406aa5e2d7f27ffc45510778ac3e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 30 May 2019 18:43:53 +0200 Subject: btrfs: use file:line format for assertion report The filename:line format is commonly understood by editors and can be copy&pasted more easily than the current format. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2e908c557fb2..fb7435533478 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3536,8 +3536,7 @@ __cold static inline void assfail(const char *expr, const char *file, int line) { if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { - pr_err("assertion failed: %s, file: %s, line: %d\n", - expr, file, line); + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); BUG(); } } -- cgit v1.2.3 From 53460a4572585b508dc4cb6f09653ac50ba3fc49 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Jun 2019 13:06:01 +0300 Subject: btrfs: trim: make reserved device area adjustments more explicit Currently the first megabyte on a device housing a btrfs filesystem is exempt from allocation and trimming. Currently this is not a problem since 'start' is set to 1M at the beginning of btrfs_trim_free_extents and find_first_clear_extent_bit always returns a range that is >= start. However, in a follow up patch find_first_clear_extent_bit will be changed such that it will return a range containing 'start' and this range may very well be 0...>=1M so 'start'. Future proof the sole user of find_first_clear_extent_bit by setting 'start' after the function is called. No functional changes. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5d736d8ae3e3..862df2066e7a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -11179,12 +11179,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* Ensure we skip the reserved area in the first 1M */ + start = max_t(u64, start, SZ_1M); + /* * If find_first_clear_extent_bit find a range that spans the * end of the device it will set end to -1, in this case it's up * to the caller to trim the value to the size of the device. */ end = min(end, device->total_bytes - 1); + len = end - start + 1; /* We didn't find any extents */ -- cgit v1.2.3 From 1eaebb341d2b4183d0112b76e31ccff3e1fe3092 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Jun 2019 13:06:02 +0300 Subject: btrfs: Don't trim returned range based on input value in find_first_clear_extent_bit Currently find_first_clear_extent_bit always returns a range whose starting value is >= passed 'start'. This implicit trimming behavior is somewhat subtle and an implementation detail. Instead, this patch modifies the function such that now it always returns the range which contains passed 'start' and has the given bits unset. This range could either be due to presence of existing records which contains 'start' but have the bits unset or because there are no records that contain the given starting offset. This patch also adds test cases which cover find_first_clear_extent_bit since they were missing up until now. Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 52 +++++++++++++++++++++--- fs/btrfs/tests/extent-io-tests.c | 87 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9591ecfaa57e..105b3d2e86f2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1537,8 +1537,8 @@ out: } /** - * find_first_clear_extent_bit - finds the first range that has @bits not set - * and that starts after @start + * find_first_clear_extent_bit - find the first range that has @bits not set. + * This range could start before @start. * * @tree - the tree to search * @start - the offset at/after which the found extent should start @@ -1578,12 +1578,52 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, goto out; } } + /* + * At this point 'node' either contains 'start' or start is + * before 'node' + */ state = rb_entry(node, struct extent_state, rb_node); - if (in_range(start, state->start, state->end - state->start + 1) && - (state->state & bits)) { - start = state->end + 1; + + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as + * the beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } } else { - *start_ret = start; + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) { + state = rb_entry(prev, struct extent_state, + rb_node); + *start_ret = state->end + 1; + } else { + *start_ret = 0; + } break; } } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7bf4d5734dbe..f0ccfb6449d2 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -432,6 +432,89 @@ out: return ret; } +static int test_find_first_clear_extent_bit(void) +{ + struct extent_io_tree tree; + u64 start, end; + + test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + + /* + * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between + * 4M-32M + */ + set_extent_bits(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != 0 || end != SZ_1M -1) + test_err("error finding beginning range: start %llu end %llu", + start, end); + + /* Now add 32M-64M so that we have a hole between 4M-32M */ + set_extent_bits(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* + * Request first hole starting at 12M, we should get 4M-32M + */ + find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M - 1) + test_err("error finding trimmed range: start %llu end %llu", + start, end); + + /* + * Search in the middle of allocated range, should get the next one + * available, which happens to be unallocated -> 4M-32M + */ + find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M -1) + test_err("error finding next unalloc range: start %llu end %llu", + start, end); + + /* + * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag + * being unset in this range, we should get the entry in range 64M-72M + */ + set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); + find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); + + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + test_err("error finding exact range: start %llu end %llu", + start, end); + + find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); + + /* + * Search in the middle of set range whose immediate neighbour doesn't + * have the bits set so it must be returned + */ + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + test_err("error finding next alloc range: start %llu end %llu", + start, end); + + /* + * Search beyond any known range, shall return after last known range + * and end should be -1 + */ + find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + if (start != SZ_64M + SZ_8M || end != -1) + test_err( + "error handling beyond end of range search: start %llu end %llu", + start, end); + + return 0; +} + int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; @@ -442,6 +525,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) if (ret) goto out; + ret = test_find_first_clear_extent_bit(); + if (ret) + goto out; + ret = test_eb_bitmaps(sectorsize, nodesize); out: return ret; -- cgit v1.2.3 From 8666e638b07e2be143b91129d3499d2c25f26a5f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 5 Jun 2019 14:50:04 +0300 Subject: btrfs: Document __etree_search The function has a lot of return values and specific conventions making it cumbersome to understand what's returned. Have a go at documenting its parameters and return values. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 105b3d2e86f2..43761ecd1201 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -359,6 +359,24 @@ do_insert: return NULL; } +/** + * __etree_search - searche @tree for an entry that contains @offset. Such + * entry would have entry->start <= offset && entry->end >= offset. + * + * @tree - the tree to search + * @offset - offset that should fall within an entry in @tree + * @next_ret - pointer to the first entry whose range ends after @offset + * @prev - pointer to the first entry whose range begins before @offset + * @p_ret - pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret - points to entry which would have been the parent of the entry, + * containing @offset + * + * This function returns a pointer to the entry that contains @offset byte + * address. If no such entry exists, then NULL is returned and the other + * pointer arguments to the function are filled, otherwise the found entry is + * returned and other pointers are left untouched. + */ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, struct rb_node **next_ret, struct rb_node **prev_ret, -- cgit v1.2.3 From 93ead46b038037cd717cf46c9170af2531a6eb58 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 3 Apr 2019 19:32:56 +0200 Subject: btrfs: tests: add locks around add_extent_mapping There are no concerns about locking during the selftests so the locks are not necessary, but following patches will add lockdep assertions to add_extent_mapping so this is needed in tests too. Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/tests/extent-map-tests.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 87aeabe9d610..4a7f796c9900 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -66,7 +66,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); goto out; @@ -85,7 +87,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); goto out; @@ -104,7 +108,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = len; em->block_start = start; em->block_len = len; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); goto out; @@ -148,7 +154,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); goto out; @@ -167,7 +175,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); goto out; @@ -186,7 +196,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); if (ret) { test_err("case2 [0 1K]: ret %d", ret); goto out; @@ -225,7 +237,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); goto out; @@ -244,7 +258,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); if (ret) { test_err("case3 [0x%llx 0x%llx): ret %d", start, start + len, ret); @@ -320,7 +336,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = SZ_8K; em->block_start = 0; em->block_len = SZ_8K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); goto out; @@ -339,7 +357,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = 24 * SZ_1K; em->block_start = SZ_16K; /* avoid merging */ em->block_len = 24 * SZ_1K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); goto out; @@ -357,7 +377,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = SZ_32K; em->block_start = 0; em->block_len = SZ_32K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); if (ret) { test_err("case4 [0x%llx 0x%llx): ret %d", start, len, ret); -- cgit v1.2.3 From 38e9372e391deabf19c76c88b59b5e16f304627a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Mar 2019 16:19:55 +0100 Subject: btrfs: assert delayed ref lock in btrfs_find_delayed_ref_head Turn the comment about required lock into an assertion. Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a73fc23e2961..a94fae897b3f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -957,13 +957,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, } /* - * this does a simple search for the head node for a given extent. - * It must be called with the delayed ref spinlock held, and it returns - * the head node if any where found, or NULL if not. + * This does a simple search for the head node for a given extent. Returns the + * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { + lockdep_assert_held(&delayed_refs->lock); + return find_ref_head(delayed_refs, bytenr, false); } -- cgit v1.2.3 From 06297d8cefcaa2029c4cb71b79285d2bfff06d4d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 May 2019 16:47:23 +0200 Subject: btrfs: switch extent_buffer blocking_writers from atomic to int The blocking_writers is either 0 or 1 and always updated under the lock, so we don't need the atomic_t semantics. Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/locking.c | 46 ++++++++++++++++++++-------------------------- fs/btrfs/print-tree.c | 2 +- 4 files changed, 23 insertions(+), 29 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43761ecd1201..6c49db9bfcf1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4857,7 +4857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); - atomic_set(&eb->blocking_writers, 0); + eb->blocking_writers = 0; eb->lock_nested = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index aa18a16a6ed7..201da61dfc21 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -167,7 +167,7 @@ struct extent_buffer { struct rcu_head rcu_head; pid_t lock_owner; - atomic_t blocking_writers; + int blocking_writers; atomic_t blocking_readers; bool lock_nested; /* >= 0 if eb belongs to a log tree, -1 otherwise */ diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2f6c3c7851ed..5feb01147e19 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -111,10 +111,10 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - if (atomic_read(&eb->blocking_writers) == 0) { + if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); btrfs_assert_tree_locked(eb); - atomic_inc(&eb->blocking_writers); + eb->blocking_writers++; write_unlock(&eb->lock); } } @@ -148,12 +148,11 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); + BUG_ON(eb->blocking_writers != 1); btrfs_assert_spinning_writers_get(eb); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_writers)) - cond_wake_up_nomb(&eb->write_lock_wq); + if (--eb->blocking_writers == 0) + cond_wake_up(&eb->write_lock_wq); } /* @@ -167,12 +166,10 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) if (trace_btrfs_tree_read_lock_enabled()) start_ns = ktime_get_ns(); again: - BUG_ON(!atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner); - read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner) { + BUG_ON(eb->blocking_writers == 0 && + current->pid == eb->lock_owner); + if (eb->blocking_writers && current->pid == eb->lock_owner) { /* * This extent is already write-locked by our thread. We allow * an additional read lock to be added because it's for the same @@ -185,10 +182,10 @@ again: trace_btrfs_tree_read_lock(eb, start_ns); return; } - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); + eb->blocking_writers == 0); goto again; } btrfs_assert_tree_read_locks_get(eb); @@ -203,11 +200,11 @@ again: */ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (eb->blocking_writers) return 0; read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); return 0; } @@ -223,13 +220,13 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (eb->blocking_writers) return 0; if (!read_trylock(&eb->lock)) return 0; - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); return 0; } @@ -245,13 +242,11 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) return 0; write_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) { + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); return 0; } @@ -322,10 +317,9 @@ void btrfs_tree_lock(struct extent_buffer *eb) WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + wait_event(eb->write_lock_wq, eb->blocking_writers == 0); write_lock(&eb->lock); - if (atomic_read(&eb->blocking_readers) || - atomic_read(&eb->blocking_writers)) { + if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) { write_unlock(&eb->lock); goto again; } @@ -340,7 +334,7 @@ again: */ void btrfs_tree_unlock(struct extent_buffer *eb) { - int blockers = atomic_read(&eb->blocking_writers); + int blockers = eb->blocking_writers; BUG_ON(blockers > 1); @@ -351,7 +345,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { btrfs_assert_no_spinning_writers(eb); - atomic_dec(&eb->blocking_writers); + eb->blocking_writers--; /* Use the lighter barrier after atomic */ smp_mb__after_atomic(); cond_wake_up_nomb(&eb->write_lock_wq); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1141ca5fae6a..7cb4f1fbe043 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -155,7 +155,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb) "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", atomic_read(&eb->refs), atomic_read(&eb->write_locks), atomic_read(&eb->read_locks), - atomic_read(&eb->blocking_writers), + eb->blocking_writers, atomic_read(&eb->blocking_readers), atomic_read(&eb->spinning_writers), atomic_read(&eb->spinning_readers), -- cgit v1.2.3 From f3dc24c52a28c700e35757dce7b38456888071e1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 May 2019 16:51:53 +0200 Subject: btrfs: switch extent_buffer spinning_writers from atomic to int The spinning_writers is either 0 or 1 and always updated under the lock, so we don't need the atomic_t semantics. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/locking.c | 10 +++++----- fs/btrfs/print-tree.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6c49db9bfcf1..fef346c63203 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4876,7 +4876,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); #ifdef CONFIG_BTRFS_DEBUG - atomic_set(&eb->spinning_writers, 0); + eb->spinning_writers = 0; atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->read_locks, 0); atomic_set(&eb->write_locks, 0); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 201da61dfc21..5616b96c365d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -187,7 +187,7 @@ struct extent_buffer { wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG - atomic_t spinning_writers; + int spinning_writers; atomic_t spinning_readers; atomic_t read_locks; atomic_t write_locks; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5feb01147e19..270667627977 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -15,19 +15,19 @@ #ifdef CONFIG_BTRFS_DEBUG static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); + WARN_ON(eb->spinning_writers); + eb->spinning_writers++; } static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); + WARN_ON(eb->spinning_writers != 1); + eb->spinning_writers--; } static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers)); + WARN_ON(eb->spinning_writers); } static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 7cb4f1fbe043..c5cc435ed39a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -157,7 +157,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb) atomic_read(&eb->read_locks), eb->blocking_writers, atomic_read(&eb->blocking_readers), - atomic_read(&eb->spinning_writers), + eb->spinning_writers, atomic_read(&eb->spinning_readers), eb->lock_owner, current->pid); #endif -- cgit v1.2.3 From 00801ae4bb2be5f5af46502ef239ac5f4b536094 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 May 2019 16:53:47 +0200 Subject: btrfs: switch extent_buffer write_locks from atomic to int The write_locks is either 0 or 1 and always updated under the lock, so we don't need the atomic_t semantics. Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/locking.c | 6 +++--- fs/btrfs/print-tree.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fef346c63203..464e6b761a9c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4879,7 +4879,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->spinning_writers = 0; atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->read_locks, 0); - atomic_set(&eb->write_locks, 0); + eb->write_locks = 0; #endif return eb; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5616b96c365d..844e595cde5b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -190,7 +190,7 @@ struct extent_buffer { int spinning_writers; atomic_t spinning_readers; atomic_t read_locks; - atomic_t write_locks; + int write_locks; struct list_head leak_list; #endif }; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 270667627977..98fccce4208c 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { - atomic_inc(&eb->write_locks); + eb->write_locks++; } static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { - atomic_dec(&eb->write_locks); + eb->write_locks--; } void btrfs_assert_tree_locked(struct extent_buffer *eb) { - BUG_ON(!atomic_read(&eb->write_locks)); + BUG_ON(!eb->write_locks); } #else diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index c5cc435ed39a..9cb50577d982 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -153,7 +153,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", - atomic_read(&eb->refs), atomic_read(&eb->write_locks), + atomic_read(&eb->refs), eb->write_locks, atomic_read(&eb->read_locks), eb->blocking_writers, atomic_read(&eb->blocking_readers), -- cgit v1.2.3 From 6d58a55a894e8632a6153e5b98792e7bdda13be8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 11 Jun 2019 14:31:01 +0200 Subject: btrfs: raid56: clear incompat block group flags after removing the last one The incompat bit for RAID56 is set either at mount time or automatically when the profile is used by balance. The part where the bit is removed is missing and can be unexpected or undesired when an older kernel is needed. This patch will drop the incompat bit after this command, assuming that RAID5 profile is not used by system or metadata: $ btrfs balance start -dconvert=raid5 /mnt $ btrfs balance start -dconvert=raid1 /mnt This will print "clearing 128 feature flag" to the system log. The patch is safe for backporting to older kernels. Reported-by: Hugo Mills Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 862df2066e7a..f24ef9020323 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10606,6 +10606,35 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } +/* + * Clear incompat bits for the following feature(s): + * + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group + * in the whole filesystem + */ +static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *sinfo; + + list_for_each_entry_rcu(sinfo, head, list) { + bool found = false; + + down_read(&sinfo->groups_sem); + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) + found = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) + found = true; + up_read(&sinfo->groups_sem); + + if (found) + return; + } + btrfs_clear_fs_incompat(fs_info, RAID56); + } +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em) { @@ -10752,6 +10781,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, clear_avail_alloc_bits(fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); + clear_incompat_bg_bits(fs_info, block_group->flags); if (kobj) { kobject_del(kobj); kobject_put(kobj); -- cgit v1.2.3 From d1d832a0b51dd9570429bb4b81b2a6c1759e681a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 7 Jun 2019 11:25:24 +0100 Subject: Btrfs: fix data loss after inode eviction, renaming it, and fsync it When we log an inode, regardless of logging it completely or only that it exists, we always update it as logged (logged_trans and last_log_commit fields of the inode are updated). This is generally fine and avoids future attempts to log it from having to do repeated work that brings no value. However, if we write data to a file, then evict its inode after all the dealloc was flushed (and ordered extents completed), rename the file and fsync it, we end up not logging the new extents, since the rename may result in logging that the inode exists in case the parent directory was logged before. The following reproducer shows and explains how this can happen: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/dir $ touch /mnt/dir/foo $ touch /mnt/dir/bar # Do a direct IO write instead of a buffered write because with a # buffered write we would need to make sure dealloc gets flushed and # complete before we do the inode eviction later, and we can not do that # from user space with call to things such as sync(2) since that results # in a transaction commit as well. $ xfs_io -d -c "pwrite -S 0xd3 0 4K" /mnt/dir/bar # Keep the directory dir in use while we evict inodes. We want our file # bar's inode to be evicted but we don't want our directory's inode to # be evicted (if it were evicted too, we would not be able to reproduce # the issue since the first fsync below, of file foo, would result in a # transaction commit. $ ( cd /mnt/dir; while true; do :; done ) & $ pid=$! # Wait a bit to give time for the background process to chdir. $ sleep 0.1 # Evict all inodes, except the inode for the directory dir because it is # currently in use by our background process. $ echo 2 > /proc/sys/vm/drop_caches # fsync file foo, which ends up persisting information about the parent # directory because it is a new inode. $ xfs_io -c fsync /mnt/dir/foo # Rename bar, this results in logging that this inode exists (inode item, # names, xattrs) because the parent directory is in the log. $ mv /mnt/dir/bar /mnt/dir/baz # Now fsync baz, which ends up doing absolutely nothing because of the # rename operation which logged that the inode exists only. $ xfs_io -c fsync /mnt/dir/baz $ mount /dev/sdb /mnt $ od -t x1 -A d /mnt/dir/baz 0000000 --> Empty file, data we wrote is missing. Fix this by not updating last_sub_trans of an inode when we are logging only that it exists and the inode was not yet logged since it was loaded from disk (full_sync bit set), this is enough to make btrfs_inode_in_log() return false for this scenario and make us log the inode. The logged_trans of the inode is still always setsince that alone is used to track if names need to be deleted as part of unlink operations. Fixes: 257c62e1bce03e ("Btrfs: avoid tree log commit when there are no changes") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3fc8d854d7fb..4a04659fded7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5420,9 +5420,19 @@ log_extents: } } + /* + * Don't update last_log_commit if we logged that an inode exists after + * it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, then + * the inode gets evicted after all delalloc was flushed, then we log + * it exists (due to a rename for example) and then fsync it. This last + * fsync would do nothing (not logging the extents previously written). + */ spin_lock(&inode->lock); inode->logged_trans = trans->transid; - inode->last_log_commit = inode->last_sub_trans; + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); out_unlock: mutex_unlock(&inode->log_mutex); -- cgit v1.2.3 From aa53e3bfac7205fb3a8815ac1c937fd6ed01b41e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 6 Jun 2019 12:07:15 +0200 Subject: btrfs: correctly validate compression type Nikolay reported the following KASAN splat when running btrfs/048: [ 1843.470920] ================================================================== [ 1843.471971] BUG: KASAN: slab-out-of-bounds in strncmp+0x66/0xb0 [ 1843.472775] Read of size 1 at addr ffff888111e369e2 by task btrfs/3979 [ 1843.473904] CPU: 3 PID: 3979 Comm: btrfs Not tainted 5.2.0-rc3-default #536 [ 1843.475009] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 1843.476322] Call Trace: [ 1843.476674] dump_stack+0x7c/0xbb [ 1843.477132] ? strncmp+0x66/0xb0 [ 1843.477587] print_address_description+0x114/0x320 [ 1843.478256] ? strncmp+0x66/0xb0 [ 1843.478740] ? strncmp+0x66/0xb0 [ 1843.479185] __kasan_report+0x14e/0x192 [ 1843.479759] ? strncmp+0x66/0xb0 [ 1843.480209] kasan_report+0xe/0x20 [ 1843.480679] strncmp+0x66/0xb0 [ 1843.481105] prop_compression_validate+0x24/0x70 [ 1843.481798] btrfs_xattr_handler_set_prop+0x65/0x160 [ 1843.482509] __vfs_setxattr+0x71/0x90 [ 1843.483012] __vfs_setxattr_noperm+0x84/0x130 [ 1843.483606] vfs_setxattr+0xac/0xb0 [ 1843.484085] setxattr+0x18c/0x230 [ 1843.484546] ? vfs_setxattr+0xb0/0xb0 [ 1843.485048] ? __mod_node_page_state+0x1f/0xa0 [ 1843.485672] ? _raw_spin_unlock+0x24/0x40 [ 1843.486233] ? __handle_mm_fault+0x988/0x1290 [ 1843.486823] ? lock_acquire+0xb4/0x1e0 [ 1843.487330] ? lock_acquire+0xb4/0x1e0 [ 1843.487842] ? mnt_want_write_file+0x3c/0x80 [ 1843.488442] ? debug_lockdep_rcu_enabled+0x22/0x40 [ 1843.489089] ? rcu_sync_lockdep_assert+0xe/0x70 [ 1843.489707] ? __sb_start_write+0x158/0x200 [ 1843.490278] ? mnt_want_write_file+0x3c/0x80 [ 1843.490855] ? __mnt_want_write+0x98/0xe0 [ 1843.491397] __x64_sys_fsetxattr+0xba/0xe0 [ 1843.492201] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 1843.493201] do_syscall_64+0x6c/0x230 [ 1843.493988] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 1843.495041] RIP: 0033:0x7fa7a8a7707a [ 1843.495819] Code: 48 8b 0d 21 de 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 be 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ee dd 2b 00 f7 d8 64 89 01 48 [ 1843.499203] RSP: 002b:00007ffcb73bca38 EFLAGS: 00000202 ORIG_RAX: 00000000000000be [ 1843.500210] RAX: ffffffffffffffda RBX: 00007ffcb73bda9d RCX: 00007fa7a8a7707a [ 1843.501170] RDX: 00007ffcb73bda9d RSI: 00000000006dc050 RDI: 0000000000000003 [ 1843.502152] RBP: 00000000006dc050 R08: 0000000000000000 R09: 0000000000000000 [ 1843.503109] R10: 0000000000000002 R11: 0000000000000202 R12: 00007ffcb73bda91 [ 1843.504055] R13: 0000000000000003 R14: 00007ffcb73bda82 R15: ffffffffffffffff [ 1843.505268] Allocated by task 3979: [ 1843.505771] save_stack+0x19/0x80 [ 1843.506211] __kasan_kmalloc.constprop.5+0xa0/0xd0 [ 1843.506836] setxattr+0xeb/0x230 [ 1843.507264] __x64_sys_fsetxattr+0xba/0xe0 [ 1843.507886] do_syscall_64+0x6c/0x230 [ 1843.508429] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 1843.509558] Freed by task 0: [ 1843.510188] (stack is not available) [ 1843.511309] The buggy address belongs to the object at ffff888111e369e0 which belongs to the cache kmalloc-8 of size 8 [ 1843.514095] The buggy address is located 2 bytes inside of 8-byte region [ffff888111e369e0, ffff888111e369e8) [ 1843.516524] The buggy address belongs to the page: [ 1843.517561] page:ffff88813f478d80 refcount:1 mapcount:0 mapping:ffff88811940c300 index:0xffff888111e373b8 compound_mapcount: 0 [ 1843.519993] flags: 0x4404000010200(slab|head) [ 1843.520951] raw: 0004404000010200 ffff88813f48b008 ffff888119403d50 ffff88811940c300 [ 1843.522616] raw: ffff888111e373b8 000000000016000f 00000001ffffffff 0000000000000000 [ 1843.524281] page dumped because: kasan: bad access detected [ 1843.525936] Memory state around the buggy address: [ 1843.526975] ffff888111e36880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1843.528479] ffff888111e36900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1843.530138] >ffff888111e36980: fc fc fc fc fc fc fc fc fc fc fc fc 02 fc fc fc [ 1843.531877] ^ [ 1843.533287] ffff888111e36a00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1843.534874] ffff888111e36a80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 1843.536468] ================================================================== This is caused by supplying a too short compression value ('lz') in the test-case and comparing it to 'lzo' with strncmp() and a length of 3. strncmp() read past the 'lz' when looking for the 'o' and thus caused an out-of-bounds read. Introduce a new check 'btrfs_compress_is_valid_type()' which not only checks the user-supplied value against known compression types, but also employs checks for too short values. Reported-by: Nikolay Borisov Fixes: 272e5326c783 ("btrfs: prop: fix vanished compression property after failed set") CC: stable@vger.kernel.org # 5.1+ Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 16 ++++++++++++++++ fs/btrfs/compression.h | 1 + fs/btrfs/props.c | 6 +----- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 66e21a4e9ea2..db41315f11eb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -43,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) return NULL; } +bool btrfs_compress_is_valid_type(const char *str, size_t len) +{ + int i; + + for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { + size_t comp_len = strlen(btrfs_compress_types[i]); + + if (len < comp_len) + continue; + + if (!strncmp(btrfs_compress_types[i], str, comp_len)) + return true; + } + return false; +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 191e5f4e3523..2035b8eb1290 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); +bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index a9e2e66152ee..af109c0ba720 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len) if (!value) return 0; - if (!strncmp("lzo", value, 3)) - return 0; - else if (!strncmp("zlib", value, 4)) - return 0; - else if (!strncmp("zstd", value, 4)) + if (btrfs_compress_is_valid_type(value, len)) return 0; return -EINVAL; -- cgit v1.2.3 From e88439debd0a7f969b3ddba6f147152cd0732676 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Jun 2019 17:31:24 +0800 Subject: btrfs: qgroup: Don't hold qgroup_ioctl_lock in btrfs_qgroup_inherit() [BUG] Lockdep will report the following circular locking dependency: WARNING: possible circular locking dependency detected 5.2.0-rc2-custom #24 Tainted: G O ------------------------------------------------------ btrfs/8631 is trying to acquire lock: 000000002536438c (&fs_info->qgroup_ioctl_lock#2){+.+.}, at: btrfs_qgroup_inherit+0x40/0x620 [btrfs] but task is already holding lock: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&fs_info->tree_log_mutex){+.+.}: __mutex_lock+0x76/0x940 mutex_lock_nested+0x1b/0x20 btrfs_commit_transaction+0x475/0xa00 [btrfs] btrfs_commit_super+0x71/0x80 [btrfs] close_ctree+0x2bd/0x320 [btrfs] btrfs_put_super+0x15/0x20 [btrfs] generic_shutdown_super+0x72/0x110 kill_anon_super+0x18/0x30 btrfs_kill_super+0x16/0xa0 [btrfs] deactivate_locked_super+0x3a/0x80 deactivate_super+0x51/0x60 cleanup_mnt+0x3f/0x80 __cleanup_mnt+0x12/0x20 task_work_run+0x94/0xb0 exit_to_usermode_loop+0xd8/0xe0 do_syscall_64+0x210/0x240 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #1 (&fs_info->reloc_mutex){+.+.}: __mutex_lock+0x76/0x940 mutex_lock_nested+0x1b/0x20 btrfs_commit_transaction+0x40d/0xa00 [btrfs] btrfs_quota_enable+0x2da/0x730 [btrfs] btrfs_ioctl+0x2691/0x2b40 [btrfs] do_vfs_ioctl+0xa9/0x6d0 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x65/0x240 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&fs_info->qgroup_ioctl_lock#2){+.+.}: lock_acquire+0xa7/0x190 __mutex_lock+0x76/0x940 mutex_lock_nested+0x1b/0x20 btrfs_qgroup_inherit+0x40/0x620 [btrfs] create_pending_snapshot+0x9d7/0xe60 [btrfs] create_pending_snapshots+0x94/0xb0 [btrfs] btrfs_commit_transaction+0x415/0xa00 [btrfs] btrfs_mksubvol+0x496/0x4e0 [btrfs] btrfs_ioctl_snap_create_transid+0x174/0x180 [btrfs] btrfs_ioctl_snap_create_v2+0x11c/0x180 [btrfs] btrfs_ioctl+0xa90/0x2b40 [btrfs] do_vfs_ioctl+0xa9/0x6d0 ksys_ioctl+0x67/0x90 __x64_sys_ioctl+0x1a/0x20 do_syscall_64+0x65/0x240 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Chain exists of: &fs_info->qgroup_ioctl_lock#2 --> &fs_info->reloc_mutex --> &fs_info->tree_log_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->tree_log_mutex); lock(&fs_info->reloc_mutex); lock(&fs_info->tree_log_mutex); lock(&fs_info->qgroup_ioctl_lock#2); *** DEADLOCK *** 6 locks held by btrfs/8631: #0: 00000000ed8f23f6 (sb_writers#12){.+.+}, at: mnt_want_write_file+0x28/0x60 #1: 000000009fb1597a (&type->i_mutex_dir_key#10/1){+.+.}, at: btrfs_mksubvol+0x70/0x4e0 [btrfs] #2: 0000000088c5ad88 (&fs_info->subvol_sem){++++}, at: btrfs_mksubvol+0x128/0x4e0 [btrfs] #3: 000000009606fc3e (sb_internal#2){.+.+}, at: start_transaction+0x37a/0x520 [btrfs] #4: 00000000f82bbdf5 (&fs_info->reloc_mutex){+.+.}, at: btrfs_commit_transaction+0x40d/0xa00 [btrfs] #5: 000000003d52cc23 (&fs_info->tree_log_mutex){+.+.}, at: create_pending_snapshot+0x8b6/0xe60 [btrfs] [CAUSE] Due to the delayed subvolume creation, we need to call btrfs_qgroup_inherit() inside commit transaction code, with a lot of other mutex hold. This hell of lock chain can lead to above problem. [FIX] On the other hand, we don't really need to hold qgroup_ioctl_lock if we're in the context of create_pending_snapshot(). As in that context, we're the only one being able to modify qgroup. All other qgroup functions which needs qgroup_ioctl_lock are either holding a transaction handle, or will start a new transaction: Functions will start a new transaction(): * btrfs_quota_enable() * btrfs_quota_disable() Functions hold a transaction handler: * btrfs_add_qgroup_relation() * btrfs_del_qgroup_relation() * btrfs_create_qgroup() * btrfs_remove_qgroup() * btrfs_limit_qgroup() * btrfs_qgroup_inherit() call inside create_subvol() So we have a higher level protection provided by transaction, thus we don't need to always hold qgroup_ioctl_lock in btrfs_qgroup_inherit(). Only the btrfs_qgroup_inherit() call in create_subvol() needs to hold qgroup_ioctl_lock, while the btrfs_qgroup_inherit() call in create_pending_snapshot() is already protected by transaction. So the fix is to detect the context by checking trans->transaction->state. If we're at TRANS_STATE_COMMIT_DOING, then we're in commit transaction context and no need to get the mutex. Reported-by: Nikolay Borisov Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3e6ffbbd8b0a..f8a3c1b0a15a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2614,6 +2614,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int ret = 0; int i; u64 *i_qgroups; + bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; @@ -2621,7 +2622,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u32 level_size = 0; u64 nums; - mutex_lock(&fs_info->qgroup_ioctl_lock); + /* + * There are only two callers of this function. + * + * One in create_subvol() in the ioctl context, which needs to hold + * the qgroup_ioctl_lock. + * + * The other one in create_pending_snapshot() where no other qgroup + * code can modify the fs as they all need to either start a new trans + * or hold a trans handler, thus we don't need to hold + * qgroup_ioctl_lock. + * This would avoid long and complex lock chain and make lockdep happy. + */ + spin_lock(&fs_info->trans_lock); + if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) + committing = true; + spin_unlock(&fs_info->trans_lock); + + if (!committing) + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; @@ -2785,7 +2804,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, unlock: spin_unlock(&fs_info->qgroup_lock); out: - mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (!committing) + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } -- cgit v1.2.3 From c7369b3faea230cf6009449147ed755c45e74afd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 May 2019 15:39:31 +0200 Subject: btrfs: add mask for all RAID1 types Preparatory patch for additional RAID1 profiles with more copies. The mask will contain 3-copy and 4-copy, most of the checks for plain RAID1 work the same for the other profiles. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 ++++---- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.c | 8 ++++---- include/uapi/linux/btrfs_tree.h | 2 ++ 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f24ef9020323..13c17f94f15d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7873,7 +7873,7 @@ search: */ if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10; @@ -9564,7 +9564,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; @@ -9575,7 +9575,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) return stripped; /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; } else { @@ -10445,7 +10445,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list_for_each_entry_rcu(space_info, &info->space_info, list) { if (!(get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_DUP))) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9f0297d529d4..0c99cf9fb595 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3091,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 776f5c7ca7c5..9e5167a0e406 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5400,7 +5400,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; @@ -5474,7 +5474,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev; ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -5663,7 +5663,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &remaining_stripes); div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); last_stripe *= sub_stripes; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { num_stripes = map->num_stripes; } else { @@ -6035,7 +6035,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, &stripe_index); if (!need_full_stripe(op)) mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 421239b98db2..34d5b34286fa 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -866,6 +866,8 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6) +#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1) + /* * We need a bit for restriper to be able to tell when chunks of type * SINGLE are available. This "extended" profile format is used in -- cgit v1.2.3 From a07e8a468e6a2f626b61078a4d1e63c181f099eb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 May 2019 16:54:26 +0200 Subject: btrfs: use mask for RAID56 profiles We don't need to enumerate the profiles, use the mask for consistency. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 13c17f94f15d..99fae9e4ab71 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7874,8 +7874,7 @@ search: if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID10; /* @@ -9562,8 +9561,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) num_devices = fs_info->fs_devices->rw_devices; - stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | + stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { @@ -10446,8 +10444,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) if (!(get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_DUP))) continue; /* -- cgit v1.2.3 From 71a9c4885e1dec0f8ec5e4b05ecd6fddd27fc6c1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 31 May 2019 17:22:59 +0200 Subject: btrfs: document BTRFS_MAX_MIRRORS The real meaning of that constant is not clear from the context due to the target device inclusion. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fb7435533478..31198499f175 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -46,7 +46,16 @@ struct btrfs_ref; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ -#define BTRFS_MAX_MIRRORS 3 +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1 with 2 copies. + */ +#define BTRFS_MAX_MIRRORS (2 + 1) #define BTRFS_MAX_LEVEL 8 -- cgit v1.2.3 From 9e967495e0e0ae8bb08f52aa71b29affc7075d31 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 22 Apr 2019 16:44:09 +0100 Subject: Btrfs: prevent send failures and crashes due to concurrent relocation Send always operates on read-only trees and always expected that while it is in progress, nothing changes in those trees. Due to that expectation and the fact that send is a read-only operation, it operates on commit roots and does not hold transaction handles. However relocation can COW nodes and leafs from read-only trees, which can cause unexpected failures and crashes (hitting BUG_ONs). while send using a node/leaf, it gets COWed, the transaction used to COW it is committed, a new transaction starts, the extent previously used for that node/leaf gets allocated, possibly for another tree, and the respective extent buffer' content changes while send is still using it. When this happens send normally fails with EIO being returned to user space and messages like the following are found in dmesg/syslog: [ 3408.699121] BTRFS error (device sdc): parent transid verify failed on 58703872 wanted 250 found 253 [ 3441.523123] BTRFS error (device sdc): did not find backref in send_root. inode=63211, offset=0, disk_byte=5222825984 found extent=5222825984 Other times, less often, we hit a BUG_ON() because an extent buffer that send is using used to be a node, and while send is still using it, it got COWed and got reused as a leaf while send is still using, producing the following trace: [ 3478.466280] ------------[ cut here ]------------ [ 3478.466282] kernel BUG at fs/btrfs/ctree.c:1806! [ 3478.466965] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [ 3478.467635] CPU: 0 PID: 2165 Comm: btrfs Not tainted 5.0.0-btrfs-next-46 #1 [ 3478.468311] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 3478.469681] RIP: 0010:read_node_slot+0x122/0x130 [btrfs] (...) [ 3478.471758] RSP: 0018:ffffa437826bfaa0 EFLAGS: 00010246 [ 3478.472457] RAX: ffff961416ed7000 RBX: 000000000000003d RCX: 0000000000000002 [ 3478.473151] RDX: 000000000000003d RSI: ffff96141e387408 RDI: ffff961599b30000 [ 3478.473837] RBP: ffffa437826bfb8e R08: 0000000000000001 R09: ffffa437826bfb8e [ 3478.474515] R10: ffffa437826bfa70 R11: 0000000000000000 R12: ffff9614385c8708 [ 3478.475186] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 3478.475840] FS: 00007f8e0e9cc8c0(0000) GS:ffff9615b6a00000(0000) knlGS:0000000000000000 [ 3478.476489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3478.477127] CR2: 00007f98b67a056e CR3: 0000000005df6005 CR4: 00000000003606f0 [ 3478.477762] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3478.478385] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3478.479003] Call Trace: [ 3478.479600] ? do_raw_spin_unlock+0x49/0xc0 [ 3478.480202] tree_advance+0x173/0x1d0 [btrfs] [ 3478.480810] btrfs_compare_trees+0x30c/0x690 [btrfs] [ 3478.481388] ? process_extent+0x1280/0x1280 [btrfs] [ 3478.481954] btrfs_ioctl_send+0x1037/0x1270 [btrfs] [ 3478.482510] _btrfs_ioctl_send+0x80/0x110 [btrfs] [ 3478.483062] btrfs_ioctl+0x13fe/0x3120 [btrfs] [ 3478.483581] ? rq_clock_task+0x2e/0x60 [ 3478.484086] ? wake_up_new_task+0x1f3/0x370 [ 3478.484582] ? do_vfs_ioctl+0xa2/0x6f0 [ 3478.485075] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 3478.485552] do_vfs_ioctl+0xa2/0x6f0 [ 3478.486016] ? __fget+0x113/0x200 [ 3478.486467] ksys_ioctl+0x70/0x80 [ 3478.486911] __x64_sys_ioctl+0x16/0x20 [ 3478.487337] do_syscall_64+0x60/0x1b0 [ 3478.487751] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 3478.488159] RIP: 0033:0x7f8e0d7d4dd7 (...) [ 3478.489349] RSP: 002b:00007ffcf6fb4908 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 3478.489742] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f8e0d7d4dd7 [ 3478.490142] RDX: 00007ffcf6fb4990 RSI: 0000000040489426 RDI: 0000000000000005 [ 3478.490548] RBP: 0000000000000005 R08: 00007f8e0d6f3700 R09: 00007f8e0d6f3700 [ 3478.490953] R10: 00007f8e0d6f39d0 R11: 0000000000000202 R12: 0000000000000005 [ 3478.491343] R13: 00005624e0780020 R14: 0000000000000000 R15: 0000000000000001 (...) [ 3478.493352] ---[ end trace d5f537302be4f8c8 ]--- Another possibility, much less likely to happen, is that send will not fail but the contents of the stream it produces may not be correct. To avoid this, do not allow send and relocation (balance) to run in parallel. In the long term the goal is to allow for both to be able to run concurrently without any problems, but that will take a significant effort in development and testing. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 +++++++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/send.c | 14 ++++++++++++++ fs/btrfs/volumes.c | 8 ++++++++ 4 files changed, 31 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 31198499f175..02a29516dacf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -793,6 +793,7 @@ enum { /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. + * Set and cleared while holding fs_info::balance_mutex. */ BTRFS_FS_BALANCE_RUNNING, @@ -1175,6 +1176,12 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; + /* + * Number of send operations in progress. + * Updated while holding fs_info::balance_mutex. + */ + int send_in_progress; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6c7dc24d4031..41a2bd2e0c56 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2809,6 +2809,8 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; + fs_info->send_in_progress = 0; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 49edcc709a99..69b59bf75882 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6929,9 +6929,23 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; + mutex_lock(&fs_info->balance_mutex); + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + mutex_unlock(&fs_info->balance_mutex); + btrfs_warn_rl(fs_info, + "cannot run send because a balance operation is in progress"); + ret = -EAGAIN; + goto out; + } + fs_info->send_in_progress++; + mutex_unlock(&fs_info->balance_mutex); + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; + mutex_lock(&fs_info->balance_mutex); + fs_info->send_in_progress--; + mutex_unlock(&fs_info->balance_mutex); if (ret < 0) goto out; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9e5167a0e406..41813813f840 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4161,6 +4161,14 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, btrfs_bg_type_to_raid_name(data_target)); } + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run balance while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + ret = -EAGAIN; + goto out; + } + ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; -- cgit v1.2.3 From 6c64460cdc8be5fa074aa8fe2ae8736d5792bdc5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Jun 2019 13:07:28 +0200 Subject: btrfs: shut up bogus -Wmaybe-uninitialized warning gcc sometimes can't determine whether a variable has been initialized when both the initialization and the use are conditional: fs/btrfs/props.c: In function 'inherit_props': fs/btrfs/props.c:389:4: error: 'num_bytes' may be used uninitialized in this function [-Werror=maybe-uninitialized] btrfs_block_rsv_release(fs_info, trans->block_rsv, This code is fine. Unfortunately, I cannot think of a good way to rephrase it in a way that makes gcc understand this, so I add a bogus initialization the way one should not. Signed-off-by: Arnd Bergmann Reviewed-by: David Sterba [ gcc 8 and 9 don't emit the warning ] Signed-off-by: David Sterba --- fs/btrfs/props.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index af109c0ba720..e0469816c678 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -337,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { const struct prop_handler *h = &prop_handlers[i]; const char *value; - u64 num_bytes; + u64 num_bytes = 0; if (!h->inheritable) continue; -- cgit v1.2.3 From c9d713d5b5e8c1fb97af8fa3e267300fe25b8921 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 13 Jun 2019 17:55:03 +0200 Subject: btrfs: improve messages when updating feature flags Currently the messages printed after setting an incompat feature are cryptis, we can easily make it better as the textual description is passed to the helpers. Old: setting 128 feature flag updated: setting incompat feature flag for RAID56 (0x80) Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 02a29516dacf..5e89e4d5b065 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3636,10 +3636,11 @@ do { \ /* compatibility and incompatibility defines */ #define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3652,18 +3653,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu feature flag", - flag); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3676,8 +3679,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu feature flag", - flag); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } @@ -3694,10 +3698,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) } #define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3710,18 +3715,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu ro feature flag", - flag); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3734,8 +3741,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu ro feature flag", - flag); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } -- cgit v1.2.3 From 5f1411265e16b257a945a5c9044aecbc7bb99443 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Jun 2019 12:05:03 +0300 Subject: btrfs: Introduce btrfs_io_geometry infrastructure Add a structure that holds various parameters for IO calculations and a helper that fills the values. This will help further refactoring and reduction of functions that in some way open-coded the calculations. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 17 ++++++++++ 2 files changed, 113 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 41813813f840..793494690e67 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5915,6 +5915,102 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } +/* + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) + * tuple. This information is used to calculate how big a + * particular bio can get before it straddles a stripe. + * + * @fs_info - the filesystem + * @logical - address that we want to figure out the geometry of + * @len - the length of IO we are going to perform, starting at @logical + * @op - type of operation - write or read + * @io_geom - pointer used to return values + * + * Returns < 0 in case a chunk for the given logical address cannot be found, + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. + */ +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +{ + struct extent_map *em; + struct map_lookup *map; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + u64 stripe_len; + u64 raid56_full_stripe_start = (u64)-1; + int data_stripes; + + ASSERT(op != BTRFS_MAP_DISCARD); + + em = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* Offset of this logical address in the chunk */ + offset = logical - em->start; + /* Len of a stripe in a chunk */ + stripe_len = map->stripe_len; + /* Stripe wher this block falls in */ + stripe_nr = div64_u64(offset, stripe_len); + /* Offset of stripe in the chunk */ + stripe_offset = stripe_nr * stripe_len; + if (offset < stripe_offset) { + btrfs_crit(fs_info, +"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", + stripe_offset, offset, em->start, logical, stripe_len); + free_extent_map(em); + return -EINVAL; + } + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_offset; + data_stripes = nr_data_stripes(map); + + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len = stripe_len - stripe_offset; + + /* + * In case of raid56, we need to know the stripe aligned start + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * data_stripes; + raid56_full_stripe_start = offset; + + /* + * Allow a write of a full stripe, but make sure we + * don't allow straddling of stripes + */ + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + + /* + * For writes to RAID[56], allow a full stripeset across + * all disks. For other RAID types and for RAID[56] + * reads, just allow a single stripe (on a single disk). + */ + if (op == BTRFS_MAP_WRITE) { + max_len = stripe_len * data_stripes - + (offset - raid56_full_stripe_start); + } + } + len = min_t(u64, em->len - offset, max_len); + } else { + len = em->len - offset; + } + + io_geom->len = len; + io_geom->offset = offset; + io_geom->stripe_len = stripe_len; + io_geom->stripe_nr = stripe_nr; + io_geom->stripe_offset = stripe_offset; + io_geom->raid56_stripe_offset = raid56_full_stripe_start; + + return 0; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index fea7b65a712e..2a8e398a8742 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,6 +23,21 @@ struct btrfs_pending_bios { struct bio *tail; }; +struct btrfs_io_geometry { + /* remaining bytes before crossing a stripe */ + u64 len; + /* offset of logical address in chunk */ + u64 offset; + /* length of single IO stripe */ + u64 stripe_len; + /* number of stripe where address falls */ + u64 stripe_nr; + /* offset of address in stripe */ + u64 stripe_offset; + /* offset of raid56 stripe into the chunk */ + u64 raid56_stripe_offset; +}; + /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -413,6 +428,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); -- cgit v1.2.3 From 89b798ad1b42b1de10d64feda241e35e90c7b102 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Jun 2019 12:05:05 +0300 Subject: btrfs: Use btrfs_get_io_geometry appropriately Presently btrfs_map_block is used not only to do everything necessary to map a bio to the underlying allocation profile but it's also used to identify how much data could be written based on btrfs' stripe logic without actually submitting anything. This is achieved by passing NULL for 'bbio_ret' parameter. This patch refactors all callers that require just the mapping length by switching them to using btrfs_io_geometry instead of calling btrfs_map_block with a special NULL value for 'bbio_ret'. No functional change. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 28 ++++++++++---------- fs/btrfs/volumes.c | 78 +++++++++++------------------------------------------- fs/btrfs/volumes.h | 2 +- 3 files changed, 30 insertions(+), 78 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7126493d8d8c..89b9535cda19 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1932,17 +1932,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, u64 length = 0; u64 map_length; int ret; + struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, + &geom); if (ret < 0) return ret; - if (map_length < length + size) + + if (geom.len < length + size) return 1; return 0; } @@ -8308,22 +8310,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) struct bio *orig_bio = dip->orig_bio; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 map_length; int async_submit = 0; u64 submit_len; int clone_offset = 0; int clone_len; int ret; blk_status_t status; + struct btrfs_io_geometry geom; - map_length = orig_bio->bi_iter.bi_size; - submit_len = map_length; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, - &map_length, NULL, 0); + submit_len = orig_bio->bi_iter.bi_size; + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) return -EIO; - if (map_length >= submit_len) { + if (geom.len >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; @@ -8336,10 +8337,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) async_submit = 1; /* bio split */ - ASSERT(map_length <= INT_MAX); + ASSERT(geom.len <= INT_MAX); atomic_inc(&dip->pending_bios); do { - clone_len = min_t(int, submit_len, map_length); + clone_len = min_t(int, submit_len, geom.len); /* * This will never fail as it's passing GPF_NOFS and @@ -8376,9 +8377,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) start_sector += clone_len >> 9; file_offset += clone_len; - map_length = submit_len; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, &map_length, NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) goto out_err; } while (submit_len > 0); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 793494690e67..b9606501d33b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5930,7 +5930,7 @@ static bool need_full_stripe(enum btrfs_map_op op) * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom) + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) { struct extent_map *em; struct map_lookup *map; @@ -6037,78 +6037,30 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; + struct btrfs_io_geometry geom; + + ASSERT(bbio_ret); if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, *length, bbio_ret); - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + if (ret < 0) + return ret; + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(em); map = em->map_lookup; - offset = logical - em->start; - stripe_len = map->stripe_len; - stripe_nr = offset; - /* - * stripe_nr counts the total number of stripes we have to stride - * to get to this block - */ - stripe_nr = div64_u64(stripe_nr, stripe_len); + *length = geom.len; + offset = geom.offset; + stripe_len = geom.stripe_len; + stripe_nr = geom.stripe_nr; + stripe_offset = geom.stripe_offset; + raid56_full_stripe_start = geom.raid56_stripe_offset; data_stripes = nr_data_stripes(map); - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", - stripe_offset, offset, em->start, logical, - stripe_len); - free_extent_map(em); - return -EINVAL; - } - - /* stripe_offset is the offset of this block in its stripe*/ - stripe_offset = offset - stripe_offset; - - /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * data_stripes; - raid56_full_stripe_start = offset; - - /* allow a write of a full stripe, but make sure we don't - * allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - } - - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - u64 max_len; - /* For writes to RAID[56], allow a full stripeset across all disks. - For other RAID types and for RAID[56] reads, just allow a single - stripe (on a single disk). */ - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (op == BTRFS_MAP_WRITE)) { - max_len = stripe_len * data_stripes - - (offset - raid56_full_stripe_start); - } else { - /* we limit the length of each bio to what fits in a stripe */ - max_len = stripe_len - stripe_offset; - } - *length = min_t(u64, em->len - offset, max_len); - } else { - *length = em->len - offset; - } - - /* - * This is for when we're called from btrfs_bio_fits_in_stripe and all - * it cares about is the length - */ - if (!bbio_ret) - goto out; - down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); /* diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2a8e398a8742..7f6aa1816409 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -429,7 +429,7 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom); + u64 logical, u64 len, struct btrfs_io_geometry *io_geom); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); -- cgit v1.2.3 From 803f0f64d17769071d7287d9e3e3b79a3e1ae937 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 19 Jun 2019 13:05:39 +0100 Subject: Btrfs: fix fsync not persisting dentry deletions due to inode evictions In order to avoid searches on a log tree when unlinking an inode, we check if the inode being unlinked was logged in the current transaction, as well as the inode of its parent directory. When any of the inodes are logged, we proceed to delete directory items and inode reference items from the log, to ensure that if a subsequent fsync of only the inode being unlinked or only of the parent directory when the other is not fsync'ed as well, does not result in the entry still existing after a power failure. That check however is not reliable when one of the inodes involved (the one being unlinked or its parent directory's inode) is evicted, since the logged_trans field is transient, that is, it is not stored on disk, so it is lost when the inode is evicted and loaded into memory again (which is set to zero on load). As a consequence the checks currently being done by btrfs_del_dir_entries_in_log() and btrfs_del_inode_ref_in_log() always return true if the inode was evicted before, regardless of the inode having been logged or not before (and in the current transaction), this results in the dentry being unlinked still existing after a log replay if after the unlink operation only one of the inodes involved is fsync'ed. Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/dir $ touch /mnt/dir/foo $ xfs_io -c fsync /mnt/dir/foo # Keep an open file descriptor on our directory while we evict inodes. # We just want to evict the file's inode, the directory's inode must not # be evicted. $ ( cd /mnt/dir; while true; do :; done ) & $ pid=$! # Wait a bit to give time to background process to chdir to our test # directory. $ sleep 0.5 # Trigger eviction of the file's inode. $ echo 2 > /proc/sys/vm/drop_caches # Unlink our file and fsync the parent directory. After a power failure # we don't expect to see the file anymore, since we fsync'ed the parent # directory. $ rm -f $SCRATCH_MNT/dir/foo $ xfs_io -c fsync /mnt/dir $ mount /dev/sdb /mnt $ ls /mnt/dir foo $ --> file still there, unlink not persisted despite explicit fsync on dir Fix this by checking if the inode has the full_sync bit set in its runtime flags as well, since that bit is set everytime an inode is loaded from disk, or for other less common cases such as after a shrinking truncate or failure to allocate extent maps for holes, and gets cleared after the first fsync. Also consider the inode as possibly logged only if it was last modified in the current transaction (besides having the full_fsync flag set). Fixes: 3a5f1d458ad161 ("Btrfs: Optimize btree walking while logging inodes") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4a04659fded7..6c8297bcfeb7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3322,6 +3322,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, return 0; } +/* + * Check if an inode was logged in the current transaction. We can't always rely + * on an inode's logged_trans value, because it's an in-memory only field and + * therefore not persisted. This means that its value is lost if the inode gets + * evicted and loaded again from disk (in which case it has a value of 0, and + * certainly it is smaller then any possible transaction ID), when that happens + * the full_sync flag is set in the inode's runtime flags, so on that case we + * assume eviction happened and ignore the logged_trans value, assuming the + * worst case, that the inode was logged before in the current transaction. + */ +static bool inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + if (inode->logged_trans == trans->transid) + return true; + + if (inode->last_trans == trans->transid && + test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) + return true; + + return false; +} + /* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: @@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); - if (dir->logged_trans < trans->transid) + if (!inode_logged(trans, dir)) return 0; ret = join_running_log_trans(root); @@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, u64 index; int ret; - if (inode->logged_trans < trans->transid) + if (!inode_logged(trans, inode)) return 0; ret = join_running_log_trans(root); -- cgit v1.2.3 From 179006688a7e888cbff39577189f2e034786d06a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 19 Jun 2019 13:05:50 +0100 Subject: Btrfs: add missing inode version, ctime and mtime updates when punching hole If the range for which we are punching a hole covers only part of a page, we end up updating the inode item but we skip the update of the inode's iversion, mtime and ctime. Fix that by ensuring we update those properties of the inode. A patch for fstests test case generic/059 that tests this as been sent along with this fix. Fixes: 2aaa66558172b0 ("Btrfs: add hole punching") Fixes: e8c1c76e804b18 ("Btrfs: add missing inode update when punching hole") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5370152ea7e3..b455bdf46faa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2711,6 +2711,11 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ + struct timespec64 now = current_time(inode); + + inode_inc_iversion(inode); + inode->i_mtime = now; + inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); -- cgit v1.2.3 From 63611e738a62bfb186a61e4b16b9fb72657144e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 10:59:18 -0400 Subject: btrfs: run delayed iput at unlink time We have been seeing issues in production where a cleaner script will end up unlinking a bunch of files that have pending iputs. This means they will get their final iput's run at btrfs-cleaner time and thus are not throttled, which impacts the workload. Since we are unlinking these files we can just drop the delayed iput at unlink time. We are already holding a reference to the inode so this will not be the final iput and thus is completely safe to do at this point. Doing this means we are more likely to be doing the final iput at unlink time, and thus will get the IO charged to the caller and get throttled appropriately without affecting the main workload. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 89b9535cda19..525790a38f6d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3295,6 +3295,28 @@ void btrfs_add_delayed_iput(struct inode *inode) wake_up_process(fs_info->cleaner_kthread); } +static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + list_del_init(&inode->delayed_iput); + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); + spin_lock(&fs_info->delayed_iput_lock); +} + +static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + if (!list_empty(&inode->delayed_iput)) { + spin_lock(&fs_info->delayed_iput_lock); + if (!list_empty(&inode->delayed_iput)) + run_delayed_iput_locked(fs_info, inode); + spin_unlock(&fs_info->delayed_iput_lock); + } +} + void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { @@ -3304,12 +3326,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); - list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); - iput(&inode->vfs_inode); - if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) - wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + run_delayed_iput_locked(fs_info, inode); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -4014,6 +4031,17 @@ skip_backref: ret = 0; else if (ret) btrfs_abort_transaction(trans, ret); + + /* + * If we have a pending delayed iput we could end up with the final iput + * being run in btrfs-cleaner context. If we have enough of these built + * up we can end up burning a lot of time in btrfs-cleaner without any + * way to throttle the unlinks. Since we're currently holding a ref on + * the inode we can run the delayed iput here without any issues as the + * final iput won't be done until after we drop the ref we're currently + * holding. + */ + btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) -- cgit v1.2.3 From 2792237d0c4c226c6a6059fefe7f712aa0c94003 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Jun 2019 20:00:05 +0200 Subject: btrfs: use common helpers for extent IO state insertion messages Print the error messages using the helpers that also print the filesystem identification. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 464e6b761a9c..932d2e0be8d7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -522,9 +522,11 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) - WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", - end, start); + if (end < start) { + btrfs_err(tree->fs_info, + "insert state: end < start %llu %llu", end, start); + WARN_ON(1); + } state->start = start; state->end = end; @@ -534,7 +536,8 @@ static int insert_state(struct extent_io_tree *tree, if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", found->start, found->end, start, end); return -EEXIST; } -- cgit v1.2.3 From f262fa8de6a2510e13a59ad3ecdcb0f2d468bb98 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Jun 2019 20:00:08 +0200 Subject: btrfs: drop default value assignments in enums A few more instances whre we don't need to specify the values as long as they are the same that enum assigns automatically. All of the enums are in-memory only and nothing relies on the exact values. Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 14 +++++++------- fs/btrfs/file.c | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99fae9e4ab71..8ec9d6fbad42 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -46,9 +46,9 @@ * */ enum { - CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_LIMITED = 1, - CHUNK_ALLOC_FORCE = 2, + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, }; /* @@ -7302,10 +7302,10 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHING_NOWAIT = 0, - LOOP_CACHING_WAIT = 1, - LOOP_ALLOC_CHUNK = 2, - LOOP_NO_EMPTY_SIZE = 3, + LOOP_CACHING_NOWAIT, + LOOP_CACHING_WAIT, + LOOP_ALLOC_CHUNK, + LOOP_NO_EMPTY_SIZE, }; static inline void diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b455bdf46faa..a8e0fc2d1c86 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2796,9 +2796,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode, } enum { - RANGE_BOUNDARY_WRITTEN_EXTENT = 0, - RANGE_BOUNDARY_PREALLOC_EXTENT = 1, - RANGE_BOUNDARY_HOLE = 2, + RANGE_BOUNDARY_WRITTEN_EXTENT, + RANGE_BOUNDARY_PREALLOC_EXTENT, + RANGE_BOUNDARY_HOLE, }; static int btrfs_zero_range_check_range_boundary(struct inode *inode, -- cgit v1.2.3 From 4f080f571192b0347f45b5709a291af473824484 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Jun 2019 20:00:11 +0200 Subject: btrfs: use raid_attr to adjust minimal stripe size in btrfs_calc_avail_data_space Special case for DUP can be replaced by lookup to the attribute table, where the dev_stripes is the right coefficient. Signed-off-by: David Sterba --- fs/btrfs/super.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6e196b8a0820..2b44223e98a9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1904,6 +1904,7 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 min_stripe_size; int min_stripes = 1, num_stripes = 1; int i = 0, nr_devices; + const struct btrfs_raid_attr *rattr; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1927,6 +1928,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); + rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; + if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; num_stripes = nr_devices; @@ -1938,10 +1941,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, num_stripes = 4; } - if (type & BTRFS_BLOCK_GROUP_DUP) - min_stripe_size = 2 * BTRFS_STRIPE_LEN; - else - min_stripe_size = BTRFS_STRIPE_LEN; + /* Adjust for more than 1 stripe per device */ + min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { -- cgit v1.2.3 From e1ea2beee284d8e434e51e60de9e157a18fe8b42 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Jun 2019 20:00:13 +0200 Subject: btrfs: use raid_attr for minimum stripe count in btrfs_calc_avail_data_space Minimum stripe count matches the minimum devices required for a given profile. The open coded assignments match the raid_attr table. What's changed here is the meaning for RAID5/6. Previously their min_stripes would be 1, while newly it's devs_min. This however shold be the same as before because it's not possible to create filesystem on fewer devices than the raid_attr table allows. There's no adjustment regarding the parity stripes (like calc_data_stripes does), because we're interested in overall space that would fit on the devices. Missing devices make no difference for the whole calculation, we have the size stored in the structures. Signed-off-by: David Sterba --- fs/btrfs/super.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2b44223e98a9..7d20856ae0c4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1902,7 +1902,7 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 type; u64 avail_space; u64 min_stripe_size; - int min_stripes = 1, num_stripes = 1; + int min_stripes, num_stripes = 1; int i = 0, nr_devices; const struct btrfs_raid_attr *rattr; @@ -1929,17 +1929,14 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; + min_stripes = rattr->devs_min; - if (type & BTRFS_BLOCK_GROUP_RAID0) { - min_stripes = 2; + if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - } else if (type & BTRFS_BLOCK_GROUP_RAID1) { - min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) num_stripes = 2; - } else if (type & BTRFS_BLOCK_GROUP_RAID10) { - min_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; - } /* Adjust for more than 1 stripe per device */ min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; -- cgit v1.2.3 From e749af443fa8dac67a896d38f5eca450a5b9026a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Jun 2019 20:00:16 +0200 Subject: btrfs: lift bio_set_dev from bio allocation helpers The block device is passed around for the only purpose to set it in new bios. Move the assignment one level up. This is a preparatory patch for further bdev cleanups. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 12 ++++++++---- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index db41315f11eb..60c47b417a4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -340,7 +340,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -382,7 +383,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -620,7 +622,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); + bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -670,7 +673,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); + bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 932d2e0be8d7..1eb671c16ff1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2861,12 +2861,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) * never fail. We're returning a bio right now but you can call btrfs_io_bio * for the appropriate container_of magic */ -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; @@ -2977,7 +2976,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, offset); + bio = btrfs_bio_alloc(offset); + bio_set_dev(bio, bdev); bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 844e595cde5b..6e13a62a2974 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -497,7 +497,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, u64 delalloc_end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); +struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); -- cgit v1.2.3 From 8719aaae8d696bf0c73f74e6d6cc75451b50d5df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:16 -0400 Subject: btrfs: move space_info to space-info.h Migrate the struct definition and the one helper that's in ctree.h into space-info.h Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 73 +----------------------------------------- fs/btrfs/extent-tree.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/space-info.h | 78 +++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/super.c | 1 + fs/btrfs/sysfs.c | 1 + fs/btrfs/volumes.c | 1 + 8 files changed, 85 insertions(+), 72 deletions(-) create mode 100644 fs/btrfs/space-info.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5e89e4d5b065..1d6a60f437a6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -37,6 +37,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; +struct btrfs_space_info; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -402,72 +403,6 @@ struct raid_kobject { struct list_head list; }; -struct btrfs_space_info { - spinlock_t lock; - - u64 total_bytes; /* total bytes in the space, - this doesn't take mirrors into account */ - u64 bytes_used; /* total bytes used, - this doesn't take mirrors into account */ - u64 bytes_pinned; /* total bytes pinned, will be freed when the - transaction finishes */ - u64 bytes_reserved; /* total bytes the allocator has reserved for - current allocations */ - u64 bytes_may_use; /* number of bytes that may be used for - delalloc/allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - - u64 max_extent_size; /* This will hold the maximum extent size of - the space info if we had an ENOSPC in the - allocator. */ - - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - - u64 disk_used; /* total bytes used on disk */ - u64 disk_total; /* total bytes on disk, takes mirrors into - account */ - - u64 flags; - - /* - * bytes_pinned is kept in line with what is actually pinned, as in - * we've called update_block_group and dropped the bytes_used counter - * and increased the bytes_pinned counter. However this means that - * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed every time we - * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zeroed every - * time the transaction commits. - */ - struct percpu_counter total_bytes_pinned; - - struct list_head list; - /* Protected by the spinlock 'lock'. */ - struct list_head ro_bgs; - struct list_head priority_tickets; - struct list_head tickets; - /* - * tickets_id just indicates the next ticket will be handled, so note - * it's not stored per ticket. - */ - u64 tickets_id; - - struct rw_semaphore groups_sem; - /* for block groups in our same type */ - struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - wait_queue_head_t wait; - - struct kobject kobj; - struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; -}; - /* * Types of block reserves */ @@ -2693,12 +2628,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, return (u64) crc32c(parent_objectid, name, len); } -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) -{ - return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && - (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8ec9d6fbad42..11ee633b02dc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -28,6 +28,7 @@ #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" +#include "space-info.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6814fa42eba3..390cd3d7d5ea 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,7 @@ #include "extent_io.h" #include "inode-map.h" #include "volumes.h" +#include "space-info.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5b4beebf138c..ca32b97b6b4b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include "qgroup.h" #include "tree-log.h" #include "compression.h" +#include "space-info.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h new file mode 100644 index 000000000000..5f96333c3450 --- /dev/null +++ b/fs/btrfs/space-info.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SPACE_INFO_H +#define BTRFS_SPACE_INFO_H + +struct btrfs_space_info { + spinlock_t lock; + + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + + u64 flags; + + /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed every time we + * call btrfs_free_extent so it is a realtime count of what will be + * freed once the transaction is committed. It will be zeroed every + * time the transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + struct list_head list; + /* Protected by the spinlock 'lock'. */ + struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; + /* + * tickets_id just indicates the next ticket will be handled, so note + * it's not stored per ticket. + */ + u64 tickets_id; + + struct rw_semaphore groups_sem; + /* for block groups in our same type */ + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + wait_queue_head_t wait; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; +}; + +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +#endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7d20856ae0c4..78de9d5d80c6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -42,6 +42,7 @@ #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" +#include "space-info.h" #include "tests/btrfs-tests.h" #include "qgroup.h" diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 2f078b77fe14..e6493b068294 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -16,6 +16,7 @@ #include "transaction.h" #include "sysfs.h" #include "volumes.h" +#include "space-info.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b9606501d33b..a13ddba1ebc3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -28,6 +28,7 @@ #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" +#include "space-info.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { -- cgit v1.2.3 From fc471cb0c8f0016ac7ec5cc3e329c5e23d83d593 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:17 -0400 Subject: btrfs: rename do_chunk_alloc to btrfs_chunk_alloc Really we just need the enum, but as we break more things up it'll help to have this external to extent-tree.c. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 22 ++++++++++++++++++++++ fs/btrfs/extent-tree.c | 49 +++++++++++++------------------------------------ 2 files changed, 35 insertions(+), 36 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1d6a60f437a6..8fca40370cf1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2780,6 +2780,28 @@ enum btrfs_flush_state { COMMIT_TRANS = 9, }; +/* + * control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + */ +enum btrfs_chunk_alloc_enum { + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, +}; + +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force); int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); int btrfs_check_data_free_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 11ee633b02dc..16e61acc0179 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,26 +32,6 @@ #undef SCRAMBLE_DELAYED_REFS -/* - * control flags for do_chunk_alloc's force field - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk - * if we really need one. - * - * CHUNK_ALLOC_LIMITED means to only try and allocate one - * if we have very few chunks already allocated. This is - * used as part of the clustering code to help make sure - * we have a good pool of storage to cluster in, without - * filling the FS with empty chunks - * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * - */ -enum { - CHUNK_ALLOC_NO_FORCE, - CHUNK_ALLOC_LIMITED, - CHUNK_ALLOC_FORCE, -}; - /* * Declare a helper function to detect underflow of various space info members */ @@ -88,8 +68,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_fs_info *fs_info, @@ -4143,8 +4121,8 @@ again: if (IS_ERR(trans)) return PTR_ERR(trans); - ret = do_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); + ret = btrfs_chunk_alloc(trans, alloc_target, + CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans); if (ret < 0) { if (ret != -ENOSPC) @@ -4414,8 +4392,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force) +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; @@ -4877,10 +4855,10 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = do_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), - (state == ALLOC_CHUNK) ? - CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, + btrfs_metadata_alloc_profile(fs_info), + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) ret = 0; @@ -7672,8 +7650,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = do_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); /* * If we can't allocate a new chunk we've already looped @@ -9687,8 +9665,7 @@ again: */ alloc_flags = update_block_group_flags(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, alloc_flags, - CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space * already allocated at the new raid level to @@ -9704,7 +9681,7 @@ again: if (!ret) goto out; alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); - ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); @@ -9725,7 +9702,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = get_alloc_profile(trans->fs_info, type); - return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } /* -- cgit v1.2.3 From d44b72aa12d0a74b67ffabdcab2f64653282dccd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:18 -0400 Subject: btrfs: export space_info_add_*_bytes Prep work for consolidating all of the space_info code into one file. We need to export these so multiple files can use them. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 40 ++++++++++++++++++---------------------- fs/btrfs/space-info.h | 7 +++++++ 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 16e61acc0179..34d08fc8ba76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -75,12 +75,6 @@ static void dump_space_info(struct btrfs_fs_info *fs_info, int dump_block_groups); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -3908,8 +3902,8 @@ static void update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); + btrfs_space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); spin_unlock(&found->lock); *space_info = found; } @@ -5108,7 +5102,8 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); return ret; } @@ -5225,7 +5220,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); ASSERT(list_empty(&ticket.list)); return ret; } @@ -5391,8 +5387,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, num_bytes, 1); if (to_free) - space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, - to_free); + btrfs_space_info_add_old_bytes(fs_info, + delayed_refs_rsv->space_info, to_free); } /** @@ -5435,9 +5431,9 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. */ -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) +void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) { struct reserve_ticket *ticket; struct list_head *head; @@ -5495,9 +5491,9 @@ again: * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent * we use this helper. */ -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) +void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) { struct reserve_ticket *ticket; struct list_head *head = &space_info->priority_tickets; @@ -5581,8 +5577,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_unlock(&dest->lock); } if (num_bytes) - space_info_add_old_bytes(fs_info, space_info, - num_bytes); + btrfs_space_info_add_old_bytes(fs_info, space_info, + num_bytes); } if (qgroup_to_release_ret) *qgroup_to_release_ret = qgroup_to_release; @@ -6758,8 +6754,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&global_rsv->lock); /* Add to any tickets we may have */ if (len) - space_info_add_new_bytes(fs_info, space_info, - len); + btrfs_space_info_add_new_bytes(fs_info, + space_info, len); } spin_unlock(&space_info->lock); } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5f96333c3450..46f15b8c9ffc 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -75,4 +75,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } +void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); + #endif /* BTRFS_SPACE_INFO_H */ -- cgit v1.2.3 From 280c290881bc048bd5dbe02796173f55ca86e697 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:19 -0400 Subject: btrfs: move the space_info handling code to space-info.c These are the basic init and lookup functions and some helper functions, fairly straightforward before the bad stuff starts. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/extent-tree.c | 205 +++++-------------------------------------------- fs/btrfs/space-info.c | 174 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 10 +++ 4 files changed, 204 insertions(+), 187 deletions(-) create mode 100644 fs/btrfs/space-info.c (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ca693dd554e9..ae5fad57bc9c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 34d08fc8ba76..2ec0409fa2b5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -713,25 +713,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return block_group_cache_tree_search(info, bytenr, 1); } -static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, - u64 flags) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); - return found; - } - } - rcu_read_unlock(); - return NULL; -} - static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) { if (ref->type == BTRFS_REF_METADATA) { @@ -749,7 +730,7 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info; u64 flags = generic_ref_to_space_flags(ref); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, BTRFS_TOTAL_BYTES_PINNED_BATCH); @@ -761,27 +742,12 @@ static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info; u64 flags = generic_ref_to_space_flags(ref); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, BTRFS_TOTAL_BYTES_PINNED_BATCH); } -/* - * after adding space to the filesystem, we need to clear the full flags - * on all the space infos. - */ -void btrfs_clear_space_info_full(struct btrfs_fs_info *info) -{ - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) - found->full = 0; - rcu_read_unlock(); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -2449,7 +2415,7 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); percpu_counter_add_batch(&space_info->total_bytes_pinned, -head->num_bytes, @@ -3821,93 +3787,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } -static const char *alloc_name(u64 flags) -{ - switch (flags) { - case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: - return "mixed"; - case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; - case BTRFS_BLOCK_GROUP_DATA: - return "data"; - case BTRFS_BLOCK_GROUP_SYSTEM: - return "system"; - default: - WARN_ON(1); - return "invalid-combination"; - }; -} - -static int create_space_info(struct btrfs_fs_info *info, u64 flags) -{ - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, - GFP_KERNEL); - if (ret) { - kfree(space_info); - return ret; - } - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&space_info->block_groups[i]); - init_rwsem(&space_info->groups_sem); - spin_lock_init(&space_info->lock); - space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; - init_waitqueue_head(&space_info->wait); - INIT_LIST_HEAD(&space_info->ro_bgs); - INIT_LIST_HEAD(&space_info->tickets); - INIT_LIST_HEAD(&space_info->priority_tickets); - - ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - info->space_info_kobj, "%s", - alloc_name(space_info->flags)); - if (ret) { - kobject_put(&space_info->kobj); - return ret; - } - - list_add_rcu(&space_info->list, &info->space_info); - if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = space_info; - - return ret; -} - -static void update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) -{ - struct btrfs_space_info *found; - int factor; - - factor = btrfs_bg_type_to_factor(flags); - - found = __find_space_info(info, flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - btrfs_space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; -} - static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & @@ -4055,15 +3934,6 @@ u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, - bool may_use_included) -{ - ASSERT(s_info); - return s_info->bytes_used + s_info->bytes_reserved + - s_info->bytes_pinned + s_info->bytes_readonly + - (may_use_included ? s_info->bytes_may_use : 0); -} - int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; @@ -4339,7 +4209,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) */ lockdep_assert_held(&fs_info->chunk_mutex); - info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); @@ -4399,7 +4269,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (trans->allocating_chunk) return -ENOSPC; - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); do { @@ -4627,7 +4497,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -4965,7 +4835,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, @@ -5611,7 +5481,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type) { btrfs_init_block_rsv(rsv, type); - rsv->space_info = __find_space_info(fs_info, + rsv->space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); } @@ -5836,10 +5706,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; @@ -5948,7 +5818,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, } num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); - rsv->space_info = __find_space_info(fs_info, + rsv->space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); ret = btrfs_block_rsv_add(root, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); @@ -7743,7 +7613,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, trace_find_free_extent(fs_info, num_bytes, empty_size, flags); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { btrfs_err(fs_info, "No space info for %llu", flags); return -ENOSPC; @@ -8097,7 +7967,7 @@ again: } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; - sinfo = __find_space_info(fs_info, flags); + sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, "allocation failed flags %llu, wanted %llu", flags, num_bytes); @@ -10130,7 +10000,7 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->pending_raid_kobjs_lock); list_for_each_entry(rkobj, &list, list) { - space_info = __find_space_info(fs_info, rkobj->flags); + space_info = btrfs_find_space_info(fs_info, rkobj->flags); ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); @@ -10397,9 +10267,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); + btrfs_update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10533,7 +10403,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = __find_space_info(fs_info, cache->flags); + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10548,7 +10418,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - update_space_info(fs_info, cache->flags, size, bytes_used, + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); update_global_block_rsv(fs_info); @@ -11085,43 +10955,6 @@ next: spin_unlock(&fs_info->unused_bgs_lock); } -int btrfs_init_space_info(struct btrfs_fs_info *fs_info) -{ - struct btrfs_super_block *disk_super; - u64 features; - u64 flags; - int mixed = 0; - int ret; - - disk_super = fs_info->super_copy; - if (!btrfs_super_root(disk_super)) - return -EINVAL; - - features = btrfs_super_incompat_flags(disk_super); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; - - flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - if (mixed) { - flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } else { - flags = BTRFS_BLOCK_GROUP_METADATA; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - flags = BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } -out: - return ret; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c new file mode 100644 index 000000000000..6edcd9b7cab2 --- /dev/null +++ b/fs/btrfs/space-info.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "space-info.h" +#include "sysfs.h" +#include "volumes.h" + +u64 btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included) +{ + ASSERT(s_info); + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + (may_use_included ? s_info->bytes_may_use : 0); +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int i; + int ret; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); + if (ret) { + kfree(space_info); + return ret; + } + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + kobject_put(&space_info->kobj); + return ret; + } + + list_add_rcu(&space_info->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = space_info; + + return ret; +} + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return -EINVAL; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } +out: + return ret; +} + +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + factor = btrfs_bg_type_to_factor(flags); + + found = btrfs_find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + btrfs_space_info_add_new_bytes(info, found, + total_bytes - bytes_used - + bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; +} + +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & flags) { + rcu_read_unlock(); + return found; + } + } + rcu_read_unlock(); + return NULL; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 46f15b8c9ffc..d1264d9223f2 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -81,5 +81,15 @@ void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info); +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags); +u64 btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); #endif /* BTRFS_SPACE_INFO_H */ -- cgit v1.2.3 From 41783ef24d56ce45452db7c413df9a3bdcc8d651 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:20 -0400 Subject: btrfs: move and export can_overcommit This is the first piece of moving the space reservation code to space-info.c Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 82 +++++--------------------------------------------- fs/btrfs/space-info.c | 68 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 4 +++ 3 files changed, 79 insertions(+), 75 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2ec0409fa2b5..d115c78d2917 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4147,11 +4147,6 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { @@ -4376,69 +4371,6 @@ out: return ret; } -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 profile; - u64 space_size; - u64 avail; - u64 used; - int factor; - - /* Don't overcommit when in mixed mode. */ - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; - - if (system_chunk) - profile = btrfs_system_alloc_profile(fs_info); - else - profile = btrfs_metadata_alloc_profile(fs_info); - - used = btrfs_space_info_used(space_info, false); - - /* - * We only want to allow over committing if we have lots of actual space - * free, but if we don't have enough space to handle the global reserve - * space then we could end up having a real enospc problem when trying - * to allocate a chunk or some other such important allocation. - */ - spin_lock(&global_rsv->lock); - space_size = calc_global_rsv_need_space(global_rsv); - spin_unlock(&global_rsv->lock); - if (used + space_size >= space_info->total_bytes) - return 0; - - used += space_info->bytes_may_use; - - avail = atomic64_read(&fs_info->free_chunk_space); - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually usable. For raid56, the space info used - * doesn't include the parity drive, so we don't have to - * change the math - */ - factor = btrfs_bg_type_to_factor(profile); - avail = div_u64(avail, factor); - - /* - * If we aren't flushing all things, let us overcommit up to - * 1/2th of the space. If we can flush, don't let us overcommit - * too much, let it overcommit up to 1/8 of the space. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; - else - avail >>= 1; - - if (used + bytes < space_info->total_bytes + avail) - return 1; - return 0; -} - static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, unsigned long nr_pages, int nr_items) { @@ -4766,14 +4698,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; used = btrfs_space_info_used(space_info, true); - if (can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -5019,8 +4951,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { + } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { update_bytes_may_use(fs_info, space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); @@ -5331,7 +5263,7 @@ again: * adding the ticket space would be a double count. */ if (check_overcommit && - !can_overcommit(fs_info, space_info, 0, flush, false)) + !btrfs_can_overcommit(fs_info, space_info, 0, flush, false)) break; if (num_bytes >= ticket->bytes) { list_del_init(&ticket->list); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 6edcd9b7cab2..9dc30081c2db 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -172,3 +172,71 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, rcu_read_unlock(); return NULL; } + +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 profile; + u64 space_size; + u64 avail; + u64 used; + int factor; + + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + + used = btrfs_space_info_used(space_info, false); + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; + + avail = atomic64_read(&fs_info->free_chunk_space); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually usable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + factor = btrfs_bg_type_to_factor(profile); + avail = div_u64(avail, factor); + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index d1264d9223f2..fcec5286e0cb 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -91,5 +91,9 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk); #endif /* BTRFS_SPACE_INFO_H */ -- cgit v1.2.3 From bb96c4e5742022b79b5bda5f52218b3d2785accd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:21 -0400 Subject: btrfs: move the space info update macro to space-info.h Also rename it to btrfs_space_info_update_* so it's clear what we're updating. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 63 +++++++++++++++++++++----------------------------- fs/btrfs/space-info.h | 23 ++++++++++++++++++ 2 files changed, 49 insertions(+), 37 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d115c78d2917..c55d1bcbb196 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,26 +32,6 @@ #undef SCRAMBLE_DELAYED_REFS -/* - * Declare a helper function to detect underflow of various space info members - */ -#define DECLARE_SPACE_INFO_UPDATE(name) \ -static inline void update_##name(struct btrfs_fs_info *fs_info, \ - struct btrfs_space_info *sinfo, \ - s64 bytes) \ -{ \ - lockdep_assert_held(&sinfo->lock); \ - trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ - if (bytes < 0 && sinfo->name < -bytes) { \ - WARN_ON(1); \ - sinfo->name = 0; \ - return; \ - } \ - sinfo->name += bytes; \ -} - -DECLARE_SPACE_INFO_UPDATE(bytes_may_use); -DECLARE_SPACE_INFO_UPDATE(bytes_pinned); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, @@ -4054,7 +4034,7 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - update_bytes_may_use(fs_info, data_sinfo, bytes); + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4107,7 +4087,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - update_bytes_may_use(fs_info, data_sinfo, -len); + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); @@ -4947,13 +4927,15 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * If not things get more complicated. */ if (used + orig_bytes <= space_info->total_bytes) { - update_bytes_may_use(fs_info, space_info, orig_bytes); + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush, system_chunk)) { - update_bytes_may_use(fs_info, space_info, orig_bytes); + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; @@ -5282,7 +5264,7 @@ again: flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } - update_bytes_may_use(fs_info, space_info, -num_bytes); + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); spin_unlock(&space_info->lock); @@ -5310,8 +5292,8 @@ again: ticket->bytes, 1); list_del_init(&ticket->list); num_bytes -= ticket->bytes; - update_bytes_may_use(fs_info, space_info, - ticket->bytes); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, ticket->bytes); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -5319,7 +5301,8 @@ again: trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 1); - update_bytes_may_use(fs_info, space_info, num_bytes); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, num_bytes); ticket->bytes -= num_bytes; num_bytes = 0; } @@ -5612,14 +5595,16 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = min(num_bytes, block_rsv->size - block_rsv->reserved); block_rsv->reserved += num_bytes; - update_bytes_may_use(fs_info, sinfo, num_bytes); + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 1); } } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - update_bytes_may_use(fs_info, sinfo, -num_bytes); + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; @@ -6082,7 +6067,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - update_bytes_pinned(info, cache->space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(info, + cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6157,7 +6143,8 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - update_bytes_pinned(fs_info, cache->space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, + num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6366,7 +6353,8 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - update_bytes_may_use(cache->fs_info, space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6522,7 +6510,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - update_bytes_pinned(fs_info, space_info, -len); + btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6543,8 +6531,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - update_bytes_may_use(fs_info, space_info, - to_add); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -10831,7 +10819,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - update_bytes_pinned(fs_info, space_info, -block_group->pinned); + btrfs_space_info_update_bytes_pinned(fs_info, space_info, + -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index fcec5286e0cb..702e431580c6 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -75,6 +75,29 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } +/* + * + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void \ +btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ + struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + lockdep_assert_held(&sinfo->lock); \ + trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); + void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); -- cgit v1.2.3 From b338b013e18a28341aaf9e665ac1edc9fae518d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:22 -0400 Subject: btrfs: move btrfs_space_info_add_*_bytes to space-info.c Now that we've moved all the pre-requisite stuff, move these two functions. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 111 ------------------------------------------------- fs/btrfs/space-info.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 8 ++++ 3 files changed, 114 insertions(+), 111 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c55d1bcbb196..7b4a287e0c5c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4485,14 +4485,6 @@ skip_async: } } -struct reserve_ticket { - u64 orig_bytes; - u64 bytes; - int error; - struct list_head list; - wait_queue_head_t wait; -}; - /** * maybe_commit_transaction - possibly commit the transaction if its ok to * @root - the root we're allocating for @@ -5211,109 +5203,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, return 0; } -/* - * This is for space we already have accounted in space_info->bytes_may_use, so - * basically when we're returning space from block_rsv's. - */ -void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head; - u64 used; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; - bool check_overcommit = false; - - spin_lock(&space_info->lock); - head = &space_info->priority_tickets; - - /* - * If we are over our limit then we need to check and see if we can - * overcommit, and if we can't then we just need to free up our space - * and not satisfy any requests. - */ - used = btrfs_space_info_used(space_info, true); - if (used - num_bytes >= space_info->total_bytes) - check_overcommit = true; -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - /* - * We use 0 bytes because this space is already reserved, so - * adding the ticket space would be a double count. - */ - if (check_overcommit && - !btrfs_can_overcommit(fs_info, space_info, 0, flush, false)) - break; - if (num_bytes >= ticket->bytes) { - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - flush = BTRFS_RESERVE_FLUSH_ALL; - goto again; - } - btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); -} - -/* - * This is for newly allocated space that isn't accounted in - * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent - * we use this helper. - */ -void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head = &space_info->priority_tickets; - -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - if (num_bytes >= ticket->bytes) { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - ticket->bytes, 1); - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, ticket->bytes); - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 1); - btrfs_space_info_update_bytes_may_use(fs_info, - space_info, num_bytes); - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - goto again; - } -} - static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9dc30081c2db..50046e8d7c71 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -240,3 +240,109 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 1; return 0; } + +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * If we are over our limit then we need to check and see if we can + * overcommit, and if we can't then we just need to free up our space + * and not satisfy any requests. + */ + used = btrfs_space_info_used(space_info, true); + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + /* + * We use 0 bytes because this space is already reserved, so + * adding the ticket space would be a double count. + */ + if (check_overcommit && + !btrfs_can_overcommit(fs_info, space_info, 0, flush, + false)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + ticket->bytes); + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + num_bytes); + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 702e431580c6..7df09731a3fa 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -69,6 +69,14 @@ struct btrfs_space_info { struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; +struct reserve_ticket { + u64 orig_bytes; + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && -- cgit v1.2.3 From c2a67a76ec87579a46a16c49fc9997737b7fa844 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:23 -0400 Subject: btrfs: export block_rsv_use_bytes We are going to need this to move the metadata reservation stuff to space_info.c. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 13 +++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8fca40370cf1..b78470fdc226 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2844,6 +2844,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, bool update_size); +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7b4a287e0c5c..b0cd367994a8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -53,8 +53,6 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -5031,7 +5029,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root, if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && - !block_rsv_use_bytes(global_rsv, orig_bytes)) + !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) ret = 0; } if (ret == -ENOSPC) { @@ -5067,8 +5065,7 @@ static struct btrfs_block_rsv *get_block_rsv( return block_rsv; } -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) { int ret = -ENOSPC; spin_lock(&block_rsv->lock); @@ -5265,7 +5262,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, { int ret; - ret = block_rsv_use_bytes(src, num_bytes); + ret = btrfs_block_rsv_use_bytes(src, num_bytes); if (ret) return ret; @@ -8134,7 +8131,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (unlikely(block_rsv->size == 0)) goto try_reserve; again: - ret = block_rsv_use_bytes(block_rsv, blocksize); + ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; @@ -8172,7 +8169,7 @@ try_reserve: */ if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && block_rsv->space_info == global_rsv->space_info) { - ret = block_rsv_use_bytes(global_rsv, blocksize); + ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); if (!ret) return global_rsv; } -- cgit v1.2.3 From 5da6afeb32e97f956aa3d599b7f94ceb36fcf854 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:24 -0400 Subject: btrfs: move dump_space_info to space-info.c We'll need this exported so we can use it in all the various was we need to use it. This is prep work to move reserve_metadata_bytes. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 70 +++++--------------------------------------------- fs/btrfs/space-info.c | 55 +++++++++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 3 +++ 3 files changed, 65 insertions(+), 63 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b0cd367994a8..e0f1ec0ca4a4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -50,9 +50,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -4196,7 +4193,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); - dump_space_info(fs_info, info, 0, 0); + btrfs_dump_space_info(fs_info, info, 0, 0); } if (left < thresh) { @@ -5038,8 +5035,8 @@ static int reserve_metadata_bytes(struct btrfs_root *root, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); + btrfs_dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); } return ret; } @@ -7644,60 +7641,6 @@ loop: return ret; } -#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ -do { \ - struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ - spin_lock(&__rsv->lock); \ - btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ - __rsv->size, __rsv->reserved); \ - spin_unlock(&__rsv->lock); \ -} while (0) - -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) -{ - struct btrfs_block_group_cache *cache; - int index = 0; - - spin_lock(&info->lock); - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", - info->flags, - info->total_bytes - btrfs_space_info_used(info, true), - info->full ? "" : "not "); - btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", - info->total_bytes, info->bytes_used, info->bytes_pinned, - info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); - spin_unlock(&info->lock); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - - if (!dump_block_groups) - return; - - down_read(&info->groups_sem); -again: - list_for_each_entry(cache, &info->block_groups[index], list) { - spin_lock(&cache->lock); - btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", - cache->key.objectid, cache->key.offset, - btrfs_block_group_used(&cache->item), cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); - btrfs_dump_free_space(cache, bytes); - spin_unlock(&cache->lock); - } - if (++index < BTRFS_NR_RAID_TYPES) - goto again; - up_read(&info->groups_sem); -} - /* * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a * hole that is at least as big as @num_bytes. @@ -7778,7 +7721,8 @@ again: "allocation failed flags %llu, wanted %llu", flags, num_bytes); if (sinfo) - dump_space_info(fs_info, sinfo, num_bytes, 1); + btrfs_dump_space_info(fs_info, sinfo, + num_bytes, 1); } } @@ -9295,7 +9239,7 @@ out: btrfs_info(cache->fs_info, "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", sinfo_used, num_bytes, min_allocable_bytes); - dump_space_info(cache->fs_info, cache->space_info, 0, 0); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; } @@ -9776,7 +9720,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) - dump_space_info(info, space_info, 0, 0); + btrfs_dump_space_info(info, space_info, 0, 0); list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 50046e8d7c71..41dfb1d4ea86 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -4,6 +4,7 @@ #include "space-info.h" #include "sysfs.h" #include "volumes.h" +#include "free-space-cache.h" u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) @@ -346,3 +347,57 @@ again: goto again; } } + +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + int index = 0; + + spin_lock(&info->lock); + btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", + info->flags, + info->total_bytes - btrfs_space_info_used(info, true), + info->full ? "" : "not "); + btrfs_info(fs_info, + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); + spin_unlock(&info->lock); + + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); +again: + list_for_each_entry(cache, &info->block_groups[index], list) { + spin_lock(&cache->lock); + btrfs_info(fs_info, + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; + up_read(&info->groups_sem); +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 7df09731a3fa..d758959d19d5 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -126,5 +126,8 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush, bool system_chunk); +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); #endif /* BTRFS_SPACE_INFO_H */ -- cgit v1.2.3 From 0d9764f6d0fb9dd4d4b773b481f259c0567870c2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:25 -0400 Subject: btrfs: move reserve_metadata_bytes and supporting code to space-info.c This moves all of the metadata reservation code into space-info.c. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 709 +------------------------------------------------ fs/btrfs/space-info.c | 698 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 4 + 3 files changed, 709 insertions(+), 702 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e0f1ec0ca4a4..c887f3352341 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4346,701 +4346,6 @@ out: return ret; } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - -static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - u64 bytes; - u64 nr; - - bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - -#define EXTENT_SIZE_PER_ITEM SZ_256K - -/* - * shrink metadata reservation for delalloc - */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) -{ - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - u64 delalloc_bytes; - u64 dio_bytes; - u64 async_pages; - u64 items; - long time_left; - unsigned long nr_pages; - int loops; - - /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; - - trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - if (delalloc_bytes == 0 && dio_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - return; - } - - /* - * If we are doing more ordered than delalloc we need to just wait on - * ordered extents, otherwise we'll waste time trying to flush delalloc - * that likely won't give us the space back we need. - */ - if (dio_bytes > delalloc_bytes) - wait_ordered = true; - - loops = 0; - while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); - - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; - - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { - spin_unlock(&space_info->lock); - break; - } - spin_unlock(&space_info->lock); - - loops++; - if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - } else { - time_left = schedule_timeout_killable(1); - if (time_left) - break; - } - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - } -} - -/** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - struct reserve_ticket *ticket = NULL; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_trans_handle *trans; - u64 bytes_needed; - u64 reclaim_bytes = 0; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - spin_lock(&space_info->lock); - if (!list_empty(&space_info->priority_tickets)) - ticket = list_first_entry(&space_info->priority_tickets, - struct reserve_ticket, list); - else if (!list_empty(&space_info->tickets)) - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; - spin_unlock(&space_info->lock); - - if (!bytes_needed) - return 0; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * See if there is enough pinned space to make this reservation, or if - * we have block groups that are going to be freed, allowing us to - * possibly do a chunk allocation the next loop through. - */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || - __percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) - goto commit; - - /* - * See if there is some space in the delayed insertion reservation for - * this reservation. - */ - if (space_info != delayed_rsv->space_info) - goto enospc; - - spin_lock(&delayed_rsv->lock); - reclaim_bytes += delayed_rsv->reserved; - spin_unlock(&delayed_rsv->lock); - - spin_lock(&delayed_refs_rsv->lock); - reclaim_bytes += delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - if (reclaim_bytes >= bytes_needed) - goto commit; - bytes_needed -= reclaim_bytes; - - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) - goto enospc; - -commit: - return btrfs_commit_transaction(trans); -enospc: - btrfs_end_transaction(trans); - return -ENOSPC; -} - -/* - * Try to flush some data based on policy set by @state. This is only advisory - * and may fail for various reasons. The caller is supposed to examine the - * state of @space_info to detect the outcome. - */ -static void flush_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 num_bytes, - int state) -{ - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_trans_handle *trans; - int nr; - int ret = 0; - - switch (state) { - case FLUSH_DELAYED_ITEMS_NR: - case FLUSH_DELAYED_ITEMS: - if (state == FLUSH_DELAYED_ITEMS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; - else - nr = -1; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = btrfs_run_delayed_items_nr(trans, nr); - btrfs_end_transaction(trans); - break; - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, - state == FLUSH_DELALLOC_WAIT); - break; - case FLUSH_DELAYED_REFS_NR: - case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes); - else - nr = 0; - btrfs_run_delayed_refs(trans, nr); - btrfs_end_transaction(trans); - break; - case ALLOC_CHUNK: - case ALLOC_CHUNK_FORCE: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = btrfs_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), - (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : - CHUNK_ALLOC_FORCE); - btrfs_end_transaction(trans); - if (ret > 0 || ret == -ENOSPC) - ret = 0; - break; - case COMMIT_TRANS: - /* - * If we have pending delayed iputs then we could free up a - * bunch of pinned space, so make sure we run the iputs before - * we do our pinned bytes check below. - */ - btrfs_run_delayed_iputs(fs_info); - btrfs_wait_on_delayed_iputs(fs_info); - - ret = may_commit_transaction(fs_info, space_info); - break; - default: - ret = -ENOSPC; - break; - } - - trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); - return; -} - -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool system_chunk) -{ - struct reserve_ticket *ticket; - u64 used; - u64 expected; - u64 to_reclaim = 0; - - list_for_each_entry(ticket, &space_info->tickets, list) - to_reclaim += ticket->bytes; - list_for_each_entry(ticket, &space_info->priority_tickets, list) - to_reclaim += ticket->bytes; - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); - return to_reclaim; -} - -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used, bool system_chunk) -{ - u64 thresh = div_factor_fine(space_info->total_bytes, 98); - - /* If we're just plain full then async reclaim just slows us down. */ - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; - - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, - system_chunk)) - return 0; - - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); -} - -static bool wake_all_tickets(struct list_head *head) -{ - struct reserve_ticket *ticket; - - while (!list_empty(head)) { - ticket = list_first_entry(head, struct reserve_ticket, list); - list_del_init(&ticket->list); - ticket->error = -ENOSPC; - wake_up(&ticket->wait); - if (ticket->bytes != ticket->orig_bytes) - return true; - } - return false; -} - -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) -{ - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; - u64 to_reclaim; - int flush_state; - int commit_cycles = 0; - u64 last_tickets_id; - - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - last_tickets_id = space_info->tickets_id; - spin_unlock(&space_info->lock); - - flush_state = FLUSH_DELAYED_ITEMS_NR; - do { - flush_space(fs_info, space_info, to_reclaim, flush_state); - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets)) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info, - false); - if (last_tickets_id == space_info->tickets_id) { - flush_state++; - } else { - last_tickets_id = space_info->tickets_id; - flush_state = FLUSH_DELAYED_ITEMS_NR; - if (commit_cycles) - commit_cycles--; - } - - /* - * We don't want to force a chunk allocation until we've tried - * pretty hard to reclaim space. Think of the case where we - * freed up a bunch of space and so have a lot of pinned space - * to reclaim. We would rather use that than possibly create a - * underutilized metadata chunk. So if this is our first run - * through the flushing state machine skip ALLOC_CHUNK_FORCE and - * commit the transaction. If nothing has changed the next go - * around then we can force a chunk allocation. - */ - if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) - flush_state++; - - if (flush_state > COMMIT_TRANS) { - commit_cycles++; - if (commit_cycles > 2) { - if (wake_all_tickets(&space_info->tickets)) { - flush_state = FLUSH_DELAYED_ITEMS_NR; - commit_cycles--; - } else { - space_info->flush = 0; - } - } else { - flush_state = FLUSH_DELAYED_ITEMS_NR; - } - } - spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); -} - -void btrfs_init_async_reclaim_work(struct work_struct *work) -{ - INIT_WORK(work, btrfs_async_reclaim_metadata_space); -} - -static const enum btrfs_flush_state priority_flush_states[] = { - FLUSH_DELAYED_ITEMS_NR, - FLUSH_DELAYED_ITEMS, - ALLOC_CHUNK, -}; - -static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) -{ - u64 to_reclaim; - int flush_state; - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - - flush_state = 0; - do { - flush_space(fs_info, space_info, to_reclaim, - priority_flush_states[flush_state]); - flush_state++; - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - } while (flush_state < ARRAY_SIZE(priority_flush_states)); -} - -static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) - -{ - DEFINE_WAIT(wait); - u64 reclaim_bytes = 0; - int ret = 0; - - spin_lock(&space_info->lock); - while (ticket->bytes > 0 && ticket->error == 0) { - ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); - if (ret) { - ret = -EINTR; - break; - } - spin_unlock(&space_info->lock); - - schedule(); - - finish_wait(&ticket->wait, &wait); - spin_lock(&space_info->lock); - } - if (!ret) - ret = ticket->error; - if (!list_empty(&ticket->list)) - list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < ticket->orig_bytes) - reclaim_bytes = ticket->orig_bytes - ticket->bytes; - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - reclaim_bytes); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct reserve_ticket ticket; - u64 used; - u64 reclaim_bytes = 0; - int ret = 0; - - ASSERT(orig_bytes); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); - - spin_lock(&space_info->lock); - ret = -ENOSPC; - used = btrfs_space_info_used(space_info, true); - - /* - * If we have enough space then hooray, make our reservation and carry - * on. If not see if we can overcommit, and if we can, hooray carry on. - * If not things get more complicated. - */ - if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } - - /* - * If we couldn't make a reservation then setup our reservation ticket - * and kick the async worker if it's not already running. - * - * If we are a priority flusher then we just need to add our ticket to - * the list and we will do our own flushing further down. - */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - ticket.orig_bytes = orig_bytes; - ticket.bytes = orig_bytes; - ticket.error = 0; - init_waitqueue_head(&ticket.wait); - if (flush == BTRFS_RESERVE_FLUSH_ALL) { - list_add_tail(&ticket.list, &space_info->tickets); - if (!space_info->flush) { - space_info->flush = 1; - trace_btrfs_trigger_flush(fs_info, - space_info->flags, - orig_bytes, flush, - "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } else { - list_add_tail(&ticket.list, - &space_info->priority_tickets); - } - } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; - /* - * We will do the space reservation dance during log replay, - * which means we won't have fs_info->fs_root set, so don't do - * the async reclaim as we will panic. - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, - used, system_chunk) && - !work_busy(&fs_info->async_reclaim_work)) { - trace_btrfs_trigger_flush(fs_info, space_info->flags, - orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } - spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - return ret; - - if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket); - - ret = 0; - priority_reclaim_metadata_space(fs_info, space_info, &ticket); - spin_lock(&space_info->lock); - if (ticket.bytes) { - if (ticket.bytes < orig_bytes) - reclaim_bytes = orig_bytes - ticket.bytes; - list_del_init(&ticket.list); - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - reclaim_bytes); - ASSERT(list_empty(&ticket.list)); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool system_chunk = (root == fs_info->chunk_root); - - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush, system_chunk); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } - if (ret == -ENOSPC) { - trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); - } - return ret; -} - static struct btrfs_block_rsv *get_block_rsv( const struct btrfs_trans_handle *trans, const struct btrfs_root *root) @@ -5187,8 +4492,8 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); if (ret) return ret; block_rsv_add_bytes(block_rsv, num_bytes, 0); @@ -5314,7 +4619,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -5359,7 +4664,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -5733,7 +5038,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); if (ret) goto out_fail; - ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); if (ret) goto out_qgroup; @@ -8102,8 +7407,8 @@ again: "BTRFS: block rsv returned %d\n", ret); } try_reserve: - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 41dfb1d4ea86..1ac58d7e7790 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -5,6 +5,9 @@ #include "sysfs.h" #include "volumes.h" #include "free-space-cache.h" +#include "ordered-data.h" +#include "transaction.h" +#include "math.h" u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included) @@ -401,3 +404,698 @@ again: goto again; up_read(&info->groups_sem); } + +static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, + unsigned long nr_pages, int nr_items) +{ + struct super_block *sb = fs_info->sb; + + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_roots(fs_info, nr_items); + if (!current->journal_info) + btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); + } +} + +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + u64 bytes; + u64 nr; + + bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM SZ_256K + +/* + * shrink metadata reservation for delalloc + */ +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) +{ + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + u64 delalloc_bytes; + u64 dio_bytes; + u64 async_pages; + u64 items; + long time_left; + unsigned long nr_pages; + int loops; + + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(fs_info, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + + trans = (struct btrfs_trans_handle *)current->journal_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + if (delalloc_bytes == 0 && dio_bytes == 0) { + if (trans) + return; + if (wait_ordered) + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + return; + } + + /* + * If we are doing more ordered than delalloc we need to just wait on + * ordered extents, otherwise we'll waste time trying to flush delalloc + * that likely won't give us the space back we need. + */ + if (dio_bytes > delalloc_bytes) + wait_ordered = true; + + loops = 0; + while ((delalloc_bytes || dio_bytes) && loops < 3) { + nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + + /* + * Triggers inode writeback for up to nr_pages. This will invoke + * ->writepages callback and trigger delalloc filling + * (btrfs_run_delalloc_range()). + */ + btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + + /* + * We need to wait for the compressed pages to start before + * we continue. + */ + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) + goto skip_async; + + /* + * Calculate how many compressed pages we want to be written + * before we continue. I.e if there are more async pages than we + * require wait_event will wait until nr_pages are written. + */ + if (async_pages <= nr_pages) + async_pages = 0; + else + async_pages -= nr_pages; + + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= + (int)async_pages); +skip_async: + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + } +} + +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct reserve_ticket *ticket = NULL; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_trans_handle *trans; + u64 bytes_needed; + u64 reclaim_bytes = 0; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + spin_lock(&space_info->lock); + if (!list_empty(&space_info->priority_tickets)) + ticket = list_first_entry(&space_info->priority_tickets, + struct reserve_ticket, list); + else if (!list_empty(&space_info->tickets)) + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + bytes_needed = (ticket) ? ticket->bytes : 0; + spin_unlock(&space_info->lock); + + if (!bytes_needed) + return 0; + + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * See if there is enough pinned space to make this reservation, or if + * we have block groups that are going to be freed, allowing us to + * possibly do a chunk allocation the next loop through. + */ + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + __percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) + goto commit; + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + goto enospc; + + spin_lock(&delayed_rsv->lock); + reclaim_bytes += delayed_rsv->reserved; + spin_unlock(&delayed_rsv->lock); + + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + + if (__percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) + goto enospc; + +commit: + return btrfs_commit_transaction(trans); +enospc: + btrfs_end_transaction(trans); + return -ENOSPC; +} + +/* + * Try to flush some data based on policy set by @state. This is only advisory + * and may fail for various reasons. The caller is supposed to examine the + * state of @space_info to detect the outcome. + */ +static void flush_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 num_bytes, + int state) +{ + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, nr); + btrfs_end_transaction(trans); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; + case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_chunk_alloc(trans, + btrfs_metadata_alloc_profile(fs_info), + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + if (ret > 0 || ret == -ENOSPC) + ret = 0; + break; + case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + btrfs_run_delayed_iputs(fs_info); + btrfs_wait_on_delayed_iputs(fs_info); + + ret = may_commit_transaction(fs_info, space_info); + break; + default: + ret = -ENOSPC; + break; + } + + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, + ret); + return; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) +{ + struct reserve_ticket *ticket; + u64 used; + u64 expected; + u64 to_reclaim = 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + return 0; + + used = btrfs_space_info_used(space_info, true); + + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) +{ + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) + return 0; + + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static bool wake_all_tickets(struct list_head *head) +{ + struct reserve_ticket *ticket; + + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); + if (ticket->bytes != ticket->orig_bytes) + return true; + } + return false; +} + +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + int commit_cycles = 0; + u64 last_tickets_id; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info, space_info, to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * freed up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + if (wake_all_tickets(&space_info->tickets)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + flush_state = 0; + do { + flush_space(fs_info, space_info, to_reclaim, + priority_flush_states[flush_state]); + flush_state++; + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } while (flush_state < ARRAY_SIZE(priority_flush_states)); +} + +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) + +{ + DEFINE_WAIT(wait); + u64 reclaim_bytes = 0; + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) + reclaim_bytes = ticket->orig_bytes - ticket->bytes; + spin_unlock(&space_info->lock); + + if (reclaim_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); + return ret; +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @space_info - the space info we want to allocate from + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + struct reserve_ticket ticket; + u64 used; + u64 reclaim_bytes = 0; + int ret = 0; + + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + + spin_lock(&space_info->lock); + ret = -ENOSPC; + used = btrfs_space_info_used(space_info, true); + + /* + * If we have enough space then hooray, make our reservation and carry + * on. If not see if we can overcommit, and if we can, hooray carry on. + * If not things get more complicated. + */ + if (used + orig_bytes <= space_info->total_bytes) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, orig_bytes, 1); + ret = 0; + } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, orig_bytes, 1); + ret = 0; + } + + /* + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. + * + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. + */ + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ticket.orig_bytes = orig_bytes; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && + !work_busy(&fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(fs_info, space_info->flags, + orig_bytes, flush, "preempt"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } + spin_unlock(&space_info->lock); + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + return ret; + + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(fs_info, space_info, &ticket); + + ret = 0; + priority_reclaim_metadata_space(fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) + reclaim_bytes = orig_bytes - ticket.bytes; + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + + if (reclaim_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); + ASSERT(list_empty(&ticket.list)); + return ret; +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool system_chunk = (root == fs_info->chunk_root); + + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + if (block_rsv != global_rsv && + !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + block_rsv->space_info->flags, + orig_bytes, 1); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index d758959d19d5..620d390cf6d2 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -129,5 +129,9 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); #endif /* BTRFS_SPACE_INFO_H */ -- cgit v1.2.3 From 83d731a5b228454d7c29ec64fdd93426698f97f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Jun 2019 16:09:26 -0400 Subject: btrfs: unexport can_overcommit Now that we've moved all of the users to space-info.c, unexport it and name it back to can_overcommit. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 23 +++++++++++------------ fs/btrfs/space-info.h | 4 ---- 2 files changed, 11 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 1ac58d7e7790..ce33b1addf55 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -182,10 +182,10 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) +static int can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; @@ -279,8 +279,7 @@ again: * adding the ticket space would be a double count. */ if (check_overcommit && - !btrfs_can_overcommit(fs_info, space_info, 0, flush, - false)) + !can_overcommit(fs_info, space_info, 0, flush, false)) break; if (num_bytes >= ticket->bytes) { list_del_init(&ticket->list); @@ -724,14 +723,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) return 0; used = btrfs_space_info_used(space_info, true); - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -978,8 +977,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; - } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { + } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 620d390cf6d2..c2b54b8e1a14 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -122,10 +122,6 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 btrfs_space_info_used(struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk); void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -- cgit v1.2.3 From 9b4851bc48b9346a957a71ca355ecdb7f2759fb8 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 25 Jun 2019 20:11:31 +0200 Subject: btrfs: Simplify update of space_info in __reserve_metadata_bytes() We don't need an if-else-if chain where we can use a simple OR since both conditions are performing the same action. The short-circuit for OR will ensure that if the first condition is true, can_overcommit() is not called. Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ce33b1addf55..ab7b9ec4c240 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -967,18 +967,12 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, used = btrfs_space_info_used(space_info, true); /* - * If we have enough space then hooray, make our reservation and carry - * on. If not see if we can overcommit, and if we can, hooray carry on. - * If not things get more complicated. + * Carry on if we have enough space (short-circuit) OR call + * can_overcommit() to ensure we can overcommit to continue. */ - if (used + orig_bytes <= space_info->total_bytes) { - btrfs_space_info_update_bytes_may_use(fs_info, space_info, - orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { + if ((used + orig_bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", -- cgit v1.2.3 From d12ffdd1aa4c165774748265c67af3aa1edab1a0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 13:47:17 -0400 Subject: btrfs: move btrfs_block_rsv definitions into it's own header Prep work for separating out all of the block_rsv related code into its own file. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ctree.h | 69 +------------------------------------------- fs/btrfs/extent-tree.c | 1 + 3 files changed, 79 insertions(+), 68 deletions(-) create mode 100644 fs/btrfs/block-rsv.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h new file mode 100644 index 000000000000..359e73d329e7 --- /dev/null +++ b/fs/btrfs/block-rsv.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_RSV_H +#define BTRFS_BLOCK_RSV_H + +enum btrfs_reserve_flush_enum; + +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + struct btrfs_space_info *space_info; + spinlock_t lock; + unsigned short full; + unsigned short type; + unsigned short failfast; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; +}; + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type); +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + bool update_size); +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); + +#endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b78470fdc226..adb6d2747e0c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -32,6 +32,7 @@ #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" +#include "block-rsv.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -403,49 +404,6 @@ struct raid_kobject { struct list_head list; }; -/* - * Types of block reserves - */ -enum { - BTRFS_BLOCK_RSV_GLOBAL, - BTRFS_BLOCK_RSV_DELALLOC, - BTRFS_BLOCK_RSV_TRANS, - BTRFS_BLOCK_RSV_CHUNK, - BTRFS_BLOCK_RSV_DELOPS, - BTRFS_BLOCK_RSV_DELREFS, - BTRFS_BLOCK_RSV_EMPTY, - BTRFS_BLOCK_RSV_TEMP, -}; - -struct btrfs_block_rsv { - u64 size; - u64 reserved; - struct btrfs_space_info *space_info; - spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; - - /* - * Qgroup equivalent for @size @reserved - * - * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care - * about things like csum size nor how many tree blocks it will need to - * reserve. - * - * Qgroup cares more about net change of the extent usage. - * - * So for one newly inserted file extent, in worst case it will cause - * leaf split and level increase, nodesize for each file extent is - * already too much. - * - * In short, qgroup_size/reserved is the upper limit of possible needed - * qgroup metadata reservation. - */ - u64 qgroup_rsv_size; - u64 qgroup_rsv_reserved; -}; - /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -2826,31 +2784,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type); -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, u64 num_bytes, - bool update_size); -int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c887f3352341..3f2ccfed3b0e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -29,6 +29,7 @@ #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" +#include "block-rsv.h" #undef SCRAMBLE_DELAYED_REFS -- cgit v1.2.3 From 0b50174ad5e9fb32abcfdb10b1a14e76a72b20e6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 13:47:18 -0400 Subject: btrfs: export btrfs_block_rsv_add_bytes This is used in a few places, we need to make sure it's exported so we can move it around. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.h | 2 ++ fs/btrfs/extent-tree.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 359e73d329e7..f4af1c0766a7 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -73,5 +73,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size); #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3f2ccfed3b0e..2f6e74803c9f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4382,8 +4382,8 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) return ret; } -static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, bool update_size) +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size) { spin_lock(&block_rsv->lock); block_rsv->reserved += num_bytes; @@ -4415,7 +4415,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, global_rsv->full = 0; spin_unlock(&global_rsv->lock); - block_rsv_add_bytes(dest, num_bytes, true); + btrfs_block_rsv_add_bytes(dest, num_bytes, true); return 0; } @@ -4497,7 +4497,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, num_bytes, flush); if (ret) return ret; - block_rsv_add_bytes(block_rsv, num_bytes, 0); + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, num_bytes, 1); return 0; @@ -4569,7 +4569,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, if (ret) return ret; - block_rsv_add_bytes(dst, num_bytes, update_size); + btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); return 0; } @@ -4622,7 +4622,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) - block_rsv_add_bytes(block_rsv, num_bytes, true); + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); return ret; } @@ -4667,7 +4667,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, false); + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; } @@ -5057,7 +5057,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) spin_unlock(&inode->lock); /* Now we can safely add our space to our block rsv */ - block_rsv_add_bytes(block_rsv, meta_reserve, false); + btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), meta_reserve, 1); @@ -7429,7 +7429,7 @@ try_reserve: static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { - block_rsv_add_bytes(block_rsv, blocksize, false); + btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); } -- cgit v1.2.3 From fed14b323db868c3ac0071925763964a6ca58883 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 13:47:19 -0400 Subject: btrfs: export __btrfs_block_rsv_release The delalloc reserve stuff calls this directly because it cares about the qgroup accounting stuff, so export it so we can move it around. Fix btrfs_block_rsv_release() to just be a static inline since it just calls __btrfs_block_rsv_release() with NULL for the qgroup stuff. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.h | 13 ++++++++++--- fs/btrfs/extent-tree.c | 13 +++---------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index f4af1c0766a7..feb0a4d4b1c9 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -70,10 +70,17 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release); + +static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2f6e74803c9f..486ddc4c436d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4674,9 +4674,9 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; @@ -4692,13 +4692,6 @@ static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, qgroup_to_release); } -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} - /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. -- cgit v1.2.3 From fcec36224fc674953eadfca05c69225e6595641b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 13:47:20 -0400 Subject: btrfs: cleanup the target logic in __btrfs_block_rsv_release This works for all callers already, but if we wanted to use the helper for the global_block_rsv it would end up trying to refill itself. Fix the logic to be able to be used no matter which block rsv is passed in to this helper. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 486ddc4c436d..a457776c3e41 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4680,12 +4680,18 @@ u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *target = delayed_rsv; + struct btrfs_block_rsv *target = NULL; - if (target->full || target == block_rsv) + /* + * If we are the delayed_rsv then push to the global rsv, otherwise dump + * into the delayed rsv if it is not full. + */ + if (block_rsv == delayed_rsv) target = global_rsv; + else if (block_rsv != global_rsv && !delayed_rsv->full) + target = delayed_rsv; - if (block_rsv->space_info != target->space_info) + if (target && block_rsv->space_info != target->space_info) target = NULL; return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, -- cgit v1.2.3 From 424a47805a819ff59ada9549c4a21fe0ba95c446 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 13:47:21 -0400 Subject: btrfs: stop using block_rsv_release_bytes everywhere block_rsv_release_bytes() is the internal to the block_rsv code, and shouldn't be called directly by anything else. Switch all users to the exported helpers. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a457776c3e41..eefdad883be8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4744,12 +4744,11 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); u64 released = 0; - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, - num_bytes, NULL); + released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, + NULL); if (released) trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, released, 0); @@ -4834,8 +4833,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { - block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1, NULL); + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); @@ -4884,8 +4882,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) WARN_ON_ONCE(!list_empty(&trans->new_bgs)); - block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, - trans->chunk_bytes_reserved, NULL); + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved); trans->chunk_bytes_reserved = 0; } @@ -7429,7 +7427,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) { btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); + btrfs_block_rsv_release(fs_info, block_rsv, 0); } /* -- cgit v1.2.3 From 550fa228ee7ebea3c3499897a7aef6536de73fff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 13:47:22 -0400 Subject: btrfs: migrate the block-rsv code to block-rsv.c This moves everything out of extent-tree.c to block-rsv.c. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/Makefile | 3 +- fs/btrfs/block-rsv.c | 253 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/extent-tree.c | 246 ----------------------------------------------- 3 files changed, 255 insertions(+), 247 deletions(-) create mode 100644 fs/btrfs/block-rsv.c (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ae5fad57bc9c..032700ae7704 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ + block-rsv.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c new file mode 100644 index 000000000000..250c383308f9 --- /dev/null +++ b/fs/btrfs/block-rsv.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "block-rsv.h" +#include "space-info.h" +#include "math.h" + +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; + u64 ret; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) { + num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } + spin_unlock(&block_rsv->lock); + + ret = num_bytes; + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + num_bytes); + } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; + return ret; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + bool update_size) +{ + int ret; + + ret = btrfs_block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + rsv->type = type; +} + +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type) +{ + struct btrfs_block_rsv *block_rsv; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + if (!rsv) + return; + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + kfree(rsv); +} + +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) + ret = 0; + else + num_bytes -= block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (!ret) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); + return 0; + } + + return ret; +} + +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = NULL; + + /* + * If we are the delayed_rsv then push to the global rsv, otherwise dump + * into the delayed rsv if it is not full. + */ + if (block_rsv == delayed_rsv) + target = global_rsv; + else if (block_rsv != global_rsv && !delayed_rsv->full) + target = delayed_rsv; + + if (target && block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) +{ + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + btrfs_block_rsv_add_bytes(dest, num_bytes, true); + return 0; +} diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index eefdad883be8..668a61724e89 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4368,57 +4368,6 @@ static struct btrfs_block_rsv *get_block_rsv( return block_rsv; } -int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) -{ - int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { - block_rsv->reserved -= num_bytes; - if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; - ret = 0; - } - spin_unlock(&block_rsv->lock); - return ret; -} - -void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, bool update_size) -{ - spin_lock(&block_rsv->lock); - block_rsv->reserved += num_bytes; - if (update_size) - block_rsv->size += num_bytes; - else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; - spin_unlock(&block_rsv->lock); -} - -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; - spin_unlock(&global_rsv->lock); - - btrfs_block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - /** * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. * @fs_info - the fs info for our fs. @@ -4503,201 +4452,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, return 0; } -static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes, - u64 *qgroup_to_release_ret) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 qgroup_to_release = 0; - u64 ret; - - spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) { - num_bytes = block_rsv->size; - qgroup_to_release = block_rsv->qgroup_rsv_size; - } - block_rsv->size -= num_bytes; - if (block_rsv->reserved >= block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; - } else { - num_bytes = 0; - } - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { - qgroup_to_release = block_rsv->qgroup_rsv_reserved - - block_rsv->qgroup_rsv_size; - block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; - } else { - qgroup_to_release = 0; - } - spin_unlock(&block_rsv->lock); - - ret = num_bytes; - if (num_bytes > 0) { - if (dest) { - spin_lock(&dest->lock); - if (!dest->full) { - u64 bytes_to_add; - - bytes_to_add = dest->size - dest->reserved; - bytes_to_add = min(num_bytes, bytes_to_add); - dest->reserved += bytes_to_add; - if (dest->reserved >= dest->size) - dest->full = 1; - num_bytes -= bytes_to_add; - } - spin_unlock(&dest->lock); - } - if (num_bytes) - btrfs_space_info_add_old_bytes(fs_info, space_info, - num_bytes); - } - if (qgroup_to_release_ret) - *qgroup_to_release_ret = qgroup_to_release; - return ret; -} - -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes, - bool update_size) -{ - int ret; - - ret = btrfs_block_rsv_use_bytes(src, num_bytes); - if (ret) - return ret; - - btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); - return 0; -} - -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) -{ - memset(rsv, 0, sizeof(*rsv)); - spin_lock_init(&rsv->lock); - rsv->type = type; -} - -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type) -{ - btrfs_init_block_rsv(rsv, type); - rsv->space_info = btrfs_find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); -} - -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) -{ - struct btrfs_block_rsv *block_rsv; - - block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); - if (!block_rsv) - return NULL; - - btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); - return block_rsv; -} - -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - if (!rsv) - return; - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); - kfree(rsv); -} - -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush) -{ - int ret; - - if (num_bytes == 0) - return 0; - - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); - - return ret; -} - -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); - if (block_rsv->reserved >= num_bytes) - ret = 0; - spin_unlock(&block_rsv->lock); - - return ret; -} - -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = min_reserved; - if (block_rsv->reserved >= num_bytes) - ret = 0; - else - num_bytes -= block_rsv->reserved; - spin_unlock(&block_rsv->lock); - - if (!ret) - return 0; - - ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); - return 0; - } - - return ret; -} - -u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *target = NULL; - - /* - * If we are the delayed_rsv then push to the global rsv, otherwise dump - * into the delayed rsv if it is not full. - */ - if (block_rsv == delayed_rsv) - target = global_rsv; - else if (block_rsv != global_rsv && !delayed_rsv->full) - target = delayed_rsv; - - if (target && block_rsv->space_info != target->space_info) - target = NULL; - - return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, - qgroup_to_release); -} - /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. -- cgit v1.2.3 From 67f9c2209e885c8f97215655739155c94ca9cb9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 13:47:23 -0400 Subject: btrfs: migrate the global_block_rsv helpers to block-rsv.c These helpers belong in block-rsv.c Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 172 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/block-rsv.h | 15 ++++ fs/btrfs/extent-tree.c | 189 ++----------------------------------------------- 3 files changed, 193 insertions(+), 183 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 250c383308f9..698470b9f32d 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -4,6 +4,7 @@ #include "block-rsv.h" #include "space-info.h" #include "math.h" +#include "transaction.h" static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, @@ -251,3 +252,174 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, btrfs_block_rsv_add_bytes(dest, num_bytes, true); return 0; } + +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + num_bytes = max_t(u64, num_bytes, SZ_16M); + + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); + + block_rsv->size = min_t(u64, num_bytes, SZ_512M); + + if (block_rsv->reserved < block_rsv->size) { + num_bytes = btrfs_space_info_used(sinfo, true); + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + num_bytes = min(num_bytes, + block_rsv->size - block_rsv->reserved); + block_rsv->reserved += num_bytes; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, + 1); + } + } else if (block_rsv->reserved > block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + -num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, 0); + block_rsv->reserved = block_rsv->size; + } + + if (block_rsv->reserved == block_rsv->size) + block_rsv->full = 1; + else + block_rsv->full = 0; + + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); +} + +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; + + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_update_global_block_rsv(fs_info); +} + +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); +} + +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = NULL; + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == fs_info->csum_root && trans->adding_csums) || + (root == fs_info->uuid_root)) + block_rsv = trans->block_rsv; + + if (!block_rsv) + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &fs_info->empty_block_rsv; + + return block_rsv; +} + +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool global_updated = false; + + block_rsv = get_block_rsv(trans, root); + + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + btrfs_update_global_block_rsv(fs_info); + goto again; + } + + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv returned %d\n", ret); + } +try_reserve: + ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index feb0a4d4b1c9..d1428bb73fc5 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -3,6 +3,7 @@ #ifndef BTRFS_BLOCK_RSV_H #define BTRFS_BLOCK_RSV_H +struct btrfs_trans_handle; enum btrfs_reserve_flush_enum; /* @@ -75,6 +76,12 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, u64 *qgroup_to_release); +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize); static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, @@ -83,4 +90,12 @@ static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); } +static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u32 blocksize) +{ + btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); + btrfs_block_rsv_release(fs_info, block_rsv, 0); +} + #endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 668a61724e89..ad05d8011dcc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4347,27 +4347,6 @@ out: return ret; } -static struct btrfs_block_rsv *get_block_rsv( - const struct btrfs_trans_handle *trans, - const struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv = NULL; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - (root == fs_info->csum_root && trans->adding_csums) || - (root == fs_info->uuid_root)) - block_rsv = trans->block_rsv; - - if (!block_rsv) - block_rsv = root->block_rsv; - - if (!block_rsv) - block_rsv = &fs_info->empty_block_rsv; - - return block_rsv; -} - /** * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. * @fs_info - the fs info for our fs. @@ -4508,95 +4487,6 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) 0, released, 0); } -static void update_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; - struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - - /* - * The global block rsv is based on the size of the extent tree, the - * checksum tree and the root tree. If the fs is empty we want to set - * it to a minimal amount for safety. - */ - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + - btrfs_root_used(&fs_info->csum_root->root_item) + - btrfs_root_used(&fs_info->tree_root->root_item); - num_bytes = max_t(u64, num_bytes, SZ_16M); - - spin_lock(&sinfo->lock); - spin_lock(&block_rsv->lock); - - block_rsv->size = min_t(u64, num_bytes, SZ_512M); - - if (block_rsv->reserved < block_rsv->size) { - num_bytes = btrfs_space_info_used(sinfo, true); - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - num_bytes = min(num_bytes, - block_rsv->size - block_rsv->reserved); - block_rsv->reserved += num_bytes; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, - 1); - } - } else if (block_rsv->reserved > block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, - -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 0); - block_rsv->reserved = block_rsv->size; - } - - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; - - spin_unlock(&block_rsv->lock); - spin_unlock(&sinfo->lock); -} - -static void init_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - fs_info->chunk_block_rsv.space_info = space_info; - - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - fs_info->global_block_rsv.space_info = space_info; - fs_info->trans_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.space_info = space_info; - fs_info->delayed_block_rsv.space_info = space_info; - fs_info->delayed_refs_rsv.space_info = space_info; - - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - if (fs_info->quota_root) - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - - update_global_block_rsv(fs_info); -} - -static void release_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); - WARN_ON(fs_info->trans_block_rsv.size > 0); - WARN_ON(fs_info->trans_block_rsv.reserved > 0); - WARN_ON(fs_info->chunk_block_rsv.size > 0); - WARN_ON(fs_info->chunk_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_block_rsv.size > 0); - WARN_ON(fs_info->delayed_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.size > 0); -} /* * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv @@ -5360,7 +5250,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) up_write(&fs_info->commit_root_sem); - update_global_block_rsv(fs_info); + btrfs_update_global_block_rsv(fs_info); } /* @@ -7117,73 +7007,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return buf; } -static struct btrfs_block_rsv * -use_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool global_updated = false; - - block_rsv = get_block_rsv(trans, root); - - if (unlikely(block_rsv->size == 0)) - goto try_reserve; -again: - ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); - if (!ret) - return block_rsv; - - if (block_rsv->failfast) - return ERR_PTR(ret); - - if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { - global_updated = true; - update_global_block_rsv(fs_info); - goto again; - } - - /* - * The global reserve still exists to save us from ourselves, so don't - * warn_on if we are short on our delayed refs reserve. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && - btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "BTRFS: block rsv returned %d\n", ret); - } -try_reserve: - ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) - return block_rsv; - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve if its space type is the same as the global - * reservation. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && - block_rsv->space_info == global_rsv->space_info) { - ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - return ERR_PTR(ret); -} - -static void unuse_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u32 blocksize) -{ - btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); - btrfs_block_rsv_release(fs_info, block_rsv, 0); -} - /* * finds a free extent and does all the dirty work required for allocation * returns the tree buffer or an ERR_PTR on error. @@ -7216,7 +7039,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, } #endif - block_rsv = use_block_rsv(trans, root, blocksize); + block_rsv = btrfs_use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -7274,7 +7097,7 @@ out_free_buf: out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); out_unuse: - unuse_block_rsv(fs_info, block_rsv, blocksize); + btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); } @@ -8761,7 +8584,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ synchronize_rcu(); - release_global_block_rsv(info); + btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { int i; @@ -9113,7 +8936,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } btrfs_add_raid_kobjects(info); - init_global_block_rsv(info); + btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); @@ -9227,7 +9050,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - update_global_block_rsv(fs_info); + btrfs_update_global_block_rsv(fs_info); link_block_group(cache); -- cgit v1.2.3 From 9978059be8a1afd68bc0f7ab4c1883633ddd0312 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 21 Jun 2019 10:02:54 -0500 Subject: btrfs: Evaluate io_tree in find_lock_delalloc_range() Simplification. No point passing the tree variable when it can be evaluated from inode. The tests now use the io_tree from btrfs_inode as opposed to creating one. Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 ++---- fs/btrfs/extent_io.h | 2 +- fs/btrfs/tests/extent-io-tests.c | 30 ++++++++++++++++-------------- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1eb671c16ff1..6b154bce5687 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1780,10 +1780,10 @@ static noinline int lock_delalloc_pages(struct inode *inode, */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -3330,7 +3330,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, struct page *page, struct writeback_control *wbc, u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; bool found; u64 delalloc_to_write = 0; @@ -3340,8 +3339,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, while (delalloc_end < page_end) { - found = find_lock_delalloc_range(inode, tree, - page, + found = find_lock_delalloc_range(inode, page, &delalloc_start, &delalloc_end); if (!found) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6e13a62a2974..401423b16976 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, +bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end); #endif diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f0ccfb6449d2..1bf6b5a79191 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -10,6 +10,7 @@ #include "btrfs-tests.h" #include "../ctree.h" #include "../extent_io.h" +#include "../btrfs_inode.h" #define PROCESS_UNLOCK (1 << 0) #define PROCESS_RELEASE (1 << 1) @@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, static int test_find_delalloc(u32 sectorsize) { struct inode *inode; - struct extent_io_tree tmp; + struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; unsigned long index = 0; @@ -76,12 +77,13 @@ static int test_find_delalloc(u32 sectorsize) test_std_err(TEST_ALLOC_INODE); return -ENOMEM; } + tmp = &BTRFS_I(inode)->io_tree; /* * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -108,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); + set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -122,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); unlock_page(locked_page); put_page(locked_page); @@ -139,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); + set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -158,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* locked_page was unlocked above */ put_page(locked_page); @@ -176,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -194,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); + set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -213,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* * Now to test where we run into a page that is no longer dirty in the @@ -238,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -256,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); + clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); -- cgit v1.2.3 From 6ef03debdb3d82d7deec65f96e143b9adcfb2cd4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 15:11:58 -0400 Subject: btrfs: migrate the delayed refs rsv code These belong with the delayed refs related code, not in extent-tree.c. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 --- fs/btrfs/delayed-ref.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/delayed-ref.h | 10 +++ fs/btrfs/extent-tree.c | 174 ------------------------------------------------- 4 files changed, 184 insertions(+), 183 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index adb6d2747e0c..45d6fc7ed21f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2622,8 +2622,6 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2784,13 +2782,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush); -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a94fae897b3f..9a91d1eb0af4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -10,6 +10,7 @@ #include "delayed-ref.h" #include "transaction.h" #include "qgroup.h" +#include "space-info.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; * of hammering updates on the extent allocation tree. */ +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; + + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); + + /* + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. + */ + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); + return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) +{ + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (val >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans->fs_info); +} + +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; + + released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, + NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} + +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + btrfs_space_info_add_old_bytes(fs_info, + delayed_refs_rsv->space_info, to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * compare two delayed tree backrefs with same bytenr and type */ diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c18f93ea88ed..1c977e6d45dc 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -364,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); + /* * helper functions to cast a node into its container */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ad05d8011dcc..413a80904673 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2753,49 +2753,6 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - bool ret = false; - u64 reserved; - - spin_lock(&global_rsv->lock); - reserved = global_rsv->reserved; - spin_unlock(&global_rsv->lock); - - /* - * Since the global reserve is just kind of magic we don't really want - * to rely on it to save our bacon, so if our size is more than the - * delayed_refs_rsv and the global rsv then it's time to think about - * bailing. - */ - spin_lock(&delayed_refs_rsv->lock); - reserved += delayed_refs_rsv->reserved; - if (delayed_refs_rsv->size >= reserved) - ret = true; - spin_unlock(&delayed_refs_rsv->lock); - return ret; -} - -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) -{ - u64 num_entries = - atomic_read(&trans->transaction->delayed_refs.num_entries); - u64 avg_runtime; - u64 val; - - smp_mb(); - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; - val = num_entries * avg_runtime; - if (val >= NSEC_PER_SEC) - return 1; - if (val >= NSEC_PER_SEC / 2) - return 2; - - return btrfs_check_space_for_delayed_refs(trans->fs_info); -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -4347,90 +4304,6 @@ out: return ret; } -/** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. - * - * This transfers up to the num_bytes amount from the src rsv to the - * delayed_refs_rsv. Any extra bytes are returned to the space info. - */ -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - u64 to_free = 0; - - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { - u64 delta = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - if (num_bytes > delta) { - to_free = num_bytes - delta; - num_bytes = delta; - } - } else { - to_free = num_bytes; - num_bytes = 0; - } - - if (num_bytes) - delayed_refs_rsv->reserved += num_bytes; - if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; - spin_unlock(&delayed_refs_rsv->lock); - - if (num_bytes) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - if (to_free) - btrfs_space_info_add_old_bytes(fs_info, - delayed_refs_rsv->space_info, to_free); -} - -/** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. - * - * This will refill the delayed block_rsv up to 1 items size worth of space and - * will return -ENOSPC if we can't make the reservation. - */ -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); - u64 num_bytes = 0; - int ret = -ENOSPC; - - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - num_bytes = block_rsv->size - block_rsv->reserved; - num_bytes = min(num_bytes, limit); - } - spin_unlock(&block_rsv->lock); - - if (!num_bytes) - return 0; - - ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); - if (ret) - return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - return 0; -} - /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. @@ -4466,53 +4339,6 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) qgroup_to_release); } -/** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. - * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. - */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); - u64 released = 0; - - released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, - NULL); - if (released) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, released, 0); -} - - -/* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs - * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. - */ -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes; - - if (!trans->delayed_ref_updates) - return; - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, - trans->delayed_ref_updates); - spin_lock(&delayed_rsv->lock); - delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; - spin_unlock(&delayed_rsv->lock); - trans->delayed_ref_updates = 0; -} - /* * To be called after all the new block groups attached to the transaction * handle have been created (btrfs_create_pending_block_groups()). -- cgit v1.2.3 From fb6dea26601b60e41d70c310537dd1e2617b25b6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 15:11:59 -0400 Subject: btrfs: migrate btrfs_trans_release_chunk_metadata Move this into transaction.c with the rest of the transaction related code. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/extent-tree.c | 18 ------------------ fs/btrfs/transaction.c | 18 ++++++++++++++++++ fs/btrfs/transaction.h | 1 + 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 45d6fc7ed21f..56e6746010fc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2768,7 +2768,6 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len, bool qgroup_free); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 413a80904673..1987daf8e439 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4339,24 +4339,6 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) qgroup_to_release); } -/* - * To be called after all the new block groups attached to the transaction - * handle have been created (btrfs_create_pending_block_groups()). - */ -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (!trans->chunk_bytes_reserved) - return; - - WARN_ON_ONCE(!list_empty(&trans->new_bgs)); - - btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, - trans->chunk_bytes_reserved); - trans->chunk_bytes_reserved = 0; -} - /* * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation * root: the root of the parent directory diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3f6811cdf803..3b8ae1a8f02d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -128,6 +128,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) return atomic_read(&trans->num_extwriters); } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* * either allocate a new transaction or hop into the existing one */ diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 78c446c222b7..527ea94b57d9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -224,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); #endif -- cgit v1.2.3 From 867363429d706984915cb4b1f299ce05f8413e23 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 15:12:00 -0400 Subject: btrfs: migrate the delalloc space stuff to it's own home We have code for data and metadata reservations for delalloc. There's quite a bit of code here, and it's used in a lot of places so I've separated it out to it's own file. inode.c and file.c are already pretty large, and this code is complicated enough to live in its own space. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/ctree.h | 14 -- fs/btrfs/delalloc-space.c | 494 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/delalloc-space.h | 23 +++ fs/btrfs/extent-tree.c | 485 +------------------------------------------ fs/btrfs/file.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/inode-map.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/ordered-data.c | 1 + fs/btrfs/relocation.c | 1 + 12 files changed, 526 insertions(+), 499 deletions(-) create mode 100644 fs/btrfs/delalloc-space.c create mode 100644 fs/btrfs/delalloc-space.h (limited to 'fs/btrfs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 032700ae7704..76a843198bcb 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o + block-rsv.o delalloc-space.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 56e6746010fc..299e11e6c554 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2758,16 +2758,6 @@ enum btrfs_chunk_alloc_enum { int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, enum btrfs_chunk_alloc_enum force); -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -2777,10 +2767,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c new file mode 100644 index 000000000000..17f7c0d38768 --- /dev/null +++ b/fs/btrfs/delalloc-space.c @@ -0,0 +1,494 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "delalloc-space.h" +#include "block-rsv.h" +#include "btrfs_inode.h" +#include "space-info.h" +#include "transaction.h" +#include "qgroup.h" + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + u64 used; + int ret = 0; + int need_commit = 2; + int have_pinned_space; + + /* Make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, fs_info->sectorsize); + + if (btrfs_is_free_space_inode(inode)) { + need_commit = 0; + ASSERT(current->journal_info); + } + +again: + /* Make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = btrfs_space_info_used(data_sinfo, true); + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; + + /* + * If we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&data_sinfo->lock); + + alloc_target = btrfs_data_alloc_profile(fs_info); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_chunk_alloc(trans, alloc_target, + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans); + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else { + have_pinned_space = 1; + goto commit_trans; + } + } + + goto again; + } + + /* + * If we don't have enough pinned space to deal with this + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. + */ + have_pinned_space = __percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + spin_unlock(&data_sinfo->lock); + + /* Commit the current transaction and try again */ +commit_trans: + if (need_commit) { + need_commit--; + + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, + (u64)-1); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + if (have_pinned_space >= 0 || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || + need_commit > 0) { + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + /* + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. We don't need to + * explicitly run the delayed iputs here because + * the commit_transaction would have woken up + * the cleaner. + */ + ret = btrfs_wait_on_delayed_iputs(fs_info); + if (ret) + return ret; + goto again; + } else { + btrfs_end_transaction(trans); + } + } + + trace_btrfs_space_reservation(fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return -ENOSPC; + } + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + data_sinfo->flags, bytes, 1); + spin_unlock(&data_sinfo->lock); + + return 0; +} + +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + + /* align the range */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); + if (ret < 0) + return ret; + + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) + btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). + */ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_space_info *data_sinfo; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + data_sinfo = fs_info->data_sinfo; + spin_lock(&data_sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); + trace_btrfs_space_reservation(fs_info, "space_info", + data_sinfo->flags, len, 0); + spin_unlock(&data_sinfo->lock); +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-inode data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->fs_info->sectorsize) - + round_down(start, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); +} + +/** + * btrfs_inode_rsv_release - release any excessive reservation. + * @inode - the inode we need to release from. + * @qgroup_free - free or convert qgroup meta. + * Unlike normal operation, qgroup meta reservation needs to know if we are + * freeing qgroup reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal release. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + u64 qgroup_to_release = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); + else + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); +} + +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; + + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + if (outstanding_extents) + reserve_size = btrfs_calc_trans_metadata_size(fs_info, + outstanding_extents + 1); + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_trans_metadata_size(fs_info, + csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; + + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; + spin_unlock(&block_rsv->lock); +} + +static void calc_inode_reservations(struct btrfs_fs_info *fs_info, + u64 num_bytes, u64 *meta_reserve, + u64 *qgroup_reserve) +{ + u64 nr_extents = count_max_extents(num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + + /* We add one for the inode update at finish ordered time */ + *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, + nr_extents + csum_leaves + 1); + *qgroup_reserve = nr_extents * fs_info->nodesize; +} + +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 meta_reserve, qgroup_reserve; + unsigned nr_extents; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + bool delalloc_lock = true; + + /* + * If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } else { + if (current->journal_info) + flush = BTRFS_RESERVE_FLUSH_LIMIT; + + if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); + } + + if (delalloc_lock) + mutex_lock(&inode->delalloc_mutex); + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + + /* + * We always want to do it this way, every other way is wrong and ends + * in tears. Pre-reserving the amount we are going to add will always + * be the right way, because otherwise if we have enough parallelism we + * could end up with thousands of inodes all holding little bits of + * reservations they were able to make previously and the only way to + * reclaim that space is to ENOSPC out the operations and clear + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ + calc_inode_reservations(fs_info, num_bytes, &meta_reserve, + &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + if (ret) + goto out_fail; + ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + if (ret) + goto out_qgroup; + + /* + * Now we need to update our outstanding extents and csum bytes _first_ + * and then add the reservation to the block_rsv. This keeps us from + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ + spin_lock(&inode->lock); + nr_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + /* Now we can safely add our space to our block rsv */ + btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), meta_reserve, 1); + + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_reserve; + spin_unlock(&block_rsv->lock); + + if (delalloc_lock) + mutex_unlock(&inode->delalloc_mutex); + return 0; +out_qgroup: + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); +out_fail: + btrfs_inode_rsv_release(inode, true); + if (delalloc_lock) + mutex_unlock(&inode->delalloc_mutex); + return ret; +} + +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations, or on error for the same reason. + */ +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + spin_lock(&inode->lock); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/** + * btrfs_delalloc_release_extents - release our outstanding_extents + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * @qgroup_free: do we need to free qgroup meta reservation or convert them. + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. + * + * This will do the following things + * + * - reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * - reserve space for metadata space, based on the number of outstanding + * extents and how much csums will be needed + * also reserve metadata space in a per root over-reserve method. + * - add to the inodes->delalloc_bytes + * - add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) + * + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) + */ +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, reserved, start, len); + if (ret < 0) + return ret; + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, *reserved, start, len); + return ret; +} + +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @start: start position of the space already reserved + * @len: the len of the space already reserved + * @release_bytes: the len of the space we consumed or didn't use + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. + */ +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free) +{ + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); + btrfs_free_reserved_data_space(inode, reserved, start, len); +} diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h new file mode 100644 index 000000000000..54466fbd7075 --- /dev/null +++ b/fs/btrfs/delalloc-space.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DELALLOC_SPACE_H +#define BTRFS_DELALLOC_SPACE_H + +struct extent_changeset; + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); + +#endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1987daf8e439..9870310a861b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -30,6 +30,7 @@ #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" +#include "delalloc-space.h" #undef SCRAMBLE_DELAYED_REFS @@ -3867,206 +3868,6 @@ u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; - - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, fs_info->sectorsize); - - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } - -again: - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * if we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, bytes, 1); - spin_unlock(&data_sinfo->lock); - - return 0; -} - -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; - - /* align the range */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); - if (ret < 0) - return ret; - - /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); - if (ret < 0) - btrfs_free_reserved_data_space_noquota(inode, start, len); - else - ret = 0; - return ret; -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will *NOT* use accurate qgroup reserved space API, just for case - * which we can't sleep and is sure it won't affect qgroup reserved space. - * Like clear_bit_hook(). - */ -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_space_info *data_sinfo; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, len, 0); - spin_unlock(&data_sinfo->lock); -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will handle the per-inode data rsv map for accurate reserved - * space framework. - */ -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, root->fs_info->sectorsize) - - round_down(start, root->fs_info->sectorsize); - start = round_down(start, root->fs_info->sectorsize); - - btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, reserved, start, len); -} - static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4304,41 +4105,6 @@ out: return ret; } -/** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. - * - * This is the same as btrfs_block_rsv_release, except that it handles the - * tracepoint for the reservation. - */ -static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 released = 0; - u64 qgroup_to_release = 0; - - /* - * Since we statically set the block_rsv->size we just want to say we - * are releasing 0 bytes, and then we'll just get the reservation over - * the size free'd. - */ - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, - &qgroup_to_release); - if (released > 0) - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), released, 0); - if (qgroup_free) - btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); - else - btrfs_qgroup_convert_reserved_meta(inode->root, - qgroup_to_release); -} - /* * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation * root: the root of the parent directory @@ -4393,255 +4159,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, rsv, (u64)-1); } -static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, - struct btrfs_inode *inode) -{ - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 reserve_size = 0; - u64 qgroup_rsv_size = 0; - u64 csum_leaves; - unsigned outstanding_extents; - - lockdep_assert_held(&inode->lock); - outstanding_extents = inode->outstanding_extents; - if (outstanding_extents) - reserve_size = btrfs_calc_trans_metadata_size(fs_info, - outstanding_extents + 1); - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, - inode->csum_bytes); - reserve_size += btrfs_calc_trans_metadata_size(fs_info, - csum_leaves); - /* - * For qgroup rsv, the calculation is very simple: - * account one nodesize for each outstanding extent - * - * This is overestimating in most cases. - */ - qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; - - spin_lock(&block_rsv->lock); - block_rsv->size = reserve_size; - block_rsv->qgroup_rsv_size = qgroup_rsv_size; - spin_unlock(&block_rsv->lock); -} - -static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) -{ - u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); - - /* We add one for the inode update at finish ordered time */ - *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, - nr_extents + csum_leaves + 1); - *qgroup_reserve = nr_extents * fs_info->nodesize; -} - -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 meta_reserve, qgroup_reserve; - unsigned nr_extents; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret = 0; - bool delalloc_lock = true; - - /* If we are a free space inode we need to not flush since we will be in - * the middle of a transaction commit. We also don't need the delalloc - * mutex since we won't race with anybody. We need this mostly to make - * lockdep shut its filthy mouth. - * - * If we have a transaction open (can happen if we call truncate_block - * from truncate), then we need FLUSH_LIMIT so we don't deadlock. - */ - if (btrfs_is_free_space_inode(inode)) { - flush = BTRFS_RESERVE_NO_FLUSH; - delalloc_lock = false; - } else { - if (current->journal_info) - flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - } - - if (delalloc_lock) - mutex_lock(&inode->delalloc_mutex); - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - - /* - * We always want to do it this way, every other way is wrong and ends - * in tears. Pre-reserving the amount we are going to add will always - * be the right way, because otherwise if we have enough parallelism we - * could end up with thousands of inodes all holding little bits of - * reservations they were able to make previously and the only way to - * reclaim that space is to ENOSPC out the operations and clear - * everything out and try again, which is bad. This way we just - * over-reserve slightly, and clean up the mess when we are done. - */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); - if (ret) - goto out_fail; - ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); - if (ret) - goto out_qgroup; - - /* - * Now we need to update our outstanding extents and csum bytes _first_ - * and then add the reservation to the block_rsv. This keeps us from - * racing with an ordered completion or some such that would think it - * needs to free the reservation we just made. - */ - spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - /* Now we can safely add our space to our block rsv */ - btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), meta_reserve, 1); - - spin_lock(&block_rsv->lock); - block_rsv->qgroup_rsv_reserved += qgroup_reserve; - spin_unlock(&block_rsv->lock); - - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return 0; -out_qgroup: - btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); -out_fail: - btrfs_inode_rsv_release(inode, true); - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return ret; -} - -/** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation - * - * This will release the metadata reservation for an inode. This can be called - * once we complete IO for a given set of bytes to release their metadata - * reservations, or on error for the same reason. - */ -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&inode->lock); - inode->csum_bytes -= num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with - * @qgroup_free: do we need to free qgroup meta reservation or convert them. - * - * When we reserve space we increase outstanding_extents for the extents we may - * add. Once we've set the range as delalloc or created our ordered extents we - * have outstanding_extents to track the real usage, so we use this to free our - * temporarily tracked outstanding_extents. This _must_ be used in conjunction - * with btrfs_delalloc_reserve_metadata. - */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned num_extents; - - spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, -num_extents); - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. - * - * This will do the following things - * - * o reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space - * (Done in check_data_free_space) - * - * o reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. - * o add to the inodes->delalloc_bytes - * o add it to the fs_info's delalloc inodes list. - * (Above 3 all done in delalloc_reserve_metadata) - * - * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) - */ -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - int ret; - - ret = btrfs_check_data_free_space(inode, reserved, start, len); - if (ret < 0) - return ret; - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); - if (ret < 0) - btrfs_free_reserved_data_space(inode, *reserved, start, len); - return ret; -} - -/** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use - * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. - */ -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free) -{ - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); - btrfs_free_reserved_data_space(inode, reserved, start, len); -} - static int update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a8e0fc2d1c86..58a18ed11546 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -26,6 +26,7 @@ #include "volumes.h" #include "qgroup.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 390cd3d7d5ea..062be9dde4c6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -19,6 +19,7 @@ #include "inode-map.h" #include "volumes.h" #include "space-info.h" +#include "delalloc-space.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ffca2abf13d0..2e8bb402050b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -11,6 +11,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "transaction.h" +#include "delalloc-space.h" static int caching_kthread(void *data) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 525790a38f6d..1af069a9a0c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -47,6 +47,7 @@ #include "props.h" #include "qgroup.h" #include "dedupe.h" +#include "delalloc-space.h" struct btrfs_iget_args { struct btrfs_key *location; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ca32b97b6b4b..d0743ec1231d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -44,6 +44,7 @@ #include "tree-log.h" #include "compression.h" #include "space-info.h" +#include "delalloc-space.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 3f67b7827393..1744ba8b2754 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -13,6 +13,7 @@ #include "extent_io.h" #include "disk-io.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_ordered_extent_cache; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 22a3c69864fa..7f219851fa23 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -20,6 +20,7 @@ #include "inode-map.h" #include "qgroup.h" #include "print-tree.h" +#include "delalloc-space.h" /* * backref_node, mapping_node and tree_block start with this -- cgit v1.2.3 From 28a32d2b1a6d7860e0b364c34a6b4205dce85537 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Jun 2019 15:12:01 -0400 Subject: btrfs: move the subvolume reservation stuff out of extent-tree.c This is just two functions, put it in root-tree.c since it involves root items. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 54 ------------------------------------------------ fs/btrfs/root-tree.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 54 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9870310a861b..d3b58e388535 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4105,60 +4105,6 @@ out: return ret; } -/* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation - * root: the root of the parent directory - * rsv: block reservation - * items: the number of items that we need do reservation - * use_global_rsv: allow fallback to the global block reservation - * - * This function is used to reserve the space for snapshot/subvolume - * creation and deletion. Those operations are different with the - * common file/directory operations, they change two fs/file trees - * and root tree, the number of items that the qgroup reserves is - * different with the free space reservation. So we can not use - * the space reservation mechanism in start_transaction(). - */ -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, int items, - bool use_global_rsv) -{ - u64 qgroup_num_bytes = 0; - u64 num_bytes; - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - /* One for parent inode, two for dir entries */ - qgroup_num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); - if (ret) - return ret; - } - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); - rsv->space_info = btrfs_find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); - ret = btrfs_block_rsv_add(root, rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); - - if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); - - if (ret && qgroup_num_bytes) - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); - - return ret; -} - -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); -} - static int update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 22124122728c..47733fb55df7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -9,6 +9,8 @@ #include "transaction.h" #include "disk-io.h" #include "print-tree.h" +#include "qgroup.h" +#include "space-info.h" /* * Read a root item from the tree. In case we detect a root item smaller then @@ -497,3 +499,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); spin_unlock(&root->root_item_lock); } + +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * use_global_rsv: allow fallback to the global block reservation + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reservation mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, int items, + bool use_global_rsv) +{ + u64 qgroup_num_bytes = 0; + u64 num_bytes; + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* One for parent inode, two for dir entries */ + qgroup_num_bytes = 3 * fs_info->nodesize; + ret = btrfs_qgroup_reserve_meta_prealloc(root, + qgroup_num_bytes, true); + if (ret) + return ret; + } + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); + + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); +} -- cgit v1.2.3 From e02d48eaaed77f6c36916a7aa65c451e1f9d9aab Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 5 Jul 2019 08:26:24 +0100 Subject: btrfs: fix memory leak of path on error return path Currently if the allocation of roots or tmp_ulist fails the error handling does not free up the allocation of path causing a memory leak. Fix this and other similar leaks by moving the call of btrfs_free_path from label out to label out_free_ulist. Kudos to David Sterba for spotting the issue in my original fix and suggesting the correct way to fix the leak and Anand Jain for spotting a double free issue. Addresses-Coverity: ("Resource leak") Fixes: 5911c8fe05c5 ("btrfs: fiemap: preallocate ulists for btrfs_check_shared") Reviewed-by: Nikolay Borisov Signed-off-by: Colin Ian King Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6b154bce5687..aea990473392 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4611,7 +4611,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), -1, 0); if (ret < 0) { - btrfs_free_path(path); goto out_free_ulist; } else { WARN_ON(!ret); @@ -4764,11 +4763,11 @@ out_free: ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); out_free_ulist: + btrfs_free_path(path); ulist_free(roots); ulist_free(tmp_ulist); return ret; -- cgit v1.2.3