From 2f96e40212d435b328459ba6b3956395eed8fa9f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jan 2021 16:26:17 -0500 Subject: btrfs: fix possible free space tree corruption with online conversion While running btrfs/011 in a loop I would often ASSERT() while trying to add a new free space entry that already existed, or get an EEXIST while adding a new block to the extent tree, which is another indication of double allocation. This occurs because when we do the free space tree population, we create the new root and then populate the tree and commit the transaction. The problem is when you create a new root, the root node and commit root node are the same. During this initial transaction commit we will run all of the delayed refs that were paused during the free space tree generation, and thus begin to cache block groups. While caching block groups the caching thread will be reading from the main root for the free space tree, so as we make allocations we'll be changing the free space tree, which can cause us to add the same range twice which results in either the ASSERT(ret != -EEXIST); in __btrfs_add_free_space, or in a variety of different errors when running delayed refs because of a double allocation. Fix this by marking the fs_info as unsafe to load the free space tree, and fall back on the old slow method. We could be smarter than this, for example caching the block group while we're populating the free space tree, but since this is a serious problem I've opted for the simplest solution. CC: stable@vger.kernel.org # 4.9+ Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 10 +++++++++- fs/btrfs/ctree.h | 3 +++ fs/btrfs/free-space-tree.c | 10 +++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0886e81e5540..48ebc106a606 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -673,7 +673,15 @@ static noinline void caching_thread(struct btrfs_work *work) wake_up(&caching_ctl->wait); } - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + /* + * If we are in the transaction that populated the free space tree we + * can't actually cache from the free space tree as our commit root and + * real root are the same, so we could change the contents of the blocks + * while caching. Instead do the slow caching in this case, and after + * the transaction has committed we will be safe. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) ret = load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0225c5208f44..47ca8edafb5e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -564,6 +564,9 @@ enum { /* Indicate that we need to cleanup space cache v1 */ BTRFS_FS_CLEANUP_SPACE_CACHE_V1, + + /* Indicate that we can't trust the free space tree for caching yet */ + BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, }; /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index e33a65bd9a0c..a33bca94d133 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1150,6 +1150,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); set_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + set_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); free_space_root = btrfs_create_tree(trans, BTRFS_FREE_SPACE_TREE_OBJECTID); if (IS_ERR(free_space_root)) { @@ -1171,11 +1172,18 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + ret = btrfs_commit_transaction(trans); - return btrfs_commit_transaction(trans); + /* + * Now that we've committed the transaction any reading of our commit + * root will be safe, so we can cache from the free space tree now. + */ + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); + return ret; abort: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); + clear_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags); btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; -- cgit v1.2.3 From c41ec4529d3448df8998950d7bada757a1b321cf Mon Sep 17 00:00:00 2001 From: Su Yue Date: Thu, 21 Jan 2021 19:39:10 +0800 Subject: btrfs: fix lockdep warning due to seqcount_mutex on 32bit arch This effectively reverts commit d5c8238849e7 ("btrfs: convert data_seqcount to seqcount_mutex_t"). While running fstests on 32 bits test box, many tests failed because of warnings in dmesg. One of those warnings (btrfs/003): [66.441317] WARNING: CPU: 6 PID: 9251 at include/linux/seqlock.h:279 btrfs_remove_chunk+0x58b/0x7b0 [btrfs] [66.441446] CPU: 6 PID: 9251 Comm: btrfs Tainted: G O 5.11.0-rc4-custom+ #5 [66.441449] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.14.0-1 04/01/2014 [66.441451] EIP: btrfs_remove_chunk+0x58b/0x7b0 [btrfs] [66.441472] EAX: 00000000 EBX: 00000001 ECX: c576070c EDX: c6b15803 [66.441475] ESI: 10000000 EDI: 00000000 EBP: c56fbcfc ESP: c56fbc70 [66.441477] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010246 [66.441481] CR0: 80050033 CR2: 05c8da20 CR3: 04b20000 CR4: 00350ed0 [66.441485] Call Trace: [66.441510] btrfs_relocate_chunk+0xb1/0x100 [btrfs] [66.441529] ? btrfs_lookup_block_group+0x17/0x20 [btrfs] [66.441562] btrfs_balance+0x8ed/0x13b0 [btrfs] [66.441586] ? btrfs_ioctl_balance+0x333/0x3c0 [btrfs] [66.441619] ? __this_cpu_preempt_check+0xf/0x11 [66.441643] btrfs_ioctl_balance+0x333/0x3c0 [btrfs] [66.441664] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [66.441683] btrfs_ioctl+0x414/0x2ae0 [btrfs] [66.441700] ? __lock_acquire+0x35f/0x2650 [66.441717] ? lockdep_hardirqs_on+0x87/0x120 [66.441720] ? lockdep_hardirqs_on_prepare+0xd0/0x1e0 [66.441724] ? call_rcu+0x2d3/0x530 [66.441731] ? __might_fault+0x41/0x90 [66.441736] ? kvm_sched_clock_read+0x15/0x50 [66.441740] ? sched_clock+0x8/0x10 [66.441745] ? sched_clock_cpu+0x13/0x180 [66.441750] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [66.441750] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [66.441768] __ia32_sys_ioctl+0x165/0x8a0 [66.441773] ? __this_cpu_preempt_check+0xf/0x11 [66.441785] ? __might_fault+0x89/0x90 [66.441791] __do_fast_syscall_32+0x54/0x80 [66.441796] do_fast_syscall_32+0x32/0x70 [66.441801] do_SYSENTER_32+0x15/0x20 [66.441805] entry_SYSENTER_32+0x9f/0xf2 [66.441808] EIP: 0xab7b5549 [66.441814] EAX: ffffffda EBX: 00000003 ECX: c4009420 EDX: bfa91f5c [66.441816] ESI: 00000003 EDI: 00000001 EBP: 00000000 ESP: bfa91e98 [66.441818] DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000292 [66.441833] irq event stamp: 42579 [66.441835] hardirqs last enabled at (42585): [] console_unlock+0x495/0x590 [66.441838] hardirqs last disabled at (42590): [] console_unlock+0x405/0x590 [66.441840] softirqs last enabled at (41698): [] call_on_stack+0x1c/0x60 [66.441843] softirqs last disabled at (41681): [] call_on_stack+0x1c/0x60 ======================================================================== btrfs_remove_chunk+0x58b/0x7b0: __seqprop_mutex_assert at linux/./include/linux/seqlock.h:279 (inlined by) btrfs_device_set_bytes_used at linux/fs/btrfs/volumes.h:212 (inlined by) btrfs_remove_chunk at linux/fs/btrfs/volumes.c:2994 ======================================================================== The warning is produced by lockdep_assert_held() in __seqprop_mutex_assert() if CONFIG_LOCKDEP is enabled. And "olumes.c:2994 is btrfs_device_set_bytes_used() with mutex lock fs_info->chunk_mutex held already. After adding some debug prints, the cause was found that many __alloc_device() are called with NULL @fs_info (during scanning ioctl). Inside the function, btrfs_device_data_ordered_init() is expanded to seqcount_mutex_init(). In this scenario, its second parameter info->chunk_mutex is &NULL->chunk_mutex which equals to offsetof(struct btrfs_fs_info, chunk_mutex) unexpectedly. Thus, seqcount_mutex_init() is called in wrong way. And later btrfs_device_get/set helpers trigger lockdep warnings. The device and filesystem object lifetimes are different and we'd have to synchronize initialization of the btrfs_device::data_seqcount with the fs_info, possibly using some additional synchronization. It would still not prevent concurrent access to the seqcount lock when it's used for read and initialization. Commit d5c8238849e7 ("btrfs: convert data_seqcount to seqcount_mutex_t") does not mention a particular problem being fixed so revert should not cause any harm and we'll get the lockdep warning fixed. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=210139 Reported-by: Erhard F Fixes: d5c8238849e7 ("btrfs: convert data_seqcount to seqcount_mutex_t") CC: stable@vger.kernel.org # 5.10 CC: Davidlohr Bueso Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0c7f4f6237e8..b900cc7849b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -433,7 +433,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - btrfs_device_data_ordered_init(dev, fs_info); + btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); extent_io_tree_init(fs_info, &dev->alloc_state, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1997a4649a66..c43663d9c22e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -39,10 +39,10 @@ struct btrfs_io_geometry { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include #define __BTRFS_NEED_DEVICE_DATA_ORDERED -#define btrfs_device_data_ordered_init(device, info) \ - seqcount_mutex_init(&device->data_seqcount, &info->chunk_mutex) +#define btrfs_device_data_ordered_init(device) \ + seqcount_init(&device->data_seqcount) #else -#define btrfs_device_data_ordered_init(device, info) do { } while (0) +#define btrfs_device_data_ordered_init(device) do { } while (0) #endif #define BTRFS_DEV_STATE_WRITEABLE (0) @@ -76,8 +76,7 @@ struct btrfs_device { blk_status_t last_flush_error; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED - /* A seqcount_t with associated chunk_mutex (for lockdep) */ - seqcount_mutex_t data_seqcount; + seqcount_t data_seqcount; #endif /* the internal btrfs device id */ @@ -168,9 +167,11 @@ btrfs_device_get_##name(const struct btrfs_device *dev) \ static inline void \ btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ { \ + preempt_disable(); \ write_seqcount_begin(&dev->data_seqcount); \ dev->name = size; \ write_seqcount_end(&dev->data_seqcount); \ + preempt_enable(); \ } #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) #define BTRFS_DEVICE_GETSET_FUNCS(name) \ -- cgit v1.2.3 From 9ad6d91f056b99dbe59a262810cb342519ea8d39 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 22 Jan 2021 19:07:45 +0000 Subject: btrfs: fix log replay failure due to race with space cache rebuild After a sudden power failure we may end up with a space cache on disk that is not valid and needs to be rebuilt from scratch. If that happens, during log replay when we attempt to pin an extent buffer from a log tree, at btrfs_pin_extent_for_log_replay(), we do not wait for the space cache to be rebuilt through the call to: btrfs_cache_block_group(cache, 1); That is because that only waits for the task (work queue job) that loads the space cache to change the cache state from BTRFS_CACHE_FAST to any other value. That is ok when the space cache on disk exists and is valid, but when the cache is not valid and needs to be rebuilt, it ends up returning as soon as the cache state changes to BTRFS_CACHE_STARTED (done at caching_thread()). So this means that we can end up trying to unpin a range which is not yet marked as free in the block group. This results in the call to btrfs_remove_free_space() to return -EINVAL to btrfs_pin_extent_for_log_replay(), which in turn makes the log replay fail as well as mounting the filesystem. More specifically the -EINVAL comes from free_space_cache.c:remove_from_bitmap(), because the requested range is not marked as free space (ones in the bitmap), we have the following condition triggered: static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl, (...) if (ret < 0 || search_start != *offset) return -EINVAL; (...) It's the "search_start != *offset" that results in the condition being evaluated to true. When this happens we got the following in dmesg/syslog: [72383.415114] BTRFS: device fsid 32b95b69-0ea9-496a-9f02-3f5a56dc9322 devid 1 transid 1432 /dev/sdb scanned by mount (3816007) [72383.417837] BTRFS info (device sdb): disk space caching is enabled [72383.418536] BTRFS info (device sdb): has skinny extents [72383.423846] BTRFS info (device sdb): start tree-log replay [72383.426416] BTRFS warning (device sdb): block group 30408704 has wrong amount of free space [72383.427686] BTRFS warning (device sdb): failed to load free space cache for block group 30408704, rebuilding it now [72383.454291] BTRFS: error (device sdb) in btrfs_recover_log_trees:6203: errno=-22 unknown (Failed to pin buffers while recovering log root tree.) [72383.456725] BTRFS: error (device sdb) in btrfs_replay_log:2253: errno=-22 unknown (Failed to recover log tree) [72383.460241] BTRFS error (device sdb): open_ctree failed We also mark the range for the extent buffer in the excluded extents io tree. That is fine when the space cache is valid on disk and we can load it, in which case it causes no problems. However, for the case where we need to rebuild the space cache, because it is either invalid or it is missing, having the extent buffer range marked in the excluded extents io tree leads to a -EINVAL failure from the call to btrfs_remove_free_space(), resulting in the log replay and mount to fail. This is because by having the range marked in the excluded extents io tree, the caching thread ends up never adding the range of the extent buffer as free space in the block group since the calls to add_new_free_space(), called from load_extent_tree_free(), filter out any ranges that are marked as excluded extents. So fix this by making sure that during log replay we wait for the caching task to finish completely when we need to rebuild a space cache, and also drop the need to mark the extent buffer range in the excluded extents io tree, as well as clearing ranges from that tree at btrfs_finish_extent_commit(). This started to happen with some frequency on large filesystems having block groups with a lot of fragmentation since the recent commit e747853cae3ae3 ("btrfs: load free space cache asynchronously"), but in fact the issue has been there for years, it was just much less likely to happen. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 61 +++++++++++++++----------------------------------- 1 file changed, 18 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 30b1a630dc2f..0c335dae5af7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2602,8 +2602,6 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache; int ret; - btrfs_add_excluded_extent(trans->fs_info, bytenr, num_bytes); - cache = btrfs_lookup_block_group(trans->fs_info, bytenr); if (!cache) return -EINVAL; @@ -2615,11 +2613,19 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, * the pinned extents. */ btrfs_cache_block_group(cache, 1); + /* + * Make sure we wait until the cache is completely built in case it is + * missing or is invalid and therefore needs to be rebuilt. + */ + ret = btrfs_wait_block_group_cache_done(cache); + if (ret) + goto out; pin_down_extent(trans, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); +out: btrfs_put_block_group(cache); return ret; } @@ -2629,50 +2635,22 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, { int ret; struct btrfs_block_group *block_group; - struct btrfs_caching_control *caching_ctl; block_group = btrfs_lookup_block_group(fs_info, start); if (!block_group) return -EINVAL; - btrfs_cache_block_group(block_group, 0); - caching_ctl = btrfs_get_caching_control(block_group); - - if (!caching_ctl) { - /* Logic error */ - BUG_ON(!btrfs_block_group_done(block_group)); - ret = btrfs_remove_free_space(block_group, start, num_bytes); - } else { - /* - * We must wait for v1 caching to finish, otherwise we may not - * remove our space. - */ - btrfs_wait_space_cache_v1_finished(block_group, caching_ctl); - mutex_lock(&caching_ctl->mutex); - - if (start >= caching_ctl->progress) { - ret = btrfs_add_excluded_extent(fs_info, start, - num_bytes); - } else if (start + num_bytes <= caching_ctl->progress) { - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - } else { - num_bytes = caching_ctl->progress - start; - ret = btrfs_remove_free_space(block_group, - start, num_bytes); - if (ret) - goto out_lock; + btrfs_cache_block_group(block_group, 1); + /* + * Make sure we wait until the cache is completely built in case it is + * missing or is invalid and therefore needs to be rebuilt. + */ + ret = btrfs_wait_block_group_cache_done(block_group); + if (ret) + goto out; - num_bytes = (start + num_bytes) - - caching_ctl->progress; - start = caching_ctl->progress; - ret = btrfs_add_excluded_extent(fs_info, start, - num_bytes); - } -out_lock: - mutex_unlock(&caching_ctl->mutex); - btrfs_put_caching_control(caching_ctl); - } + ret = btrfs_remove_free_space(block_group, start, num_bytes); +out: btrfs_put_block_group(block_group); return ret; } @@ -2863,9 +2841,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - clear_extent_bits(&fs_info->excluded_extents, start, - end, EXTENT_UPTODATE); if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, -- cgit v1.2.3