diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-04 13:03:38 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-04-04 13:03:38 -0700 |
commit | 94514bbe9e5c402c4232af158a295a8fdfd72a2c (patch) | |
tree | c990c722cbac5abe8a3b28e0564effa722b7c80e /fs/btrfs/extent-tree.c | |
parent | 547c43d777968228b1060b6f1b152b96215eb7b2 (diff) | |
parent | 57599c7e7722daf5f8c2dba4b0e4628f5c500771 (diff) | |
download | linux-94514bbe9e5c402c4232af158a295a8fdfd72a2c.tar.bz2 |
Merge tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"There are a several user visible changes, the rest is mostly invisible
and continues to clean up the whole code base.
User visible changes:
- new mount option nossd_spread (pair for ssd_spread)
- mount option subvolid will detect junk after the number and fail
the mount
- add message after cancelled device replace
- direct module dependency on libcrc32, removed own crc wrappers
- removed user space transaction ioctls
- use lighter locking when reading /proc/self/mounts, RCU instead of
mutex to avoid unnecessary contention
Enhancements:
- skip writeback of last page when truncating file to same size
- send: do not issue unnecessary truncate operations
- mount option token specifiers: use %u for unsigned values, more
validation
- selftests: more tree block validations
qgroups:
- preparatory work for splitting reservation types for data and
metadata, this should allow for more accurate tracking and fix some
issues with underflows or do further enhancements
- split metadata reservations for started and joined transaction so
they do not get mixed up and are accounted correctly at commit time
- with the above, it's possible to revert patch that potentially
deadlocks when trying to make more space by explicitly committing
when the quota limit is hit
- fix root item corruption when multiple same source snapshots are
created with quota enabled
RAID56:
- make sure target is identical to source when raid56 rebuild fails
after dev-replace
- faster rebuild during scrub, batch by stripes and not
block-by-block
- make more use of cached data when rebuilding from a missing device
Fixes:
- null pointer deref when device replace target is missing
- fix fsync after hole punching when using no-holes feature
- fix lockdep splat when allocating percpu data with wrong GFP flags
Cleanups, refactoring, core changes:
- drop redunant parameters from various functions
- kill and opencode trivial helpers
- __cold/__exit function annotations
- dead code removal
- continued audit and documentation of memory barriers
- error handling: handle removal from uuid tree
- error handling: remove handling of impossible condtitons
- more debugging or error messages
- updated tracepoints
- one VLA use removal (and one still left)"
* tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (164 commits)
btrfs: lift errors from add_extent_changeset to the callers
Btrfs: print error messages when failing to read trees
btrfs: user proper type for btrfs_mask_flags flags
btrfs: split dev-replace locking helpers for read and write
btrfs: remove stale comments about fs_mutex
btrfs: use RCU in btrfs_show_devname for device list traversal
btrfs: update barrier in should_cow_block
btrfs: use lockdep_assert_held for mutexes
btrfs: use lockdep_assert_held for spinlocks
btrfs: Validate child tree block's level and first key
btrfs: tests/qgroup: Fix wrong tree backref level
Btrfs: fix copy_items() return value when logging an inode
Btrfs: fix fsync after hole punching when using no-holes feature
btrfs: use helper to set ulist aux from a qgroup
Revert "btrfs: qgroups: Retry after commit on getting EDQUOT"
btrfs: qgroup: Update trace events for metadata reservation
btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space
btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item
btrfs: qgroup: Use separate meta reservation type for delalloc
btrfs: qgroup: Introduce function to convert META_PREALLOC into META_PERTRANS
...
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r-- | fs/btrfs/extent-tree.c | 317 |
1 files changed, 173 insertions, 144 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e0460d7b5622..e08d0d45af4f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -27,7 +27,7 @@ #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include <linux/lockdep.h> -#include "hash.h" +#include <linux/crc32c.h> #include "tree-log.h" #include "disk-io.h" #include "print-tree.h" @@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work) struct btrfs_block_group_cache *block_group; struct btrfs_fs_info *fs_info; struct btrfs_caching_control *caching_ctl; - struct btrfs_root *extent_root; int ret; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; - extent_root = fs_info->extent_root; mutex_lock(&caching_ctl->mutex); down_read(&fs_info->commit_root_sem); @@ -1203,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -2652,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, unsigned long nr) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; @@ -2994,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work) if (trans->transid > async->transid) goto end; - ret = btrfs_run_delayed_refs(trans, fs_info, async->count); + ret = btrfs_run_delayed_refs(trans, async->count); if (ret) async->error = ret; end: @@ -3053,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, unsigned long count) + unsigned long count) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; @@ -3078,7 +3077,7 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif trans->can_flush_pending_bgs = false; - ret = __btrfs_run_delayed_refs(trans, fs_info, count); + ret = __btrfs_run_delayed_refs(trans, count); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; @@ -3086,7 +3085,7 @@ again: if (run_all) { if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, fs_info); + btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); node = rb_first(&delayed_refs->href_root); @@ -3660,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, * the commit latency by getting rid of the easy block groups while * we're still allowing others to join the commit. */ -int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; @@ -3686,7 +3685,7 @@ again: * make sure all the block groups on our dirty list actually * exist */ - btrfs_create_pending_block_groups(trans, fs_info); + btrfs_create_pending_block_groups(trans); if (!path) { path = btrfs_alloc_path(); @@ -3741,8 +3740,9 @@ again: should_put = 0; /* - * the cache_write_mutex is protecting - * the io_list + * The cache_write_mutex is protecting the + * io_list, also refer to the definition of + * btrfs_transaction::io_bgs for more details */ list_add_tail(&cache->io_list, io); } else { @@ -3800,7 +3800,7 @@ again: * go through delayed refs for all the stuff we've just kicked off * and then loop back (just once) */ - ret = btrfs_run_delayed_refs(trans, fs_info, 0); + ret = btrfs_run_delayed_refs(trans, 0); if (!ret && loops == 0) { loops++; spin_lock(&cur_trans->dirty_bgs_lock); @@ -3882,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache_save_setup(cache, trans, path); if (!ret) - ret = btrfs_run_delayed_refs(trans, fs_info, + ret = btrfs_run_delayed_refs(trans, (unsigned long) -1); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { @@ -3934,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } spin_unlock(&cur_trans->dirty_bgs_lock); + /* + * Refer to the definition of io_bgs member for details why it's safe + * to use it without any locking + */ while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group_cache, io_list); @@ -4332,8 +4336,7 @@ again: /* commit the current transaction and try again */ commit_trans: - if (need_commit && - !atomic_read(&fs_info->open_ioctl_trans)) { + if (need_commit) { need_commit--; if (need_commit > 0) { @@ -4541,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, * Needed because we can end up allocating a system chunk and for an * atomic and race free space reservation in the chunk block reserve. */ - ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); + lockdep_assert_held(&fs_info->chunk_mutex); info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); @@ -4602,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, return -ENOSPC; space_info = __find_space_info(fs_info, flags); - if (!space_info) { - ret = create_space_info(fs_info, flags, &space_info); - if (ret) - return ret; - } + ASSERT(space_info); again: spin_lock(&space_info->lock); @@ -4705,7 +4704,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, fs_info); + btrfs_create_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -4826,7 +4825,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, long time_left; unsigned long nr_pages; int loops; - enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(fs_info, to_reclaim); @@ -4867,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, atomic_read(&fs_info->async_delalloc_pages) <= (int)max_reclaim); skip_async: - if (!trans) - flush = BTRFS_RESERVE_FLUSH_ALL; - else - flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -4993,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = btrfs_run_delayed_items_nr(trans, fs_info, nr); + ret = btrfs_run_delayed_items_nr(trans, nr); btrfs_end_transaction(trans); break; case FLUSH_DELALLOC: @@ -5388,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root, !block_rsv_use_bytes(global_rsv, orig_bytes)) ret = 0; } - if (ret == -ENOSPC) + if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", block_rsv->space_info->flags, orig_bytes, 1); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); + } return ret; } @@ -5760,6 +5759,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, if (num_bytes == 0) return 0; + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + if (ret) + return ret; ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); @@ -5772,11 +5774,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. + * @qgroup_free - free or convert qgroup meta. + * Unlike normal operation, qgroup meta reservation needs to know if we are + * freeing qgroup reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal release. * * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. */ -static void btrfs_inode_rsv_release(struct btrfs_inode *inode) +static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -5792,6 +5798,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode) if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(inode->root, released); + else + btrfs_qgroup_convert_reserved_meta(inode->root, released); } void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, @@ -5892,24 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } -void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - if (!trans->block_rsv) { - ASSERT(!trans->bytes_reserved); - return; - } - - if (!trans->bytes_reserved) - return; - - ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, trans->bytes_reserved, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, - trans->bytes_reserved); - trans->bytes_reserved = 0; -} /* * To be called after all the new block groups attached to the transaction @@ -5951,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, */ u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), num_bytes, 1); return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } @@ -5995,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { /* One for parent inode, two for dir entries */ num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta(root, num_bytes, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret) return ret; } else { @@ -6014,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); if (ret && *qgroup_reserved) - btrfs_qgroup_free_meta(root, *qgroup_reserved); + btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved); return ret; } @@ -6051,7 +6043,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - struct btrfs_root *root = inode->root; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; @@ -6068,13 +6059,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; delalloc_lock = false; - } else if (current->journal_info) { - flush = BTRFS_RESERVE_FLUSH_LIMIT; - } + } else { + if (current->journal_info) + flush = BTRFS_RESERVE_FLUSH_LIMIT; - if (flush != BTRFS_RESERVE_NO_FLUSH && - btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); + if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); + } if (delalloc_lock) mutex_lock(&inode->delalloc_mutex); @@ -6089,19 +6080,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - ret = btrfs_qgroup_reserve_meta(root, - nr_extents * fs_info->nodesize, true); - if (ret) - goto out_fail; - } - ret = btrfs_inode_rsv_refill(inode, flush); - if (unlikely(ret)) { - btrfs_qgroup_free_meta(root, - nr_extents * fs_info->nodesize); + if (unlikely(ret)) goto out_fail; - } if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); @@ -6115,7 +6096,7 @@ out_fail: btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - btrfs_inode_rsv_release(inode); + btrfs_inode_rsv_release(inode, true); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return ret; @@ -6125,12 +6106,14 @@ out_fail: * btrfs_delalloc_release_metadata - release a metadata reservation for an inode * @inode: the inode to release the reservation for. * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata * reservations, or on error for the same reason. */ -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); @@ -6143,13 +6126,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) if (btrfs_is_testing(fs_info)) return; - btrfs_inode_rsv_release(inode); + btrfs_inode_rsv_release(inode, qgroup_free); } /** * btrfs_delalloc_release_extents - release our outstanding_extents * @inode: the inode to balance the reservation for. * @num_bytes: the number of bytes we originally reserved with + * @qgroup_free: do we need to free qgroup meta reservation or convert them. * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -6157,7 +6141,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) * temporarily tracked outstanding_extents. This _must_ be used in conjunction * with btrfs_delalloc_reserve_metadata. */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); unsigned num_extents; @@ -6171,7 +6156,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) if (btrfs_is_testing(fs_info)) return; - btrfs_inode_rsv_release(inode); + btrfs_inode_rsv_release(inode, qgroup_free); } /** @@ -6227,9 +6212,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode, */ void btrfs_delalloc_release_space(struct inode *inode, struct extent_changeset *reserved, - u64 start, u64 len) + u64 start, u64 len, bool qgroup_free) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len); + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); btrfs_free_reserved_data_space(inode, reserved, start, len); } @@ -6783,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *block_group, *tmp; struct list_head *deleted_bgs; struct extent_io_tree *unpin; @@ -7351,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return ret; } -int __get_raid_index(u64 flags) -{ - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ -} - -int get_block_group_index(struct btrfs_block_group_cache *cache) -{ - return __get_raid_index(cache->flags); -} - static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = "raid10", [BTRFS_RAID_RAID1] = "raid1", @@ -7488,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; - int index = __get_raid_index(flags); + int index = btrfs_bg_flags_to_raid_index(flags); bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -7574,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - index = get_block_group_index(block_group); + index = btrfs_bg_flags_to_raid_index( + block_group->flags); btrfs_lock_block_group(block_group, delalloc); goto have_block_group; } @@ -7584,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, } search: have_caching_bg = false; - if (index == 0 || index == __get_raid_index(flags)) + if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags)) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], @@ -7842,7 +7805,8 @@ checks: loop: failed_cluster_refill = false; failed_alloc = false; - BUG_ON(index != get_block_group_index(block_group)); + BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != + index); btrfs_release_block_group(block_group, delalloc); cond_resched(); } @@ -7996,6 +7960,51 @@ again: up_read(&info->groups_sem); } +/* + * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a + * hole that is at least as big as @num_bytes. + * + * @root - The root that will contain this extent + * + * @ram_bytes - The amount of space in ram that @num_bytes take. This + * is used for accounting purposes. This value differs + * from @num_bytes only in the case of compressed extents. + * + * @num_bytes - Number of bytes to allocate on-disk. + * + * @min_alloc_size - Indicates the minimum amount of space that the + * allocator should try to satisfy. In some cases + * @num_bytes may be larger than what is required and if + * the filesystem is fragmented then allocation fails. + * However, the presence of @min_alloc_size gives a + * chance to try and satisfy the smaller allocation. + * + * @empty_size - A hint that you plan on doing more COW. This is the + * size in bytes the allocator should try to find free + * next to the block it returns. This is just a hint and + * may be ignored by the allocator. + * + * @hint_byte - Hint to the allocator to start searching above the byte + * address passed. It might be ignored. + * + * @ins - This key is modified to record the found hole. It will + * have the following values: + * ins->objectid == start position + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == the size of the hole. + * + * @is_data - Boolean flag indicating whether an extent is + * allocated for data (true) or metadata (false) + * + * @delalloc - Boolean flag indicating whether this allocation is for + * delalloc or not. If 'true' data_rwsem of block groups + * is going to be acquired. + * + * + * Returns 0 when an allocation succeeded or < 0 when an error occurred. In + * case -ENOSPC is returned then @ins->offset will contain the size of the + * largest available hole the allocator managed to find. + */ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, @@ -8699,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 parent; u32 blocksize; struct btrfs_key key; + struct btrfs_key first_key; struct extent_buffer *next; int level = wc->level; int reada = 0; @@ -8719,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + btrfs_node_key_to_cpu(path->nodes[level], &first_key, + path->slots[level]); blocksize = fs_info->nodesize; next = find_extent_buffer(fs_info, bytenr); @@ -8783,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, generation); + next = read_tree_block(fs_info, bytenr, generation, level - 1, + &first_key); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { @@ -9648,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) */ target = get_restripe_target(fs_info, block_group->flags); if (target) { - index = __get_raid_index(extended_to_chunk(target)); + index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target)); } else { /* * this is just a balance, so if we were marked as full @@ -9662,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) goto out; } - index = get_block_group_index(block_group); + index = btrfs_bg_flags_to_raid_index(block_group->flags); } if (index == BTRFS_RAID_RAID10) { @@ -9911,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) return 0; } +/* link_block_group will queue up kobjects to add when we're reclaim-safe */ +void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + struct raid_kobject *rkobj; + LIST_HEAD(list); + int index; + int ret = 0; + + spin_lock(&fs_info->pending_raid_kobjs_lock); + list_splice_init(&fs_info->pending_raid_kobjs, &list); + spin_unlock(&fs_info->pending_raid_kobjs_lock); + + list_for_each_entry(rkobj, &list, list) { + space_info = __find_space_info(fs_info, rkobj->flags); + index = btrfs_bg_flags_to_raid_index(rkobj->flags); + + ret = kobject_add(&rkobj->kobj, &space_info->kobj, + "%s", get_raid_name(index)); + if (ret) { + kobject_put(&rkobj->kobj); + break; + } + } + if (ret) + btrfs_warn(fs_info, + "failed to add kobject for block cache, ignoring"); +} + static void link_block_group(struct btrfs_block_group_cache *cache) { struct btrfs_space_info *space_info = cache->space_info; - int index = get_block_group_index(cache); + struct btrfs_fs_info *fs_info = cache->fs_info; + int index = btrfs_bg_flags_to_raid_index(cache->flags); bool first = false; down_write(&space_info->groups_sem); @@ -9924,27 +9967,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache) up_write(&space_info->groups_sem); if (first) { - struct raid_kobject *rkobj; - int ret; - - rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); - if (!rkobj) - goto out_err; - rkobj->raid_type = index; - kobject_init(&rkobj->kobj, &btrfs_raid_ktype); - ret = kobject_add(&rkobj->kobj, &space_info->kobj, - "%s", get_raid_name(index)); - if (ret) { - kobject_put(&rkobj->kobj); - goto out_err; + struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS); + if (!rkobj) { + btrfs_warn(cache->fs_info, + "couldn't alloc memory for raid level kobject"); + return; } + rkobj->flags = cache->flags; + kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + spin_lock(&fs_info->pending_raid_kobjs_lock); + list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs); + spin_unlock(&fs_info->pending_raid_kobjs_lock); space_info->block_group_kobjs[index] = &rkobj->kobj; } - - return; -out_err: - btrfs_warn(cache->fs_info, - "failed to add kobject for block cache, ignoring"); } static struct btrfs_block_group_cache * @@ -10160,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) inc_block_group_ro(cache, 1); } + btrfs_add_raid_kobjects(info); init_global_block_rsv(info); ret = 0; error: @@ -10167,9 +10204,9 @@ error: return ret; } -void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group_cache *block_group, *tmp; struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_block_group_item item; @@ -10254,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * with its ->space_info set. */ cache->space_info = __find_space_info(fs_info, cache->flags); - if (!cache->space_info) { - ret = create_space_info(fs_info, cache->flags, - &cache->space_info); - if (ret) { - btrfs_remove_free_space_cache(cache); - btrfs_put_block_group(cache); - return ret; - } - } + ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(fs_info, cache); if (ret) { @@ -10334,7 +10363,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, block_group->key.offset); memcpy(&key, &block_group->key, sizeof(key)); - index = get_block_group_index(block_group); + index = btrfs_bg_flags_to_raid_index(block_group->flags); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) |