From f3bca8028bd934e96257b8bd1143e6474fe98465 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 20 Jul 2016 17:33:44 -0700 Subject: Btrfs: add ASSERT for block group's memory leak This adds several ASSERT()' s to report memory leak of block group cache. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c2b81b0d3fe0..e962b0e25324 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9942,6 +9942,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group->iref = 0; block_group->inode = NULL; spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); iput(inode); last = block_group->key.objectid + block_group->key.offset; btrfs_put_block_group(block_group); @@ -9999,6 +10000,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); -- cgit v1.2.3 From eecba891d38051ebf7f4af6394d188a5fd151a6a Mon Sep 17 00:00:00 2001 From: Alex Lyakas Date: Sun, 6 Dec 2015 12:32:31 +0200 Subject: btrfs: flush_space: treat return value of do_chunk_alloc properly do_chunk_alloc returns 1 when it succeeds to allocate a new chunk. But flush_space will not convert this to 0, and will also return 1. As a result, reserve_metadata_bytes will think that flush_space failed, and may potentially return this value "1" to the caller (depends how reserve_metadata_bytes was called). The caller will also treat this as an error. For example, btrfs_block_rsv_refill does: int ret = -ENOSPC; ... ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; } return ret; So it will return -ENOSPC. Signed-off-by: Alex Lyakas Reviewed-by: Josef Bacik Reviewed-by: Liu Bo Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e962b0e25324..d2cc16ff3e21 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4882,7 +4882,7 @@ static int flush_space(struct btrfs_root *root, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); - if (ret == -ENOSPC) + if (ret > 0 || ret == -ENOSPC) ret = 0; break; case COMMIT_TRANS: -- cgit v1.2.3 From cb93b52cc005ba0e470845b519c662e661d5113c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Aug 2016 10:36:50 +0800 Subject: btrfs: qgroup: Refactor btrfs_qgroup_insert_dirty_extent() Refactor btrfs_qgroup_insert_dirty_extent() function, to two functions: 1. btrfs_qgroup_insert_dirty_extent_nolock() Almost the same with original code. For delayed_ref usage, which has delayed refs locked. Change the return value type to int, since caller never needs the pointer, but only needs to know if they need to free the allocated memory. 2. btrfs_qgroup_insert_dirty_extent() The more encapsulated version. Will do the delayed_refs lock, memory allocation, quota enabled check and other things. The original design is to keep exported functions to minimal, but since more btrfs hacks exposed, like replacing path in balance, we need to record dirty extents manually, so we have to add such functions. Also, add comment for both functions, to info developers how to keep qgroup correct when doing hacks. Cc: Mark Fasheh Signed-off-by: Qu Wenruo Reviewed-and-Tested-by: Goldwyn Rodrigues Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/delayed-ref.c | 7 ++----- fs/btrfs/extent-tree.c | 37 +++++-------------------------------- fs/btrfs/qgroup.c | 41 +++++++++++++++++++++++++++++++++++------ fs/btrfs/qgroup.h | 33 +++++++++++++++++++++++++++++---- 4 files changed, 71 insertions(+), 47 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index d9ddcfc18c91..ac02e041464b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, - delayed_refs, - qrecord); - if (qexisting) + if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, + delayed_refs, qrecord)) kfree(qrecord); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d2cc16ff3e21..d10872748c43 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8521,35 +8521,6 @@ reada: wc->reada_slot = slot; } -/* - * These may not be seen by the usual inc/dec ref code so we have to - * add them here. - */ -static int record_one_subtree_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes) -{ - struct btrfs_qgroup_extent_record *qrecord; - struct btrfs_delayed_ref_root *delayed_refs; - - qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); - if (!qrecord) - return -ENOMEM; - - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; - qrecord->old_roots = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, - delayed_refs, qrecord)) - kfree(qrecord); - spin_unlock(&delayed_refs->lock); - - return 0; -} - static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) @@ -8583,7 +8554,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + bytenr, num_bytes, GFP_NOFS); if (ret) return ret; } @@ -8732,8 +8704,9 @@ walk_down: btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - ret = record_one_subtree_extent(trans, root, child_bytenr, - root->nodesize); + ret = btrfs_qgroup_insert_dirty_extent(trans, + root->fs_info, child_bytenr, + root->nodesize, GFP_NOFS); if (ret) goto out; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 13eb6a7a4db1..8db2e29fdcf4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, else if (bytenr > entry->bytenr) p = &(*p)->rb_right; else - return entry; + return 1; } rb_link_node(&record->node, parent_node, p); rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); - return NULL; + return 0; +} + +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag) +{ + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0) + return 0; + if (WARN_ON(trans == NULL)) + return -EINVAL; + record = kmalloc(sizeof(*record), gfp_flag); + if (!record) + return -ENOMEM; + + delayed_refs = &trans->transaction->delayed_refs; + record->bytenr = bytenr; + record->num_bytes = num_bytes; + record->old_roots = NULL; + + spin_lock(&delayed_refs->lock); + ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, + record); + spin_unlock(&delayed_refs->lock); + if (ret > 0) + kfree(record); + return 0; } #define UPDATE_NEW 0 diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index af3e5578cad7..1bc64c864b62 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -64,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record * -btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * No lock version, caller must acquire delayed ref lock and allocate memory. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ +int btrfs_qgroup_insert_dirty_extent_nolock( + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); + +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * Better encapsulated version. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag); + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, -- cgit v1.2.3 From 4824f1f412f75e9f84b9cecbde828e8f4699f82d Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Mon, 25 Jul 2016 15:51:39 +0800 Subject: btrfs: divide btrfs_update_reserved_bytes() into two functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch divides btrfs_update_reserved_bytes() into btrfs_add_reserved_bytes() and btrfs_free_reserved_bytes(), and next patch will extend btrfs_add_reserved_bytes()to fix some false ENOSPC error, please see later patch for detailed info. Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 97 +++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 40 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d10872748c43..f1121db65878 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -104,9 +104,10 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, - int delalloc); +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int delalloc); +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, @@ -6497,19 +6498,14 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) } /** - * btrfs_update_reserved_bytes - update the block_group and space info counters + * btrfs_add_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @reserve: One of the reservation enums * @delalloc: The blocks are allocated for the delalloc write * - * This is called by the allocator when it reserves space, or by somebody who is - * freeing space that was never actually used on disk. For example if you - * reserve some space for a new leaf in transaction A and before transaction A - * commits you free that leaf, you call this with reserve set to 0 in order to - * clear the reservation. - * - * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * This is called by the allocator when it reserves space. Metadata + * reservations should be called with RESERVE_ALLOC so we do the proper * ENOSPC accounting. For data we handle the reservation through clearing the * delalloc bits in the io_tree. We have to do this since we could end up * allocating less disk space for the amount of data we have reserved in the @@ -6519,44 +6515,65 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) * make the reservation and return -EAGAIN, otherwise this function always * succeeds. */ -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int delalloc) +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (reserve != RESERVE_FREE) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - num_bytes, 0); - space_info->bytes_may_use -= num_bytes; - } - - if (delalloc) - cache->delalloc_bytes += num_bytes; - } + if (cache->ro) { + ret = -EAGAIN; } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + num_bytes, 0); + space_info->bytes_may_use -= num_bytes; + } if (delalloc) - cache->delalloc_bytes -= num_bytes; + cache->delalloc_bytes += num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; } +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ + +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -7191,7 +7208,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; @@ -7763,8 +7780,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, - alloc_type, delalloc); + ret = btrfs_add_reserved_bytes(block_group, num_bytes, + alloc_type, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -7995,7 +8012,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + btrfs_free_reserved_bytes(cache, len, delalloc); trace_btrfs_reserved_extent_free(root, start, len); } @@ -8223,8 +8240,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT, 0); + ret = btrfs_add_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT, 0); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); -- cgit v1.2.3 From 18513091af9483ba84328d42092bd4d42a3c958f Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Mon, 25 Jul 2016 15:51:40 +0800 Subject: btrfs: update btrfs_space_info's bytes_may_use timely This patch can fix some false ENOSPC errors, below test script can reproduce one false ENOSPC error: #!/bin/bash dd if=/dev/zero of=fs.img bs=$((1024*1024)) count=128 dev=$(losetup --show -f fs.img) mkfs.btrfs -f -M $dev mkdir /tmp/mntpoint mount $dev /tmp/mntpoint cd /tmp/mntpoint xfs_io -f -c "falloc 0 $((64*1024*1024))" testfile Above script will fail for ENOSPC reason, but indeed fs still has free space to satisfy this request. Please see call graph: btrfs_fallocate() |-> btrfs_alloc_data_chunk_ondemand() | bytes_may_use += 64M |-> btrfs_prealloc_file_range() |-> btrfs_reserve_extent() |-> btrfs_add_reserved_bytes() | alloc_type is RESERVE_ALLOC_NO_ACCOUNT, so it does not | change bytes_may_use, and bytes_reserved += 64M. Now | bytes_may_use + bytes_reserved == 128M, which is greater | than btrfs_space_info's total_bytes, false enospc occurs. | Note, the bytes_may_use decrease operation will be done in | end of btrfs_fallocate(), which is too late. Here is another simple case for buffered write: CPU 1 | CPU 2 | |-> cow_file_range() |-> __btrfs_buffered_write() |-> btrfs_reserve_extent() | | | | | | | | | ..... | |-> btrfs_check_data_free_space() | | | | |-> extent_clear_unlock_delalloc() | In CPU 1, btrfs_reserve_extent()->find_free_extent()-> btrfs_add_reserved_bytes() do not decrease bytes_may_use, the decrease operation will be delayed to be done in extent_clear_unlock_delalloc(). Assume in this case, btrfs_reserve_extent() reserved 128MB data, CPU2's btrfs_check_data_free_space() tries to reserve 100MB data space. If 100MB > data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - data_sinfo->bytes_may_use btrfs_check_data_free_space() will try to allcate new data chunk or call btrfs_start_delalloc_roots(), or commit current transaction in order to reserve some free space, obviously a lot of work. But indeed it's not necessary as long as decreasing bytes_may_use timely, we still have free space, decreasing 128M from bytes_may_use. To fix this issue, this patch chooses to update bytes_may_use for both data and metadata in btrfs_add_reserved_bytes(). For compress path, real extent length may not be equal to file content length, so introduce a ram_bytes argument for btrfs_reserve_extent(), find_free_extent() and btrfs_add_reserved_bytes(), it's becasue bytes_may_use is increased by file content length. Then compress path can update bytes_may_use correctly. Also now we can discard RESERVE_ALLOC_NO_ACCOUNT, RESERVE_ALLOC and RESERVE_FREE. As we know, usually EXTENT_DO_ACCOUNTING is used for error path. In run_delalloc_nocow(), for inode marked as NODATACOW or extent marked as PREALLOC, we also need to update bytes_may_use, but can not pass EXTENT_DO_ACCOUNTING, because it also clears metadata reservation, so here we introduce EXTENT_CLEAR_DATA_RESV flag to indicate btrfs_clear_bit_hook() to update btrfs_space_info's bytes_may_use. Meanwhile __btrfs_prealloc_file_range() will call btrfs_free_reserved_data_space() internally for both sucessful and failed path, btrfs_prealloc_file_range()'s callers does not need to call btrfs_free_reserved_data_space() any more. Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 56 +++++++++++++++++--------------------------------- fs/btrfs/extent_io.h | 1 + fs/btrfs/file.c | 26 +++++++++++++---------- fs/btrfs/inode-map.c | 3 +-- fs/btrfs/inode.c | 37 ++++++++++++++++++++++++--------- fs/btrfs/relocation.c | 11 ++++++++-- 7 files changed, 73 insertions(+), 63 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index acc6850a118f..09cdff0d58e8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2579,7 +2579,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f1121db65878..133c93b0a0f1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -60,21 +60,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -/* - * Control how reservations are dealt with. - * - * RESERVE_FREE - freeing a reservation. - * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for - * ENOSPC accounting - * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update - * bytes_may_use as the ENOSPC accounting is done elsewhere - */ -enum { - RESERVE_FREE = 0, - RESERVE_ALLOC = 1, - RESERVE_ALLOC_NO_ACCOUNT = 2, -}; - static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -105,7 +90,7 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int delalloc); + u64 ram_bytes, u64 num_bytes, int delalloc); static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, @@ -3502,7 +3487,6 @@ again: dcs = BTRFS_DC_SETUP; else if (ret == -ENOSPC) set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -6500,8 +6484,9 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) /** * btrfs_add_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. * @num_bytes: The number of bytes in question - * @reserve: One of the reservation enums * @delalloc: The blocks are allocated for the delalloc write * * This is called by the allocator when it reserves space. Metadata @@ -6516,7 +6501,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) * succeeds. */ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int delalloc) + u64 ram_bytes, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; @@ -6528,13 +6513,11 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - num_bytes, 0); - space_info->bytes_may_use -= num_bytes; - } + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + ram_bytes, 0); + space_info->bytes_may_use -= ram_bytes; if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -7433,9 +7416,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * the free space extent currently. */ static noinline int find_free_extent(struct btrfs_root *orig_root, - u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 flags, int delalloc) + u64 ram_bytes, u64 num_bytes, u64 empty_size, + u64 hint_byte, struct btrfs_key *ins, + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -7447,8 +7430,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); - int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? - RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -7780,8 +7761,8 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_add_reserved_bytes(block_group, num_bytes, - alloc_type, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ram_bytes, + num_bytes, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; @@ -7953,7 +7934,7 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) @@ -7965,8 +7946,8 @@ int btrfs_reserve_extent(struct btrfs_root *root, flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags, delalloc); + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, + hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(root->fs_info, ins->objectid); @@ -7975,6 +7956,7 @@ again: num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); + ram_bytes = num_bytes; if (num_bytes == min_alloc_size) final_tried = true; goto again; @@ -8241,7 +8223,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, return -EINVAL; ret = btrfs_add_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT, 0); + ins->offset, 0); BUG_ON(ret); /* logic error */ ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -8385,7 +8367,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(root, blocksize, blocksize, + ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, empty_size, hint, &ins, 0, 0); if (ret) goto out_unuse; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c0c1c4fef6ce..b52ca5db01cb 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,6 +20,7 @@ #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) +#define EXTENT_CLEAR_DATA_RESV (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5842423f8f47..3391f2adf0c8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2675,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); + cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2767,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - cur_offset = alloc_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); @@ -2794,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte - cur_offset); if (ret < 0) break; + } else { + /* + * Do not need to reserve unwritten extent for this + * range, free reserved data space first, otherwise + * it'll result in false ENOSPC error. + */ + btrfs_free_reserved_data_space(inode, cur_offset, + last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2811,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->start, range->len, 1 << inode->i_blkbits, offset + len, &alloc_hint); + else + btrfs_free_reserved_data_space(inode, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2845,18 +2856,11 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_KERNEL); out: - /* - * As we waited the extent range, the data_rsv_map must be empty - * in the range, as written data range will be released from it. - * And for prealloacted extent, it will also be released when - * its metadata is written. - * So this is completely used as cleanup. - */ - btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); inode_unlock(inode); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - alloc_start); + if (ret != 0) + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - cur_offset); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index aa6fabaee72e..359ee861b5a4 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -495,10 +495,9 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, 0, prealloc); + btrfs_delalloc_release_metadata(inode, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29636624a427..bcea20015d9a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -566,6 +566,8 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); goto free_pages_out; } } @@ -742,7 +744,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, + ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); @@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) @@ -1489,8 +1492,10 @@ out_check: extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC, PAGE_UNLOCK | - PAGE_SET_PRIVATE2); + EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; @@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode, return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE)) + && do_list && !(state->state & EXTENT_NORESERVE) + && (*bits & (EXTENT_DO_ACCOUNTING | + EXTENT_CLEAR_DATA_RESV))) btrfs_free_reserved_data_space_noquota(inode, state->start, len); @@ -7252,7 +7259,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); @@ -7752,6 +7759,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ret = PTR_ERR(em2); goto unlock_err; } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, + start, len); goto unlock; } } @@ -7786,7 +7800,6 @@ unlock: i_size_write(inode, start + len); adjust_dio_outstanding_extents(inode, dio_data, len); - btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -10306,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; + u64 end = start + num_bytes - 1; if (trans) own_trans = false; @@ -10327,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); - ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1, 0); + ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); @@ -10414,6 +10428,9 @@ next: if (own_trans) btrfs_end_transaction(trans, root); } + if (cur_offset < end) + btrfs_free_reserved_data_space(inode, cur_offset, + end - cur_offset + 1); return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 71b4b70f56b9..8a2c2a07987b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3040,6 +3040,7 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; + u64 cur_offset; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); @@ -3049,6 +3050,7 @@ int prealloc_file_extent_cluster(struct inode *inode, if (ret) goto out; + cur_offset = prealloc_start; while (nr < cluster->nr) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -3058,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; + if (cur_offset < start) + btrfs_free_reserved_data_space(inode, cur_offset, + start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); + cur_offset = end + 1; unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; } - btrfs_free_reserved_data_space(inode, prealloc_start, - prealloc_end + 1 - prealloc_start); + if (cur_offset < prealloc_end) + btrfs_free_reserved_data_space(inode, cur_offset, + prealloc_end + 1 - cur_offset); out: inode_unlock(inode); return ret; -- cgit v1.2.3 From 28b737f6ede3661fe610937706c4a6f50e9ab769 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jul 2016 11:09:50 -0700 Subject: Btrfs: clarify do_chunk_alloc()'s return value Function start_transaction() can return ERR_PTR(1) when flush is BTRFS_RESERVE_FLUSH_LIMIT, so the call graph is start_transaction (return ERR_PTR(1)) -> btrfs_block_rsv_add (return 1) -> reserve_metadata_bytes (return 1) -> flush_space (return 1) -> do_chunk_alloc (return 1) With BTRFS_RESERVE_FLUSH_LIMIT, if flush_space is already on the flush_state of ALLOC_CHUNK and it successfully allocates a new chunk, then instead of trying to reserve space again, reserve_metadata_bytes returns 1 immediately. Eventually the callers who call start_transaction() usually just do the IS_ERR() check which ERR_PTR(1) can pass, then it'll get a panic when dereferencing a pointer which is ERR_PTR(1). The following patch fixes the above problem. "btrfs: flush_space: treat return value of do_chunk_alloc properly" https://patchwork.kernel.org/patch/7778651/ This add comments to clarify do_chunk_alloc()'s return value. Signed-off-by: Liu Bo Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 133c93b0a0f1..d5d3cfb1f66f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4457,6 +4457,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } } +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force) { -- cgit v1.2.3 From 187ee58c62c1d0d238d3dc4835869d33e1869906 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 18 Aug 2016 15:30:06 -0400 Subject: Btrfs: fix em leak in find_first_block_group We need to call free_extent_map() on the em we look up. Signed-off-by: Josef Bacik Reviewed-by: Omar Sandoval Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d5d3cfb1f66f..60d4ae7ce974 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9887,6 +9887,7 @@ static int find_first_block_group(struct btrfs_root *root, } else { ret = 0; } + free_extent_map(em); goto out; } path->slots[0]++; -- cgit v1.2.3 From e0af24849efb0eea572cf22d22bb65d164cb8a6f Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Wed, 31 Aug 2016 19:46:16 +0800 Subject: btrfs: fix one bug that process may endlessly wait for ticket in wait_reserve_ticket() If can_overcommit() in btrfs_calc_reclaim_metadata_size() returns true, btrfs_async_reclaim_metadata_space() will not reclaim metadata space, just return directly and also forget to wake up process which are waiting for their tickets, so these processes will wait endlessly. Fstests case generic/172 with mount option "-o compress=lzo" have revealed this bug in my test machine. Here if we have tickets to handle, we must handle them first. Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60d4ae7ce974..64676a16d32a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4901,11 +4901,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim = 0; - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - list_for_each_entry(ticket, &space_info->tickets, list) to_reclaim += ticket->bytes; list_for_each_entry(ticket, &space_info->priority_tickets, list) @@ -4913,6 +4908,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, if (to_reclaim) return to_reclaim; + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (can_overcommit(root, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; -- cgit v1.2.3 From ed7a6948394305b810d0c6203268648715e5006f Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Fri, 26 Aug 2016 11:33:14 +0800 Subject: btrfs: do not decrease bytes_may_use when replaying extents When replaying extents, there is no need to update bytes_may_use in btrfs_alloc_logged_file_extent(), otherwise it'll trigger a WARN_ON about bytes_may_use. Fixes: ("btrfs: update btrfs_space_info's bytes_may_use timely") Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 64676a16d32a..4483487ef021 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8216,6 +8216,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; /* * Mixed block groups will exclude before processing the log so we only @@ -8231,9 +8232,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_add_reserved_bytes(block_group, ins->offset, - ins->offset, 0); - BUG_ON(ret); /* logic error */ + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); -- cgit v1.2.3 From ce129655c9d9aaa7b3bcc46529db1b36693575ed Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Fri, 2 Sep 2016 10:58:46 +0800 Subject: btrfs: introduce tickets_id to determine whether asynchronous metadata reclaim work makes progress In btrfs_async_reclaim_metadata_space(), we use ticket's address to determine whether asynchronous metadata reclaim work is making progress. ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); if (last_ticket == ticket) { flush_state++; } else { last_ticket = ticket; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; } But indeed it's wrong, we should not rely on local variable's address to do this check, because addresses may be same. In my test environment, I dd one 168MB file in a 256MB fs, found that for this file, every time wait_reserve_ticket() called, local variable ticket's address is same, For above codes, assume a previous ticket's address is addrA, last_ticket is addrA. Btrfs_async_reclaim_metadata_space() finished this ticket and wake up it, then another ticket is added, but with the same address addrA, now last_ticket will be same to current ticket, then current ticket's flush work will start from current flush_state, not initial FLUSH_DELAYED_ITEMS_NR, which may result in some enospc issues(I have seen this in my test machine). Signed-off-by: Wang Xiaoguang Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ec4154faab61..146d1c7078ed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -427,6 +427,7 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4483487ef021..d09cf7aa083b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4966,12 +4966,12 @@ static void wake_all_tickets(struct list_head *head) */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { - struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; int commit_cycles = 0; + u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -4984,8 +4984,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } - last_ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); + last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -5005,10 +5004,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info); ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (last_ticket == ticket) { + if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { - last_ticket = ticket; + last_tickets_id = space_info->tickets_id; flush_state = FLUSH_DELAYED_ITEMS_NR; if (commit_cycles) commit_cycles--; @@ -5384,6 +5383,7 @@ again: list_del_init(&ticket->list); num_bytes -= ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { ticket->bytes -= num_bytes; @@ -5426,6 +5426,7 @@ again: num_bytes -= ticket->bytes; space_info->bytes_may_use += ticket->bytes; ticket->bytes = 0; + space_info->tickets_id++; wake_up(&ticket->wait); } else { trace_btrfs_space_reservation(fs_info, "space_info", -- cgit v1.2.3