summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-04 13:03:38 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-04 13:03:38 -0700
commit94514bbe9e5c402c4232af158a295a8fdfd72a2c (patch)
treec990c722cbac5abe8a3b28e0564effa722b7c80e /fs
parent547c43d777968228b1060b6f1b152b96215eb7b2 (diff)
parent57599c7e7722daf5f8c2dba4b0e4628f5c500771 (diff)
downloadlinux-94514bbe9e5c402c4232af158a295a8fdfd72a2c.tar.bz2
Merge tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "There are a several user visible changes, the rest is mostly invisible and continues to clean up the whole code base. User visible changes: - new mount option nossd_spread (pair for ssd_spread) - mount option subvolid will detect junk after the number and fail the mount - add message after cancelled device replace - direct module dependency on libcrc32, removed own crc wrappers - removed user space transaction ioctls - use lighter locking when reading /proc/self/mounts, RCU instead of mutex to avoid unnecessary contention Enhancements: - skip writeback of last page when truncating file to same size - send: do not issue unnecessary truncate operations - mount option token specifiers: use %u for unsigned values, more validation - selftests: more tree block validations qgroups: - preparatory work for splitting reservation types for data and metadata, this should allow for more accurate tracking and fix some issues with underflows or do further enhancements - split metadata reservations for started and joined transaction so they do not get mixed up and are accounted correctly at commit time - with the above, it's possible to revert patch that potentially deadlocks when trying to make more space by explicitly committing when the quota limit is hit - fix root item corruption when multiple same source snapshots are created with quota enabled RAID56: - make sure target is identical to source when raid56 rebuild fails after dev-replace - faster rebuild during scrub, batch by stripes and not block-by-block - make more use of cached data when rebuilding from a missing device Fixes: - null pointer deref when device replace target is missing - fix fsync after hole punching when using no-holes feature - fix lockdep splat when allocating percpu data with wrong GFP flags Cleanups, refactoring, core changes: - drop redunant parameters from various functions - kill and opencode trivial helpers - __cold/__exit function annotations - dead code removal - continued audit and documentation of memory barriers - error handling: handle removal from uuid tree - error handling: remove handling of impossible condtitons - more debugging or error messages - updated tracepoints - one VLA use removal (and one still left)" * tag 'for-4.17-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (164 commits) btrfs: lift errors from add_extent_changeset to the callers Btrfs: print error messages when failing to read trees btrfs: user proper type for btrfs_mask_flags flags btrfs: split dev-replace locking helpers for read and write btrfs: remove stale comments about fs_mutex btrfs: use RCU in btrfs_show_devname for device list traversal btrfs: update barrier in should_cow_block btrfs: use lockdep_assert_held for mutexes btrfs: use lockdep_assert_held for spinlocks btrfs: Validate child tree block's level and first key btrfs: tests/qgroup: Fix wrong tree backref level Btrfs: fix copy_items() return value when logging an inode Btrfs: fix fsync after hole punching when using no-holes feature btrfs: use helper to set ulist aux from a qgroup Revert "btrfs: qgroups: Retry after commit on getting EDQUOT" btrfs: qgroup: Update trace events for metadata reservation btrfs: qgroup: Use root::qgroup_meta_rsv_* to record qgroup meta reserved space btrfs: delayed-inode: Use new qgroup meta rsv for delayed inode and item btrfs: qgroup: Use separate meta reservation type for delalloc btrfs: qgroup: Introduce function to convert META_PREALLOC into META_PERTRANS ...
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c14
-rw-r--r--fs/btrfs/backref.c13
-rw-r--r--fs/btrfs/backref.h2
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c263
-rw-r--r--fs/btrfs/ctree.h86
-rw-r--r--fs/btrfs/delayed-inode.c62
-rw-r--r--fs/btrfs/delayed-inode.h8
-rw-r--r--fs/btrfs/delayed-ref.c8
-rw-r--r--fs/btrfs/delayed-ref.h2
-rw-r--r--fs/btrfs/dev-replace.c130
-rw-r--r--fs/btrfs/dev-replace.h9
-rw-r--r--fs/btrfs/dir-item.c1
-rw-r--r--fs/btrfs/disk-io.c226
-rw-r--r--fs/btrfs/disk-io.h15
-rw-r--r--fs/btrfs/extent-tree.c317
-rw-r--r--fs/btrfs/extent_io.c82
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/extent_map.h2
-rw-r--r--fs/btrfs/file.c30
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/hash.c54
-rw-r--r--fs/btrfs/hash.h43
-rw-r--r--fs/btrfs/inode-item.c1
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c335
-rw-r--r--fs/btrfs/ioctl.c127
-rw-r--r--fs/btrfs/locking.c2
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/print-tree.c10
-rw-r--r--fs/btrfs/props.c8
-rw-r--r--fs/btrfs/qgroup.c406
-rw-r--r--fs/btrfs/qgroup.h106
-rw-r--r--fs/btrfs/raid56.c28
-rw-r--r--fs/btrfs/reada.c10
-rw-r--r--fs/btrfs/ref-verify.c7
-rw-r--r--fs/btrfs/relocation.c34
-rw-r--r--fs/btrfs/scrub.c126
-rw-r--r--fs/btrfs/send.c35
-rw-r--r--fs/btrfs/super.c259
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c3
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c2
-rw-r--r--fs/btrfs/transaction.c231
-rw-r--r--fs/btrfs/transaction.h25
-rw-r--r--fs/btrfs/tree-checker.c151
-rw-r--r--fs/btrfs/tree-checker.h7
-rw-r--r--fs/btrfs/tree-defrag.c5
-rw-r--r--fs/btrfs/tree-log.c88
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/uuid-tree.c2
-rw-r--r--fs/btrfs/volumes.c162
-rw-r--r--fs/btrfs/volumes.h31
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/btrfs/xattr.h7
65 files changed, 1967 insertions, 1691 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 273351ee4c46..167e5dc7eadd 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,7 +1,6 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
- select CRYPTO
- select CRYPTO_CRC32C
+ select LIBCRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0c4373628eb4..ca693dd554e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o
+ uuid-tree.o props.o free-space-tree.o tree-checker.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 1ba49ebe67da..0066d95b133f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -46,12 +46,12 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
BUG();
}
- size = __btrfs_getxattr(inode, name, "", 0);
+ size = btrfs_getxattr(inode, name, "", 0);
if (size > 0) {
value = kzalloc(size, GFP_KERNEL);
if (!value)
return ERR_PTR(-ENOMEM);
- size = __btrfs_getxattr(inode, name, value, size);
+ size = btrfs_getxattr(inode, name, value, size);
}
if (size > 0) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
@@ -65,9 +65,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
return acl;
}
-/*
- * Needs to be called with fs_mutex held
- */
static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct posix_acl *acl, int type)
{
@@ -101,7 +98,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
goto out;
}
- ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+ ret = btrfs_setxattr(trans, inode, name, value, size, 0);
out:
kfree(value);
@@ -127,11 +124,6 @@ int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
return ret;
}
-/*
- * btrfs_init_acl is already generally called under fs_mutex, so the locking
- * stuff has been fixed to work with that. If the locking stuff changes, we
- * need to re-evaluate the acl locking stuff.
- */
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir)
{
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 26484648d090..571024bc632e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -170,7 +170,7 @@ int __init btrfs_prelim_ref_init(void)
return 0;
}
-void btrfs_prelim_ref_exit(void)
+void __cold btrfs_prelim_ref_exit(void)
{
kmem_cache_destroy(btrfs_prelim_ref_cache);
}
@@ -738,7 +738,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
BUG_ON(ref->key_for_search.type);
BUG_ON(!ref->wanted_disk_byte);
- eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0);
+ eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0,
+ ref->level - 1, NULL);
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
@@ -773,15 +774,12 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
struct btrfs_key tmp_op_key;
- struct btrfs_key *op_key = NULL;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key) {
+ if (extent_op && extent_op->update_key)
btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
- op_key = &tmp_op_key;
- }
spin_lock(&head->lock);
for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
@@ -1291,7 +1289,8 @@ again:
ref->level == 0) {
struct extent_buffer *eb;
- eb = read_tree_block(fs_info, ref->parent, 0);
+ eb = read_tree_block(fs_info, ref->parent, 0,
+ ref->level, NULL);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 0c2fab8514ff..0a30028d5196 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -73,7 +73,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr);
int __init btrfs_prelim_ref_init(void);
-void btrfs_prelim_ref_exit(void);
+void __cold btrfs_prelim_ref_exit(void);
struct prelim_ref {
struct rb_node rbnode;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 63f0ccc92a71..ca15be569d69 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -195,7 +195,6 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
- long delayed_iput_count;
/*
* To avoid races between lockless (i_mutex not held) direct IO writes
@@ -365,6 +364,4 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
logical_start, csum, csum_expected, mirror_num);
}
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
-
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 7d51b5a5b505..3baebbc021c5 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -96,9 +96,9 @@
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/string.h>
+#include <linux/crc32c.h>
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
#include "extent_io.h"
#include "volumes.h"
@@ -1736,7 +1736,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
size_t sublen = i ? PAGE_SIZE :
(PAGE_SIZE - BTRFS_CSUM_SIZE);
- crc = btrfs_crc32c(crc, data, sublen);
+ crc = crc32c(crc, data, sublen);
}
btrfs_csum_final(crc, csum);
if (memcmp(csum, h->csum, state->csum_size))
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 07d049c0c20f..562c3e633403 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1133,7 +1133,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
return ret;
}
-void btrfs_exit_compress(void)
+void __cold btrfs_exit_compress(void)
{
free_workspaces();
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 677fa4aa0bd7..ce796557a918 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -76,7 +76,7 @@ struct compressed_bio {
};
void __init btrfs_init_compress(void);
-void btrfs_exit_compress(void);
+void __cold btrfs_exit_compress(void);
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
u64 start, struct page **pages,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b88a79e69ddf..a2c9d21176e2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -41,8 +41,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot);
-static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb);
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -301,11 +299,6 @@ enum mod_log_op {
MOD_LOG_ROOT_REPLACE,
};
-struct tree_mod_move {
- int dst_slot;
- int nr_items;
-};
-
struct tree_mod_root {
u64 logical;
u8 level;
@@ -328,32 +321,15 @@ struct tree_mod_elem {
u64 blockptr;
/* this is used for op == MOD_LOG_MOVE_KEYS */
- struct tree_mod_move move;
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
/* this is used for op == MOD_LOG_ROOT_REPLACE */
struct tree_mod_root old_root;
};
-static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
-{
- read_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
-{
- read_unlock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
-{
- write_lock(&fs_info->tree_mod_log_lock);
-}
-
-static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
-{
- write_unlock(&fs_info->tree_mod_log_lock);
-}
-
/*
* Pull a new tree mod seq number for our operation.
*/
@@ -373,14 +349,14 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
- tree_mod_log_write_lock(fs_info);
+ write_lock(&fs_info->tree_mod_log_lock);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
spin_unlock(&fs_info->tree_mod_seq_lock);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
return elem->seq;
}
@@ -422,7 +398,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
- tree_mod_log_write_lock(fs_info);
+ write_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
@@ -432,7 +408,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
rb_erase(node, tm_root);
kfree(tm);
}
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
}
/*
@@ -443,7 +419,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* for root replace operations, or the logical address of the affected
* block for all other operations.
*
- * Note: must be called with write lock (tree_mod_log_write_lock).
+ * Note: must be called with write lock for fs_info::tree_mod_log_lock.
*/
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -481,7 +457,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
* Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
* returns zero with the tree_mod_log_lock acquired. The caller must hold
* this until all tree mod log insertions are recorded in the rb tree and then
- * call tree_mod_log_write_unlock() to release.
+ * write unlock fs_info::tree_mod_log_lock.
*/
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb) {
@@ -491,9 +467,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
if (eb && btrfs_header_level(eb) == 0)
return 1;
- tree_mod_log_write_lock(fs_info);
+ write_lock(&fs_info->tree_mod_log_lock);
if (list_empty(&(fs_info)->tree_mod_seq_list)) {
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
return 1;
}
@@ -536,38 +512,34 @@ alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
return tm;
}
-static noinline int
-tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
+static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
{
struct tree_mod_elem *tm;
int ret;
- if (!tree_mod_need_log(fs_info, eb))
+ if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm = alloc_tree_mod_elem(eb, slot, op, flags);
if (!tm)
return -ENOMEM;
- if (tree_mod_dont_log(fs_info, eb)) {
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
kfree(tm);
return 0;
}
- ret = __tree_mod_log_insert(fs_info, tm);
- tree_mod_log_write_unlock(fs_info);
+ ret = __tree_mod_log_insert(eb->fs_info, tm);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
kfree(tm);
return ret;
}
-static noinline int
-tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int dst_slot, int src_slot,
- int nr_items)
+static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot, int nr_items)
{
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
@@ -575,7 +547,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
int i;
int locked = 0;
- if (!tree_mod_need_log(fs_info, eb))
+ if (!tree_mod_need_log(eb->fs_info, eb))
return 0;
tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
@@ -603,7 +575,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
}
}
- if (tree_mod_dont_log(fs_info, eb))
+ if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
locked = 1;
@@ -613,26 +585,26 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
* buffer, i.e. dst_slot < src_slot.
*/
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+ ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
if (ret)
goto free_tms;
}
- ret = __tree_mod_log_insert(fs_info, tm);
+ ret = __tree_mod_log_insert(eb->fs_info, tm);
if (ret)
goto free_tms;
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
free_tms:
for (i = 0; i < nr_items; i++) {
if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
kfree(tm_list[i]);
}
if (locked)
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
kfree(tm_list);
kfree(tm);
@@ -660,12 +632,10 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
return 0;
}
-static noinline int
-tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
- struct extent_buffer *old_root,
- struct extent_buffer *new_root,
- int log_removal)
+static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root, int log_removal)
{
+ struct btrfs_fs_info *fs_info = old_root->fs_info;
struct tree_mod_elem *tm = NULL;
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
@@ -713,7 +683,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
if (!ret)
ret = __tree_mod_log_insert(fs_info, tm);
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
@@ -740,7 +710,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
struct tree_mod_elem *cur = NULL;
struct tree_mod_elem *found = NULL;
- tree_mod_log_read_lock(fs_info);
+ read_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
node = tm_root->rb_node;
while (node) {
@@ -768,7 +738,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
break;
}
}
- tree_mod_log_read_unlock(fs_info);
+ read_unlock(&fs_info->tree_mod_log_lock);
return found;
}
@@ -849,7 +819,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
goto free_tms;
}
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return 0;
@@ -861,36 +831,13 @@ free_tms:
kfree(tm_list[i]);
}
if (locked)
- tree_mod_log_write_unlock(fs_info);
+ write_unlock(&fs_info->tree_mod_log_lock);
kfree(tm_list);
return ret;
}
-static inline void
-tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
- int dst_offset, int src_offset, int nr_items)
-{
- int ret;
- ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
- nr_items);
- BUG_ON(ret < 0);
-}
-
-static noinline void
-tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int slot, int atomic)
-{
- int ret;
-
- ret = tree_mod_log_insert_key(fs_info, eb, slot,
- MOD_LOG_KEY_REPLACE,
- atomic ? GFP_ATOMIC : GFP_NOFS);
- BUG_ON(ret < 0);
-}
-
-static noinline int
-tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
{
struct tree_mod_elem **tm_list = NULL;
int nritems = 0;
@@ -900,7 +847,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
if (btrfs_header_level(eb) == 0)
return 0;
- if (!tree_mod_need_log(fs_info, NULL))
+ if (!tree_mod_need_log(eb->fs_info, NULL))
return 0;
nritems = btrfs_header_nritems(eb);
@@ -917,11 +864,11 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
}
}
- if (tree_mod_dont_log(fs_info, eb))
+ if (tree_mod_dont_log(eb->fs_info, eb))
goto free_tms;
- ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
- tree_mod_log_write_unlock(fs_info);
+ ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
if (ret)
goto free_tms;
kfree(tm_list);
@@ -936,17 +883,6 @@ free_tms:
return ret;
}
-static noinline void
-tree_mod_log_set_root_pointer(struct btrfs_root *root,
- struct extent_buffer *new_root_node,
- int log_removal)
-{
- int ret;
- ret = tree_mod_log_insert_root(root->fs_info, root->node,
- new_root_node, log_removal);
- BUG_ON(ret < 0);
-}
-
/*
* check if the tree block can be shared by multiple trees
*/
@@ -1173,7 +1109,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
extent_buffer_get(cow);
- tree_mod_log_set_root_pointer(root, cow, 1);
+ ret = tree_mod_log_insert_root(root->node, cow, 1);
+ BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -1182,7 +1119,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- tree_mod_log_insert_key(fs_info, parent, parent_slot,
+ tree_mod_log_insert_key(parent, parent_slot,
MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
@@ -1190,7 +1127,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
- ret = tree_mod_log_free_eb(fs_info, buf);
+ ret = tree_mod_log_free_eb(buf);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -1211,9 +1148,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
* returns the logical address of the oldest predecessor of the given root.
* entries older than time_seq are ignored.
*/
-static struct tree_mod_elem *
-__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb_root, u64 time_seq)
+static struct tree_mod_elem *__tree_mod_log_oldest_root(
+ struct extent_buffer *eb_root, u64 time_seq)
{
struct tree_mod_elem *tm;
struct tree_mod_elem *found = NULL;
@@ -1230,7 +1166,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
* first operation that's logged for this root.
*/
while (1) {
- tm = tree_mod_log_search_oldest(fs_info, root_logical,
+ tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
time_seq);
if (!looped && !tm)
return NULL;
@@ -1279,7 +1215,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
unsigned long p_size = sizeof(struct btrfs_key_ptr);
n = btrfs_header_nritems(eb);
- tree_mod_log_read_lock(fs_info);
+ read_lock(&fs_info->tree_mod_log_lock);
while (tm && tm->seq >= time_seq) {
/*
* all the operations are recorded with the operator used for
@@ -1334,7 +1270,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
if (tm->logical != first_tm->logical)
break;
}
- tree_mod_log_read_unlock(fs_info);
+ read_unlock(&fs_info->tree_mod_log_lock);
btrfs_set_header_nritems(eb, n);
}
@@ -1418,9 +1354,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
u64 logical;
+ int level;
eb_root = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq);
+ tm = __tree_mod_log_oldest_root(eb_root, time_seq);
if (!tm)
return eb_root;
@@ -1428,15 +1365,17 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
old_root = &tm->old_root;
old_generation = tm->generation;
logical = old_root->logical;
+ level = old_root->level;
} else {
logical = eb_root->start;
+ level = btrfs_header_level(eb_root);
}
tm = tree_mod_log_search(fs_info, logical, time_seq);
if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, 0);
+ old = read_tree_block(fs_info, logical, 0, level, NULL);
if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
if (!IS_ERR(old))
free_extent_buffer(old);
@@ -1484,7 +1423,7 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
int level;
struct extent_buffer *eb_root = btrfs_root_node(root);
- tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+ tm = __tree_mod_log_oldest_root(eb_root, time_seq);
if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
level = tm->old_root.level;
} else {
@@ -1502,8 +1441,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
if (btrfs_is_testing(root->fs_info))
return 0;
- /* ensure we can see the force_cow */
- smp_rmb();
+ /* Ensure we can see the FORCE_COW bit */
+ smp_mb__before_atomic();
/*
* We do not need to cow a block if
@@ -1656,6 +1595,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(parent);
for (i = start_slot; i <= end_slot; i++) {
+ struct btrfs_key first_key;
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1665,6 +1605,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
progress_passed = 1;
blocknr = btrfs_node_blockptr(parent, i);
gen = btrfs_node_ptr_generation(parent, i);
+ btrfs_node_key_to_cpu(parent, &first_key, i);
if (last_block == 0)
last_block = blocknr;
@@ -1688,7 +1629,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
uptodate = 0;
if (!cur || !uptodate) {
if (!cur) {
- cur = read_tree_block(fs_info, blocknr, gen);
+ cur = read_tree_block(fs_info, blocknr, gen,
+ parent_level - 1,
+ &first_key);
if (IS_ERR(cur)) {
return PTR_ERR(cur);
} else if (!extent_buffer_uptodate(cur)) {
@@ -1696,7 +1639,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
return -EIO;
}
} else if (!uptodate) {
- err = btrfs_read_buffer(cur, gen);
+ err = btrfs_read_buffer(cur, gen,
+ parent_level - 1,&first_key);
if (err) {
free_extent_buffer(cur);
return err;
@@ -1849,14 +1793,17 @@ read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
{
int level = btrfs_header_level(parent);
struct extent_buffer *eb;
+ struct btrfs_key first_key;
if (slot < 0 || slot >= btrfs_header_nritems(parent))
return ERR_PTR(-ENOENT);
BUG_ON(level == 0);
+ btrfs_node_key_to_cpu(parent, &first_key, slot);
eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
- btrfs_node_ptr_generation(parent, slot));
+ btrfs_node_ptr_generation(parent, slot),
+ level - 1, &first_key);
if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
eb = ERR_PTR(-EIO);
@@ -1928,7 +1875,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
- tree_mod_log_set_root_pointer(root, child, 1);
+ ret = tree_mod_log_insert_root(root->node, child, 1);
+ BUG_ON(ret < 0);
rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
@@ -2007,8 +1955,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
- tree_mod_log_set_node_key(fs_info, parent,
- pslot + 1, 0);
+ ret = tree_mod_log_insert_key(parent, pslot + 1,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
}
@@ -2052,7 +2001,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+ ret = tree_mod_log_insert_key(parent, pslot,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
}
@@ -2153,7 +2104,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
- tree_mod_log_set_node_key(fs_info, parent, pslot, 0);
+ ret = tree_mod_log_insert_key(parent, pslot,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
if (btrfs_header_nritems(left) > orig_slot) {
@@ -2207,8 +2160,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
- tree_mod_log_set_node_key(fs_info, parent,
- pslot + 1, 0);
+ ret = tree_mod_log_insert_key(parent, pslot + 1,
+ MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2445,10 +2399,14 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
u64 gen;
struct extent_buffer *b = *eb_ret;
struct extent_buffer *tmp;
+ struct btrfs_key first_key;
int ret;
+ int parent_level;
blocknr = btrfs_node_blockptr(b, slot);
gen = btrfs_node_ptr_generation(b, slot);
+ parent_level = btrfs_header_level(b);
+ btrfs_node_key_to_cpu(b, &first_key, slot);
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
@@ -2467,7 +2425,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_set_path_blocking(p);
/* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_buffer(tmp, gen);
+ ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
if (!ret) {
*eb_ret = tmp;
return 0;
@@ -2494,7 +2452,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_release_path(p);
ret = -EAGAIN;
- tmp = read_tree_block(fs_info, blocknr, 0);
+ tmp = read_tree_block(fs_info, blocknr, 0, parent_level - 1,
+ &first_key);
if (!IS_ERR(tmp)) {
/*
* If the read above didn't mark this buffer up to date,
@@ -3161,13 +3120,17 @@ static void fixup_low_keys(struct btrfs_fs_info *fs_info,
{
int i;
struct extent_buffer *t;
+ int ret;
for (i = level; i < BTRFS_MAX_LEVEL; i++) {
int tslot = path->slots[i];
+
if (!path->nodes[i])
break;
t = path->nodes[i];
- tree_mod_log_set_node_key(fs_info, t, tslot, 1);
+ ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
+ GFP_ATOMIC);
+ BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
if (tslot != 0)
@@ -3264,8 +3227,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
if (push_items < src_nritems) {
/*
- * don't call tree_mod_log_eb_move here, key removal was already
- * fully logged by tree_mod_log_eb_copy above.
+ * Don't call tree_mod_log_insert_move here, key removal was
+ * already fully logged by tree_mod_log_eb_copy above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
@@ -3320,7 +3283,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
- tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems);
+ ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+ BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
@@ -3363,6 +3327,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
struct extent_buffer *c;
struct extent_buffer *old;
struct btrfs_disk_key lower_key;
+ int ret;
BUG_ON(path->nodes[level]);
BUG_ON(path->nodes[level-1] != root->node);
@@ -3401,7 +3366,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- tree_mod_log_set_root_pointer(root, c, 0);
+ ret = tree_mod_log_insert_root(root->node, c, 0);
+ BUG_ON(ret < 0);
rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
@@ -3438,17 +3404,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(slot > nritems);
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
if (slot != nritems) {
- if (level)
- tree_mod_log_eb_move(fs_info, lower, slot + 1,
- slot, nritems - slot);
+ if (level) {
+ ret = tree_mod_log_insert_move(lower, slot + 1, slot,
+ nritems - slot);
+ BUG_ON(ret < 0);
+ }
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
- ret = tree_mod_log_insert_key(fs_info, lower, slot,
- MOD_LOG_KEY_ADD, GFP_NOFS);
+ ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
+ GFP_NOFS);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -4911,17 +4879,19 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
- if (level)
- tree_mod_log_eb_move(fs_info, parent, slot,
- slot + 1, nritems - slot - 1);
+ if (level) {
+ ret = tree_mod_log_insert_move(parent, slot, slot + 1,
+ nritems - slot - 1);
+ BUG_ON(ret < 0);
+ }
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
- ret = tree_mod_log_insert_key(fs_info, parent, slot,
- MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
+ GFP_NOFS);
BUG_ON(ret < 0);
}
@@ -5145,9 +5115,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
- * This does lock as it descends, and path->keep_locks should be set
- * to 1 by the caller.
- *
* This honors path->lowest_level to prevent descent past a given level
* of the tree.
*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index da308774b8a4..0eb55825862a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,6 +40,7 @@
#include <linux/sizes.h>
#include <linux/dynamic_debug.h>
#include <linux/refcount.h>
+#include <linux/crc32c.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -65,6 +66,8 @@ struct btrfs_ordered_sum;
#define BTRFS_MAX_LEVEL 8
+#define BTRFS_OLDEST_GENERATION 0ULL
+
#define BTRFS_COMPAT_EXTENT_TREE_V0
/*
@@ -86,9 +89,9 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_LINK_MAX 65535U
+/* four bytes for CRC32 */
static const int btrfs_csum_sizes[] = { 4 };
-/* four bytes for CRC32 */
#define BTRFS_EMPTY_DIR_SIZE 0
/* ioprio of readahead is set to idle */
@@ -98,6 +101,7 @@ static const int btrfs_csum_sizes[] = { 4 };
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+
/*
* Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
*/
@@ -381,8 +385,9 @@ struct btrfs_dev_replace {
/* For raid type sysfs entries */
struct raid_kobject {
- int raid_type;
+ u64 flags;
struct kobject kobj;
+ struct list_head list;
};
struct btrfs_space_info {
@@ -707,7 +712,6 @@ struct btrfs_delayed_root;
#define BTRFS_FS_LOG_RECOVERING 4
#define BTRFS_FS_OPEN 5
#define BTRFS_FS_QUOTA_ENABLED 6
-#define BTRFS_FS_QUOTA_ENABLING 7
#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9
#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10
#define BTRFS_FS_BTREE_ERR 11
@@ -788,7 +792,7 @@ struct btrfs_fs_info {
unsigned long pending_changes;
unsigned long compress_type:4;
unsigned int compress_level;
- int commit_interval;
+ u32 commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
* wrong number because we will write out the data into a regular
@@ -877,7 +881,6 @@ struct btrfs_fs_info {
struct rb_root tree_mod_log;
atomic_t async_delalloc_pages;
- atomic_t open_ioctl_trans;
/*
* this is used to protect the following list -- ordered_roots.
@@ -935,9 +938,11 @@ struct btrfs_fs_info {
struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
- int thread_pool_size;
+ u32 thread_pool_size;
struct kobject *space_info_kobj;
+ struct list_head pending_raid_kobjs;
+ spinlock_t pending_raid_kobjs_lock; /* uncontended */
u64 total_pinned;
@@ -952,9 +957,9 @@ struct btrfs_fs_info {
struct btrfs_fs_devices *fs_devices;
/*
- * the space_info list is almost entirely read only. It only changes
- * when we add a new raid type to the FS, and that happens
- * very rarely. RCU is used to protect it.
+ * The space_info list is effectively read only after initial
+ * setup. It is populated at mount time and cleaned up after
+ * all block groups are removed. RCU is used to protect it.
*/
struct list_head space_info;
@@ -993,8 +998,8 @@ struct btrfs_fs_info {
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
- unsigned data_chunk_allocations;
- unsigned metadata_ratio;
+ u32 data_chunk_allocations;
+ u32 metadata_ratio;
void *bdev_holder;
@@ -1260,12 +1265,13 @@ struct btrfs_root {
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
- /* For qgroup metadata space reserve */
- atomic64_t qgroup_meta_rsv;
+ /* For qgroup metadata reserved space */
+ spinlock_t qgroup_meta_rsv_lock;
+ u64 qgroup_meta_rsv_pertrans;
+ u64 qgroup_meta_rsv_prealloc;
};
struct btrfs_file_private {
- struct btrfs_trans_handle *trans;
void *filldir_buf;
};
@@ -2554,6 +2560,20 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \
btrfs_item_offset_nr(leaf, slot)))
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+ return crc32c((u32)~1, name, len);
+}
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+ int len)
+{
+ return (u64) crc32c(parent_objectid, name, len);
+}
+
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
@@ -2608,7 +2628,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, unsigned long count);
+ unsigned long count);
int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
unsigned long count, u64 transid, int wait);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
@@ -2628,7 +2648,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
u64 bytenr);
void btrfs_get_block_group(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-int get_block_group_index(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2668,15 +2687,13 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset);
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
@@ -2688,6 +2705,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info);
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
@@ -2697,8 +2715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info);
u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info);
u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info);
@@ -2730,11 +2747,10 @@ int btrfs_check_data_free_space(struct inode *inode,
void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved, u64 start, u64 len);
+ struct extent_changeset *reserved,
+ u64 start, u64 len, bool qgroup_free);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
@@ -2745,10 +2761,12 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
u64 *qgroup_reserved, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free);
int btrfs_delalloc_reserve_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
@@ -2792,7 +2810,6 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-int __get_raid_index(u64 flags);
int btrfs_start_write_no_snapshotting(struct btrfs_root *root);
void btrfs_end_write_no_snapshotting(struct btrfs_root *root);
void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
@@ -3195,8 +3212,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
-void btrfs_destroy_cachep(void);
-long btrfs_ioctl_trans_end(struct file *file);
+void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
@@ -3246,7 +3262,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
/* file.c */
int __init btrfs_auto_defrag_init(void);
-void btrfs_auto_defrag_exit(void);
+void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
@@ -3281,25 +3297,23 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int __init btrfs_init_sysfs(void);
-void btrfs_exit_sysfs(void);
+void __cold btrfs_exit_sysfs(void);
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
-/* xattr.c */
-ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-
/* super.c */
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
-static inline __printf(2, 3)
+static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
#ifdef CONFIG_PRINTK
__printf(2, 3)
+__cold
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#else
#define btrfs_printk(fs_info, fmt, args...) \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0530f6f2e4ba..86ec2edc05e8 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -23,6 +23,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "ctree.h"
+#include "qgroup.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@@ -42,7 +43,7 @@ int __init btrfs_delayed_inode_init(void)
return 0;
}
-void btrfs_delayed_inode_exit(void)
+void __cold btrfs_delayed_inode_exit(void)
{
kmem_cache_destroy(delayed_node_cache);
}
@@ -552,11 +553,12 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item(
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *src_rsv;
struct btrfs_block_rsv *dst_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 num_bytes;
int ret;
@@ -578,15 +580,17 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
return ret;
}
-static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info,
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
struct btrfs_delayed_item *item)
{
struct btrfs_block_rsv *rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
if (!item->bytes_reserved)
return;
rsv = &fs_info->delayed_block_rsv;
+ btrfs_qgroup_convert_reserved_meta(root, item->bytes_reserved);
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid, item->bytes_reserved,
0);
@@ -611,6 +615,9 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ if (ret < 0)
+ return ret;
/*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
@@ -630,8 +637,10 @@ static int btrfs_delayed_inode_reserve_metadata(
* EAGAIN to make us stop the transaction we have, so return
* ENOSPC instead so that btrfs_dirty_inode knows what to do.
*/
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
ret = -ENOSPC;
+ btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+ }
if (!ret) {
node->bytes_reserved = num_bytes;
trace_btrfs_space_reservation(fs_info,
@@ -653,7 +662,8 @@ static int btrfs_delayed_inode_reserve_metadata(
}
static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_node *node)
+ struct btrfs_delayed_node *node,
+ bool qgroup_free)
{
struct btrfs_block_rsv *rsv;
@@ -665,6 +675,12 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info,
node->inode_id, node->bytes_reserved, 0);
btrfs_block_rsv_release(fs_info, rsv,
node->bytes_reserved);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(node->root,
+ node->bytes_reserved);
+ else
+ btrfs_qgroup_convert_reserved_meta(node->root,
+ node->bytes_reserved);
node->bytes_reserved = 0;
}
@@ -766,7 +782,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
curr->data_len);
slot++;
- btrfs_delayed_item_release_metadata(fs_info, curr);
+ btrfs_delayed_item_release_metadata(root, curr);
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
@@ -788,7 +804,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *delayed_item)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
char *ptr;
int ret;
@@ -806,7 +821,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
delayed_item->data_len);
btrfs_mark_buffer_dirty(leaf);
- btrfs_delayed_item_release_metadata(fs_info, delayed_item);
+ btrfs_delayed_item_release_metadata(root, delayed_item);
return 0;
}
@@ -858,7 +873,6 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_delayed_item *item)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_delayed_item *curr, *next;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -908,7 +922,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
goto out;
list_for_each_entry_safe(curr, next, &head, tree_list) {
- btrfs_delayed_item_release_metadata(fs_info, curr);
+ btrfs_delayed_item_release_metadata(root, curr);
list_del(&curr->tree_list);
btrfs_release_delayed_item(curr);
}
@@ -1051,7 +1065,7 @@ out:
no_iref:
btrfs_release_path(path);
err_out:
- btrfs_delayed_inode_release_metadata(fs_info, node);
+ btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
return ret;
@@ -1115,9 +1129,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
* Returns < 0 on error and returns with an aborted transaction with any
* outstanding delayed items cleaned up.
*/
-static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, int nr)
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_root *delayed_root;
struct btrfs_delayed_node *curr_node, *prev_node;
struct btrfs_path *path;
@@ -1162,16 +1176,14 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans)
{
- return __btrfs_run_delayed_items(trans, fs_info, -1);
+ return __btrfs_run_delayed_items(trans, -1);
}
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, int nr)
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr)
{
- return __btrfs_run_delayed_items(trans, fs_info, nr);
+ return __btrfs_run_delayed_items(trans, nr);
}
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
@@ -1443,7 +1455,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_type(dir_item, type);
memcpy((char *)(dir_item + 1), name, name_len);
- ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible
@@ -1480,7 +1492,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info,
return 1;
}
- btrfs_delayed_item_release_metadata(fs_info, item);
+ btrfs_delayed_item_release_metadata(node->root, item);
btrfs_release_delayed_item(item);
mutex_unlock(&node->mutex);
return 0;
@@ -1515,7 +1527,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
item->key = item_key;
- ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item);
+ ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item);
/*
* we have reserved enough space when we start a new transaction,
* so reserving metadata failure is impossible.
@@ -1880,7 +1892,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
mutex_lock(&delayed_node->mutex);
curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(fs_info, curr_item);
+ btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
@@ -1888,7 +1900,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
while (curr_item) {
- btrfs_delayed_item_release_metadata(fs_info, curr_item);
+ btrfs_delayed_item_release_metadata(root, curr_item);
prev_item = curr_item;
curr_item = __btrfs_next_delayed_item(prev_item);
btrfs_release_delayed_item(prev_item);
@@ -1898,7 +1910,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
btrfs_release_delayed_iref(delayed_node);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- btrfs_delayed_inode_release_metadata(fs_info, delayed_node);
+ btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);
btrfs_release_delayed_inode(delayed_node);
}
mutex_unlock(&delayed_node->mutex);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index c4189d495934..100a91e26b55 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -111,10 +111,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, int nr);
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr);
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
@@ -151,7 +149,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
/* for init */
int __init btrfs_delayed_inode_init(void);
-void btrfs_delayed_inode_exit(void);
+void __cold btrfs_delayed_inode_exit(void);
/* for debugging */
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 7ab5e0128f0c..2677257c149d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -216,7 +216,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- assert_spin_locked(&delayed_refs->lock);
+ lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
return 0;
@@ -239,7 +239,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
- assert_spin_locked(&head->lock);
+ lockdep_assert_held(&head->lock);
rb_erase(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
@@ -307,7 +307,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct rb_node *node;
u64 seq = 0;
- assert_spin_locked(&head->lock);
+ lockdep_assert_held(&head->lock);
if (RB_EMPTY_ROOT(&head->ref_tree))
return;
@@ -930,7 +930,7 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
return find_ref_head(&delayed_refs->href_root, bytenr, 0);
}
-void btrfs_delayed_ref_exit(void)
+void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c4f625e5a691..9e3e5aff0937 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -204,7 +204,7 @@ extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
int __init btrfs_delayed_ref_init(void);
-void btrfs_delayed_ref_exit(void);
+void __cold btrfs_delayed_ref_exit(void);
static inline struct btrfs_delayed_extent_op *
btrfs_alloc_delayed_extent_op(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7efbc4d1128b..0d203633bb96 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -44,7 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
static int btrfs_dev_replace_kthread(void *data);
static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
@@ -174,8 +173,14 @@ no_valid_dev_replace_entry_found:
}
set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&dev_replace->tgtdev->dev_state);
- btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
- dev_replace->tgtdev);
+
+ WARN_ON(fs_info->fs_devices->rw_devices == 0);
+ dev_replace->tgtdev->io_width = fs_info->sectorsize;
+ dev_replace->tgtdev->io_align = fs_info->sectorsize;
+ dev_replace->tgtdev->sector_size = fs_info->sectorsize;
+ dev_replace->tgtdev->fs_info = fs_info;
+ set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+ &dev_replace->tgtdev->dev_state);
}
break;
}
@@ -200,13 +205,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_dev_replace_item *ptr;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
if (!dev_replace->is_valid ||
!dev_replace->item_needs_writeback) {
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
return 0;
}
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -264,7 +269,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_dev_replace_item);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
if (dev_replace->srcdev)
btrfs_set_dev_replace_src_devid(eb, ptr,
dev_replace->srcdev->devid);
@@ -287,7 +292,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
btrfs_set_dev_replace_cursor_right(eb, ptr,
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_mark_buffer_dirty(eb);
@@ -307,7 +312,7 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
static char* btrfs_dev_name(struct btrfs_device *device)
{
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
return rcu_str_deref(device->name);
@@ -352,7 +357,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -390,7 +395,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -402,7 +407,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
goto leave;
}
@@ -426,7 +431,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
leave:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
return ret;
}
@@ -493,18 +498,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* don't allow cancel or unmount to disturb the finishing procedure */
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
/*
* flush all outstanding I/O and inode extent mappings before the
@@ -529,7 +534,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* keep away write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
@@ -549,7 +554,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&uuid_mutex);
@@ -586,7 +591,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_rm_dev_replace_blocked(fs_info);
@@ -679,7 +684,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
/* even if !dev_replace_is_valid, the values are good enough for
* the replace_status ioctl */
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
@@ -691,41 +696,36 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
args->status.num_uncorrectable_read_errors =
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
}
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
- struct btrfs_ioctl_dev_replace_args *args)
-{
- args->result = __btrfs_dev_replace_cancel(fs_info);
- return 0;
-}
-
-static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *tgt_device = NULL;
+ struct btrfs_device *src_device = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = fs_info->tree_root;
- u64 result;
+ int result;
int ret;
if (sb_rdonly(fs_info->sb))
return -EROFS;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
goto leave;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
+ src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
break;
@@ -733,7 +733,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
@@ -743,6 +743,12 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
+
+ btrfs_info_in_rcu(fs_info,
+ "dev_replace from %s (devid %llu) to %s canceled",
+ btrfs_dev_name(src_device), src_device->devid,
+ btrfs_dev_name(tgt_device));
+
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
@@ -756,7 +762,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -772,7 +778,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
break;
}
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}
@@ -782,12 +788,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
struct task_struct *task;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- btrfs_dev_replace_lock(dev_replace, 1);
+ btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
break;
@@ -801,10 +807,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
- btrfs_dev_replace_unlock(dev_replace, 1);
+ btrfs_dev_replace_write_unlock(dev_replace);
WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
@@ -873,37 +879,37 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
{
- if (rw == 1) {
- /* write */
-again:
- wait_event(dev_replace->read_lock_wq,
- atomic_read(&dev_replace->blocking_readers) == 0);
- write_lock(&dev_replace->lock);
- if (atomic_read(&dev_replace->blocking_readers)) {
- write_unlock(&dev_replace->lock);
- goto again;
- }
- } else {
- read_lock(&dev_replace->lock);
- atomic_inc(&dev_replace->read_locks);
- }
+ read_lock(&dev_replace->lock);
+ atomic_inc(&dev_replace->read_locks);
+}
+
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ ASSERT(atomic_read(&dev_replace->read_locks) > 0);
+ atomic_dec(&dev_replace->read_locks);
+ read_unlock(&dev_replace->lock);
}
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace)
{
- if (rw == 1) {
- /* write */
- ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+again:
+ wait_event(dev_replace->read_lock_wq,
+ atomic_read(&dev_replace->blocking_readers) == 0);
+ write_lock(&dev_replace->lock);
+ if (atomic_read(&dev_replace->blocking_readers)) {
write_unlock(&dev_replace->lock);
- } else {
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- atomic_dec(&dev_replace->read_locks);
- read_unlock(&dev_replace->lock);
+ goto again;
}
}
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
+{
+ ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
+ write_unlock(&dev_replace->lock);
+}
+
/* inc blocking cnt and release read lock */
void btrfs_dev_replace_set_lock_blocking(
struct btrfs_dev_replace *dev_replace)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index f94a76844ae7..8566a02ef222 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -32,13 +32,14 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int read_src);
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
-int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
- struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw);
-void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw);
+void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_clear_lock_blocking(
struct btrfs_dev_replace *dev_replace);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index cbe421605cd5..29e967b2c667 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -18,7 +18,6 @@
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 21f34ad0d411..07b5e6f7df67 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,10 +31,10 @@
#include <linux/uuid.h>
#include <linux/semaphore.h>
#include <linux/error-injection.h>
+#include <linux/crc32c.h>
#include <asm/unaligned.h>
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
@@ -110,7 +110,7 @@ int __init btrfs_end_io_wq_init(void)
return 0;
}
-void btrfs_end_io_wq_exit(void)
+void __cold btrfs_end_io_wq_exit(void)
{
kmem_cache_destroy(btrfs_end_io_wq_cache);
}
@@ -124,8 +124,8 @@ struct async_submit_bio {
void *private_data;
struct btrfs_fs_info *fs_info;
struct bio *bio;
- extent_submit_bio_hook_t *submit_bio_start;
- extent_submit_bio_hook_t *submit_bio_done;
+ extent_submit_bio_start_t *submit_bio_start;
+ extent_submit_bio_done_t *submit_bio_done;
int mirror_num;
unsigned long bio_flags;
/*
@@ -270,7 +270,7 @@ out:
u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
{
- return btrfs_crc32c(seed, data, len);
+ return crc32c(seed, data, len);
}
void btrfs_csum_final(u32 crc, u8 *result)
@@ -403,8 +403,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
u32 crc = ~(u32)0;
- const int csum_size = sizeof(crc);
- char result[csum_size];
+ char result[sizeof(crc)];
/*
* The super_block structure does not span the whole
@@ -415,7 +414,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
btrfs_csum_final(crc, result);
- if (memcmp(raw_disk_sb, result, csum_size))
+ if (memcmp(raw_disk_sb, result, sizeof(result)))
ret = 1;
}
@@ -428,13 +427,59 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
return ret;
}
+static int verify_level_key(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb, int level,
+ struct btrfs_key *first_key)
+{
+ int found_level;
+ struct btrfs_key found_key;
+ int ret;
+
+ found_level = btrfs_header_level(eb);
+ if (found_level != level) {
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_ON(1);
+ btrfs_err(fs_info,
+"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
+ eb->start, level, found_level);
+#endif
+ return -EIO;
+ }
+
+ if (!first_key)
+ return 0;
+
+ if (found_level)
+ btrfs_node_key_to_cpu(eb, &found_key, 0);
+ else
+ btrfs_item_key_to_cpu(eb, &found_key, 0);
+ ret = btrfs_comp_cpu_keys(first_key, &found_key);
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret) {
+ WARN_ON(1);
+ btrfs_err(fs_info,
+"tree first key mismatch detected, bytenr=%llu key expected=(%llu, %u, %llu) has=(%llu, %u, %llu)",
+ eb->start, first_key->objectid, first_key->type,
+ first_key->offset, found_key.objectid,
+ found_key.type, found_key.offset);
+ }
+#endif
+ return ret;
+}
+
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
+ *
+ * @parent_transid: expected transid, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key of first slot, skip check if NULL
*/
static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb,
- u64 parent_transid)
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct extent_io_tree *io_tree;
int failed = 0;
@@ -449,11 +494,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
mirror_num);
if (!ret) {
- if (!verify_parent_transid(io_tree, eb,
+ if (verify_parent_transid(io_tree, eb,
parent_transid, 0))
- break;
- else
ret = -EIO;
+ else if (verify_level_key(fs_info, eb, level,
+ first_key))
+ ret = -EUCLEAN;
+ else
+ break;
}
/*
@@ -461,7 +509,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
* there is no reason to read the other copies, they won't be
* any less wrong.
*/
- if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags) ||
+ ret == -EUCLEAN)
break;
num_copies = btrfs_num_copies(fs_info,
@@ -602,12 +651,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
* that we don't try and read the other copies of this block, just
* return -EIO.
*/
- if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
+ if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
- if (found_level > 0 && btrfs_check_node(root, eb))
+ if (found_level > 0 && btrfs_check_node(fs_info, eb))
ret = -EIO;
if (!ret)
@@ -710,14 +759,6 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
return 0;
}
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
-{
- unsigned long limit = min_t(unsigned long,
- info->thread_pool_size,
- info->fs_devices->open_devices);
- return 256 * limit;
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -725,7 +766,6 @@ static void run_one_async_start(struct btrfs_work *work)
async = container_of(work, struct async_submit_bio, work);
ret = async->submit_bio_start(async->private_data, async->bio,
- async->mirror_num, async->bio_flags,
async->bio_offset);
if (ret)
async->status = ret;
@@ -744,8 +784,7 @@ static void run_one_async_done(struct btrfs_work *work)
return;
}
- async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
- async->bio_flags, async->bio_offset);
+ async->submit_bio_done(async->private_data, async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -759,8 +798,8 @@ static void run_one_async_free(struct btrfs_work *work)
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset, void *private_data,
- extent_submit_bio_hook_t *submit_bio_start,
- extent_submit_bio_hook_t *submit_bio_done)
+ extent_submit_bio_start_t *submit_bio_start,
+ extent_submit_bio_done_t *submit_bio_done)
{
struct async_submit_bio *async;
@@ -807,8 +846,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
-static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
+static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
/*
@@ -818,9 +856,8 @@ static blk_status_t __btree_submit_bio_start(void *private_data, struct bio *bio
return btree_csum_one_bio(bio);
}
-static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btree_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num)
{
struct inode *inode = private_data;
blk_status_t ret;
@@ -879,8 +916,8 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio,
*/
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
bio_offset, private_data,
- __btree_submit_bio_start,
- __btree_submit_bio_done);
+ btree_submit_bio_start,
+ btree_submit_bio_done);
}
if (ret)
@@ -1062,8 +1099,17 @@ void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
buf->start, buf->start + buf->len - 1);
}
+/*
+ * Read tree block at logical address @bytenr and do variant basic but critical
+ * verification.
+ *
+ * @parent_transid: expected transid of this tree block, skip check if 0
+ * @level: expected level, mandatory check
+ * @first_key: expected key in slot 0, skip check if NULL
+ */
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 parent_transid)
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct extent_buffer *buf = NULL;
int ret;
@@ -1072,7 +1118,8 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
- ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+ ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ level, first_key);
if (ret) {
free_extent_buffer(buf);
return ERR_PTR(ret);
@@ -1108,7 +1155,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
if (!writers)
return ERR_PTR(-ENOMEM);
- ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
if (ret < 0) {
kfree(writers);
return ERR_PTR(ret);
@@ -1160,6 +1207,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
spin_lock_init(&root->accounting_lock);
spin_lock_init(&root->log_extents_lock[0]);
spin_lock_init(&root->log_extents_lock[1]);
+ spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
@@ -1176,7 +1224,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->orphan_inodes, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
- atomic64_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1401,6 +1448,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_path *path;
u64 generation;
int ret;
+ int level;
path = btrfs_alloc_path();
if (!path)
@@ -1423,9 +1471,10 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
}
generation = btrfs_root_generation(&root->root_item);
+ level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
- generation);
+ generation, level, NULL);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
goto find_fail;
@@ -1808,12 +1857,10 @@ sleep:
if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
&fs_info->fs_state)))
btrfs_cleanup_transaction(fs_info);
- set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
(!btrfs_transaction_blocked(fs_info) ||
cannot_commit))
- schedule_timeout(delay);
- __set_current_state(TASK_RUNNING);
+ schedule_timeout_interruptible(delay);
} while (!kthread_should_stop());
return 0;
}
@@ -2183,7 +2230,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
- int max_active = fs_info->thread_pool_size;
+ u32 max_active = fs_info->thread_pool_size;
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
@@ -2276,6 +2323,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
+ int level = btrfs_super_log_root_level(disk_super);
if (fs_devices->rw_devices == 0) {
btrfs_warn(fs_info, "log replay required on RO media");
@@ -2289,7 +2337,8 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
__setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
log_tree_root->node = read_tree_block(fs_info, bytenr,
- fs_info->generation + 1);
+ fs_info->generation + 1,
+ level, NULL);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@@ -2334,23 +2383,29 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
location.offset = 0;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->extent_root = root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
btrfs_init_devices_late(fs_info);
location.objectid = BTRFS_CSUM_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->csum_root = root;
@@ -2367,7 +2422,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (IS_ERR(root)) {
ret = PTR_ERR(root);
if (ret != -ENOENT)
- return ret;
+ goto out;
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->uuid_root = root;
@@ -2376,13 +2431,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
root = btrfs_read_tree_root(tree_root, &location);
- if (IS_ERR(root))
- return PTR_ERR(root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->free_space_root = root;
}
return 0;
+out:
+ btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
+ location.objectid, ret);
+ return ret;
}
int open_ctree(struct super_block *sb,
@@ -2404,8 +2465,8 @@ int open_ctree(struct super_block *sb,
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
- int max_active;
int clear_free_space_tree = 0;
+ int level;
tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2447,6 +2508,8 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->delayed_iputs);
INIT_LIST_HEAD(&fs_info->delalloc_roots);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
+ INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
+ spin_lock_init(&fs_info->pending_raid_kobjs_lock);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
@@ -2713,8 +2776,6 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- max_active = fs_info->thread_pool_size;
-
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -2741,12 +2802,13 @@ int open_ctree(struct super_block *sb,
}
generation = btrfs_super_chunk_root_generation(disk_super);
+ level = btrfs_super_chunk_root_level(disk_super);
__setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
chunk_root->node = read_tree_block(fs_info,
btrfs_super_chunk_root(disk_super),
- generation);
+ generation, level, NULL);
if (IS_ERR(chunk_root->node) ||
!extent_buffer_uptodate(chunk_root->node)) {
btrfs_err(fs_info, "failed to read chunk root");
@@ -2768,10 +2830,10 @@ int open_ctree(struct super_block *sb,
}
/*
- * keep the device that is marked to be the target device for the
- * dev_replace procedure
+ * Keep the devid that is marked to be the target device for the
+ * device replace procedure
*/
- btrfs_close_extra_devices(fs_devices, 0);
+ btrfs_free_extra_devids(fs_devices, 0);
if (!fs_devices->latest_bdev) {
btrfs_err(fs_info, "failed to read devices");
@@ -2780,10 +2842,11 @@ int open_ctree(struct super_block *sb,
retry_root_backup:
generation = btrfs_super_generation(disk_super);
+ level = btrfs_super_root_level(disk_super);
tree_root->node = read_tree_block(fs_info,
btrfs_super_root(disk_super),
- generation);
+ generation, level, NULL);
if (IS_ERR(tree_root->node) ||
!extent_buffer_uptodate(tree_root->node)) {
btrfs_warn(fs_info, "failed to read tree root");
@@ -2834,7 +2897,7 @@ retry_root_backup:
goto fail_block_groups;
}
- btrfs_close_extra_devices(fs_devices, 1);
+ btrfs_free_extra_devids(fs_devices, 1);
ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
if (ret) {
@@ -2953,6 +3016,7 @@ retry_root_backup:
fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
+ btrfs_warn(fs_info, "failed to read fs tree: %d", err);
goto fail_qgroup;
}
@@ -3290,6 +3354,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
struct buffer_head *bh;
int i;
int errors = 0;
+ bool primary_failed = false;
u64 bytenr;
if (max_mirrors == 0)
@@ -3306,11 +3371,16 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
errors++;
+ if (i == 0)
+ primary_failed = true;
continue;
}
wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
+ if (!buffer_uptodate(bh)) {
errors++;
+ if (i == 0)
+ primary_failed = true;
+ }
/* drop our reference */
brelse(bh);
@@ -3319,6 +3389,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
brelse(bh);
}
+ /* log error, force error return */
+ if (primary_failed) {
+ btrfs_err(device->fs_info, "error writing primary super block to device %llu",
+ device->devid);
+ return -1;
+ }
+
return errors < i ? 0 : -1;
}
@@ -3851,7 +3928,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
* So here we should only check item pointers, not item data.
*/
if (btrfs_header_level(buf) == 0 &&
- btrfs_check_leaf_relaxed(root, buf)) {
+ btrfs_check_leaf_relaxed(fs_info, buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
@@ -3890,12 +3967,14 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+ struct btrfs_key *first_key)
{
struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
+ return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
+ level, first_key);
}
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
@@ -4314,11 +4393,6 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
dirty_list);
- if (!cache) {
- btrfs_err(fs_info, "orphan block group dirty_bgs list");
- spin_unlock(&cur_trans->dirty_bgs_lock);
- return;
- }
if (!list_empty(&cache->io_list)) {
spin_unlock(&cur_trans->dirty_bgs_lock);
@@ -4338,14 +4412,14 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
}
spin_unlock(&cur_trans->dirty_bgs_lock);
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
while (!list_empty(&cur_trans->io_bgs)) {
cache = list_first_entry(&cur_trans->io_bgs,
struct btrfs_block_group_cache,
io_list);
- if (!cache) {
- btrfs_err(fs_info, "orphan block group on io_bgs list");
- return;
- }
list_del_init(&cache->io_list);
spin_lock(&cache->lock);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 301151a50ac1..453ea9f5d4e9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,8 +52,9 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
-struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info,
- u64 bytenr, u64 parent_transid);
+struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 parent_transid, int level,
+ struct btrfs_key *first_key);
void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr);
int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
int mirror_num, struct extent_buffer **eb);
@@ -123,7 +124,8 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
+ struct btrfs_key *first_key);
u32 btrfs_csum_data(const char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, u8 *result);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -131,9 +133,8 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset, void *private_data,
- extent_submit_bio_hook_t *submit_bio_start,
- extent_submit_bio_hook_t *submit_bio_done);
-unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+ extent_submit_bio_start_t *submit_bio_start,
+ extent_submit_bio_done_t *submit_bio_done);
int btrfs_write_tree_block(struct extent_buffer *buf);
void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@@ -154,7 +155,7 @@ struct extent_map *btree_get_extent(struct btrfs_inode *inode,
int create);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
-void btrfs_end_io_wq_exit(void);
+void __cold btrfs_end_io_wq_exit(void);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e0460d7b5622..e08d0d45af4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -27,7 +27,7 @@
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
#include <linux/lockdep.h>
-#include "hash.h"
+#include <linux/crc32c.h>
#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
@@ -535,13 +535,11 @@ static noinline void caching_thread(struct btrfs_work *work)
struct btrfs_block_group_cache *block_group;
struct btrfs_fs_info *fs_info;
struct btrfs_caching_control *caching_ctl;
- struct btrfs_root *extent_root;
int ret;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
fs_info = block_group->fs_info;
- extent_root = fs_info->extent_root;
mutex_lock(&caching_ctl->mutex);
down_read(&fs_info->commit_root_sem);
@@ -1203,11 +1201,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -2652,9 +2650,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
unsigned long nr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
@@ -2994,7 +2992,7 @@ static void delayed_ref_async_start(struct btrfs_work *work)
if (trans->transid > async->transid)
goto end;
- ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
+ ret = btrfs_run_delayed_refs(trans, async->count);
if (ret)
async->error = ret;
end:
@@ -3053,8 +3051,9 @@ int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
* Returns <0 on error and aborts the transaction
*/
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, unsigned long count)
+ unsigned long count)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
@@ -3078,7 +3077,7 @@ again:
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
trans->can_flush_pending_bgs = false;
- ret = __btrfs_run_delayed_refs(trans, fs_info, count);
+ ret = __btrfs_run_delayed_refs(trans, count);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3086,7 +3085,7 @@ again:
if (run_all) {
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
node = rb_first(&delayed_refs->href_root);
@@ -3660,9 +3659,9 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
* the commit latency by getting rid of the easy block groups while
* we're still allowing others to join the commit.
*/
-int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
@@ -3686,7 +3685,7 @@ again:
* make sure all the block groups on our dirty list actually
* exist
*/
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
if (!path) {
path = btrfs_alloc_path();
@@ -3741,8 +3740,9 @@ again:
should_put = 0;
/*
- * the cache_write_mutex is protecting
- * the io_list
+ * The cache_write_mutex is protecting the
+ * io_list, also refer to the definition of
+ * btrfs_transaction::io_bgs for more details
*/
list_add_tail(&cache->io_list, io);
} else {
@@ -3800,7 +3800,7 @@ again:
* go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3882,7 +3882,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache_save_setup(cache, trans, path);
if (!ret)
- ret = btrfs_run_delayed_refs(trans, fs_info,
+ ret = btrfs_run_delayed_refs(trans,
(unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
@@ -3934,6 +3934,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
}
spin_unlock(&cur_trans->dirty_bgs_lock);
+ /*
+ * Refer to the definition of io_bgs member for details why it's safe
+ * to use it without any locking
+ */
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
io_list);
@@ -4332,8 +4336,7 @@ again:
/* commit the current transaction and try again */
commit_trans:
- if (need_commit &&
- !atomic_read(&fs_info->open_ioctl_trans)) {
+ if (need_commit) {
need_commit--;
if (need_commit > 0) {
@@ -4541,7 +4544,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
* Needed because we can end up allocating a system chunk and for an
* atomic and race free space reservation in the chunk block reserve.
*/
- ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ lockdep_assert_held(&fs_info->chunk_mutex);
info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
spin_lock(&info->lock);
@@ -4602,11 +4605,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
return -ENOSPC;
space_info = __find_space_info(fs_info, flags);
- if (!space_info) {
- ret = create_space_info(fs_info, flags, &space_info);
- if (ret)
- return ret;
- }
+ ASSERT(space_info);
again:
spin_lock(&space_info->lock);
@@ -4705,7 +4704,7 @@ out:
*/
if (trans->can_flush_pending_bgs &&
trans->chunk_bytes_reserved >= (u64)SZ_2M) {
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
}
return ret;
@@ -4826,7 +4825,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
long time_left;
unsigned long nr_pages;
int loops;
- enum btrfs_reserve_flush_enum flush;
/* Calc the number of the pages we need flush for space reservation */
items = calc_reclaim_items_nr(fs_info, to_reclaim);
@@ -4867,10 +4865,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
atomic_read(&fs_info->async_delalloc_pages) <=
(int)max_reclaim);
skip_async:
- if (!trans)
- flush = BTRFS_RESERVE_FLUSH_ALL;
- else
- flush = BTRFS_RESERVE_NO_FLUSH;
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -4993,7 +4987,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
+ ret = btrfs_run_delayed_items_nr(trans, nr);
btrfs_end_transaction(trans);
break;
case FLUSH_DELALLOC:
@@ -5388,10 +5382,15 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
!block_rsv_use_bytes(global_rsv, orig_bytes))
ret = 0;
}
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
block_rsv->space_info->flags,
orig_bytes, 1);
+
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ dump_space_info(fs_info, block_rsv->space_info,
+ orig_bytes, 0);
+ }
return ret;
}
@@ -5760,6 +5759,9 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
if (num_bytes == 0)
return 0;
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ if (ret)
+ return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
block_rsv_add_bytes(block_rsv, num_bytes, 0);
@@ -5772,11 +5774,15 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
/**
* btrfs_inode_rsv_release - release any excessive reservation.
* @inode - the inode we need to release from.
+ * @qgroup_free - free or convert qgroup meta.
+ * Unlike normal operation, qgroup meta reservation needs to know if we are
+ * freeing qgroup reservation or just converting it into per-trans. Normally
+ * @qgroup_free is true for error handling, and false for normal release.
*
* This is the same as btrfs_block_rsv_release, except that it handles the
* tracepoint for the reservation.
*/
-static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
+static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5792,6 +5798,10 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode)
if (released > 0)
trace_btrfs_space_reservation(fs_info, "delalloc",
btrfs_ino(inode), released, 0);
+ if (qgroup_free)
+ btrfs_qgroup_free_meta_prealloc(inode->root, released);
+ else
+ btrfs_qgroup_convert_reserved_meta(inode->root, released);
}
void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
@@ -5892,24 +5902,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
}
-void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- if (!trans->block_rsv) {
- ASSERT(!trans->bytes_reserved);
- return;
- }
-
- if (!trans->bytes_reserved)
- return;
-
- ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, trans->bytes_reserved, 0);
- btrfs_block_rsv_release(fs_info, trans->block_rsv,
- trans->bytes_reserved);
- trans->bytes_reserved = 0;
-}
/*
* To be called after all the new block groups attached to the transaction
@@ -5951,7 +5943,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
*/
u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
- trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
+ trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
num_bytes, 1);
return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
@@ -5995,7 +5987,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret)
return ret;
} else {
@@ -6014,7 +6006,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && *qgroup_reserved)
- btrfs_qgroup_free_meta(root, *qgroup_reserved);
+ btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
return ret;
}
@@ -6051,7 +6043,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
unsigned nr_extents;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
@@ -6068,13 +6059,13 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
- } else if (current->journal_info) {
- flush = BTRFS_RESERVE_FLUSH_LIMIT;
- }
+ } else {
+ if (current->journal_info)
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
- if (flush != BTRFS_RESERVE_NO_FLUSH &&
- btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
+ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+ }
if (delalloc_lock)
mutex_lock(&inode->delalloc_mutex);
@@ -6089,19 +6080,9 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- ret = btrfs_qgroup_reserve_meta(root,
- nr_extents * fs_info->nodesize, true);
- if (ret)
- goto out_fail;
- }
-
ret = btrfs_inode_rsv_refill(inode, flush);
- if (unlikely(ret)) {
- btrfs_qgroup_free_meta(root,
- nr_extents * fs_info->nodesize);
+ if (unlikely(ret))
goto out_fail;
- }
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
@@ -6115,7 +6096,7 @@ out_fail:
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, true);
if (delalloc_lock)
mutex_unlock(&inode->delalloc_mutex);
return ret;
@@ -6125,12 +6106,14 @@ out_fail:
* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
+ * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
*
* This will release the metadata reservation for an inode. This can be called
* once we complete IO for a given set of bytes to release their metadata
* reservations, or on error for the same reason.
*/
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
@@ -6143,13 +6126,14 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
+ * @qgroup_free: do we need to free qgroup meta reservation or convert them.
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -6157,7 +6141,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
+ bool qgroup_free)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
unsigned num_extents;
@@ -6171,7 +6156,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode);
+ btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
@@ -6227,9 +6212,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
*/
void btrfs_delalloc_release_space(struct inode *inode,
struct extent_changeset *reserved,
- u64 start, u64 len)
+ u64 start, u64 len, bool qgroup_free)
{
- btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
btrfs_free_reserved_data_space(inode, reserved, start, len);
}
@@ -6783,9 +6768,9 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
@@ -7351,29 +7336,6 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return ret;
}
-int __get_raid_index(u64 flags)
-{
- if (flags & BTRFS_BLOCK_GROUP_RAID10)
- return BTRFS_RAID_RAID10;
- else if (flags & BTRFS_BLOCK_GROUP_RAID1)
- return BTRFS_RAID_RAID1;
- else if (flags & BTRFS_BLOCK_GROUP_DUP)
- return BTRFS_RAID_DUP;
- else if (flags & BTRFS_BLOCK_GROUP_RAID0)
- return BTRFS_RAID_RAID0;
- else if (flags & BTRFS_BLOCK_GROUP_RAID5)
- return BTRFS_RAID_RAID5;
- else if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return BTRFS_RAID_RAID6;
-
- return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
-}
-
-int get_block_group_index(struct btrfs_block_group_cache *cache)
-{
- return __get_raid_index(cache->flags);
-}
-
static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = "raid10",
[BTRFS_RAID_RAID1] = "raid1",
@@ -7488,7 +7450,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = __get_raid_index(flags);
+ int index = btrfs_bg_flags_to_raid_index(flags);
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7574,7 +7536,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(
+ block_group->flags);
btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
@@ -7584,7 +7547,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
}
search:
have_caching_bg = false;
- if (index == 0 || index == __get_raid_index(flags))
+ if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
@@ -7842,7 +7805,8 @@ checks:
loop:
failed_cluster_refill = false;
failed_alloc = false;
- BUG_ON(index != get_block_group_index(block_group));
+ BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
+ index);
btrfs_release_block_group(block_group, delalloc);
cond_resched();
}
@@ -7996,6 +7960,51 @@ again:
up_read(&info->groups_sem);
}
+/*
+ * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
+ * hole that is at least as big as @num_bytes.
+ *
+ * @root - The root that will contain this extent
+ *
+ * @ram_bytes - The amount of space in ram that @num_bytes take. This
+ * is used for accounting purposes. This value differs
+ * from @num_bytes only in the case of compressed extents.
+ *
+ * @num_bytes - Number of bytes to allocate on-disk.
+ *
+ * @min_alloc_size - Indicates the minimum amount of space that the
+ * allocator should try to satisfy. In some cases
+ * @num_bytes may be larger than what is required and if
+ * the filesystem is fragmented then allocation fails.
+ * However, the presence of @min_alloc_size gives a
+ * chance to try and satisfy the smaller allocation.
+ *
+ * @empty_size - A hint that you plan on doing more COW. This is the
+ * size in bytes the allocator should try to find free
+ * next to the block it returns. This is just a hint and
+ * may be ignored by the allocator.
+ *
+ * @hint_byte - Hint to the allocator to start searching above the byte
+ * address passed. It might be ignored.
+ *
+ * @ins - This key is modified to record the found hole. It will
+ * have the following values:
+ * ins->objectid == start position
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == the size of the hole.
+ *
+ * @is_data - Boolean flag indicating whether an extent is
+ * allocated for data (true) or metadata (false)
+ *
+ * @delalloc - Boolean flag indicating whether this allocation is for
+ * delalloc or not. If 'true' data_rwsem of block groups
+ * is going to be acquired.
+ *
+ *
+ * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
+ * case -ENOSPC is returned then @ins->offset will contain the size of the
+ * largest available hole the allocator managed to find.
+ */
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
@@ -8699,6 +8708,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 parent;
u32 blocksize;
struct btrfs_key key;
+ struct btrfs_key first_key;
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
@@ -8719,6 +8729,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+ path->slots[level]);
blocksize = fs_info->nodesize;
next = find_extent_buffer(fs_info, bytenr);
@@ -8783,7 +8795,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
- next = read_tree_block(fs_info, bytenr, generation);
+ next = read_tree_block(fs_info, bytenr, generation, level - 1,
+ &first_key);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@@ -9648,7 +9661,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
target = get_restripe_target(fs_info, block_group->flags);
if (target) {
- index = __get_raid_index(extended_to_chunk(target));
+ index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
} else {
/*
* this is just a balance, so if we were marked as full
@@ -9662,7 +9675,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
goto out;
}
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
}
if (index == BTRFS_RAID_RAID10) {
@@ -9911,10 +9924,40 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
return 0;
}
+/* link_block_group will queue up kobjects to add when we're reclaim-safe */
+void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+ struct raid_kobject *rkobj;
+ LIST_HEAD(list);
+ int index;
+ int ret = 0;
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_splice_init(&fs_info->pending_raid_kobjs, &list);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
+
+ list_for_each_entry(rkobj, &list, list) {
+ space_info = __find_space_info(fs_info, rkobj->flags);
+ index = btrfs_bg_flags_to_raid_index(rkobj->flags);
+
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
+ if (ret) {
+ kobject_put(&rkobj->kobj);
+ break;
+ }
+ }
+ if (ret)
+ btrfs_warn(fs_info,
+ "failed to add kobject for block cache, ignoring");
+}
+
static void link_block_group(struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
- int index = get_block_group_index(cache);
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ int index = btrfs_bg_flags_to_raid_index(cache->flags);
bool first = false;
down_write(&space_info->groups_sem);
@@ -9924,27 +9967,20 @@ static void link_block_group(struct btrfs_block_group_cache *cache)
up_write(&space_info->groups_sem);
if (first) {
- struct raid_kobject *rkobj;
- int ret;
-
- rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
- if (!rkobj)
- goto out_err;
- rkobj->raid_type = index;
- kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
- ret = kobject_add(&rkobj->kobj, &space_info->kobj,
- "%s", get_raid_name(index));
- if (ret) {
- kobject_put(&rkobj->kobj);
- goto out_err;
+ struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj) {
+ btrfs_warn(cache->fs_info,
+ "couldn't alloc memory for raid level kobject");
+ return;
}
+ rkobj->flags = cache->flags;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ spin_lock(&fs_info->pending_raid_kobjs_lock);
+ list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
+ spin_unlock(&fs_info->pending_raid_kobjs_lock);
space_info->block_group_kobjs[index] = &rkobj->kobj;
}
-
- return;
-out_err:
- btrfs_warn(cache->fs_info,
- "failed to add kobject for block cache, ignoring");
}
static struct btrfs_block_group_cache *
@@ -10160,6 +10196,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
inc_block_group_ro(cache, 1);
}
+ btrfs_add_raid_kobjects(info);
init_global_block_rsv(info);
ret = 0;
error:
@@ -10167,9 +10204,9 @@ error:
return ret;
}
-void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group_cache *block_group, *tmp;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
@@ -10254,15 +10291,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* with its ->space_info set.
*/
cache->space_info = __find_space_info(fs_info, cache->flags);
- if (!cache->space_info) {
- ret = create_space_info(fs_info, cache->flags,
- &cache->space_info);
- if (ret) {
- btrfs_remove_free_space_cache(cache);
- btrfs_put_block_group(cache);
- return ret;
- }
- }
+ ASSERT(cache->space_info);
ret = btrfs_add_block_group_cache(fs_info, cache);
if (ret) {
@@ -10334,7 +10363,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->key.offset);
memcpy(&key, &block_group->key, sizeof(key));
- index = get_block_group_index(block_group);
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dfeb74a0be77..47a8fe9d22e8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -76,8 +76,8 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&buffers)) {
eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n",
- eb->start, eb->len, atomic_read(&eb->refs));
+ pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
+ eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
list_del(&eb->leak_list);
kmem_cache_free(extent_buffer_cache, eb);
}
@@ -119,23 +119,22 @@ struct extent_page_data {
unsigned int sync_io:1;
};
-static void add_extent_changeset(struct extent_state *state, unsigned bits,
+static int add_extent_changeset(struct extent_state *state, unsigned bits,
struct extent_changeset *changeset,
int set)
{
int ret;
if (!changeset)
- return;
+ return 0;
if (set && (state->state & bits) == bits)
- return;
+ return 0;
if (!set && (state->state & bits) == 0)
- return;
+ return 0;
changeset->bytes_changed += state->end - state->start + 1;
ret = ulist_add(&changeset->range_changed, state->start, state->end,
GFP_ATOMIC);
- /* ENOMEM */
- BUG_ON(ret < 0);
+ return ret;
}
static void flush_write_bio(struct extent_page_data *epd);
@@ -187,7 +186,7 @@ free_state_cache:
return -ENOMEM;
}
-void extent_io_exit(void)
+void __cold extent_io_exit(void)
{
btrfs_leak_debug_check();
@@ -527,6 +526,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
{
struct extent_state *next;
unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ int ret;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -534,7 +534,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
- add_extent_changeset(state, bits_to_clear, changeset, 0);
+ ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
+ BUG_ON(ret < 0);
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
@@ -805,13 +806,15 @@ static void set_state_bits(struct extent_io_tree *tree,
unsigned *bits, struct extent_changeset *changeset)
{
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
+ int ret;
set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- add_extent_changeset(state, bits_to_set, changeset, 1);
+ ret = add_extent_changeset(state, bits_to_set, changeset, 1);
+ BUG_ON(ret < 0);
state->state |= bits_to_set;
}
@@ -2744,20 +2747,21 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
return blk_status_to_errno(ret);
}
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
- unsigned long offset, size_t size, struct bio *bio,
- unsigned long bio_flags)
-{
- int ret = 0;
- if (tree->ops)
- ret = tree->ops->merge_bio_hook(page, offset, size, bio,
- bio_flags);
- return ret;
-
-}
-
/*
* @opf: bio REQ_OP_* and REQ_* flags as one value
+ * @tree: tree so we can call our merge_bio hook
+ * @wbc: optional writeback control for io accounting
+ * @page: page to add to the bio
+ * @pg_offset: offset of the new bio or to check whether we are adding
+ * a contiguous page to the previous one
+ * @size: portion of page that we want to write
+ * @offset: starting offset in the page
+ * @bdev: attach newly created bios to this bdev
+ * @bio_ret: must be valid pointer, newly allocated bio will be stored there
+ * @end_io_func: end_io callback for new bio
+ * @mirror_num: desired mirror to read/write
+ * @prev_bio_flags: flags of previous bio to see if we can merge the current one
+ * @bio_flags: flags of the current bio to see if we can merge them
*/
static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
struct writeback_control *wbc,
@@ -2773,21 +2777,27 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
{
int ret = 0;
struct bio *bio;
- int contig = 0;
- int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
size_t page_size = min_t(size_t, size, PAGE_SIZE);
sector_t sector = offset >> 9;
- if (bio_ret && *bio_ret) {
+ ASSERT(bio_ret);
+
+ if (*bio_ret) {
+ bool contig;
+ bool can_merge = true;
+
bio = *bio_ret;
- if (old_compressed)
+ if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
- if (prev_bio_flags != bio_flags || !contig ||
+ if (tree->ops && tree->ops->merge_bio_hook(page, offset,
+ page_size, bio, bio_flags))
+ can_merge = false;
+
+ if (prev_bio_flags != bio_flags || !contig || !can_merge ||
force_bio_submit ||
- merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, pg_offset) < page_size) {
ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
if (ret < 0) {
@@ -2813,10 +2823,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
wbc_account_io(wbc, page, page_size);
}
- if (bio_ret)
- *bio_ret = bio;
- else
- ret = submit_one_bio(bio, mirror_num, bio_flags);
+ *bio_ret = bio;
return ret;
}
@@ -2886,8 +2893,7 @@ static int __do_readpage(struct extent_io_tree *tree,
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
- u64 page_end = start + PAGE_SIZE - 1;
- u64 end;
+ const u64 end = start + PAGE_SIZE - 1;
u64 cur = start;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
@@ -2905,7 +2911,6 @@ static int __do_readpage(struct extent_io_tree *tree,
set_page_extent_mapped(page);
- end = page_end;
if (!PageUptodate(page)) {
if (cleancache_get_page(page) == 0) {
BUG_ON(blocksize != PAGE_SIZE);
@@ -5230,11 +5235,6 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb)
}
}
-int extent_buffer_uptodate(struct extent_buffer *eb)
-{
- return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-}
-
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, int wait, int mirror_num)
{
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a7a850abd600..b77d84909863 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -83,8 +83,8 @@ static inline int le_test_bit(int nr, const u8 *addr)
return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1)));
}
-extern void le_bitmap_set(u8 *map, unsigned int start, int len);
-extern void le_bitmap_clear(u8 *map, unsigned int start, int len);
+void le_bitmap_set(u8 *map, unsigned int start, int len);
+void le_bitmap_clear(u8 *map, unsigned int start, int len);
struct extent_state;
struct btrfs_root;
@@ -95,6 +95,13 @@ struct io_failure_record;
typedef blk_status_t (extent_submit_bio_hook_t)(void *private_data, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
+ struct bio *bio, u64 bio_offset);
+
+typedef blk_status_t (extent_submit_bio_done_t)(void *private_data,
+ struct bio *bio, int mirror_num);
+
struct extent_io_ops {
/*
* The following callbacks must be allways defined, the function
@@ -286,7 +293,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, int mirror_num);
int __init extent_io_init(void);
-void extent_io_exit(void);
+void __cold extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
@@ -455,6 +462,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
atomic_inc(&eb->refs);
}
+static inline int extent_buffer_uptodate(struct extent_buffer *eb)
+{
+ return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+}
+
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len);
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
@@ -489,7 +501,6 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
-int extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(const struct extent_buffer *eb,
unsigned long offset, unsigned long min_len,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index d3bd02105d1c..53a0633c6ef7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2,7 +2,6 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/hardirq.h>
#include "ctree.h"
#include "extent_map.h"
#include "compression.h"
@@ -20,7 +19,7 @@ int __init extent_map_init(void)
return 0;
}
-void extent_map_exit(void)
+void __cold extent_map_exit(void)
{
kmem_cache_destroy(extent_map_cache);
}
@@ -552,6 +551,9 @@ int btrfs_add_extent_mapping(struct extent_map_tree *em_tree,
ret = 0;
existing = search_extent_mapping(em_tree, start, len);
+
+ trace_btrfs_handle_em_exist(existing, em, start, len);
+
/*
* existing will always be non-NULL, since there must be
* extent causing the -EEXIST.
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b29f77bc0732..f6f8ba114977 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -86,7 +86,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
-void extent_map_exit(void);
+void __cold extent_map_exit(void);
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 41ab9073d1d4..f247300170e5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1691,7 +1691,7 @@ again:
force_page_uptodate);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes, true);
break;
}
@@ -1703,7 +1703,7 @@ again:
if (extents_locked == -EAGAIN)
goto again;
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes, true);
ret = extents_locked;
break;
}
@@ -1738,7 +1738,7 @@ again:
fs_info->sb->s_blocksize_bits;
if (only_release_metadata) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes);
+ release_bytes, true);
} else {
u64 __pos;
@@ -1747,7 +1747,7 @@ again:
(dirty_pages << PAGE_SHIFT);
btrfs_delalloc_release_space(inode,
data_reserved, __pos,
- release_bytes);
+ release_bytes, true);
}
}
@@ -1760,7 +1760,8 @@ again:
if (extents_locked)
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
lockstart, lockend, &cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
+ (ret != 0));
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
@@ -1800,11 +1801,11 @@ again:
if (only_release_metadata) {
btrfs_end_write_no_snapshotting(root);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes);
+ release_bytes, true);
} else {
btrfs_delalloc_release_space(inode, data_reserved,
round_down(pos, fs_info->sectorsize),
- release_bytes);
+ release_bytes, true);
}
}
@@ -1997,8 +1998,6 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
- if (private && private->trans)
- btrfs_ioctl_trans_end(filp);
if (private && private->filldir_buf)
kfree(private->filldir_buf);
kfree(private);
@@ -2190,12 +2189,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
/*
- * ok we haven't committed the transaction yet, lets do a commit
- */
- if (file->private_data)
- btrfs_ioctl_trans_end(file);
-
- /*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log, which could require joining a transaction (for
* example checking cross references in the nocow path). If we use join
@@ -2214,7 +2207,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
trans->sync = true;
- ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2482,7 +2475,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
if ((!ordered ||
(ordered->file_offset + ordered->len <= lockstart ||
ordered->file_offset > lockend)) &&
- !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+ !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
@@ -3378,7 +3372,7 @@ const struct file_operations btrfs_file_operations = {
.dedupe_file_range = btrfs_dedupe_file_range,
};
-void btrfs_auto_defrag_exit(void)
+void __cold btrfs_auto_defrag_exit(void)
{
kmem_cache_destroy(btrfs_inode_defrag_cachep);
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a9f22ac50d6a..d0dde9e6afd7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3547,7 +3547,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
if (ret) {
if (release_metadata)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- inode->i_size);
+ inode->i_size, true);
#ifdef DEBUG
btrfs_err(fs_info,
"failed to write free ino cache for root %llu",
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index fe5e0324dca9..af36a6a971fe 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1071,7 +1071,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 1;
+ path->reada = READA_FORWARD;
path2 = btrfs_alloc_path();
if (!path2) {
@@ -1573,7 +1573,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
*/
path->skip_locking = 1;
path->search_commit_root = 1;
- path->reada = 1;
+ path->reada = READA_FORWARD;
info = search_free_space_info(NULL, fs_info, block_group, path, 0);
if (IS_ERR(info)) {
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
deleted file mode 100644
index baacc1866861..000000000000
--- a/fs/btrfs/hash.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-#include <crypto/hash.h>
-#include <linux/err.h>
-#include "hash.h"
-
-static struct crypto_shash *tfm;
-
-int __init btrfs_hash_init(void)
-{
- tfm = crypto_alloc_shash("crc32c", 0, 0);
-
- return PTR_ERR_OR_ZERO(tfm);
-}
-
-const char* btrfs_crc32c_impl(void)
-{
- return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm));
-}
-
-void btrfs_hash_exit(void)
-{
- crypto_free_shash(tfm);
-}
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
-{
- SHASH_DESC_ON_STACK(shash, tfm);
- u32 *ctx = (u32 *)shash_desc_ctx(shash);
- u32 retval;
- int err;
-
- shash->tfm = tfm;
- shash->flags = 0;
- *ctx = crc;
-
- err = crypto_shash_update(shash, address, length);
- BUG_ON(err);
-
- retval = *ctx;
- barrier_data(ctx);
- return retval;
-}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
deleted file mode 100644
index c3a2ec554361..000000000000
--- a/fs/btrfs/hash.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __HASH__
-#define __HASH__
-
-int __init btrfs_hash_init(void);
-
-void btrfs_hash_exit(void);
-const char* btrfs_crc32c_impl(void);
-
-u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
-
-static inline u64 btrfs_name_hash(const char *name, int len)
-{
- return btrfs_crc32c((u32)~1, name, len);
-}
-
-/*
- * Figure the key offset of an extended inode ref
- */
-static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
- int len)
-{
- return (u64) btrfs_crc32c(parent_objectid, name, len);
-}
-
-#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 65e1a76bf755..1d5631ef2738 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -18,7 +18,6 @@
#include "ctree.h"
#include "disk-io.h"
-#include "hash.h"
#include "transaction.h"
#include "print-tree.h"
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 022b19336fee..9409dcc7020d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -500,12 +500,12 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f53470112670..1f091c2358a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -58,7 +58,6 @@
#include "free-space-cache.h"
#include "inode-map.h"
#include "backref.h"
-#include "hash.h"
#include "props.h"
#include "qgroup.h"
#include "dedupe.h"
@@ -102,7 +101,7 @@ static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
};
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
-static int btrfs_truncate(struct inode *inode);
+static int btrfs_truncate(struct inode *inode, bool skip_writeback);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
@@ -277,12 +276,12 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct btrfs_root *root,
- struct inode *inode, u64 start,
+static noinline int cow_file_range_inline(struct inode *inode, u64 start,
u64 end, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
u64 isize = i_size_read(inode);
@@ -458,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode,
int *num_added)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
u64 blocksize = fs_info->sectorsize;
u64 actual_end;
u64 isize = i_size_read(inode);
@@ -580,11 +578,11 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(root, inode, start, end,
- 0, BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(root, inode, start, end,
+ ret = cow_file_range_inline(inode, start, end,
total_compressed,
compress_type, pages);
}
@@ -961,7 +959,6 @@ static noinline int cow_file_range(struct inode *inode,
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
- u64 disk_num_bytes;
u64 cur_alloc_size = 0;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
@@ -979,14 +976,14 @@ static noinline int cow_file_range(struct inode *inode,
num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
- disk_num_bytes = num_bytes;
+ ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
if (start == 0) {
/* lets try to make an inline extent */
- ret = cow_file_range_inline(root, inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, start, end, 0,
+ BTRFS_COMPRESS_NONE, NULL);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
@@ -1010,15 +1007,12 @@ static noinline int cow_file_range(struct inode *inode,
}
}
- BUG_ON(disk_num_bytes >
- btrfs_super_total_bytes(fs_info->super_copy));
-
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(BTRFS_I(inode), start,
start + num_bytes - 1, 0);
- while (disk_num_bytes > 0) {
- cur_alloc_size = disk_num_bytes;
+ while (num_bytes > 0) {
+ cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
fs_info->sectorsize, 0, alloc_hint,
&ins, 1, 1);
@@ -1082,11 +1076,10 @@ static noinline int cow_file_range(struct inode *inode,
delalloc_end, locked_page,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
- if (disk_num_bytes < cur_alloc_size)
- disk_num_bytes = 0;
+ if (num_bytes < cur_alloc_size)
+ num_bytes = 0;
else
- disk_num_bytes -= cur_alloc_size;
- num_bytes -= cur_alloc_size;
+ num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
extent_reserved = false;
@@ -1262,6 +1255,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
list_del(&sums->list);
kfree(sums);
}
+ if (ret < 0)
+ return ret;
return 1;
}
@@ -1394,10 +1389,23 @@ next_slot:
goto out_check;
if (btrfs_extent_readonly(fs_info, disk_bytenr))
goto out_check;
- if (btrfs_cross_ref_exist(root, ino,
- found_key.offset -
- extent_offset, disk_bytenr))
+ ret = btrfs_cross_ref_exist(root, ino,
+ found_key.offset -
+ extent_offset, disk_bytenr);
+ if (ret) {
+ /*
+ * ret could be -EIO if the above fails to read
+ * metadata.
+ */
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ }
+
+ WARN_ON_ONCE(nolock);
goto out_check;
+ }
disk_bytenr += extent_offset;
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
@@ -1415,10 +1423,22 @@ next_slot:
* this ensure that csum for a given extent are
* either valid or do not exist.
*/
- if (csum_exist_in_range(fs_info, disk_bytenr,
- num_bytes)) {
+ ret = csum_exist_in_range(fs_info, disk_bytenr,
+ num_bytes);
+ if (ret) {
if (!nolock)
btrfs_end_write_no_snapshotting(root);
+
+ /*
+ * ret could be -EIO if the above fails to read
+ * metadata.
+ */
+ if (ret < 0) {
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+ goto error;
+ }
+ WARN_ON_ONCE(nolock);
goto out_check;
}
if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
@@ -1847,7 +1867,7 @@ static void btrfs_clear_bit_hook(void *private_data,
*/
if (*bits & EXTENT_CLEAR_META_RESV &&
root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(inode, len);
+ btrfs_delalloc_release_metadata(inode, len, false);
/* For sanity tests. */
if (btrfs_is_testing(fs_info))
@@ -1921,8 +1941,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
+static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
u64 bio_offset)
{
struct inode *inode = private_data;
@@ -1941,9 +1960,8 @@ static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio
* At IO completion time the cums attached on the ordered extent record
* are inserted into the btree
*/
-static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
- int mirror_num, unsigned long bio_flags,
- u64 bio_offset)
+static blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
+ int mirror_num)
{
struct inode *inode = private_data;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2015,8 +2033,8 @@ static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
/* we're doing a write, do the async checksumming */
ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
bio_offset, inode,
- __btrfs_submit_bio_start,
- __btrfs_submit_bio_done);
+ btrfs_submit_bio_start,
+ btrfs_submit_bio_done);
goto out;
} else if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, 0, 0);
@@ -2134,7 +2152,7 @@ again:
ClearPageChecked(page);
set_page_dirty(page);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
@@ -2754,12 +2772,10 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
struct inode *inode;
- struct btrfs_root *root;
struct rb_node *node;
int ret;
inode = new->inode;
- root = BTRFS_I(inode)->root;
path = btrfs_alloc_path();
if (!path)
@@ -3247,6 +3263,16 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
start, (size_t)(end - start + 1));
}
+/*
+ * btrfs_add_delayed_iput - perform a delayed iput on @inode
+ *
+ * @inode: The inode we want to perform iput on
+ *
+ * This function uses the generic vfs_inode::i_count to track whether we should
+ * just decrement it (in case it's > 1) or if this is the last iput then link
+ * the inode to the delayed iput machinery. Delayed iputs are processed at
+ * transaction commit time/superblock commit/cleaner kthread.
+ */
void btrfs_add_delayed_iput(struct inode *inode)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -3256,12 +3282,8 @@ void btrfs_add_delayed_iput(struct inode *inode)
return;
spin_lock(&fs_info->delayed_iput_lock);
- if (binode->delayed_iput_count == 0) {
- ASSERT(list_empty(&binode->delayed_iput));
- list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
- } else {
- binode->delayed_iput_count++;
- }
+ ASSERT(list_empty(&binode->delayed_iput));
+ list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
spin_unlock(&fs_info->delayed_iput_lock);
}
@@ -3274,13 +3296,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
- if (inode->delayed_iput_count) {
- inode->delayed_iput_count--;
- list_move_tail(&inode->delayed_iput,
- &fs_info->delayed_iputs);
- } else {
- list_del_init(&inode->delayed_iput);
- }
+ list_del_init(&inode->delayed_iput);
spin_unlock(&fs_info->delayed_iput_lock);
iput(&inode->vfs_inode);
spin_lock(&fs_info->delayed_iput_lock);
@@ -3350,7 +3366,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
struct btrfs_block_rsv *block_rsv = NULL;
int reserve = 0;
- int insert = 0;
+ bool insert = false;
int ret;
if (!root->orphan_block_rsv) {
@@ -3360,7 +3376,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
+ if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+ &inode->runtime_flags))
+ insert = true;
+
+ if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+ &inode->runtime_flags))
+ reserve = 1;
+
spin_lock(&root->orphan_lock);
+ /* If someone has created ->orphan_block_rsv, be happy to use it. */
if (!root->orphan_block_rsv) {
root->orphan_block_rsv = block_rsv;
} else if (block_rsv) {
@@ -3368,26 +3393,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
block_rsv = NULL;
}
- if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
- &inode->runtime_flags)) {
-#if 0
- /*
- * For proper ENOSPC handling, we should do orphan
- * cleanup when mounting. But this introduces backward
- * compatibility issue.
- */
- if (!xchg(&root->orphan_item_inserted, 1))
- insert = 2;
- else
- insert = 1;
-#endif
- insert = 1;
+ if (insert)
atomic_inc(&root->orphan_inodes);
- }
-
- if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
- &inode->runtime_flags))
- reserve = 1;
spin_unlock(&root->orphan_lock);
/* grab metadata reservation from transaction handle */
@@ -3411,7 +3418,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
}
/* insert an orphan item to track this unlinked/truncated file */
- if (insert >= 1) {
+ if (insert) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret) {
if (reserve) {
@@ -3435,15 +3442,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans,
ret = 0;
}
- /* insert an orphan item to track subvolume contains orphan files */
- if (insert >= 2) {
- ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
- root->root_key.objectid);
- if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, ret);
- return ret;
- }
- }
return 0;
}
@@ -3644,7 +3642,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
goto out;
}
- ret = btrfs_truncate(inode);
+ ret = btrfs_truncate(inode, false);
if (ret)
btrfs_orphan_del(NULL, BTRFS_I(inode));
} else {
@@ -4711,7 +4709,6 @@ delete:
if (updates) {
trans->delayed_ref_updates = 0;
ret = btrfs_run_delayed_refs(trans,
- fs_info,
updates * 2);
if (ret && !err)
err = ret;
@@ -4751,8 +4748,7 @@ error:
unsigned long updates = trans->delayed_ref_updates;
if (updates) {
trans->delayed_ref_updates = 0;
- ret = btrfs_run_delayed_refs(trans, fs_info,
- updates * 2);
+ ret = btrfs_run_delayed_refs(trans, updates * 2);
if (ret && !err)
err = ret;
}
@@ -4806,8 +4802,8 @@ again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
- block_start, blocksize);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ block_start, blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
ret = -ENOMEM;
goto out;
}
@@ -4874,8 +4870,8 @@ again:
out_unlock:
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
+ blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
unlock_page(page);
put_page(page);
out:
@@ -5130,7 +5126,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
inode_dio_wait(inode);
btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
- ret = btrfs_truncate(inode);
+ ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
int err;
@@ -5466,7 +5462,8 @@ no_delete:
/*
* this returns the key found in the dir entry in the location pointer.
- * If no dir entries were found, location->objectid is 0.
+ * If no dir entries were found, returns -ENOENT.
+ * If found a corrupted location in dir entry, returns -EUCLEAN.
*/
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
struct btrfs_key *location)
@@ -5484,27 +5481,27 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
- if (IS_ERR(di))
+ if (!di) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (IS_ERR(di)) {
ret = PTR_ERR(di);
-
- if (IS_ERR_OR_NULL(di))
- goto out_err;
+ goto out;
+ }
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
if (location->type != BTRFS_INODE_ITEM_KEY &&
location->type != BTRFS_ROOT_ITEM_KEY) {
+ ret = -EUCLEAN;
btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
__func__, name, btrfs_ino(BTRFS_I(dir)),
location->objectid, location->type, location->offset);
- goto out_err;
}
out:
btrfs_free_path(path);
return ret;
-out_err:
- location->objectid = 0;
- goto out;
}
/*
@@ -5807,9 +5804,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret < 0)
return ERR_PTR(ret);
- if (location.objectid == 0)
- return ERR_PTR(-ENOENT);
-
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(dir->i_sb, &location, root, NULL);
return inode;
@@ -7443,76 +7437,6 @@ out:
return ret;
}
-bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
-{
- struct radix_tree_root *root = &inode->i_mapping->page_tree;
- bool found = false;
- void **pagep = NULL;
- struct page *page = NULL;
- unsigned long start_idx;
- unsigned long end_idx;
-
- start_idx = start >> PAGE_SHIFT;
-
- /*
- * end is the last byte in the last page. end == start is legal
- */
- end_idx = end >> PAGE_SHIFT;
-
- rcu_read_lock();
-
- /* Most of the code in this while loop is lifted from
- * find_get_page. It's been modified to begin searching from a
- * page and return just the first page found in that range. If the
- * found idx is less than or equal to the end idx then we know that
- * a page exists. If no pages are found or if those pages are
- * outside of the range then we're fine (yay!) */
- while (page == NULL &&
- radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
- page = radix_tree_deref_slot(pagep);
- if (unlikely(!page))
- break;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- page = NULL;
- continue;
- }
- /*
- * Otherwise, shmem/tmpfs must be storing a swap entry
- * here as an exceptional entry: so return it without
- * attempting to raise page count.
- */
- page = NULL;
- break; /* TODO: Is this relevant for this use case? */
- }
-
- if (!page_cache_get_speculative(page)) {
- page = NULL;
- continue;
- }
-
- /*
- * Has the page moved?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != *pagep)) {
- put_page(page);
- page = NULL;
- }
- }
-
- if (page) {
- if (page->index <= end_idx)
- found = true;
- put_page(page);
- }
-
- rcu_read_unlock();
- return found;
-}
-
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, int writing)
{
@@ -7538,8 +7462,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* get stale data.
*/
if (!ordered &&
- (!writing ||
- !btrfs_page_exists_in_range(inode, lockstart, lockend)))
+ (!writing || !filemap_range_has_page(inode->i_mapping,
+ lockstart, lockend)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -8270,9 +8194,8 @@ static void btrfs_endio_direct_write(struct bio *bio)
bio_put(bio);
}
-static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
- struct bio *bio, int mirror_num,
- unsigned long bio_flags, u64 offset)
+static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
+ struct bio *bio, u64 offset)
{
struct inode *inode = private_data;
blk_status_t ret;
@@ -8298,13 +8221,13 @@ static void btrfs_end_dio_bio(struct bio *bio)
err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
if (err) {
- dip->errors = 1;
-
/*
- * before atomic variable goto zero, we must make sure
- * dip->errors is perceived to be set.
+ * We want to perceive the errors flag being set before
+ * decrementing the reference count. We don't need a barrier
+ * since atomic operations with a return value are fully
+ * ordered as per atomic_t.txt
*/
- smp_mb__before_atomic();
+ dip->errors = 1;
}
/* if there are more bios still pending for this dio, just exit */
@@ -8352,9 +8275,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
return 0;
}
-static inline blk_status_t
-__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
- int async_submit)
+static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
+ struct inode *inode, u64 file_offset, int async_submit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_dio_private *dip = bio->bi_private;
@@ -8377,8 +8299,8 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
if (write && async_submit) {
ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
file_offset, inode,
- __btrfs_submit_bio_start_direct_io,
- __btrfs_submit_bio_done);
+ btrfs_submit_bio_start_direct_io,
+ btrfs_submit_bio_done);
goto err;
} else if (write) {
/*
@@ -8464,7 +8386,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
*/
atomic_inc(&dip->pending_bios);
- status = __btrfs_submit_dio_bio(bio, inode, file_offset,
+ status = btrfs_submit_dio_bio(bio, inode, file_offset,
async_submit);
if (status) {
bio_put(bio);
@@ -8484,7 +8406,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
} while (submit_len > 0);
submit:
- status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
+ status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
if (!status)
return 0;
@@ -8492,10 +8414,11 @@ submit:
out_err:
dip->errors = 1;
/*
- * before atomic variable goto zero, we must
- * make sure dip->errors is perceived to be set.
+ * Before atomic variable goto zero, we must make sure dip->errors is
+ * perceived to be set. This ordering is ensured by the fact that an
+ * atomic operations with a return value are fully ordered as per
+ * atomic_t.txt
*/
- smp_mb__before_atomic();
if (atomic_dec_and_test(&dip->pending_bios))
bio_io_error(dip->orig_bio);
@@ -8713,7 +8636,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (ret < 0 && ret != -EIOCBQUEUED) {
if (dio_data.reserve)
btrfs_delalloc_release_space(inode, data_reserved,
- offset, dio_data.reserve);
+ offset, dio_data.reserve, true);
/*
* On error we might have left some ordered extents
* without submitting corresponding bios for them, so
@@ -8729,8 +8652,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
false);
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
- offset, count - (size_t)ret);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
+ offset, count - (size_t)ret, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
}
out:
if (wakeup)
@@ -9045,7 +8968,8 @@ again:
if (reserved_space < PAGE_SIZE) {
end = page_start + reserved_space - 1;
btrfs_delalloc_release_space(inode, data_reserved,
- page_start, PAGE_SIZE - reserved_space);
+ page_start, PAGE_SIZE - reserved_space,
+ true);
}
}
@@ -9095,23 +9019,23 @@ again:
out_unlock:
if (!ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
}
unlock_page(page);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
btrfs_delalloc_release_space(inode, data_reserved, page_start,
- reserved_space);
+ reserved_space, (ret != 0));
out_noreserve:
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return ret;
}
-static int btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode, bool skip_writeback)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -9122,10 +9046,12 @@ static int btrfs_truncate(struct inode *inode)
u64 mask = fs_info->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
- ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
- (u64)-1);
- if (ret)
- return ret;
+ if (!skip_writeback) {
+ ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ (u64)-1);
+ if (ret)
+ return ret;
+ }
/*
* Yes ladies and gentlemen, this is indeed ugly. The fact is we have
@@ -9335,7 +9261,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
- ei->delayed_iput_count = 0;
spin_lock_init(&ei->lock);
ei->outstanding_extents = 0;
@@ -9455,7 +9380,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-void btrfs_destroy_cachep(void)
+void __cold btrfs_destroy_cachep(void)
{
/*
* Make sure all delayed rcu free inodes are flushed before we
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3278ae592a2c..b2db3988813f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -106,7 +106,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
-static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+static unsigned int btrfs_mask_flags(umode_t mode, unsigned int flags)
{
if (S_ISDIR(mode))
return flags;
@@ -1197,7 +1197,7 @@ again:
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
- (page_cnt - i_done) << PAGE_SHIFT);
+ (page_cnt - i_done) << PAGE_SHIFT, true);
}
@@ -1215,7 +1215,8 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+ false);
extent_changeset_free(data_reserved);
return i_done;
out:
@@ -1225,8 +1226,9 @@ out:
}
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
- page_cnt << PAGE_SHIFT);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
+ page_cnt << PAGE_SHIFT, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
+ true);
extent_changeset_free(data_reserved);
return ret;
@@ -2600,7 +2602,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
range->len = (u64)-1;
}
ret = btrfs_defrag_file(file_inode(file), file,
- range, 0, 0);
+ range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
kfree(range);
@@ -3936,73 +3938,6 @@ int btrfs_clone_file_range(struct file *src_file, loff_t off,
return btrfs_clone_files(dst_file, src_file, off, len, destoff);
}
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks. They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-static long btrfs_ioctl_trans_start(struct file *file)
-{
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
- struct btrfs_file_private *private;
- int ret;
- static bool warned = false;
-
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
- if (!warned) {
- btrfs_warn(fs_info,
- "Userspace transaction mechanism is considered "
- "deprecated and slated to be removed in 4.17. "
- "If you have a valid use case please "
- "speak up on the mailing list");
- WARN_ON(1);
- warned = true;
- }
-
- ret = -EINPROGRESS;
- private = file->private_data;
- if (private && private->trans)
- goto out;
- if (!private) {
- private = kzalloc(sizeof(struct btrfs_file_private),
- GFP_KERNEL);
- if (!private)
- return -ENOMEM;
- file->private_data = private;
- }
-
- ret = -EROFS;
- if (btrfs_root_readonly(root))
- goto out;
-
- ret = mnt_want_write_file(file);
- if (ret)
- goto out;
-
- atomic_inc(&fs_info->open_ioctl_trans);
-
- ret = -ENOMEM;
- trans = btrfs_start_ioctl_transaction(root);
- if (IS_ERR(trans))
- goto out_drop;
-
- private->trans = trans;
- return 0;
-
-out_drop:
- atomic_dec(&fs_info->open_ioctl_trans);
- mnt_drop_write_file(file);
-out:
- return ret;
-}
-
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
@@ -4244,30 +4179,6 @@ out:
return ret;
}
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks. They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-long btrfs_ioctl_trans_end(struct file *file)
-{
- struct inode *inode = file_inode(file);
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_file_private *private = file->private_data;
-
- if (!private || !private->trans)
- return -EINVAL;
-
- btrfs_end_transaction(private->trans);
- private->trans = NULL;
-
- atomic_dec(&root->fs_info->open_ioctl_trans);
-
- mnt_drop_write_file(file);
- return 0;
-}
-
static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
void __user *argp)
{
@@ -4429,7 +4340,8 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = 0;
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
- ret = btrfs_dev_replace_cancel(fs_info, p);
+ p->result = btrfs_dev_replace_cancel(fs_info);
+ ret = 0;
break;
default:
ret = -EINVAL;
@@ -5138,10 +5050,17 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
BTRFS_UUID_SIZE);
if (received_uuid_changed &&
- !btrfs_is_empty_uuid(root_item->received_uuid))
- btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
- BTRFS_UUID_KEY_RECEIVED_SUBVOL,
- root->root_key.objectid);
+ !btrfs_is_empty_uuid(root_item->received_uuid)) {
+ ret = btrfs_uuid_tree_rem(trans, fs_info,
+ root_item->received_uuid,
+ BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+ root->root_key.objectid);
+ if (ret && ret != -ENOENT) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
btrfs_set_root_stransid(root_item, sa->stransid);
btrfs_set_root_rtransid(root_item, sa->rtransid);
@@ -5574,10 +5493,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
- case BTRFS_IOC_TRANS_START:
- return btrfs_ioctl_trans_start(file);
- case BTRFS_IOC_TRANS_END:
- return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(file, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d13128c70ddd..621083f8932c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -290,7 +290,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
/*
* Make sure counter is updated before we wake up waiters.
*/
- smp_mb();
+ smp_mb__after_atomic();
if (waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
} else {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 6c7f18cd3b61..1c7f7f70caf4 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -382,14 +382,12 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
- size_t tot_len;
int ret = 0;
char *kaddr;
unsigned long bytes;
BUG_ON(srclen < LZO_LEN);
- tot_len = read_compress_length(data_in);
data_in += LZO_LEN;
in_len = read_compress_length(data_in);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5b311aeddcc8..661cc3db0c7c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -610,7 +610,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->len);
+ btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false);
tree = &btrfs_inode->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -1154,7 +1154,7 @@ int __init ordered_data_init(void)
return 0;
}
-void ordered_data_exit(void)
+void __cold ordered_data_exit(void)
{
kmem_cache_destroy(btrfs_ordered_extent_cache);
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 56c4c0ee6381..4a1672a13ba6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -151,7 +151,9 @@ static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info,
unsigned long bytes)
{
int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
- return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
+ int csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+ return sizeof(struct btrfs_ordered_sum) + num_sectors * csum_size;
}
static inline void
@@ -215,5 +217,5 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
-void ordered_data_exit(void);
+void __cold ordered_data_exit(void);
#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 569205e651c7..4a8770485f77 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -365,9 +365,13 @@ void btrfs_print_tree(struct extent_buffer *c)
btrfs_node_blockptr(c, i));
}
for (i = 0; i < nr; i++) {
- struct extent_buffer *next = read_tree_block(fs_info,
- btrfs_node_blockptr(c, i),
- btrfs_node_ptr_generation(c, i));
+ struct btrfs_key first_key;
+ struct extent_buffer *next;
+
+ btrfs_node_key_to_cpu(c, &first_key, i);
+ next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
+ btrfs_node_ptr_generation(c, i),
+ level - 1, &first_key);
if (IS_ERR(next)) {
continue;
} else if (!extent_buffer_uptodate(next)) {
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b30a056963ab..5859f7d3cf3e 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -19,8 +19,8 @@
#include <linux/hashtable.h>
#include "props.h"
#include "btrfs_inode.h"
-#include "hash.h"
#include "transaction.h"
+#include "ctree.h"
#include "xattr.h"
#include "compression.h"
@@ -116,7 +116,7 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
return -EINVAL;
if (value_len == 0) {
- ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ ret = btrfs_setxattr(trans, inode, handler->xattr_name,
NULL, 0, flags);
if (ret)
return ret;
@@ -130,13 +130,13 @@ static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
ret = handler->validate(value, value_len);
if (ret)
return ret;
- ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ ret = btrfs_setxattr(trans, inode, handler->xattr_name,
value, value_len, flags);
if (ret)
return ret;
ret = handler->apply(inode, value, value_len);
if (ret) {
- __btrfs_setxattr(trans, inode, handler->xattr_name,
+ btrfs_setxattr(trans, inode, handler->xattr_name,
NULL, 0, flags);
return ret;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index aa259d6986e1..f583f13ff26e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -47,6 +47,82 @@
* - check all ioctl parameters
*/
+/*
+ * Helpers to access qgroup reservation
+ *
+ * Callers should ensure the lock context and type are valid
+ */
+
+static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
+{
+ u64 ret = 0;
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ ret += qgroup->rsv.values[i];
+
+ return ret;
+}
+
+#ifdef CONFIG_BTRFS_DEBUG
+static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
+{
+ if (type == BTRFS_QGROUP_RSV_DATA)
+ return "data";
+ if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
+ return "meta_pertrans";
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+ return "meta_prealloc";
+ return NULL;
+}
+#endif
+
+static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
+ qgroup->rsv.values[type] += num_bytes;
+}
+
+static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *qgroup, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
+ if (qgroup->rsv.values[type] >= num_bytes) {
+ qgroup->rsv.values[type] -= num_bytes;
+ return;
+ }
+#ifdef CONFIG_BTRFS_DEBUG
+ WARN_RATELIMIT(1,
+ "qgroup %llu %s reserved space underflow, have %llu to free %llu",
+ qgroup->qgroupid, qgroup_rsv_type_str(type),
+ qgroup->rsv.values[type], num_bytes);
+#endif
+ qgroup->rsv.values[type] = 0;
+}
+
+static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *dest,
+ struct btrfs_qgroup *src)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
+}
+
+static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup *dest,
+ struct btrfs_qgroup *src)
+{
+ int i;
+
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
+ qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
+}
+
static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
int mod)
{
@@ -826,10 +902,8 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans,
int slot;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (fs_info->quota_root) {
- set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+ if (fs_info->quota_root)
goto out;
- }
fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
if (!fs_info->qgroup_ulist) {
@@ -923,8 +997,15 @@ out_add_root:
}
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
- set_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags);
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
spin_unlock(&fs_info->qgroup_lock);
+ ret = qgroup_rescan_init(fs_info, 0, 1);
+ if (!ret) {
+ qgroup_rescan_zero_tracking(fs_info);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ }
+
out_free_path:
btrfs_free_path(path);
out_free_root:
@@ -991,33 +1072,29 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
}
-static void report_reserved_underflow(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup *qgroup,
- u64 num_bytes)
-{
-#ifdef CONFIG_BTRFS_DEBUG
- WARN_ON(qgroup->reserved < num_bytes);
- btrfs_debug(fs_info,
- "qgroup %llu reserved space underflow, have: %llu, to free: %llu",
- qgroup->qgroupid, qgroup->reserved, num_bytes);
-#endif
- qgroup->reserved = 0;
-}
/*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their reference and
- * exclusive counts adjusted.
+ * The easy accounting, we're updating qgroup relationship whose child qgroup
+ * only has exclusive extents.
+ *
+ * In this case, all exclsuive extents will also be exlusive for parent, so
+ * excl/rfer just get added/removed.
+ *
+ * So is qgroup reservation space, which should also be added/removed to
+ * parent.
+ * Or when child tries to release reservation space, parent will underflow its
+ * reservation (for relationship adding case).
*
* Caller should hold fs_info->qgroup_lock.
*/
static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
struct ulist *tmp, u64 ref_root,
- u64 num_bytes, int sign)
+ struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *glist;
struct ulist_node *unode;
struct ulist_iterator uiter;
+ u64 num_bytes = src->excl;
int ret = 0;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -1030,13 +1107,11 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
qgroup->excl_cmpr += sign * num_bytes;
- if (sign > 0) {
- trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes);
- if (qgroup->reserved < num_bytes)
- report_reserved_underflow(fs_info, qgroup, num_bytes);
- else
- qgroup->reserved -= num_bytes;
- }
+
+ if (sign > 0)
+ qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+ else
+ qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
qgroup_dirty(fs_info, qgroup);
@@ -1056,15 +1131,10 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup->rfer_cmpr += sign * num_bytes;
WARN_ON(sign < 0 && qgroup->excl < num_bytes);
qgroup->excl += sign * num_bytes;
- if (sign > 0) {
- trace_qgroup_update_reserve(fs_info, qgroup,
- -(s64)num_bytes);
- if (qgroup->reserved < num_bytes)
- report_reserved_underflow(fs_info, qgroup,
- num_bytes);
- else
- qgroup->reserved -= num_bytes;
- }
+ if (sign > 0)
+ qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
+ else
+ qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
qgroup->excl_cmpr += sign * num_bytes;
qgroup_dirty(fs_info, qgroup);
@@ -1107,7 +1177,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info,
if (qgroup->excl == qgroup->rfer) {
ret = 0;
err = __qgroup_excl_accounting(fs_info, tmp, dst,
- qgroup->excl, sign);
+ qgroup, sign);
if (err < 0) {
ret = err;
goto out;
@@ -1414,7 +1484,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
- assert_spin_locked(&delayed_refs->lock);
+ lockdep_assert_held(&delayed_refs->lock);
trace_btrfs_qgroup_trace_extent(fs_info, record);
while (*p) {
@@ -1614,7 +1684,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
return 0;
if (!extent_buffer_uptodate(root_eb)) {
- ret = btrfs_read_buffer(root_eb, root_gen);
+ ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
@@ -1645,6 +1715,7 @@ walk_down:
level = root_level;
while (level >= 0) {
if (path->nodes[level] == NULL) {
+ struct btrfs_key first_key;
int parent_slot;
u64 child_gen;
u64 child_bytenr;
@@ -1657,8 +1728,10 @@ walk_down:
parent_slot = path->slots[level + 1];
child_bytenr = btrfs_node_blockptr(eb, parent_slot);
child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
- eb = read_tree_block(fs_info, child_bytenr, child_gen);
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ level, &first_key);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
@@ -2009,9 +2082,9 @@ out_free:
return ret;
}
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
@@ -2080,17 +2153,9 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
{
struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
- int start_rescan_worker = 0;
if (!quota_root)
- goto out;
-
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
- test_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
- start_rescan_worker = 1;
-
- if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags))
- set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ return ret;
spin_lock(&fs_info->qgroup_lock);
while (!list_empty(&fs_info->dirty_qgroups)) {
@@ -2119,18 +2184,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- if (!ret && start_rescan_worker) {
- ret = qgroup_rescan_init(fs_info, 0, 1);
- if (!ret) {
- qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_work(fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
- }
- ret = 0;
- }
-
-out:
-
return ret;
}
@@ -2338,24 +2391,24 @@ out:
static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
- qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer)
+ qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
- qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl)
+ qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
return true;
}
-static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
+ enum btrfs_qgroup_rsv_type type)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
int ret = 0;
- int retried = 0;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -2369,7 +2422,6 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce)
capable(CAP_SYS_RESOURCE))
enforce = false;
-retry:
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
if (!quota_root)
@@ -2385,7 +2437,7 @@ retry:
*/
ulist_reinit(fs_info->qgroup_ulist);
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- (uintptr_t)qgroup, GFP_ATOMIC);
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
@@ -2396,27 +2448,6 @@ retry:
qg = unode_aux_to_qgroup(unode);
if (enforce && !qgroup_check_limits(qg, num_bytes)) {
- /*
- * Commit the tree and retry, since we may have
- * deletions which would free up space.
- */
- if (!retried && qg->reserved > 0) {
- struct btrfs_trans_handle *trans;
-
- spin_unlock(&fs_info->qgroup_lock);
- ret = btrfs_start_delalloc_inodes(root, 0);
- if (ret)
- return ret;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- retried++;
- goto retry;
- }
ret = -EDQUOT;
goto out;
}
@@ -2424,7 +2455,7 @@ retry:
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(fs_info->qgroup_ulist,
glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
@@ -2439,8 +2470,8 @@ retry:
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, num_bytes);
- qg->reserved += num_bytes;
+ trace_qgroup_update_reserve(fs_info, qg, num_bytes, type);
+ qgroup_rsv_add(fs_info, qg, num_bytes, type);
}
out:
@@ -2448,8 +2479,18 @@ out:
return ret;
}
+/*
+ * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0
+ * qgroup).
+ *
+ * Will handle all higher level qgroup too.
+ *
+ * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
+ * This special case is only used for META_PERTRANS type.
+ */
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes)
+ u64 ref_root, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -2463,6 +2504,10 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
if (num_bytes == 0)
return;
+ if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
+ WARN(1, "%s: Invalid type to free", __func__);
+ return;
+ }
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
@@ -2473,9 +2518,16 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
if (!qgroup)
goto out;
+ if (num_bytes == (u64)-1)
+ /*
+ * We're freeing all pertrans rsv, get reserved value from
+ * level 0 qgroup as real num_bytes to free.
+ */
+ num_bytes = qgroup->rsv.values[type];
+
ulist_reinit(fs_info->qgroup_ulist);
ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
- (uintptr_t)qgroup, GFP_ATOMIC);
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
if (ret < 0)
goto out;
ULIST_ITER_INIT(&uiter);
@@ -2485,16 +2537,13 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
qg = unode_aux_to_qgroup(unode);
- trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes);
- if (qg->reserved < num_bytes)
- report_reserved_underflow(fs_info, qg, num_bytes);
- else
- qg->reserved -= num_bytes;
+ trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes, type);
+ qgroup_rsv_release(fs_info, qg, num_bytes, type);
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(fs_info->qgroup_ulist,
glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
if (ret < 0)
goto out;
}
@@ -2877,7 +2926,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
goto cleanup;
- ret = qgroup_reserve(root, to_reserve, true);
+ ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
@@ -2940,7 +2989,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed);
+ btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+ BTRFS_QGROUP_RSV_DATA);
ret = freed;
out:
extent_changeset_release(&changeset);
@@ -2972,7 +3022,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
if (free)
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->objectid,
- changeset.bytes_changed);
+ changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
extent_changeset_release(&changeset);
@@ -3017,8 +3067,48 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
}
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- bool enforce)
+static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+ type != BTRFS_QGROUP_RSV_META_PERTRANS)
+ return;
+ if (num_bytes == 0)
+ return;
+
+ spin_lock(&root->qgroup_meta_rsv_lock);
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
+ root->qgroup_meta_rsv_prealloc += num_bytes;
+ else
+ root->qgroup_meta_rsv_pertrans += num_bytes;
+ spin_unlock(&root->qgroup_meta_rsv_lock);
+}
+
+static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
+{
+ if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
+ type != BTRFS_QGROUP_RSV_META_PERTRANS)
+ return 0;
+ if (num_bytes == 0)
+ return 0;
+
+ spin_lock(&root->qgroup_meta_rsv_lock);
+ if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
+ num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
+ num_bytes);
+ root->qgroup_meta_rsv_prealloc -= num_bytes;
+ } else {
+ num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
+ num_bytes);
+ root->qgroup_meta_rsv_pertrans -= num_bytes;
+ }
+ spin_unlock(&root->qgroup_meta_rsv_lock);
+ return num_bytes;
+}
+
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3028,31 +3118,39 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, (s64)num_bytes);
- ret = qgroup_reserve(root, num_bytes, enforce);
+ trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
+ ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
- atomic64_add(num_bytes, &root->qgroup_meta_rsv);
+ /*
+ * Record what we have reserved into root.
+ *
+ * To avoid quota disabled->enabled underflow.
+ * In that case, we may try to free space we haven't reserved
+ * (since quota was disabled), so record what we reserved into root.
+ * And ensure later release won't underflow this number.
+ */
+ add_root_meta_rsv(root, num_bytes, type);
return ret;
}
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 reserved;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
!is_fstree(root->objectid))
return;
- reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0);
- if (reserved == 0)
- return;
- trace_qgroup_meta_reserve(root, -(s64)reserved);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved);
+ /* TODO: Update trace point to handle such free */
+ trace_qgroup_meta_free_all_pertrans(root);
+ /* Special value -1 means to free all reserved space */
+ btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
}
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3060,11 +3158,75 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
!is_fstree(root->objectid))
return;
+ /*
+ * reservation for META_PREALLOC can happen before quota is enabled,
+ * which can lead to underflow.
+ * Here ensure we will only free what we really have reserved.
+ */
+ num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes);
- atomic64_sub(num_bytes, &root->qgroup_meta_rsv);
- trace_qgroup_meta_reserve(root, -(s64)num_bytes);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes);
+ trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
+ btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+}
+
+static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
+ int num_bytes)
+{
+ struct btrfs_root *quota_root = fs_info->quota_root;
+ struct btrfs_qgroup *qgroup;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret = 0;
+
+ if (num_bytes == 0)
+ return;
+ if (!quota_root)
+ return;
+
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, ref_root);
+ if (!qgroup)
+ goto out;
+ ulist_reinit(fs_info->qgroup_ulist);
+ ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
+ qgroup_to_aux(qgroup), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_list *glist;
+
+ qg = unode_aux_to_qgroup(unode);
+
+ qgroup_rsv_release(fs_info, qg, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+ qgroup_rsv_add(fs_info, qg, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(fs_info->qgroup_ulist,
+ glist->group->qgroupid,
+ qgroup_to_aux(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+}
+
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
+ !is_fstree(root->objectid))
+ return;
+ /* Same as btrfs_qgroup_free_meta_prealloc() */
+ num_bytes = sub_root_meta_rsv(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+ trace_qgroup_meta_convert(root, num_bytes);
+ qgroup_convert_meta(fs_info, root->objectid, num_bytes);
}
/*
@@ -3092,7 +3254,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
}
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
BTRFS_I(inode)->root->objectid,
- changeset.bytes_changed);
+ changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
extent_changeset_release(&changeset);
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index d9984e87cddf..e63e2d497a8e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -62,6 +62,48 @@ struct btrfs_qgroup_extent_record {
};
/*
+ * Qgroup reservation types:
+ *
+ * DATA:
+ * space reserved for data
+ *
+ * META_PERTRANS:
+ * Space reserved for metadata (per-transaction)
+ * Due to the fact that qgroup data is only updated at transaction commit
+ * time, reserved space for metadata must be kept until transaction
+ * commits.
+ * Any metadata reserved that are used in btrfs_start_transaction() should
+ * be of this type.
+ *
+ * META_PREALLOC:
+ * There are cases where metadata space is reserved before starting
+ * transaction, and then btrfs_join_transaction() to get a trans handle.
+ * Any metadata reserved for such usage should be of this type.
+ * And after join_transaction() part (or all) of such reservation should
+ * be converted into META_PERTRANS.
+ */
+enum btrfs_qgroup_rsv_type {
+ BTRFS_QGROUP_RSV_DATA = 0,
+ BTRFS_QGROUP_RSV_META_PERTRANS,
+ BTRFS_QGROUP_RSV_META_PREALLOC,
+ BTRFS_QGROUP_RSV_LAST,
+};
+
+/*
+ * Represents how many bytes we have reserved for this qgroup.
+ *
+ * Each type should have different reservation behavior.
+ * E.g, data follows its io_tree flag modification, while
+ * *currently* meta is just reserve-and-clear during transcation.
+ *
+ * TODO: Add new type for reservation which can survive transaction commit.
+ * Currect metadata reservation behavior is not suitable for such case.
+ */
+struct btrfs_qgroup_rsv {
+ u64 values[BTRFS_QGROUP_RSV_LAST];
+};
+
+/*
* one struct for each qgroup, organized in fs_info->qgroup_tree.
*/
struct btrfs_qgroup {
@@ -87,7 +129,7 @@ struct btrfs_qgroup {
/*
* reservation tracking
*/
- u64 reserved;
+ struct btrfs_qgroup_rsv rsv;
/*
* lists
@@ -220,20 +262,21 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes,
struct ulist *old_roots, struct ulist *new_roots);
-int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
struct btrfs_qgroup_inherit *inherit);
void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
- u64 ref_root, u64 num_bytes);
+ u64 ref_root, u64 num_bytes,
+ enum btrfs_qgroup_rsv_type type);
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
- btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
+ BTRFS_QGROUP_RSV_DATA);
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -248,9 +291,54 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- bool enforce);
-void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
-void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
+/* Reserve metadata space for pertrans and prealloc type */
+static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
+ int num_bytes, bool enforce)
+{
+ return __btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
+}
+static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
+ int num_bytes, bool enforce)
+{
+ return __btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
+}
+
+void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type);
+
+/* Free per-transaction meta reservation for error handling */
+static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
+ int num_bytes)
+{
+ __btrfs_qgroup_free_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PERTRANS);
+}
+
+/* Pre-allocated meta reservation can be freed at need */
+static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
+ int num_bytes)
+{
+ __btrfs_qgroup_free_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC);
+}
+
+/*
+ * Per-transaction meta reservation should be all freed at transaction commit
+ * time
+ */
+void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
+
+/*
+ * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
+ *
+ * This is called when preallocated meta reservation needs to be used.
+ * Normally after btrfs_join_transaction() call.
+ */
+void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
+
void btrfs_qgroup_check_reserved_leak(struct inode *inode);
#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcfc20de2df3..c3a2bc8af675 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1987,7 +1987,13 @@ cleanup:
kfree(pointers);
cleanup_io:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ /*
+ * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
+ * valid rbio which is consistent with ondisk content, thus such a
+ * valid rbio can be cached to avoid further disk reads.
+ */
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
/*
* - In case of two failures, where rbio->failb != -1:
*
@@ -2009,8 +2015,6 @@ cleanup_io:
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
rbio_orig_end_io(rbio, err);
- } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
- rbio_orig_end_io(rbio, err);
} else if (err == BLK_STS_OK) {
rbio->faila = -1;
rbio->failb = -1;
@@ -2768,24 +2772,8 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
return rbio;
}
-static void missing_raid56_work(struct btrfs_work *work)
-{
- struct btrfs_raid_bio *rbio;
-
- rbio = container_of(work, struct btrfs_raid_bio, work);
- __raid56_parity_recover(rbio);
-}
-
-static void async_missing_raid56(struct btrfs_raid_bio *rbio)
-{
- btrfs_init_work(&rbio->work, btrfs_rmw_helper,
- missing_raid56_work, NULL, NULL);
-
- btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
-}
-
void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
{
if (!lock_stripe_add(rbio))
- async_missing_raid56(rbio);
+ async_read_rebuild(rbio);
}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ab852b8e3e37..a52dd12af648 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -395,20 +395,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
goto error;
/* insert extent in reada_tree + all per-device trees, all or nothing */
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&fs_info->reada_tree, index, re);
if (ret == -EEXIST) {
re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
re_exist->refcnt++;
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
goto error;
}
if (ret) {
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
radix_tree_preload_end();
goto error;
}
@@ -451,13 +451,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
}
radix_tree_delete(&fs_info->reada_tree, index);
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
goto error;
}
have_zone = 1;
}
spin_unlock(&fs_info->reada_lock);
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
if (!have_zone)
goto error;
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 171f3cce30e6..35fab67dcbe8 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -579,11 +579,16 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path,
while (level >= 0) {
if (level) {
+ struct btrfs_key first_key;
+
block_bytenr = btrfs_node_blockptr(path->nodes[level],
path->slots[level]);
gen = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
- eb = read_tree_block(fs_info, block_bytenr, gen);
+ btrfs_node_key_to_cpu(path->nodes[level], &first_key,
+ path->slots[level]);
+ eb = read_tree_block(fs_info, block_bytenr, gen,
+ level - 1, &first_key);
if (IS_ERR(eb))
return PTR_ERR(eb);
if (!extent_buffer_uptodate(eb)) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cd2298d185dd..4874c09f6d3c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1839,6 +1839,8 @@ again:
parent = eb;
while (1) {
+ struct btrfs_key first_key;
+
level = btrfs_header_level(parent);
BUG_ON(level < lowest_level);
@@ -1852,6 +1854,7 @@ again:
old_bytenr = btrfs_node_blockptr(parent, slot);
blocksize = fs_info->nodesize;
old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+ btrfs_node_key_to_cpu(parent, &key, slot);
if (level <= max_level) {
eb = path->nodes[level];
@@ -1876,7 +1879,8 @@ again:
break;
}
- eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen);
+ eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
+ level - 1, &first_key);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@@ -2036,6 +2040,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
last_snapshot = btrfs_root_last_snapshot(&root->root_item);
for (i = *level; i > 0; i--) {
+ struct btrfs_key first_key;
+
eb = path->nodes[i];
nritems = btrfs_header_nritems(eb);
while (path->slots[i] < nritems) {
@@ -2056,7 +2062,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
}
bytenr = btrfs_node_blockptr(eb, path->slots[i]);
- eb = read_tree_block(fs_info, bytenr, ptr_gen);
+ btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
+ eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
+ &first_key);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -2714,6 +2722,8 @@ static int do_relocation(struct btrfs_trans_handle *trans,
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
+ struct btrfs_key first_key;
+
cond_resched();
upper = edge->node[UPPER];
@@ -2779,7 +2789,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = root->fs_info->nodesize;
generation = btrfs_node_ptr_generation(upper->eb, slot);
- eb = read_tree_block(fs_info, bytenr, generation);
+ btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
+ eb = read_tree_block(fs_info, bytenr, generation,
+ upper->level - 1, &first_key);
if (IS_ERR(eb)) {
err = PTR_ERR(eb);
goto next;
@@ -2944,7 +2956,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb;
BUG_ON(block->key_ready);
- eb = read_tree_block(fs_info, block->bytenr, block->key.offset);
+ eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
+ block->level, NULL);
if (IS_ERR(eb)) {
return PTR_ERR(eb);
} else if (!extent_buffer_uptodate(eb)) {
@@ -3226,7 +3239,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
mask);
if (!page) {
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
ret = -ENOMEM;
goto out;
}
@@ -3245,9 +3258,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
unlock_page(page);
put_page(page);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
ret = -EIO;
goto out;
}
@@ -3274,9 +3287,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
unlock_page(page);
put_page(page);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE);
+ PAGE_SIZE, true);
clear_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
@@ -3292,7 +3305,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
+ false);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec56f33feea9..1a2066ac6fe7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -371,7 +371,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
- WARN_ON(!mutex_is_locked(&locks_root->lock));
+ lockdep_assert_held(&locks_root->lock);
p = &locks_root->root.rb_node;
while (*p) {
@@ -413,7 +413,7 @@ static struct full_stripe_lock *search_full_stripe_lock(
struct rb_node *node;
struct full_stripe_lock *entry;
- WARN_ON(!mutex_is_locked(&locks_root->lock));
+ lockdep_assert_held(&locks_root->lock);
node = locks_root->root.rb_node;
while (node) {
@@ -1111,7 +1111,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct scrub_ctx *sctx = sblock_to_check->sctx;
struct btrfs_device *dev;
struct btrfs_fs_info *fs_info;
- u64 length;
u64 logical;
unsigned int failed_mirror_index;
unsigned int is_metadata;
@@ -1139,7 +1138,6 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_unlock(&sctx->stat_lock);
return 0;
}
- length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
@@ -1412,8 +1410,17 @@ nodatasum_case:
if (!page_bad->io_error && !sctx->is_dev_replace)
continue;
- /* try to find no-io-error page in mirrors */
- if (page_bad->io_error) {
+ if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
+ /*
+ * In case of dev replace, if raid56 rebuild process
+ * didn't work out correct data, then copy the content
+ * in sblock_bad to make sure target device is identical
+ * to source device, instead of writing garbage data in
+ * sblock_for_recheck array to target device.
+ */
+ sblock_other = NULL;
+ } else if (page_bad->io_error) {
+ /* try to find no-io-error page in mirrors */
for (mirror_index = 0;
mirror_index < BTRFS_MAX_MIRRORS &&
sblocks_for_recheck[mirror_index].page_count > 0;
@@ -1718,6 +1725,45 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
return blk_status_to_errno(bio->bi_status);
}
+static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
+ struct scrub_block *sblock)
+{
+ struct scrub_page *first_page = sblock->pagev[0];
+ struct bio *bio;
+ int page_num;
+
+ /* All pages in sblock belong to the same stripe on the same device. */
+ ASSERT(first_page->dev);
+ if (!first_page->dev->bdev)
+ goto out;
+
+ bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
+ bio_set_dev(bio, first_page->dev->bdev);
+
+ for (page_num = 0; page_num < sblock->page_count; page_num++) {
+ struct scrub_page *page = sblock->pagev[page_num];
+
+ WARN_ON(!page->page);
+ bio_add_page(bio, page->page, PAGE_SIZE, 0);
+ }
+
+ if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
+ bio_put(bio);
+ goto out;
+ }
+
+ bio_put(bio);
+
+ scrub_recheck_block_checksum(sblock);
+
+ return;
+out:
+ for (page_num = 0; page_num < sblock->page_count; page_num++)
+ sblock->pagev[page_num]->io_error = 1;
+
+ sblock->no_io_error_seen = 0;
+}
+
/*
* this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages
@@ -1733,6 +1779,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
sblock->no_io_error_seen = 1;
+ /* short cut for raid56 */
+ if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
+ return scrub_recheck_block_on_raid56(fs_info, sblock);
+
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
struct scrub_page *page = sblock->pagev[page_num];
@@ -1748,19 +1798,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio_set_dev(bio, page->dev->bdev);
bio_add_page(bio, page->page, PAGE_SIZE, 0);
- if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
- if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
- page->io_error = 1;
- sblock->no_io_error_seen = 0;
- }
- } else {
- bio->bi_iter.bi_sector = page->physical >> 9;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_iter.bi_sector = page->physical >> 9;
+ bio->bi_opf = REQ_OP_READ;
- if (btrfsic_submit_bio_wait(bio)) {
- page->io_error = 1;
- sblock->no_io_error_seen = 0;
- }
+ if (btrfsic_submit_bio_wait(bio)) {
+ page->io_error = 1;
+ sblock->no_io_error_seen = 0;
}
bio_put(bio);
@@ -2728,7 +2771,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
}
/* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
+ u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u64 physical_for_dev_replace)
{
@@ -2737,13 +2781,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->fs_info->sectorsize;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ blocksize = map->stripe_len;
+ else
+ blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->fs_info->nodesize;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+ blocksize = map->stripe_len;
+ else
+ blocksize = sctx->fs_info->nodesize;
spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++;
sctx->stat.tree_bytes_scrubbed += len;
@@ -2883,9 +2933,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
}
if (flags & BTRFS_EXTENT_FLAG_DATA) {
- blocksize = sctx->fs_info->sectorsize;
+ blocksize = sparity->stripe_len;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- blocksize = sctx->fs_info->nodesize;
+ blocksize = sparity->stripe_len;
} else {
blocksize = sctx->fs_info->sectorsize;
WARN_ON(1);
@@ -3595,7 +3645,7 @@ again:
if (ret)
goto out;
- ret = scrub_extent(sctx, extent_logical, extent_len,
+ ret = scrub_extent(sctx, map, extent_logical, extent_len,
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
extent_logical - logical + physical);
@@ -3885,11 +3935,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
- btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_lock(&fs_info->dev_replace);
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
found_key.offset, cache, is_dev_replace);
@@ -3925,10 +3975,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
scrub_pause_off(fs_info);
- btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_lock(&fs_info->dev_replace);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
+ btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
if (ro_set)
btrfs_dec_block_group_ro(cache);
@@ -4144,16 +4194,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EIO;
}
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (dev->scrub_ctx ||
(!is_dev_replace &&
btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
return -EINPROGRESS;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
ret = scrub_workers_get(fs_info, is_dev_replace);
if (ret) {
@@ -4480,7 +4530,8 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
* move on to the next inode.
*/
if (em->block_start > logical ||
- em->block_start + em->block_len < logical + len) {
+ em->block_start + em->block_len < logical + len ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
free_extent_map(em);
ret = 1;
goto out_unlock;
@@ -4620,7 +4671,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
{
struct bio *bio;
struct btrfs_device *dev;
- int ret;
dev = sctx->wr_tgtdev;
if (!dev)
@@ -4635,17 +4685,15 @@ static int write_page_nocow(struct scrub_ctx *sctx,
bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio_set_dev(bio, dev->bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
- ret = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (ret != PAGE_SIZE) {
-leave_with_eio:
+ /* bio_add_page won't fail on a freshly allocated bio */
+ bio_add_page(bio, page, PAGE_SIZE, 0);
+
+ if (btrfsic_submit_bio_wait(bio)) {
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
- if (btrfsic_submit_bio_wait(bio))
- goto leave_with_eio;
-
bio_put(bio);
return 0;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 484e2af793de..1f5748c7d1c7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -27,10 +27,10 @@
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/compat.h>
+#include <linux/crc32c.h>
#include "send.h"
#include "backref.h"
-#include "hash.h"
#include "locking.h"
#include "disk-io.h"
#include "btrfs_inode.h"
@@ -112,6 +112,7 @@ struct send_ctx {
u64 cur_inode_mode;
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
+ u64 cur_inode_next_write_offset;
u64 send_progress;
@@ -270,6 +271,7 @@ struct name_cache_entry {
char name[];
};
+__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
enum btrfs_compare_tree_result result,
const char *what)
@@ -611,9 +613,9 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
}
-#define TLV_PUT(sctx, attrtype, attrlen, data) \
+#define TLV_PUT(sctx, attrtype, data, attrlen) \
do { \
- ret = tlv_put(sctx, attrtype, attrlen, data); \
+ ret = tlv_put(sctx, attrtype, data, attrlen); \
if (ret < 0) \
goto tlv_put_failure; \
} while (0)
@@ -695,7 +697,7 @@ static int send_cmd(struct send_ctx *sctx)
hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
hdr->crc = 0;
- crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -5029,6 +5031,7 @@ static int send_hole(struct send_ctx *sctx, u64 end)
break;
offset += len;
}
+ sctx->cur_inode_next_write_offset = offset;
tlv_put_failure:
fs_path_free(p);
return ret;
@@ -5264,6 +5267,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
} else {
ret = send_extent_data(sctx, offset, len);
}
+ sctx->cur_inode_next_write_offset = offset + len;
out:
return ret;
}
@@ -5788,6 +5792,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
u64 right_gid;
int need_chmod = 0;
int need_chown = 0;
+ int need_truncate = 1;
int pending_move = 0;
int refs_processed = 0;
@@ -5825,9 +5830,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode))
need_chmod = 1;
+ if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
+ need_truncate = 0;
} else {
+ u64 old_size;
+
ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
- NULL, NULL, &right_mode, &right_uid,
+ &old_size, NULL, &right_mode, &right_uid,
&right_gid, NULL);
if (ret < 0)
goto out;
@@ -5836,6 +5845,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
need_chown = 1;
if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
need_chmod = 1;
+ if ((old_size == sctx->cur_inode_size) ||
+ (sctx->cur_inode_size > old_size &&
+ sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
+ need_truncate = 0;
}
if (S_ISREG(sctx->cur_inode_mode)) {
@@ -5854,10 +5867,13 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
}
}
- ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
- sctx->cur_inode_size);
- if (ret < 0)
- goto out;
+ if (need_truncate) {
+ ret = send_truncate(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen,
+ sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ }
}
if (need_chown) {
@@ -5911,6 +5927,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_ino = key->objectid;
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
+ sctx->cur_inode_next_write_offset = 0;
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4b817947e00f..170baef49fae 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,6 +41,7 @@
#include <linux/slab.h>
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
+#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include "delayed-inode.h"
#include "ctree.h"
@@ -48,7 +49,6 @@
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
-#include "hash.h"
#include "props.h"
#include "xattr.h"
#include "volumes.h"
@@ -308,21 +308,50 @@ static void btrfs_put_super(struct super_block *sb)
}
enum {
- Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
- Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
- Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
- Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
- Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
- Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
- Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
- Opt_skip_balance, Opt_check_integrity,
+ Opt_acl, Opt_noacl,
+ Opt_clear_cache,
+ Opt_commit_interval,
+ Opt_compress,
+ Opt_compress_force,
+ Opt_compress_force_type,
+ Opt_compress_type,
+ Opt_degraded,
+ Opt_device,
+ Opt_fatal_errors,
+ Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_inode_cache, Opt_noinode_cache,
+ Opt_max_inline,
+ Opt_barrier, Opt_nobarrier,
+ Opt_datacow, Opt_nodatacow,
+ Opt_datasum, Opt_nodatasum,
+ Opt_defrag, Opt_nodefrag,
+ Opt_discard, Opt_nodiscard,
+ Opt_nologreplay,
+ Opt_norecovery,
+ Opt_ratio,
+ Opt_rescan_uuid_tree,
+ Opt_skip_balance,
+ Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache_version,
+ Opt_ssd, Opt_nossd,
+ Opt_ssd_spread, Opt_nossd_spread,
+ Opt_subvol,
+ Opt_subvolid,
+ Opt_thread_pool,
+ Opt_treelog, Opt_notreelog,
+ Opt_usebackuproot,
+ Opt_user_subvol_rm_allowed,
+
+ /* Deprecated options */
+ Opt_alloc_start,
+ Opt_recovery,
+ Opt_subvolrootid,
+
+ /* Debugging options */
+ Opt_check_integrity,
Opt_check_integrity_including_extent_data,
- Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
- Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
- Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
- Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_usebackuproot,
- Opt_nologreplay, Opt_norecovery,
+ Opt_check_integrity_print_mask,
+ Opt_enospc_debug, Opt_noenospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
@@ -333,58 +362,63 @@ enum {
};
static const match_table_t tokens = {
- {Opt_degraded, "degraded"},
- {Opt_subvol, "subvol=%s"},
- {Opt_subvolid, "subvolid=%s"},
- {Opt_device, "device=%s"},
- {Opt_nodatasum, "nodatasum"},
- {Opt_datasum, "datasum"},
- {Opt_nodatacow, "nodatacow"},
- {Opt_datacow, "datacow"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_barrier, "barrier"},
- {Opt_max_inline, "max_inline=%s"},
- {Opt_alloc_start, "alloc_start=%s"},
- {Opt_thread_pool, "thread_pool=%d"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_clear_cache, "clear_cache"},
+ {Opt_commit_interval, "commit=%u"},
{Opt_compress, "compress"},
{Opt_compress_type, "compress=%s"},
{Opt_compress_force, "compress-force"},
{Opt_compress_force_type, "compress-force=%s"},
- {Opt_ssd, "ssd"},
- {Opt_ssd_spread, "ssd_spread"},
- {Opt_nossd, "nossd"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_notreelog, "notreelog"},
- {Opt_treelog, "treelog"},
- {Opt_nologreplay, "nologreplay"},
- {Opt_norecovery, "norecovery"},
+ {Opt_degraded, "degraded"},
+ {Opt_device, "device=%s"},
+ {Opt_fatal_errors, "fatal_errors=%s"},
{Opt_flushoncommit, "flushoncommit"},
{Opt_noflushoncommit, "noflushoncommit"},
- {Opt_ratio, "metadata_ratio=%d"},
+ {Opt_inode_cache, "inode_cache"},
+ {Opt_noinode_cache, "noinode_cache"},
+ {Opt_max_inline, "max_inline=%s"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_datacow, "datacow"},
+ {Opt_nodatacow, "nodatacow"},
+ {Opt_datasum, "datasum"},
+ {Opt_nodatasum, "nodatasum"},
+ {Opt_defrag, "autodefrag"},
+ {Opt_nodefrag, "noautodefrag"},
{Opt_discard, "discard"},
{Opt_nodiscard, "nodiscard"},
+ {Opt_nologreplay, "nologreplay"},
+ {Opt_norecovery, "norecovery"},
+ {Opt_ratio, "metadata_ratio=%u"},
+ {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
+ {Opt_skip_balance, "skip_balance"},
{Opt_space_cache, "space_cache"},
+ {Opt_no_space_cache, "nospace_cache"},
{Opt_space_cache_version, "space_cache=%s"},
- {Opt_clear_cache, "clear_cache"},
+ {Opt_ssd, "ssd"},
+ {Opt_nossd, "nossd"},
+ {Opt_ssd_spread, "ssd_spread"},
+ {Opt_nossd_spread, "nossd_spread"},
+ {Opt_subvol, "subvol=%s"},
+ {Opt_subvolid, "subvolid=%s"},
+ {Opt_thread_pool, "thread_pool=%u"},
+ {Opt_treelog, "treelog"},
+ {Opt_notreelog, "notreelog"},
+ {Opt_usebackuproot, "usebackuproot"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
- {Opt_enospc_debug, "enospc_debug"},
- {Opt_noenospc_debug, "noenospc_debug"},
+
+ /* Deprecated options */
+ {Opt_alloc_start, "alloc_start=%s"},
+ {Opt_recovery, "recovery"},
{Opt_subvolrootid, "subvolrootid=%d"},
- {Opt_defrag, "autodefrag"},
- {Opt_nodefrag, "noautodefrag"},
- {Opt_inode_cache, "inode_cache"},
- {Opt_noinode_cache, "noinode_cache"},
- {Opt_no_space_cache, "nospace_cache"},
- {Opt_recovery, "recovery"}, /* deprecated */
- {Opt_usebackuproot, "usebackuproot"},
- {Opt_skip_balance, "skip_balance"},
+
+ /* Debugging options */
{Opt_check_integrity, "check_int"},
{Opt_check_integrity_including_extent_data, "check_int_data"},
- {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
- {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
- {Opt_fatal_errors, "fatal_errors=%s"},
- {Opt_commit_interval, "commit=%d"},
+ {Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
+ {Opt_enospc_debug, "enospc_debug"},
+ {Opt_noenospc_debug, "noenospc_debug"},
#ifdef CONFIG_BTRFS_DEBUG
{Opt_fragment_data, "fragment=data"},
{Opt_fragment_metadata, "fragment=metadata"},
@@ -579,6 +613,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, NOSSD);
btrfs_clear_and_info(info, SSD,
"not using ssd optimizations");
+ /* Fallthrough */
+ case Opt_nossd_spread:
btrfs_clear_and_info(info, SSD_SPREAD,
"not using spread ssd allocation scheme");
break;
@@ -594,12 +630,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
ret = match_int(&args[0], &intarg);
if (ret) {
goto out;
- } else if (intarg > 0) {
- info->thread_pool_size = intarg;
- } else {
+ } else if (intarg == 0) {
ret = -EINVAL;
goto out;
}
+ info->thread_pool_size = intarg;
break;
case Opt_max_inline:
num = match_strdup(&args[0]);
@@ -658,16 +693,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret) {
+ if (ret)
goto out;
- } else if (intarg >= 0) {
- info->metadata_ratio = intarg;
- btrfs_info(info, "metadata ratio %d",
- info->metadata_ratio);
- } else {
- ret = -EINVAL;
- goto out;
- }
+ info->metadata_ratio = intarg;
+ btrfs_info(info, "metadata ratio %u",
+ info->metadata_ratio);
break;
case Opt_discard:
btrfs_set_and_info(info, DISCARD,
@@ -762,17 +792,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret) {
+ if (ret)
goto out;
- } else if (intarg >= 0) {
- info->check_integrity_print_mask = intarg;
- btrfs_info(info,
- "check_integrity_print_mask 0x%x",
- info->check_integrity_print_mask);
- } else {
- ret = -EINVAL;
- goto out;
- }
+ info->check_integrity_print_mask = intarg;
+ btrfs_info(info, "check_integrity_print_mask 0x%x",
+ info->check_integrity_print_mask);
break;
#else
case Opt_check_integrity_including_extent_data:
@@ -798,24 +822,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret < 0) {
- btrfs_err(info, "invalid commit interval");
- ret = -EINVAL;
+ if (ret)
goto out;
- }
- if (intarg > 0) {
- if (intarg > 300) {
- btrfs_warn(info,
- "excessive commit interval %d",
- intarg);
- }
- info->commit_interval = intarg;
- } else {
+ if (intarg == 0) {
btrfs_info(info,
- "using default commit interval %ds",
+ "using default commit interval %us",
BTRFS_DEFAULT_COMMIT_INTERVAL);
- info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ } else if (intarg > 300) {
+ btrfs_warn(info, "excessive commit interval %d",
+ intarg);
}
+ info->commit_interval = intarg;
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
@@ -932,8 +950,8 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
{
substring_t args[MAX_OPT_ARGS];
char *opts, *orig, *p;
- char *num = NULL;
int error = 0;
+ u64 subvolid;
if (!options)
return 0;
@@ -963,18 +981,15 @@ static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
}
break;
case Opt_subvolid:
- num = match_strdup(&args[0]);
- if (num) {
- *subvol_objectid = memparse(num, NULL);
- kfree(num);
- /* we want the original fs_tree */
- if (!*subvol_objectid)
- *subvol_objectid =
- BTRFS_FS_TREE_OBJECTID;
- } else {
- error = -EINVAL;
+ error = match_u64(&args[0], &subvolid);
+ if (error)
goto out;
- }
+
+ /* we want the original fs_tree */
+ if (subvolid == 0)
+ subvolid = BTRFS_FS_TREE_OBJECTID;
+
+ *subvol_objectid = subvolid;
break;
case Opt_subvolrootid:
pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n");
@@ -1284,7 +1299,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
- seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
+ seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
if (btrfs_test_opt(info, COMPRESS)) {
compress_type = btrfs_compress_type2str(info->compress_type);
if (btrfs_test_opt(info, FORCE_COMPRESS))
@@ -1340,12 +1355,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
info->check_integrity_print_mask);
#endif
if (info->metadata_ratio)
- seq_printf(seq, ",metadata_ratio=%d",
- info->metadata_ratio);
+ seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
- seq_printf(seq, ",commit=%d", info->commit_interval);
+ seq_printf(seq, ",commit=%u", info->commit_interval);
#ifdef CONFIG_BTRFS_DEBUG
if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
@@ -1690,7 +1704,7 @@ out:
}
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
- int new_pool_size, int old_pool_size)
+ u32 new_pool_size, u32 old_pool_size)
{
if (new_pool_size == old_pool_size)
return;
@@ -1758,8 +1772,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_opts = fs_info->mount_opt;
unsigned long old_compress_type = fs_info->compress_type;
u64 old_max_inline = fs_info->max_inline;
- int old_thread_pool_size = fs_info->thread_pool_size;
- unsigned int old_metadata_ratio = fs_info->metadata_ratio;
+ u32 old_thread_pool_size = fs_info->thread_pool_size;
+ u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
sync_filesystem(sb);
@@ -2290,11 +2304,18 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
struct list_head *head;
struct rcu_string *name;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ /*
+ * Lightweight locking of the devices. We should not need
+ * device_list_mutex here as we only read the device data and the list
+ * is protected by RCU. Even if a device is deleted during the list
+ * traversals, we'll get valid data, the freeing callback will wait at
+ * least until until the rcu_read_unlock.
+ */
+ rcu_read_lock();
cur_devices = fs_info->fs_devices;
while (cur_devices) {
head = &cur_devices->devices;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry_rcu(dev, head, dev_list) {
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
continue;
if (!dev->name)
@@ -2306,14 +2327,12 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
}
if (first_dev) {
- rcu_read_lock();
name = rcu_dereference(first_dev->name);
seq_escape(m, name->str, " \t\n\\");
- rcu_read_unlock();
} else {
WARN_ON(1);
}
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ rcu_read_unlock();
return 0;
}
@@ -2355,7 +2374,7 @@ static int __init btrfs_interface_init(void)
return misc_register(&btrfs_misc);
}
-static void btrfs_interface_exit(void)
+static __cold void btrfs_interface_exit(void)
{
misc_deregister(&btrfs_misc);
}
@@ -2376,22 +2395,18 @@ static void __init btrfs_print_mod_info(void)
", ref-verify=on"
#endif
"\n",
- btrfs_crc32c_impl());
+ crc32c_impl());
}
static int __init init_btrfs_fs(void)
{
int err;
- err = btrfs_hash_init();
- if (err)
- return err;
-
btrfs_props_init();
err = btrfs_init_sysfs();
if (err)
- goto free_hash;
+ return err;
btrfs_init_compress();
@@ -2472,8 +2487,7 @@ free_cachep:
free_compress:
btrfs_exit_compress();
btrfs_exit_sysfs();
-free_hash:
- btrfs_hash_exit();
+
return err;
}
@@ -2493,7 +2507,6 @@ static void __exit exit_btrfs_fs(void)
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
btrfs_exit_compress();
- btrfs_hash_exit();
}
late_initcall(init_btrfs_fs);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a8bafed931f4..ca067471cd46 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -272,7 +272,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj,
{
struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
struct btrfs_block_group_cache *block_group;
- int index = to_raid_kobj(kobj)->raid_type;
+ int index = btrfs_bg_flags_to_raid_index(to_raid_kobj(kobj)->flags);
u64 val = 0;
down_read(&sinfo->groups_sem);
@@ -923,7 +923,7 @@ out1:
return ret;
}
-void btrfs_exit_sysfs(void)
+void __cold btrfs_exit_sysfs(void)
{
sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9786d8cd0aa6..e74278170806 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -278,8 +278,7 @@ int btrfs_run_sanity_tests(void)
}
}
ret = btrfs_test_extent_map();
- if (ret)
- goto out;
+
out:
btrfs_destroy_test_fs();
return ret;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 70c993f01670..c23bd00bdd92 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -343,7 +343,7 @@ static void test_case_4(struct extent_map_tree *em_tree)
__test_case_4(em_tree, SZ_4K);
}
-int btrfs_test_extent_map()
+int btrfs_test_extent_map(void)
{
struct extent_map_tree *em_tree;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 90204b166643..160eb2fba726 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -63,7 +63,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
btrfs_set_extent_generation(leaf, item, 1);
btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
block_info = (struct btrfs_tree_block_info *)(item + 1);
- btrfs_set_tree_block_level(leaf, block_info, 1);
+ btrfs_set_tree_block_level(leaf, block_info, 0);
iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
if (parent > 0) {
btrfs_set_extent_inline_ref_type(leaf, iref,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 04f07144b45c..5c4cf0f9146b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -37,22 +37,16 @@
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
- [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
- __TRANS_START),
- [TRANS_STATE_COMMIT_START] = (__TRANS_USERSPACE |
- __TRANS_START |
- __TRANS_ATTACH),
- [TRANS_STATE_COMMIT_DOING] = (__TRANS_USERSPACE |
- __TRANS_START |
+ [TRANS_STATE_BLOCKED] = __TRANS_START,
+ [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
+ [TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN),
- [TRANS_STATE_UNBLOCKED] = (__TRANS_USERSPACE |
- __TRANS_START |
+ [TRANS_STATE_UNBLOCKED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK),
- [TRANS_STATE_COMPLETED] = (__TRANS_USERSPACE |
- __TRANS_START |
+ [TRANS_STATE_COMPLETED] = (__TRANS_START |
__TRANS_ATTACH |
__TRANS_JOIN |
__TRANS_JOIN_NOLOCK),
@@ -126,9 +120,9 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
spin_unlock(&tree->lock);
}
-static noinline void switch_commit_roots(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root, *tmp;
down_write(&fs_info->commit_root_sem);
@@ -319,7 +313,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
root->last_trans < trans->transid) || force) {
WARN_ON(root == fs_info->extent_root);
- WARN_ON(root->commit_root != root->node);
+ WARN_ON(!force && root->commit_root != root->node);
/*
* see below for IN_TRANS_SETUP usage rules
@@ -449,11 +443,7 @@ static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
return 0;
- if (type == TRANS_USERSPACE)
- return 1;
-
- if (type == TRANS_START &&
- !atomic_read(&fs_info->open_ioctl_trans))
+ if (type == TRANS_START)
return 1;
return 0;
@@ -508,8 +498,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
*/
if (num_items && root != fs_info->chunk_root) {
qgroup_reserved = num_items * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
- enforce_qgroups);
+ ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
+ enforce_qgroups);
if (ret)
return ERR_PTR(ret);
@@ -593,7 +583,7 @@ again:
got_it:
btrfs_record_root_in_trans(h, root);
- if (!current->journal_info && type != TRANS_USERSPACE)
+ if (!current->journal_info)
current->journal_info = h;
return h;
@@ -606,7 +596,7 @@ alloc_fail:
btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
- btrfs_qgroup_free_meta(root, qgroup_reserved);
+ btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
return ERR_PTR(ret);
}
@@ -658,14 +648,6 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
return trans;
}
-struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root,
- unsigned int num_items)
-{
- return start_transaction(root, num_items, TRANS_START,
- BTRFS_RESERVE_FLUSH_LIMIT, true);
-}
-
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
{
return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
@@ -678,12 +660,6 @@ struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root
BTRFS_RESERVE_NO_FLUSH, true);
}
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
-{
- return start_transaction(root, 0, TRANS_USERSPACE,
- BTRFS_RESERVE_NO_FLUSH, true);
-}
-
/*
* btrfs_attach_transaction() - catch the running transaction
*
@@ -789,8 +765,7 @@ out:
void btrfs_throttle(struct btrfs_fs_info *fs_info)
{
- if (!atomic_read(&fs_info->open_ioctl_trans))
- wait_current_trans(fs_info);
+ wait_current_trans(fs_info);
}
static int should_end_transaction(struct btrfs_trans_handle *trans)
@@ -806,7 +781,6 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_fs_info *fs_info = trans->fs_info;
int updates;
int err;
@@ -818,7 +792,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
updates = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (updates) {
- err = btrfs_run_delayed_refs(trans, fs_info, updates * 2);
+ err = btrfs_run_delayed_refs(trans, updates * 2);
if (err) /* Error code will also eval true */
return err;
}
@@ -826,6 +800,27 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
return should_end_transaction(trans);
}
+static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
+
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ if (!trans->block_rsv) {
+ ASSERT(!trans->bytes_reserved);
+ return;
+ }
+
+ if (!trans->bytes_reserved)
+ return;
+
+ ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
+ trace_btrfs_space_reservation(fs_info, "transaction",
+ trans->transid, trans->bytes_reserved, 0);
+ btrfs_block_rsv_release(fs_info, trans->block_rsv,
+ trans->bytes_reserved);
+ trans->bytes_reserved = 0;
+}
+
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int throttle)
{
@@ -843,11 +838,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
return 0;
}
- btrfs_trans_release_metadata(trans, info);
+ btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, info);
+ btrfs_create_pending_block_groups(trans);
trans->delayed_ref_updates = 0;
if (!trans->sync) {
@@ -864,16 +859,15 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
- btrfs_trans_release_metadata(trans, info);
+ btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, info);
+ btrfs_create_pending_block_groups(trans);
btrfs_trans_release_chunk_metadata(trans);
- if (lock && !atomic_read(&info->open_ioctl_trans) &&
- should_end_transaction(trans) &&
+ if (lock && should_end_transaction(trans) &&
READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
spin_lock(&info->trans_lock);
if (cur_trans->state == TRANS_STATE_RUNNING)
@@ -1072,40 +1066,33 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
}
/*
- * when btree blocks are allocated, they have some corresponding bits set for
- * them in one of two extent_io trees. This is used to make sure all of
- * those extents are on disk for transaction or log commit
+ * When btree blocks are allocated the corresponding extents are marked dirty.
+ * This function ensures such extents are persisted on disk for transaction or
+ * log commit.
+ *
+ * @trans: transaction whose dirty pages we'd like to write
*/
-static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *dirty_pages, int mark)
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
{
int ret;
int ret2;
+ struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct blk_plug plug;
blk_start_plug(&plug);
- ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark);
+ ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
+ clear_btree_io_tree(&trans->transaction->dirty_pages);
+
if (ret)
return ret;
- if (ret2)
+ else if (ret2)
return ret2;
- return 0;
-}
-
-static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- int ret;
-
- ret = btrfs_write_and_wait_marked_extents(fs_info,
- &trans->transaction->dirty_pages,
- EXTENT_DIRTY);
- clear_btree_io_tree(&trans->transaction->dirty_pages);
-
- return ret;
+ else
+ return 0;
}
/*
@@ -1155,9 +1142,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
* failures will cause the file system to go offline. We still need
* to clean up the delayed refs.
*/
-static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *io_bgs = &trans->transaction->io_bgs;
struct list_head *next;
@@ -1173,7 +1160,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
@@ -1192,7 +1179,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
return ret;
/* run_qgroups might have added some more refs */
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
again:
@@ -1209,7 +1196,7 @@ again:
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
}
@@ -1218,7 +1205,7 @@ again:
ret = btrfs_write_dirty_block_groups(trans, fs_info);
if (ret)
return ret;
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret)
return ret;
}
@@ -1251,9 +1238,9 @@ void btrfs_add_dead_root(struct btrfs_root *root)
/*
* update all the cowonly tree roots on disk
*/
-static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *gang[8];
int i;
int ret;
@@ -1297,7 +1284,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
- btrfs_qgroup_free_meta_all(root);
+ btrfs_qgroup_free_meta_all_pertrans(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1366,15 +1353,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return 0;
/*
+ * Ensure dirty @src will be commited. Or, after comming
+ * commit_fs_roots() and switch_commit_roots(), any dirty but not
+ * recorded root will never be updated again, causing an outdated root
+ * item.
+ */
+ record_root_in_trans(trans, src, 1);
+
+ /*
* We are going to commit transaction, see btrfs_commit_transaction()
* comment for reason locking tree_log_mutex
*/
mutex_lock(&fs_info->tree_log_mutex);
- ret = commit_fs_roots(trans, fs_info);
+ ret = commit_fs_roots(trans);
if (ret)
goto out;
- ret = btrfs_qgroup_account_extents(trans, fs_info);
+ ret = btrfs_qgroup_account_extents(trans);
if (ret < 0)
goto out;
@@ -1397,11 +1392,11 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* like chunk and root tree, as they won't affect qgroup.
* And we don't write super to avoid half committed status.
*/
- ret = commit_cowonly_roots(trans, fs_info);
+ ret = commit_cowonly_roots(trans);
if (ret)
goto out;
- switch_commit_roots(trans->transaction, fs_info);
- ret = btrfs_write_and_wait_transaction(trans, fs_info);
+ switch_commit_roots(trans->transaction);
+ ret = btrfs_write_and_wait_transaction(trans);
if (ret)
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction for qgroup");
@@ -1430,9 +1425,10 @@ out:
* the creation of the pending snapshots, just return 0.
*/
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
struct btrfs_pending_snapshot *pending)
{
+
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key key;
struct btrfs_root_item *new_root_item;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -1524,7 +1520,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* otherwise we corrupt the FS during
* snapshot
*/
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret) { /* Transaction aborted */
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1620,7 +1616,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1674,7 +1670,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
}
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1699,8 +1695,7 @@ no_free_objectid:
/*
* create all the snapshots we've scheduled for creation
*/
-static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
{
struct btrfs_pending_snapshot *pending, *next;
struct list_head *head = &trans->transaction->pending_snapshots;
@@ -1708,7 +1703,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
list_for_each_entry_safe(pending, next, head, list) {
list_del(&pending->list);
- ret = create_pending_snapshot(trans, fs_info, pending);
+ ret = create_pending_snapshot(trans, pending);
if (ret)
break;
}
@@ -1861,10 +1856,9 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
}
-static void cleanup_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int err)
+static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
DEFINE_WAIT(wait);
@@ -1904,7 +1898,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
btrfs_put_transaction(cur_trans);
btrfs_put_transaction(cur_trans);
- trace_btrfs_transaction_commit(root);
+ trace_btrfs_transaction_commit(trans->root);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -1959,13 +1953,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
btrfs_end_transaction(trans);
return ret;
}
- btrfs_trans_release_metadata(trans, fs_info);
+ btrfs_trans_release_metadata(trans);
trans->block_rsv = NULL;
cur_trans = trans->transaction;
@@ -1978,9 +1972,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
smp_wmb();
if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, fs_info);
+ btrfs_create_pending_block_groups(trans);
- ret = btrfs_run_delayed_refs(trans, fs_info, 0);
+ ret = btrfs_run_delayed_refs(trans, 0);
if (ret) {
btrfs_end_transaction(trans);
return ret;
@@ -2008,12 +2002,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
run_it = 1;
mutex_unlock(&fs_info->ro_block_group_mutex);
- if (run_it)
- ret = btrfs_start_dirty_block_groups(trans, fs_info);
- }
- if (ret) {
- btrfs_end_transaction(trans);
- return ret;
+ if (run_it) {
+ ret = btrfs_start_dirty_block_groups(trans);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+ }
}
spin_lock(&fs_info->trans_lock);
@@ -2061,7 +2056,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto cleanup_transaction;
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto cleanup_transaction;
@@ -2069,7 +2064,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
extwriter_counter_read(cur_trans) == 0);
/* some pending stuffs might be added after the previous flush. */
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto cleanup_transaction;
@@ -2106,7 +2101,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* deal with them in create_pending_snapshot(), which is the
* core function of the snapshot creation.
*/
- ret = create_pending_snapshots(trans, fs_info);
+ ret = create_pending_snapshots(trans);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
@@ -2122,13 +2117,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* because all the tree which are snapshoted will be forced to COW
* the nodes and leaves.
*/
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
@@ -2157,7 +2152,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
mutex_lock(&fs_info->tree_log_mutex);
- ret = commit_fs_roots(trans, fs_info);
+ ret = commit_fs_roots(trans);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2179,7 +2174,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* commit_fs_roots() can call btrfs_save_ino_cache(), which generates
* new delayed refs. Must handle them or qgroup can be wrong.
*/
- ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
+ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2190,14 +2185,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Since fs roots are all committed, we can get a quite accurate
* new_roots. So let's do quota accounting.
*/
- ret = btrfs_qgroup_account_extents(trans, fs_info);
+ ret = btrfs_qgroup_account_extents(trans);
if (ret < 0) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
goto scrub_continue;
}
- ret = commit_cowonly_roots(trans, fs_info);
+ ret = commit_cowonly_roots(trans);
if (ret) {
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2229,7 +2224,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
- switch_commit_roots(cur_trans, fs_info);
+ switch_commit_roots(cur_trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
ASSERT(list_empty(&cur_trans->io_bgs));
@@ -2241,7 +2236,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
sizeof(*fs_info->super_copy));
btrfs_update_commit_device_size(fs_info);
- btrfs_update_commit_device_bytes_used(fs_info, cur_trans);
+ btrfs_update_commit_device_bytes_used(cur_trans);
clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
@@ -2256,7 +2251,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_wait);
- ret = btrfs_write_and_wait_transaction(trans, fs_info);
+ ret = btrfs_write_and_wait_transaction(trans);
if (ret) {
btrfs_handle_fs_error(fs_info, ret,
"Error while writing out transaction");
@@ -2273,7 +2268,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
- btrfs_finish_extent_commit(trans, fs_info);
+ btrfs_finish_extent_commit(trans);
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
@@ -2319,13 +2314,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
scrub_continue:
btrfs_scrub_continue(fs_info);
cleanup_transaction:
- btrfs_trans_release_metadata(trans, fs_info);
+ btrfs_trans_release_metadata(trans);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
- cleanup_transaction(trans, trans->root, ret);
+ cleanup_transaction(trans, ret);
return ret;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6beee072b1bd..b6c94ce33503 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -69,6 +69,22 @@ struct btrfs_transaction {
struct list_head pending_chunks;
struct list_head switch_commits;
struct list_head dirty_bgs;
+
+ /*
+ * There is no explicit lock which protects io_bgs, rather its
+ * consistency is implied by the fact that all the sites which modify
+ * it do so under some form of transaction critical section, namely:
+ *
+ * - btrfs_start_dirty_block_groups - This function can only ever be
+ * run by one of the transaction committers. Refer to
+ * BTRFS_TRANS_DIRTY_BG_RUN usage in btrfs_commit_transaction
+ *
+ * - btrfs_write_dirty_blockgroups - this is called by
+ * commit_cowonly_roots from transaction critical section
+ * (TRANS_STATE_COMMIT_DOING)
+ *
+ * - btrfs_cleanup_dirty_bgs - called on transaction abort
+ */
struct list_head io_bgs;
struct list_head dropped_roots;
@@ -89,21 +105,18 @@ struct btrfs_transaction {
#define __TRANS_FREEZABLE (1U << 0)
-#define __TRANS_USERSPACE (1U << 8)
#define __TRANS_START (1U << 9)
#define __TRANS_ATTACH (1U << 10)
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
#define __TRANS_DUMMY (1U << 13)
-#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
#define TRANS_ATTACH (__TRANS_ATTACH)
#define TRANS_JOIN (__TRANS_JOIN | __TRANS_FREEZABLE)
#define TRANS_JOIN_NOLOCK (__TRANS_JOIN_NOLOCK)
-#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
- __TRANS_ATTACH)
+#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
@@ -186,15 +199,11 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
struct btrfs_root *root,
unsigned int num_items,
int min_factor);
-struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root,
- unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c3c8d48f6618..8871286c1a91 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -30,7 +30,6 @@
#include "tree-checker.h"
#include "disk-io.h"
#include "compression.h"
-#include "hash.h"
/*
* Error message should follow the following format:
@@ -53,7 +52,8 @@
* Allows callers to customize the output.
*/
__printf(4, 5)
-static void generic_err(const struct btrfs_root *root,
+__cold
+static void generic_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
@@ -65,10 +65,10 @@ static void generic_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
- btrfs_crit(root->fs_info,
+ btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
- root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf);
va_end(args);
}
@@ -77,7 +77,8 @@ static void generic_err(const struct btrfs_root *root,
* offset has its own meaning.
*/
__printf(4, 5)
-static void file_extent_err(const struct btrfs_root *root,
+__cold
+static void file_extent_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
@@ -91,10 +92,11 @@ static void file_extent_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
- btrfs_crit(root->fs_info,
+ btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV",
- btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
- btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf);
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, key.offset, &vaf);
va_end(args);
}
@@ -102,26 +104,26 @@ static void file_extent_err(const struct btrfs_root *root,
* Return 0 if the btrfs_file_extent_##name is aligned to @alignment
* Else return 1
*/
-#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \
+#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment) \
({ \
if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
- file_extent_err((root), (leaf), (slot), \
+ file_extent_err((fs_info), (leaf), (slot), \
"invalid %s for file extent, have %llu, should be aligned to %u", \
(#name), btrfs_file_extent_##name((leaf), (fi)), \
(alignment)); \
(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \
})
-static int check_extent_data_item(struct btrfs_root *root,
+static int check_extent_data_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
struct btrfs_file_extent_item *fi;
- u32 sectorsize = root->fs_info->sectorsize;
+ u32 sectorsize = fs_info->sectorsize;
u32 item_size = btrfs_item_size_nr(leaf, slot);
if (!IS_ALIGNED(key->offset, sectorsize)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"unaligned file_offset for file extent, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
@@ -130,7 +132,7 @@ static int check_extent_data_item(struct btrfs_root *root,
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid type for file extent, have %u expect range [0, %u]",
btrfs_file_extent_type(leaf, fi),
BTRFS_FILE_EXTENT_TYPES);
@@ -142,14 +144,14 @@ static int check_extent_data_item(struct btrfs_root *root,
* and must be caught in open_ctree().
*/
if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid compression for file extent, have %u expect range [0, %u]",
btrfs_file_extent_compression(leaf, fi),
BTRFS_COMPRESS_TYPES);
return -EUCLEAN;
}
if (btrfs_file_extent_encryption(leaf, fi)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid encryption for file extent, have %u expect 0",
btrfs_file_extent_encryption(leaf, fi));
return -EUCLEAN;
@@ -157,7 +159,7 @@ static int check_extent_data_item(struct btrfs_root *root,
if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) {
/* Inline extent must have 0 as key offset */
if (key->offset) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid file_offset for inline file extent, have %llu expect 0",
key->offset);
return -EUCLEAN;
@@ -171,7 +173,7 @@ static int check_extent_data_item(struct btrfs_root *root,
/* Uncompressed inline extent size must match item size */
if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START +
btrfs_file_extent_ram_bytes(leaf, fi)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid ram_bytes for uncompressed inline extent, have %u expect %llu",
item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START +
btrfs_file_extent_ram_bytes(leaf, fi));
@@ -182,40 +184,41 @@ static int check_extent_data_item(struct btrfs_root *root,
/* Regular or preallocated extent has fixed item size */
if (item_size != sizeof(*fi)) {
- file_extent_err(root, leaf, slot,
+ file_extent_err(fs_info, leaf, slot,
"invalid item size for reg/prealloc file extent, have %u expect %zu",
item_size, sizeof(*fi));
return -EUCLEAN;
}
- if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) ||
- CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize))
+ if (CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, ram_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_bytenr, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, disk_num_bytes, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, offset, sectorsize) ||
+ CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, num_bytes, sectorsize))
return -EUCLEAN;
return 0;
}
-static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+static int check_csum_item(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf, struct btrfs_key *key,
+ int slot)
{
- u32 sectorsize = root->fs_info->sectorsize;
- u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy);
+ u32 sectorsize = fs_info->sectorsize;
+ u32 csumsize = btrfs_super_csum_size(fs_info->super_copy);
if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"invalid key objectid for csum item, have %llu expect %llu",
key->objectid, BTRFS_EXTENT_CSUM_OBJECTID);
return -EUCLEAN;
}
if (!IS_ALIGNED(key->offset, sectorsize)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"unaligned key offset for csum item, have %llu should be aligned to %u",
key->offset, sectorsize);
return -EUCLEAN;
}
if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"unaligned item size for csum item, have %u should be aligned to %u",
btrfs_item_size_nr(leaf, slot), csumsize);
return -EUCLEAN;
@@ -228,7 +231,8 @@ static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf,
* which represents inode number
*/
__printf(4, 5)
-static void dir_item_err(const struct btrfs_root *root,
+__cold
+static void dir_item_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
{
@@ -242,14 +246,15 @@ static void dir_item_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
- btrfs_crit(root->fs_info,
+ btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu, %pV",
- btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
- btrfs_header_bytenr(eb), slot, key.objectid, &vaf);
+ btrfs_header_level(eb) == 0 ? "leaf" : "node",
+ btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+ key.objectid, &vaf);
va_end(args);
}
-static int check_dir_item(struct btrfs_root *root,
+static int check_dir_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
@@ -268,7 +273,7 @@ static int check_dir_item(struct btrfs_root *root,
/* header itself should not cross item boundary */
if (cur + sizeof(*di) > item_size) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item header crosses item boundary, have %zu boundary %u",
cur + sizeof(*di), item_size);
return -EUCLEAN;
@@ -277,7 +282,7 @@ static int check_dir_item(struct btrfs_root *root,
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
if (dir_type >= BTRFS_FT_MAX) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
dir_type, BTRFS_FT_MAX);
return -EUCLEAN;
@@ -285,14 +290,14 @@ static int check_dir_item(struct btrfs_root *root,
if (key->type == BTRFS_XATTR_ITEM_KEY &&
dir_type != BTRFS_FT_XATTR) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"invalid dir item type for XATTR key, have %u expect %u",
dir_type, BTRFS_FT_XATTR);
return -EUCLEAN;
}
if (dir_type == BTRFS_FT_XATTR &&
key->type != BTRFS_XATTR_ITEM_KEY) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"xattr dir type found for non-XATTR key");
return -EUCLEAN;
}
@@ -305,21 +310,21 @@ static int check_dir_item(struct btrfs_root *root,
name_len = btrfs_dir_name_len(leaf, di);
data_len = btrfs_dir_data_len(leaf, di);
if (name_len > max_name_len) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item name len too long, have %u max %u",
name_len, max_name_len);
return -EUCLEAN;
}
- if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
- dir_item_err(root, leaf, slot,
+ if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(fs_info)) {
+ dir_item_err(fs_info, leaf, slot,
"dir item name and data len too long, have %u max %u",
name_len + data_len,
- BTRFS_MAX_XATTR_SIZE(root->fs_info));
+ BTRFS_MAX_XATTR_SIZE(fs_info));
return -EUCLEAN;
}
if (data_len && dir_type != BTRFS_FT_XATTR) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item with invalid data len, have %u expect 0",
data_len);
return -EUCLEAN;
@@ -329,7 +334,7 @@ static int check_dir_item(struct btrfs_root *root,
/* header and name/data should not cross item boundary */
if (cur + total_size > item_size) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"dir item data crosses item boundary, have %u boundary %u",
cur + total_size, item_size);
return -EUCLEAN;
@@ -347,7 +352,7 @@ static int check_dir_item(struct btrfs_root *root,
(unsigned long)(di + 1), name_len);
name_hash = btrfs_name_hash(namebuf, name_len);
if (key->offset != name_hash) {
- dir_item_err(root, leaf, slot,
+ dir_item_err(fs_info, leaf, slot,
"name hash mismatch with key, have 0x%016x expect 0x%016llx",
name_hash, key->offset);
return -EUCLEAN;
@@ -362,7 +367,7 @@ static int check_dir_item(struct btrfs_root *root,
/*
* Common point to switch the item-specific validation.
*/
-static int check_leaf_item(struct btrfs_root *root,
+static int check_leaf_item(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
@@ -370,24 +375,23 @@ static int check_leaf_item(struct btrfs_root *root,
switch (key->type) {
case BTRFS_EXTENT_DATA_KEY:
- ret = check_extent_data_item(root, leaf, key, slot);
+ ret = check_extent_data_item(fs_info, leaf, key, slot);
break;
case BTRFS_EXTENT_CSUM_KEY:
- ret = check_csum_item(root, leaf, key, slot);
+ ret = check_csum_item(fs_info, leaf, key, slot);
break;
case BTRFS_DIR_ITEM_KEY:
case BTRFS_DIR_INDEX_KEY:
case BTRFS_XATTR_ITEM_KEY:
- ret = check_dir_item(root, leaf, key, slot);
+ ret = check_dir_item(fs_info, leaf, key, slot);
break;
}
return ret;
}
-static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
+static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
bool check_item_data)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
/* No valid key type is 0, so all key should be larger than this key */
struct btrfs_key prev_key = {0, 0, 0};
struct btrfs_key key;
@@ -420,7 +424,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
eb = btrfs_root_node(check_root);
/* if leaf is the root, then it's fine */
if (leaf != eb) {
- generic_err(check_root, leaf, 0,
+ generic_err(fs_info, leaf, 0,
"invalid nritems, have %u should not be 0 for non-root leaf",
nritems);
free_extent_buffer(eb);
@@ -453,7 +457,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
/* Make sure the keys are in the right order */
if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"bad key order, prev (%llu %u %llu) current (%llu %u %llu)",
prev_key.objectid, prev_key.type,
prev_key.offset, key.objectid, key.type,
@@ -472,7 +476,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
item_end_expected = btrfs_item_offset_nr(leaf,
slot - 1);
if (btrfs_item_end_nr(leaf, slot) != item_end_expected) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"unexpected item end, have %u expect %u",
btrfs_item_end_nr(leaf, slot),
item_end_expected);
@@ -486,7 +490,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
*/
if (btrfs_item_end_nr(leaf, slot) >
BTRFS_LEAF_DATA_SIZE(fs_info)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"slot end outside of leaf, have %u expect range [0, %u]",
btrfs_item_end_nr(leaf, slot),
BTRFS_LEAF_DATA_SIZE(fs_info));
@@ -496,7 +500,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
/* Also check if the item pointer overlaps with btrfs item. */
if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) >
btrfs_item_ptr_offset(leaf, slot)) {
- generic_err(root, leaf, slot,
+ generic_err(fs_info, leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
btrfs_item_nr_offset(slot) +
sizeof(struct btrfs_item),
@@ -509,7 +513,7 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
* Check if the item size and content meet other
* criteria
*/
- ret = check_leaf_item(root, leaf, &key, slot);
+ ret = check_leaf_item(fs_info, leaf, &key, slot);
if (ret < 0)
return ret;
}
@@ -522,18 +526,19 @@ static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf,
return 0;
}
-int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf)
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf)
{
- return check_leaf(root, leaf, true);
+ return check_leaf(fs_info, leaf, true);
}
-int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf)
{
- return check_leaf(root, leaf, false);
+ return check_leaf(fs_info, leaf, false);
}
-int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
{
unsigned long nr = btrfs_header_nritems(node);
struct btrfs_key key, next_key;
@@ -541,12 +546,12 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
u64 bytenr;
int ret = 0;
- if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) {
- btrfs_crit(root->fs_info,
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
+ btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
- root->objectid, node->start,
+ btrfs_header_owner(node), node->start,
nr == 0 ? "small" : "large", nr,
- BTRFS_NODEPTRS_PER_BLOCK(root->fs_info));
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
return -EUCLEAN;
}
@@ -556,21 +561,21 @@ int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node)
btrfs_node_key_to_cpu(node, &next_key, slot + 1);
if (!bytenr) {
- generic_err(root, node, slot,
+ generic_err(fs_info, node, slot,
"invalid NULL node pointer");
ret = -EUCLEAN;
goto out;
}
- if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) {
- generic_err(root, node, slot,
+ if (!IS_ALIGNED(bytenr, fs_info->sectorsize)) {
+ generic_err(fs_info, node, slot,
"unaligned pointer, have %llu should be aligned to %u",
- bytenr, root->fs_info->sectorsize);
+ bytenr, fs_info->sectorsize);
ret = -EUCLEAN;
goto out;
}
if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) {
- generic_err(root, node, slot,
+ generic_err(fs_info, node, slot,
"bad key order, current (%llu %u %llu) next (%llu %u %llu)",
key.objectid, key.type, key.offset,
next_key.objectid, next_key.type,
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3d53e8d6fda0..aba542755710 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -25,14 +25,15 @@
* Will check not only the item pointers, but also every possible member
* in item data.
*/
-int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *leaf);
/*
* Less strict leaf checker.
* Will only check item pointers, not reading item data.
*/
-int btrfs_check_leaf_relaxed(struct btrfs_root *root,
+int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf);
-int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node);
+int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index cb65089127cc..c09dbe4bd6e7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -39,7 +39,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int level;
int next_key_ret = 0;
u64 last_ret = 0;
- u64 min_trans = 0;
if (root->fs_info->extent_root == root) {
/*
@@ -81,7 +80,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
- ret = btrfs_search_forward(root, &key, path, min_trans);
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
if (ret < 0)
goto out;
if (ret > 0) {
@@ -130,7 +129,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
*/
path->slots[1] = btrfs_header_nritems(path->nodes[1]);
next_key_ret = btrfs_find_next_key(root, path, &key, 1,
- min_trans);
+ BTRFS_OLDEST_GENERATION);
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 434457794c27..c91babc6aa4b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -21,12 +21,12 @@
#include <linux/blkdev.h>
#include <linux/list_sort.h>
#include <linux/iversion.h>
+#include "ctree.h"
#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
-#include "hash.h"
#include "compression.h"
#include "qgroup.h"
#include "inode-map.h"
@@ -286,7 +286,7 @@ struct walk_control {
* inside it
*/
int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen);
+ struct walk_control *wc, u64 gen, int level);
};
/*
@@ -294,7 +294,7 @@ struct walk_control {
*/
static int process_one_buffer(struct btrfs_root *log,
struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+ struct walk_control *wc, u64 gen, int level)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret = 0;
@@ -304,7 +304,7 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
- ret = btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
@@ -853,7 +853,6 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_dir_item *di)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
char *name;
int name_len;
@@ -887,7 +886,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
else
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
@@ -1007,7 +1006,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
u64 ref_index, char *name, int namelen,
int *search_done)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *victim_name;
int victim_name_len;
@@ -1065,7 +1063,7 @@ again:
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
if (ret)
return ret;
*search_done = 1;
@@ -1136,8 +1134,7 @@ again:
victim_name_len);
if (!ret)
ret = btrfs_run_delayed_items(
- trans,
- fs_info);
+ trans);
}
iput(victim_parent);
kfree(victim_name);
@@ -2098,7 +2095,6 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
struct extent_buffer *eb;
int slot;
@@ -2162,7 +2158,7 @@ again:
ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
BTRFS_I(inode), name, name_len);
if (!ret)
- ret = btrfs_run_delayed_items(trans, fs_info);
+ ret = btrfs_run_delayed_items(trans);
kfree(name);
iput(inode);
if (ret)
@@ -2410,17 +2406,16 @@ out:
* back refs).
*/
static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
- struct walk_control *wc, u64 gen)
+ struct walk_control *wc, u64 gen, int level)
{
int nritems;
struct btrfs_path *path;
struct btrfs_root *root = wc->replay_dest;
struct btrfs_key key;
- int level;
int i;
int ret;
- ret = btrfs_read_buffer(eb, gen);
+ ret = btrfs_read_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@@ -2537,6 +2532,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level >= BTRFS_MAX_LEVEL);
while (*level > 0) {
+ struct btrfs_key first_key;
+
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
@@ -2549,6 +2546,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
blocksize = fs_info->nodesize;
parent = path->nodes[*level];
@@ -2559,7 +2557,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return PTR_ERR(next);
if (*level == 1) {
- ret = wc->process_func(root, next, wc, ptr_gen);
+ ret = wc->process_func(root, next, wc, ptr_gen,
+ *level - 1);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2567,7 +2566,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen,
+ *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2597,7 +2597,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
- ret = btrfs_read_buffer(next, ptr_gen);
+ ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;
@@ -2647,7 +2647,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
root_owner = btrfs_header_owner(parent);
ret = wc->process_func(root, path->nodes[*level], wc,
- btrfs_header_generation(path->nodes[*level]));
+ btrfs_header_generation(path->nodes[*level]),
+ *level);
if (ret)
return ret;
@@ -2729,7 +2730,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
/* was the root node processed? if not, catch it here */
if (path->nodes[orig_level]) {
ret = wc->process_func(log, path->nodes[orig_level], wc,
- btrfs_header_generation(path->nodes[orig_level]));
+ btrfs_header_generation(path->nodes[orig_level]),
+ orig_level);
if (ret)
goto out;
if (wc->free) {
@@ -3972,6 +3974,7 @@ fill_holes:
ASSERT(ret == 0);
src = src_path->nodes[0];
i = 0;
+ need_find_last_extent = true;
}
btrfs_item_key_to_cpu(src, &key, i);
@@ -4006,6 +4009,36 @@ fill_holes:
break;
*last_extent = extent_end;
}
+
+ /*
+ * Check if there is a hole between the last extent found in our leaf
+ * and the first extent in the next leaf. If there is one, we need to
+ * log an explicit hole so that at replay time we can punch the hole.
+ */
+ if (ret == 0 &&
+ key.objectid == btrfs_ino(inode) &&
+ key.type == BTRFS_EXTENT_DATA_KEY &&
+ i == btrfs_header_nritems(src_path->nodes[0])) {
+ ret = btrfs_next_leaf(inode->root, src_path);
+ need_find_last_extent = true;
+ if (ret > 0) {
+ ret = 0;
+ } else if (ret == 0) {
+ btrfs_item_key_to_cpu(src_path->nodes[0], &key,
+ src_path->slots[0]);
+ if (key.objectid == btrfs_ino(inode) &&
+ key.type == BTRFS_EXTENT_DATA_KEY &&
+ *last_extent < key.offset) {
+ const u64 len = key.offset - *last_extent;
+
+ ret = btrfs_insert_file_extent(trans, log,
+ btrfs_ino(inode),
+ *last_extent, 0,
+ 0, len, 0, len,
+ 0, 0, 0);
+ }
+ }
+ }
/*
* Need to let the callers know we dropped the path so they should
* re-search.
@@ -5517,7 +5550,6 @@ out:
* the last committed transaction
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
struct btrfs_inode *inode,
struct dentry *parent,
const loff_t start,
@@ -5525,6 +5557,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
int inode_only,
struct btrfs_log_ctx *ctx)
{
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct super_block *sb;
struct dentry *old_parent = NULL;
@@ -5550,7 +5583,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) {
+ if (btrfs_root_refs(&root->root_item) == 0) {
ret = 1;
goto end_no_trans;
}
@@ -5682,7 +5715,7 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry,
+ struct dentry *dentry,
const loff_t start,
const loff_t end,
struct btrfs_log_ctx *ctx)
@@ -5690,8 +5723,8 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
- parent, start, end, LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
+ start, end, LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -5953,7 +5986,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct dentry *parent)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
- struct btrfs_root *root = inode->root;
/*
* this will force the logging code to walk the dentry chain
@@ -5970,7 +6002,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 0,
- LLONG_MAX, LOG_INODE_EXISTS, NULL);
+ return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 483027f9a7f4..88abc43312a1 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -65,7 +65,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry,
+ struct dentry *dentry,
const loff_t start,
const loff_t end,
struct btrfs_log_ctx *ctx);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 726f928238d0..9916f03430bc 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -282,7 +282,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
key.offset = 0;
again_search_slot:
- ret = btrfs_search_forward(root, &key, path, 0);
+ ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION);
if (ret) {
if (ret > 0)
ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b2d05c6b1c56..93f8f17cacca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
+#include <linux/list_sort.h>
#include <asm/div64.h>
#include "ctree.h"
#include "extent_map.h"
@@ -278,7 +279,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_cleanup_fs_uuids(void)
+void __exit btrfs_cleanup_fs_uuids(void)
{
struct btrfs_fs_devices *fs_devices;
@@ -708,7 +709,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
device->devid != BTRFS_DEV_REPLACE_DEVID) {
fs_devices->rw_devices++;
- list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+ list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
}
brelse(bh);
@@ -895,7 +896,11 @@ error:
return ERR_PTR(-ENOMEM);
}
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
+/*
+ * After we have read the system tree and know devids belonging to
+ * this filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
{
struct btrfs_device *device, *next;
struct btrfs_device *latest_dev = NULL;
@@ -1103,6 +1108,20 @@ out:
return ret;
}
+static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+ struct btrfs_device *dev1, *dev2;
+
+ dev1 = list_entry(a, struct btrfs_device, dev_list);
+ dev2 = list_entry(b, struct btrfs_device, dev_list);
+
+ if (dev1->devid < dev2->devid)
+ return -1;
+ else if (dev1->devid > dev2->devid)
+ return 1;
+ return 0;
+}
+
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder)
{
@@ -1113,6 +1132,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->opened++;
ret = 0;
} else {
+ list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = __btrfs_open_devices(fs_devices, flags, holder);
}
mutex_unlock(&uuid_mutex);
@@ -1916,12 +1936,12 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_lock(&uuid_mutex);
num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
WARN_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
@@ -2047,7 +2067,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devices;
- WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+ lockdep_assert_held(&fs_info->fs_devices->device_list_mutex);
/*
* in case of fs with no seed, srcdev->fs_devices will point
@@ -2237,7 +2257,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
u64 super_flags;
- BUG_ON(!mutex_is_locked(&uuid_mutex));
+ lockdep_assert_held(&uuid_mutex);
if (!fs_devices->seeding)
return -EINVAL;
@@ -2642,7 +2662,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
- ASSERT(list_empty(&srcdev->resized_list));
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
@@ -2666,19 +2685,6 @@ error:
return ret;
}
-void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
- struct btrfs_device *tgtdev)
-{
- u32 sectorsize = fs_info->sectorsize;
-
- WARN_ON(fs_info->fs_devices->rw_devices == 0);
- tgtdev->io_width = sectorsize;
- tgtdev->io_align = sectorsize;
- tgtdev->sector_size = sectorsize;
- tgtdev->fs_info = fs_info;
- set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &tgtdev->dev_state);
-}
-
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device)
{
@@ -2984,7 +2990,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
+ lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
ret = btrfs_can_relocate(fs_info, chunk_offset);
if (ret)
@@ -2997,6 +3003,16 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (ret)
return ret;
+ /*
+ * We add the kobjects here (and after forcing data chunk creation)
+ * since relocation is the only place we'll create chunks of a new
+ * type at runtime. The only place where we'll remove the last
+ * chunk of a type is the call immediately below this one. Even
+ * so, we're protected against races with the cleaner thread since
+ * we're covered by the delete_unused_bgs_mutex.
+ */
+ btrfs_add_raid_kobjects(fs_info);
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3124,6 +3140,8 @@ static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
if (ret < 0)
return ret;
+ btrfs_add_raid_kobjects(fs_info);
+
return 1;
}
}
@@ -3892,12 +3910,12 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
}
num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
BUG_ON(num_devices < 1);
num_devices--;
}
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -4202,7 +4220,8 @@ static int btrfs_uuid_scan_kthread(void *data)
key.offset = 0;
while (1) {
- ret = btrfs_search_forward(root, &key, path, 0);
+ ret = btrfs_search_forward(root, &key, path,
+ BTRFS_OLDEST_GENERATION);
if (ret) {
if (ret > 0)
ret = 0;
@@ -4672,7 +4691,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
-#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \
+#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -4713,10 +4732,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
BUG_ON(!alloc_profile_is_valid(type, 0));
- if (list_empty(&fs_devices->alloc_list))
+ if (list_empty(&fs_devices->alloc_list)) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info, "%s: no writable device", __func__);
return -ENOSPC;
+ }
- index = __get_raid_index(type);
+ index = btrfs_bg_flags_to_raid_index(type);
sub_stripes = btrfs_raid_array[index].sub_stripes;
dev_stripes = btrfs_raid_array[index].dev_stripes;
@@ -4729,7 +4751,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = SZ_1G;
max_chunk_size = 10 * max_stripe_size;
if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
@@ -4738,7 +4760,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_stripe_size = SZ_256M;
max_chunk_size = max_stripe_size;
if (!devs_max)
- devs_max = BTRFS_MAX_DEVS(info->chunk_root);
+ devs_max = BTRFS_MAX_DEVS(info);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = SZ_32M;
max_chunk_size = 2 * max_stripe_size;
@@ -4797,8 +4819,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (ret == 0)
max_avail = max_stripe_size * dev_stripes;
- if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+ if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
+ btrfs_debug(info,
+ "%s: devid %llu has no free space, have=%llu want=%u",
+ __func__, device->devid, max_avail,
+ BTRFS_STRIPE_LEN * dev_stripes);
continue;
+ }
if (ndevs == fs_devices->rw_devices) {
WARN(1, "%s: found more than %llu devices\n",
@@ -4821,8 +4849,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
/* round down to number of usable stripes */
ndevs = round_down(ndevs, devs_increment);
- if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+ if (ndevs < devs_min) {
ret = -ENOSPC;
+ if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
+ btrfs_debug(info,
+ "%s: not enough devices with free space: have=%d minimum required=%d",
+ __func__, ndevs, devs_min);
+ }
goto error;
}
@@ -4856,18 +4889,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
* and compare that answer with the max chunk size
*/
if (stripe_size * data_stripes > max_chunk_size) {
- u64 mask = (1ULL << 24) - 1;
-
stripe_size = div_u64(max_chunk_size, data_stripes);
/* bump the answer up to a 16MB boundary */
- stripe_size = (stripe_size + mask) & ~mask;
+ stripe_size = round_up(stripe_size, SZ_16M);
- /* but don't go higher than the limits we found
- * while searching for free extents
+ /*
+ * But don't go higher than the limits we found while searching
+ * for free extents
*/
- if (stripe_size > devices_info[ndevs-1].max_avail)
- stripe_size = devices_info[ndevs-1].max_avail;
+ stripe_size = min(devices_info[ndevs - 1].max_avail,
+ stripe_size);
}
/* align to BTRFS_STRIPE_LEN */
@@ -5068,7 +5100,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
{
u64 chunk_offset;
- ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
+ lockdep_assert_held(&fs_info->chunk_mutex);
chunk_offset = find_next_chunk(fs_info);
return __btrfs_alloc_chunk(trans, chunk_offset, type);
}
@@ -5209,11 +5241,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
ret = 1;
free_extent_map(em);
- btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
fs_info->dev_replace.tgtdev)
ret++;
- btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
return ret;
}
@@ -5254,13 +5286,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first, int num,
- int optimal, int dev_replace_is_ongoing)
+ struct map_lookup *map, int first,
+ int dev_replace_is_ongoing)
{
int i;
+ int num_stripes;
+ int preferred_mirror;
int tolerance;
struct btrfs_device *srcdev;
+ ASSERT((map->type &
+ (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)));
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ num_stripes = map->sub_stripes;
+ else
+ num_stripes = map->num_stripes;
+
+ preferred_mirror = first + current->pid % num_stripes;
+
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
@@ -5274,10 +5318,10 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
* mirror is available
*/
for (tolerance = 0; tolerance < 2; tolerance++) {
- if (map->stripes[optimal].dev->bdev &&
- (tolerance || map->stripes[optimal].dev != srcdev))
- return optimal;
- for (i = first; i < first + num; i++) {
+ if (map->stripes[preferred_mirror].dev->bdev &&
+ (tolerance || map->stripes[preferred_mirror].dev != srcdev))
+ return preferred_mirror;
+ for (i = first; i < first + num_stripes; i++) {
if (map->stripes[i].dev->bdev &&
(tolerance || map->stripes[i].dev != srcdev))
return i;
@@ -5287,7 +5331,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
/* we couldn't find one that doesn't fail. Just return something
* and the io error handling code will clean up eventually
*/
- return optimal;
+ return preferred_mirror;
}
static inline int parity_smaller(u64 a, u64 b)
@@ -5779,10 +5823,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (!bbio_ret)
goto out;
- btrfs_dev_replace_lock(dev_replace, 0);
+ btrfs_dev_replace_read_lock(dev_replace);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (!dev_replace_is_ongoing)
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
else
btrfs_dev_replace_set_lock_blocking(dev_replace);
@@ -5814,8 +5858,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
stripe_index = mirror_num - 1;
else {
stripe_index = find_live_mirror(fs_info, map, 0,
- map->num_stripes,
- current->pid % map->num_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index + 1;
}
@@ -5843,8 +5885,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
int old_stripe_index = stripe_index;
stripe_index = find_live_mirror(fs_info, map,
stripe_index,
- map->sub_stripes, stripe_index +
- current->pid % map->sub_stripes,
dev_replace_is_ongoing);
mirror_num = stripe_index - old_stripe_index + 1;
}
@@ -5984,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
out:
if (dev_replace_is_ongoing) {
btrfs_dev_replace_clear_lock_blocking(dev_replace);
- btrfs_dev_replace_unlock(dev_replace, 0);
+ btrfs_dev_replace_read_unlock(dev_replace);
}
free_extent_map(em);
return ret;
@@ -6618,7 +6658,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices;
int ret;
- BUG_ON(!mutex_is_locked(&uuid_mutex));
+ lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
fs_devices = fs_info->fs_devices->seed;
@@ -7358,20 +7398,20 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
}
/* Must be invoked during the transaction commit */
-void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *transaction)
+void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct extent_map *em;
struct map_lookup *map;
struct btrfs_device *dev;
int i;
- if (list_empty(&transaction->pending_chunks))
+ if (list_empty(&trans->pending_chunks))
return;
/* In order to kick the device replace finish process */
mutex_lock(&fs_info->chunk_mutex);
- list_for_each_entry(em, &transaction->pending_chunks, list) {
+ list_for_each_entry(em, &trans->pending_chunks, list) {
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 28c28eeadff3..d1fcaea9fef5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -422,7 +422,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
struct btrfs_fs_devices **fs_devices_ret);
int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
struct btrfs_device *device, struct btrfs_device *this_dev);
int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
@@ -436,7 +436,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u8 *uuid);
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
const char *device_path, u64 devid);
-void btrfs_cleanup_fs_uuids(void);
+void __exit btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
@@ -476,8 +476,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
-void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
- struct btrfs_device *tgtdev);
void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path);
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
@@ -546,9 +544,30 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
btrfs_dev_stat_set(dev, index, 0);
}
+/*
+ * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
+ * can be used as index to access btrfs_raid_array[].
+ */
+static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
+{
+ if (flags & BTRFS_BLOCK_GROUP_RAID10)
+ return BTRFS_RAID_RAID10;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+ return BTRFS_RAID_RAID1;
+ else if (flags & BTRFS_BLOCK_GROUP_DUP)
+ return BTRFS_RAID_DUP;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+ return BTRFS_RAID_RAID0;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+ return BTRFS_RAID_RAID5;
+ else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return BTRFS_RAID_RAID6;
+
+ return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
+}
+
void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
-void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
- struct btrfs_transaction *transaction);
+void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans);
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index de7d072c78ef..e1e8177deb5e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -33,7 +33,7 @@
#include "locking.h"
-ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size)
{
struct btrfs_dir_item *di;
@@ -233,7 +233,7 @@ out:
/*
* @value: "" makes the attribute to empty, NULL removes it
*/
-int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+int btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
@@ -374,7 +374,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
const char *name, void *buffer, size_t size)
{
name = xattr_full_name(handler, name);
- return __btrfs_getxattr(inode, name, buffer, size);
+ return btrfs_getxattr(inode, name, buffer, size);
}
static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
@@ -383,7 +383,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
name = xattr_full_name(handler, name);
- return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+ return btrfs_setxattr(NULL, inode, name, buffer, size, flags);
}
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
@@ -448,8 +448,8 @@ static int btrfs_initxattrs(struct inode *inode,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
- err = __btrfs_setxattr(trans, inode, name,
- xattr->value, xattr->value_len, 0);
+ err = btrfs_setxattr(trans, inode, name, xattr->value,
+ xattr->value_len, 0);
kfree(name);
if (err < 0)
break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 15fc4743dc70..e215a3212a2a 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -23,13 +23,14 @@
extern const struct xattr_handler *btrfs_xattr_handlers[];
-extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+int btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size);
-extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+int btrfs_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags);
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr);