From 44f714dae50a2e795d3268a6831762aa6fa54f55 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Jun 2016 16:11:13 +0100 Subject: Btrfs: improve performance on fsync against new inode after rename/unlink With commit 56f23fdbb600 ("Btrfs: fix file/data loss caused by fsync after rename and new inode") we got simple fix for a functional issue when the following sequence of actions is done: at transaction N create file A at directory D at transaction N + M (where M >= 1) move/rename existing file A from directory D to directory E create a new file named A at directory D fsync the new file power fail The solution was to simply detect such scenario and fallback to a full transaction commit when we detect it. However this turned out to had a significant impact on throughput (and a bit on latency too) for benchmarks using the dbench tool, which simulates real workloads from smbd (Samba) servers. For example on a test vm (with a debug kernel): Unpatched: Throughput 19.1572 MB/sec 32 clients 32 procs max_latency=1005.229 ms Patched: Throughput 23.7015 MB/sec 32 clients 32 procs max_latency=809.206 ms The patched results (this patch is applied) are similar to the results of a kernel with the commit 56f23fdbb600 ("Btrfs: fix file/data loss caused by fsync after rename and new inode") reverted. This change avoids the fallback to a transaction commit and instead makes sure all the names of the conflicting inode (the one that had a name in a past transaction that matches the name of the new file in the same parent directory) are logged so that at log replay time we don't lose neither the new file nor the old file, and the old file gets the name it was renamed to. This also ends up avoiding a full transaction commit for a similar case that involves an unlink instead of a rename of the old file: at transaction N create file A at directory D at transaction N + M (where M >= 1) remove file A create a new file named A at directory D fsync the new file power fail Signed-off-by: Filipe Manana --- fs/btrfs/tree-log.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 8 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c05f69a8ec42..5fa624cd815d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4469,7 +4469,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, - struct inode *inode) + struct inode *inode, + u64 *other_ino) { int ret; struct btrfs_path *search_path; @@ -4528,7 +4529,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path, parent, name, this_name_len, 0); if (di && !IS_ERR(di)) { - ret = 1; + struct btrfs_key di_key; + + btrfs_dir_item_key_to_cpu(search_path->nodes[0], + di, &di_key); + if (di_key.type == BTRFS_INODE_ITEM_KEY) { + ret = 1; + *other_ino = di_key.objectid; + } else { + ret = -EAGAIN; + } goto out; } else if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -4718,16 +4728,71 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && BTRFS_I(inode)->generation == trans->transid) { + u64 other_ino = 0; + ret = btrfs_check_ref_name_override(path->nodes[0], path->slots[0], - &min_key, inode); + &min_key, inode, + &other_ino); if (ret < 0) { err = ret; goto out_unlock; } else if (ret > 0) { - err = 1; - btrfs_set_log_full_commit(root->fs_info, trans); - goto out_unlock; + struct btrfs_key inode_key; + struct inode *other_inode; + + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + btrfs_release_path(path); + inode_key.objectid = other_ino; + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + other_inode = btrfs_iget(root->fs_info->sb, + &inode_key, root, + NULL); + /* + * If the other inode that had a conflicting dir + * entry was deleted in the current transaction, + * we don't need to do more work nor fallback to + * a transaction commit. + */ + if (IS_ERR(other_inode) && + PTR_ERR(other_inode) == -ENOENT) { + goto next_key; + } else if (IS_ERR(other_inode)) { + err = PTR_ERR(other_inode); + goto out_unlock; + } + /* + * We are safe logging the other inode without + * acquiring its i_mutex as long as we log with + * the LOG_INODE_EXISTS mode. We're safe against + * concurrent renames of the other inode as well + * because during a rename we pin the log and + * update the log with the new name before we + * unpin it. + */ + err = btrfs_log_inode(trans, root, other_inode, + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + iput(other_inode); + if (err) + goto out_unlock; + else + goto next_key; } } @@ -4795,7 +4860,7 @@ next_slot: ins_nr = 0; } btrfs_release_path(path); - +next_key: if (min_key.offset < (u64)-1) { min_key.offset++; } else if (min_key.type < max_key.type) { @@ -4989,8 +5054,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; - if (IS_ROOT(parent)) + if (IS_ROOT(parent)) { + inode = d_inode(parent); + if (btrfs_must_commit_transaction(trans, inode)) + ret = 1; break; + } parent = dget_parent(parent); dput(old_parent); -- cgit v1.2.3 From df2c95f33e0a28b22509e4ee85365eedf32a1056 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 15 Aug 2016 10:36:52 +0800 Subject: btrfs: qgroup: Fix qgroup incorrectness caused by log replay When doing log replay at mount time(after power loss), qgroup will leak numbers of replayed data extents. The cause is almost the same of balance. So fix it by manually informing qgroup for owner changed extents. The bug can be detected by btrfs/119 test case. Cc: Mark Fasheh Signed-off-by: Qu Wenruo Reviewed-and-Tested-by: Goldwyn Rodrigues Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fff3f3efa436..ffe92da81b8a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "backref.h" #include "hash.h" #include "compression.h" +#include "qgroup.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * Manually record dirty extent, as here we did a shallow + * file extent item copy and skip normal backref update, + * but modifying extent tree all by ourselves. + * So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree + * (doesn't affect qgroup) to fs/file tree(affects qgroup) + */ + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item), + GFP_NOFS); + if (ret < 0) + goto out; + if (ins.objectid > 0) { u64 csum_start; u64 csum_end; -- cgit v1.2.3 From 28a235931b56d4e7bdd51f6733daf95f2b269da8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 23 Aug 2016 21:13:51 +0100 Subject: Btrfs: fix lockdep warning on deadlock against an inode's log mutex Commit 44f714dae50a ("Btrfs: improve performance on fsync against new inode after rename/unlink"), which landed in 4.8-rc2, introduced a possibility for a deadlock due to double locking of an inode's log mutex by the same task, which lockdep reports with: [23045.433975] ============================================= [23045.434748] [ INFO: possible recursive locking detected ] [23045.435426] 4.7.0-rc6-btrfs-next-34+ #1 Not tainted [23045.436044] --------------------------------------------- [23045.436044] xfs_io/3688 is trying to acquire lock: [23045.436044] (&ei->log_mutex){+.+...}, at: [] btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] but task is already holding lock: [23045.436044] (&ei->log_mutex){+.+...}, at: [] btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] other info that might help us debug this: [23045.436044] Possible unsafe locking scenario: [23045.436044] CPU0 [23045.436044] ---- [23045.436044] lock(&ei->log_mutex); [23045.436044] lock(&ei->log_mutex); [23045.436044] *** DEADLOCK *** [23045.436044] May be due to missing lock nesting notation [23045.436044] 3 locks held by xfs_io/3688: [23045.436044] #0: (&sb->s_type->i_mutex_key#15){+.+...}, at: [] btrfs_sync_file+0x14e/0x425 [btrfs] [23045.436044] #1: (sb_internal#2){.+.+.+}, at: [] __sb_start_write+0x5f/0xb0 [23045.436044] #2: (&ei->log_mutex){+.+...}, at: [] btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] stack backtrace: [23045.436044] CPU: 4 PID: 3688 Comm: xfs_io Not tainted 4.7.0-rc6-btrfs-next-34+ #1 [23045.436044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.9.1-0-gb3ef39f-prebuilt.qemu-project.org 04/01/2014 [23045.436044] 0000000000000000 ffff88022f5f7860 ffffffff8127074d ffffffff82a54b70 [23045.436044] ffffffff82a54b70 ffff88022f5f7920 ffffffff81092897 ffff880228015d68 [23045.436044] 0000000000000000 ffffffff82a54b70 ffffffff829c3f00 ffff880228015d68 [23045.436044] Call Trace: [23045.436044] [] dump_stack+0x67/0x90 [23045.436044] [] __lock_acquire+0xcbb/0xe4e [23045.436044] [] ? mark_lock+0x24/0x201 [23045.436044] [] ? mark_held_locks+0x5e/0x74 [23045.436044] [] lock_acquire+0x12f/0x1c3 [23045.436044] [] ? lock_acquire+0x12f/0x1c3 [23045.436044] [] ? btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] [] ? btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] [] mutex_lock_nested+0x77/0x3a7 [23045.436044] [] ? btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] [] ? btrfs_release_delayed_node+0xb/0xd [btrfs] [23045.436044] [] btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] [] ? btrfs_log_inode+0x13a/0xc95 [btrfs] [23045.436044] [] ? vprintk_emit+0x453/0x465 [23045.436044] [] btrfs_log_inode+0x66e/0xc95 [btrfs] [23045.436044] [] log_new_dir_dentries+0x26c/0x359 [btrfs] [23045.436044] [] btrfs_log_inode_parent+0x4a6/0x628 [btrfs] [23045.436044] [] btrfs_log_dentry_safe+0x5a/0x75 [btrfs] [23045.436044] [] btrfs_sync_file+0x304/0x425 [btrfs] [23045.436044] [] vfs_fsync_range+0x8c/0x9e [23045.436044] [] vfs_fsync+0x1c/0x1e [23045.436044] [] do_fsync+0x31/0x4a [23045.436044] [] SyS_fsync+0x10/0x14 [23045.436044] [] entry_SYSCALL_64_fastpath+0x18/0xa8 [23045.436044] [] ? trace_hardirqs_off_caller+0x3f/0xaa An example reproducer for this is: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/dir $ touch /mnt/dir/foo $ sync $ mv /mnt/dir/foo /mnt/dir/bar $ touch /mnt/dir/foo $ xfs_io -c "fsync" /mnt/dir/bar This is because while logging the inode of file bar we end up logging its parent directory (since its inode has an unlink_trans field matching the current transaction id due to the rename operation), which in turn logs the inodes for all its new dentries, so that the new inode for the new file named foo gets logged which in turn triggered another logging attempt for the inode we are fsync'ing, since that inode had an old name that corresponds to the name of the new inode. So fix this by ensuring that when logging the inode for a new dentry that has a name matching an old name of some other inode, we don't log again the original inode that we are fsync'ing. Fixes: 44f714dae50a ("Btrfs: improve performance on fsync against new inode after rename/unlink") Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 5 +++-- fs/btrfs/tree-log.h | 5 ++++- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3391f2adf0c8..fea31a4a6e36 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2070,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx); + btrfs_init_log_ctx(&ctx, inode); ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ffe92da81b8a..e935035ac034 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2823,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx); + btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); @@ -4757,7 +4757,8 @@ again: if (ret < 0) { err = ret; goto out_unlock; - } else if (ret > 0) { + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(ctx->inode)) { struct btrfs_key inode_key; struct inode *other_inode; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a9f1b75d080d..ab858e31ccbc 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,15 +30,18 @@ struct btrfs_log_ctx { int log_transid; int io_err; bool log_new_dentries; + struct inode *inode; struct list_head list; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, + struct inode *inode) { ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; ctx->log_new_dentries = false; + ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } -- cgit v1.2.3 From cbd60aa7cd17d81a434234268c55192862147439 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 6 Sep 2016 05:37:40 -0700 Subject: Btrfs: remove root_log_ctx from ctx list before btrfs_sync_log returns We use a btrfs_log_ctx structure to pass information into the tree log commit, and get error values out. It gets added to a per log-transaction list which we walk when things go bad. Commit d1433debe added an optimization to skip waiting for the log commit, but didn't take root_log_ctx out of the list. This patch makes sure we remove things before exiting. Signed-off-by: Chris Mason Fixes: d1433debe7f4346cf9fc0dafc71c3137d2a97bc4 cc: stable@vger.kernel.org # 3.15+ --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/tree-log.c') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e935035ac034..ef9c55bc7907 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2867,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; -- cgit v1.2.3